diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb
index 0808166..5749699 100644
--- a/your-code/challenge-1.ipynb
+++ b/your-code/challenge-1.ipynb
@@ -66,9 +66,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'ironhack s q website is'"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"def clean_up(s):\n",
" \"\"\"\n",
@@ -79,7 +90,19 @@
"\n",
" Returns:\n",
" A string that has been cleaned up.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " import re\n",
+ " \n",
+ " # remove URLs\n",
+ " s = re.sub(r'http.+', '', s)\n",
+ " \n",
+ " # remove special characters\n",
+ " s = re.sub(r'[^a-zA-Z\\s]+', ' ', s)\n",
+ " \n",
+ " return \" \".join([word for word in s.lower().split()])\n",
+ " \n",
+ "\n",
+ "clean_up(\"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\\\")\")"
]
},
{
@@ -101,10 +124,35 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package punkt to\n",
+ "[nltk_data] /Users/rickardramhoj/nltk_data...\n",
+ "[nltk_data] Package punkt is already up-to-date!\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "['ironhack', 's', 'q', 'website', 'is']"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
+ "# import nltk and tokenizer\n",
+ "import nltk\n",
+ "from nltk.tokenize import word_tokenize\n",
+ "nltk.download(\"punkt\")\n",
+ "\n",
"def tokenize(s):\n",
" \"\"\"\n",
" Tokenize a string.\n",
@@ -114,7 +162,12 @@
"\n",
" Returns:\n",
" A list of words as the result of tokenization.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " \n",
+ " return [word.lower() for word in word_tokenize(s) if word.isalpha()]\n",
+ "\n",
+ "# test it\n",
+ "tokenize(\"ironhack s q website is\")"
]
},
{
@@ -145,9 +198,29 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package wordnet to\n",
+ "[nltk_data] /Users/rickardramhoj/nltk_data...\n",
+ "[nltk_data] Package wordnet is already up-to-date!\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "['ironhack', 's', 'q', 'websit', 'is']"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"def stem_and_lemmatize(l):\n",
" \"\"\"\n",
@@ -158,7 +231,33 @@
"\n",
" Returns:\n",
" A list of strings after being stemmed and lemmatized.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " \n",
+ " # import libraries\n",
+ " from nltk.stem import PorterStemmer\n",
+ " nltk.download(\"wordnet\")\n",
+ " from nltk.stem import WordNetLemmatizer\n",
+ " \n",
+ " #initiate stemmer \n",
+ " stemmer = PorterStemmer()\n",
+ " \n",
+ " # initiate lemmatizer\n",
+ " lemmatizer = WordNetLemmatizer()\n",
+ " \n",
+ " # define list\n",
+ " new_list = []\n",
+ " \n",
+ " # loop words and append lemmatized and stemmed\n",
+ " for word in l:\n",
+ " lemmatized = lemmatizer.lemmatize(word)\n",
+ " new_list.append(stemmer.stem(lemmatized))\n",
+ " #new_list.append({\"stemmed\": stemmer.stem(word), \"lemmatized\": lemmatizer.lemmatize(word)})\n",
+ " \n",
+ " return new_list\n",
+ " \n",
+ "# test it\n",
+ "stem_and_lemmatize(['ironhack', 's', 'q', 'website', 'is'])\n",
+ " "
]
},
{
@@ -176,9 +275,29 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 12,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package stopwords to\n",
+ "[nltk_data] /Users/rickardramhoj/nltk_data...\n",
+ "[nltk_data] Package stopwords is already up-to-date!\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "['ironhack', 'q', 'websit']"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"def remove_stopwords(l):\n",
" \"\"\"\n",
@@ -189,7 +308,18 @@
"\n",
" Returns:\n",
" A list of strings after stop words are removed.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " \n",
+ " # import stopwords\n",
+ " nltk.download(\"stopwords\")\n",
+ " from nltk.corpus import stopwords\n",
+ " \n",
+ " return [word for word in l if word not in stopwords.words()]\n",
+ "\n",
+ "# test it\n",
+ "remove_stopwords(['ironhack', 's', 'q', 'websit', 'is'])\n",
+ " \n",
+ " "
]
},
{
@@ -204,7 +334,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -218,7 +348,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.3"
+ "version": "3.7.6"
}
},
"nbformat": 4,
diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb
index 6b0e116..b3f6d9d 100644
--- a/your-code/challenge-2.ipynb
+++ b/your-code/challenge-2.ipynb
@@ -46,11 +46,120 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " target | \n",
+ " id | \n",
+ " date | \n",
+ " flag | \n",
+ " user | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 514293 | \n",
+ " 0 | \n",
+ " 2190584004 | \n",
+ " Tue Jun 16 03:08:48 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " Vicki_Gee | \n",
+ " i miss nikki nu nu already shes always there ... | \n",
+ "
\n",
+ " \n",
+ " | 142282 | \n",
+ " 0 | \n",
+ " 1881451988 | \n",
+ " Fri May 22 04:42:15 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " PatCashin | \n",
+ " So I had a dream last night. I remember a sig... | \n",
+ "
\n",
+ " \n",
+ " | 403727 | \n",
+ " 0 | \n",
+ " 2058252964 | \n",
+ " Sat Jun 06 14:34:17 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " deelectable | \n",
+ " @girlyghost ohh poor sickly you (((hugs)) ho... | \n",
+ "
\n",
+ " \n",
+ " | 649503 | \n",
+ " 0 | \n",
+ " 2237307600 | \n",
+ " Fri Jun 19 05:34:22 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " justinekepa | \n",
+ " it is raining again | \n",
+ "
\n",
+ " \n",
+ " | 610789 | \n",
+ " 0 | \n",
+ " 2224301193 | \n",
+ " Thu Jun 18 09:20:06 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " cmatt007 | \n",
+ " @MissKeriBaby wish I was in LA right now | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " target id date flag \\\n",
+ "514293 0 2190584004 Tue Jun 16 03:08:48 PDT 2009 NO_QUERY \n",
+ "142282 0 1881451988 Fri May 22 04:42:15 PDT 2009 NO_QUERY \n",
+ "403727 0 2058252964 Sat Jun 06 14:34:17 PDT 2009 NO_QUERY \n",
+ "649503 0 2237307600 Fri Jun 19 05:34:22 PDT 2009 NO_QUERY \n",
+ "610789 0 2224301193 Thu Jun 18 09:20:06 PDT 2009 NO_QUERY \n",
+ "\n",
+ " user text \n",
+ "514293 Vicki_Gee i miss nikki nu nu already shes always there ... \n",
+ "142282 PatCashin So I had a dream last night. I remember a sig... \n",
+ "403727 deelectable @girlyghost ohh poor sickly you (((hugs)) ho... \n",
+ "649503 justinekepa it is raining again \n",
+ "610789 cmatt007 @MissKeriBaby wish I was in LA right now "
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# your code here"
+ "# your code here\n",
+ "\n",
+ "# import pandas\n",
+ "import pandas as pd\n",
+ "\n",
+ "# load data limited to 20000 rows\n",
+ "tweets = pd.read_csv(\"/Users/rickardramhoj/Downloads/training.1600000.processed.noemoticon.csv\", names=['target','id','date','flag','user','text'], encoding = 'latin-1').sample(n=20000, random_state=1)\n",
+ "\n",
+ "# look at data\n",
+ "tweets.head()"
]
},
{
@@ -76,11 +185,234 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
+ "execution_count": 2,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package punkt to\n",
+ "[nltk_data] /Users/rickardramhoj/nltk_data...\n",
+ "[nltk_data] Package punkt is already up-to-date!\n",
+ "[nltk_data] Downloading package wordnet to\n",
+ "[nltk_data] /Users/rickardramhoj/nltk_data...\n",
+ "[nltk_data] Package wordnet is already up-to-date!\n",
+ "[nltk_data] Downloading package stopwords to\n",
+ "[nltk_data] /Users/rickardramhoj/nltk_data...\n",
+ "[nltk_data] Package stopwords is already up-to-date!\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " target | \n",
+ " id | \n",
+ " date | \n",
+ " flag | \n",
+ " user | \n",
+ " text | \n",
+ " text_processed | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 514293 | \n",
+ " 0 | \n",
+ " 2190584004 | \n",
+ " Tue Jun 16 03:08:48 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " Vicki_Gee | \n",
+ " i miss nikki nu nu already shes always there ... | \n",
+ " [miss, nikki, alreadi, alway, need, thank, xxx] | \n",
+ "
\n",
+ " \n",
+ " | 142282 | \n",
+ " 0 | \n",
+ " 1881451988 | \n",
+ " Fri May 22 04:42:15 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " PatCashin | \n",
+ " So I had a dream last night. I remember a sig... | \n",
+ " [dream, last, night, rememb, sign, clearli, to... | \n",
+ "
\n",
+ " \n",
+ " | 403727 | \n",
+ " 0 | \n",
+ " 2058252964 | \n",
+ " Sat Jun 06 14:34:17 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " deelectable | \n",
+ " @girlyghost ohh poor sickly you (((hugs)) ho... | \n",
+ " [girlyghost, ohh, poor, sickli, hug, hope, fee... | \n",
+ "
\n",
+ " \n",
+ " | 649503 | \n",
+ " 0 | \n",
+ " 2237307600 | \n",
+ " Fri Jun 19 05:34:22 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " justinekepa | \n",
+ " it is raining again | \n",
+ " [rain] | \n",
+ "
\n",
+ " \n",
+ " | 610789 | \n",
+ " 0 | \n",
+ " 2224301193 | \n",
+ " Thu Jun 18 09:20:06 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " cmatt007 | \n",
+ " @MissKeriBaby wish I was in LA right now | \n",
+ " [misskeribabi, wish, wa, right] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " target id date flag \\\n",
+ "514293 0 2190584004 Tue Jun 16 03:08:48 PDT 2009 NO_QUERY \n",
+ "142282 0 1881451988 Fri May 22 04:42:15 PDT 2009 NO_QUERY \n",
+ "403727 0 2058252964 Sat Jun 06 14:34:17 PDT 2009 NO_QUERY \n",
+ "649503 0 2237307600 Fri Jun 19 05:34:22 PDT 2009 NO_QUERY \n",
+ "610789 0 2224301193 Thu Jun 18 09:20:06 PDT 2009 NO_QUERY \n",
+ "\n",
+ " user text \\\n",
+ "514293 Vicki_Gee i miss nikki nu nu already shes always there ... \n",
+ "142282 PatCashin So I had a dream last night. I remember a sig... \n",
+ "403727 deelectable @girlyghost ohh poor sickly you (((hugs)) ho... \n",
+ "649503 justinekepa it is raining again \n",
+ "610789 cmatt007 @MissKeriBaby wish I was in LA right now \n",
+ "\n",
+ " text_processed \n",
+ "514293 [miss, nikki, alreadi, alway, need, thank, xxx] \n",
+ "142282 [dream, last, night, rememb, sign, clearli, to... \n",
+ "403727 [girlyghost, ohh, poor, sickli, hug, hope, fee... \n",
+ "649503 [rain] \n",
+ "610789 [misskeribabi, wish, wa, right] "
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# your code here"
+ "# your code here\n",
+ "\n",
+ "# import nltk and tokenizer\n",
+ "import nltk\n",
+ "from nltk.tokenize import word_tokenize\n",
+ "nltk.download(\"punkt\")\n",
+ "from nltk.stem import PorterStemmer\n",
+ "nltk.download(\"wordnet\")\n",
+ "from nltk.stem import WordNetLemmatizer\n",
+ "nltk.download(\"stopwords\")\n",
+ "from nltk.corpus import stopwords\n",
+ "\n",
+ "# define functions\n",
+ "def clean_up(s):\n",
+ " \"\"\"\n",
+ " Cleans up numbers, URLs, and special characters from a string.\n",
+ "\n",
+ " Args:\n",
+ " s: The string to be cleaned up.\n",
+ "\n",
+ " Returns:\n",
+ " A string that has been cleaned up.\n",
+ " \"\"\"\n",
+ " import re\n",
+ " \n",
+ " # remove URLs\n",
+ " s = re.sub(r'http.+', '', s)\n",
+ " \n",
+ " # remove special characters\n",
+ " s = re.sub(r'[^a-zA-Z\\s]+', ' ', s)\n",
+ " \n",
+ " return \" \".join([word for word in s.lower().split()])\n",
+ "\n",
+ "def tokenize(s):\n",
+ " \"\"\"\n",
+ " Tokenize a string.\n",
+ "\n",
+ " Args:\n",
+ " s: String to be tokenized.\n",
+ "\n",
+ " Returns:\n",
+ " A list of words as the result of tokenization.\n",
+ " \"\"\"\n",
+ " \n",
+ " return [word.lower() for word in word_tokenize(s) if word.isalpha()]\n",
+ "\n",
+ "def stem_and_lemmatize(l):\n",
+ " \"\"\"\n",
+ " Perform stemming and lemmatization on a list of words.\n",
+ "\n",
+ " Args:\n",
+ " l: A list of strings.\n",
+ "\n",
+ " Returns:\n",
+ " A list of strings after being stemmed and lemmatized.\n",
+ " \"\"\"\n",
+ " \n",
+ " #initiate stemmer \n",
+ " stemmer = PorterStemmer()\n",
+ " \n",
+ " # initiate lemmatizer\n",
+ " lemmatizer = WordNetLemmatizer()\n",
+ " \n",
+ " # define list\n",
+ " new_list = []\n",
+ " \n",
+ " # loop words and append lemmatized and stemmed\n",
+ " for word in l:\n",
+ " lemmatized = lemmatizer.lemmatize(word)\n",
+ " new_list.append(stemmer.stem(lemmatized))\n",
+ " #new_list.append({\"stemmed\": stemmer.stem(word), \"lemmatized\": lemmatizer.lemmatize(word)})\n",
+ " \n",
+ " return new_list\n",
+ "\n",
+ "def remove_stopwords(l):\n",
+ " \"\"\"\n",
+ " Remove English stopwords from a list of strings.\n",
+ "\n",
+ " Args:\n",
+ " l: A list of strings.\n",
+ "\n",
+ " Returns:\n",
+ " A list of strings after stop words are removed.\n",
+ " \"\"\"\n",
+ " \n",
+ " return [word for word in l if word not in stopwords.words()]\n",
+ "\n",
+ "tweets[\"text_processed\"] = tweets[\"text\"].apply(clean_up)\n",
+ "tweets[\"text_processed\"] = tweets[\"text_processed\"].apply(tokenize)\n",
+ "tweets[\"text_processed\"] = tweets[\"text_processed\"].apply(stem_and_lemmatize)\n",
+ "tweets[\"text_processed\"] = tweets[\"text_processed\"].apply(remove_stopwords)\n",
+ "\n",
+ "# check dataframe\n",
+ "tweets.head()"
]
},
{
@@ -98,11 +430,1034 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['go',\n",
+ " 'get',\n",
+ " 'wa',\n",
+ " 'day',\n",
+ " 'good',\n",
+ " 'like',\n",
+ " 'love',\n",
+ " 'work',\n",
+ " 'got',\n",
+ " 'quot',\n",
+ " 'time',\n",
+ " 'today',\n",
+ " 'miss',\n",
+ " 'lol',\n",
+ " 'thank',\n",
+ " 'back',\n",
+ " 'realli',\n",
+ " 'know',\n",
+ " 'think',\n",
+ " 'see',\n",
+ " 'feel',\n",
+ " 'amp',\n",
+ " 'watch',\n",
+ " 'need',\n",
+ " 'still',\n",
+ " 'night',\n",
+ " 'well',\n",
+ " 'make',\n",
+ " 'hope',\n",
+ " 'oh',\n",
+ " 'home',\n",
+ " 'new',\n",
+ " 'look',\n",
+ " 'twitter',\n",
+ " 'much',\n",
+ " 'last',\n",
+ " 'wish',\n",
+ " 'morn',\n",
+ " 'great',\n",
+ " 'wait',\n",
+ " 'sad',\n",
+ " 'tomorrow',\n",
+ " 'would',\n",
+ " 'sleep',\n",
+ " 'haha',\n",
+ " 'right',\n",
+ " 'whi',\n",
+ " 'follow',\n",
+ " 'thing',\n",
+ " 'fun',\n",
+ " 'tonight',\n",
+ " 'onli',\n",
+ " 'happi',\n",
+ " 'week',\n",
+ " 'friend',\n",
+ " 'bad',\n",
+ " 'nice',\n",
+ " 'veri',\n",
+ " 'sorri',\n",
+ " 'hi',\n",
+ " 'say',\n",
+ " 'way',\n",
+ " 'better',\n",
+ " 'school',\n",
+ " 'could',\n",
+ " 'hate',\n",
+ " 'yeah',\n",
+ " 'bed',\n",
+ " 'start',\n",
+ " 'tweet',\n",
+ " 'peopl',\n",
+ " 'gon',\n",
+ " 'though',\n",
+ " 'hour',\n",
+ " 'show',\n",
+ " 'even',\n",
+ " 'guy',\n",
+ " 'weekend',\n",
+ " 'ye',\n",
+ " 'life',\n",
+ " 'play',\n",
+ " 'everyon',\n",
+ " 'let',\n",
+ " 'final',\n",
+ " 'cant',\n",
+ " 'littl',\n",
+ " 'hey',\n",
+ " 'wan',\n",
+ " 'use',\n",
+ " 'movi',\n",
+ " 'lt',\n",
+ " 'rain',\n",
+ " 'sick',\n",
+ " 'year',\n",
+ " 'first',\n",
+ " 'ok',\n",
+ " 'tire',\n",
+ " 'find',\n",
+ " 'awesom',\n",
+ " 'x',\n",
+ " 'never',\n",
+ " 'next',\n",
+ " 'best',\n",
+ " 'soon',\n",
+ " 'ani',\n",
+ " 'talk',\n",
+ " 'call',\n",
+ " 'done',\n",
+ " 'head',\n",
+ " 'phone',\n",
+ " 'sure',\n",
+ " 'dont',\n",
+ " 'long',\n",
+ " 'alreadi',\n",
+ " 'help',\n",
+ " 'alway',\n",
+ " 'anoth',\n",
+ " 'hurt',\n",
+ " 'mayb',\n",
+ " 'keep',\n",
+ " 'pleas',\n",
+ " 'cool',\n",
+ " 'lot',\n",
+ " 'yay',\n",
+ " 'song',\n",
+ " 'someth',\n",
+ " 'girl',\n",
+ " 'eat',\n",
+ " 'made',\n",
+ " 'bore',\n",
+ " 'old',\n",
+ " 'enjoy',\n",
+ " 'suck',\n",
+ " 'leav',\n",
+ " 'yet',\n",
+ " 'ever',\n",
+ " 'read',\n",
+ " 'thought',\n",
+ " 'becaus',\n",
+ " 'lost',\n",
+ " 'ur',\n",
+ " 'readi',\n",
+ " 'away',\n",
+ " 'pretti',\n",
+ " 'actual',\n",
+ " 'finish',\n",
+ " 'live',\n",
+ " 'sound',\n",
+ " 'hous',\n",
+ " 'went',\n",
+ " 'excit',\n",
+ " 'listen',\n",
+ " 'hear',\n",
+ " 'game',\n",
+ " 'someon',\n",
+ " 'summer',\n",
+ " 'happen',\n",
+ " 'w',\n",
+ " 'left',\n",
+ " 'guess',\n",
+ " 'earli',\n",
+ " 'noth',\n",
+ " 'wonder',\n",
+ " 'befor',\n",
+ " 'b',\n",
+ " 'late',\n",
+ " 'babi',\n",
+ " 'job',\n",
+ " 'sinc',\n",
+ " 'doe',\n",
+ " 'tell',\n",
+ " 'omg',\n",
+ " 'glad',\n",
+ " 'ugh',\n",
+ " 'gone',\n",
+ " 'weather',\n",
+ " 'car',\n",
+ " 'mom',\n",
+ " 'kid',\n",
+ " 'p',\n",
+ " 'wow',\n",
+ " 'later',\n",
+ " 'saw',\n",
+ " 'beauti',\n",
+ " 'damn',\n",
+ " 'r',\n",
+ " 'amaz',\n",
+ " 'bit',\n",
+ " 'birthday',\n",
+ " 'stop',\n",
+ " 'mean',\n",
+ " 'big',\n",
+ " 'said',\n",
+ " 'check',\n",
+ " 'hot',\n",
+ " 'parti',\n",
+ " 'give',\n",
+ " 'sun',\n",
+ " 'god',\n",
+ " 'run',\n",
+ " 'may',\n",
+ " 'two',\n",
+ " 'updat',\n",
+ " 'shit',\n",
+ " 'must',\n",
+ " 'seem',\n",
+ " 'move',\n",
+ " 'world',\n",
+ " 'hard',\n",
+ " 'put',\n",
+ " 'iphon',\n",
+ " 'free',\n",
+ " 'cold',\n",
+ " 'luck',\n",
+ " 'yesterday',\n",
+ " 'th',\n",
+ " 'studi',\n",
+ " 'stay',\n",
+ " 'might',\n",
+ " 'meet',\n",
+ " 'month',\n",
+ " 'found',\n",
+ " 'book',\n",
+ " 'boy',\n",
+ " 'music',\n",
+ " 'friday',\n",
+ " 'lunch',\n",
+ " 'gt',\n",
+ " 'mani',\n",
+ " 'woke',\n",
+ " 'exam',\n",
+ " 'fuck',\n",
+ " 'aww',\n",
+ " 'buy',\n",
+ " 'shop',\n",
+ " 'tho',\n",
+ " 'around',\n",
+ " 'least',\n",
+ " 'monday',\n",
+ " 'post',\n",
+ " 'stuff',\n",
+ " 'clean',\n",
+ " 'cri',\n",
+ " 'famili',\n",
+ " 'okay',\n",
+ " 'anyth',\n",
+ " 'video',\n",
+ " 'xx',\n",
+ " 'forward',\n",
+ " 'drink',\n",
+ " 'welcom',\n",
+ " 'everyth',\n",
+ " 'sooo',\n",
+ " 'pictur',\n",
+ " 'busi',\n",
+ " 'believ',\n",
+ " 'hahaha',\n",
+ " 'drive',\n",
+ " 'food',\n",
+ " 'train',\n",
+ " 'stupid',\n",
+ " 'walk',\n",
+ " 'everi',\n",
+ " 'anyon',\n",
+ " 'sweet',\n",
+ " 'sunday',\n",
+ " 'probabl',\n",
+ " 'outsid',\n",
+ " 'win',\n",
+ " 'turn',\n",
+ " 'plan',\n",
+ " 'poor',\n",
+ " 'chang',\n",
+ " 'almost',\n",
+ " 'write',\n",
+ " 'hair',\n",
+ " 'far',\n",
+ " 'dad',\n",
+ " 'wrong',\n",
+ " 'dream',\n",
+ " 'place',\n",
+ " 'real',\n",
+ " 'cute',\n",
+ " 'kill',\n",
+ " 'ask',\n",
+ " 'rememb',\n",
+ " 'tv',\n",
+ " 'goodnight',\n",
+ " 'caus',\n",
+ " 'fan',\n",
+ " 'blog',\n",
+ " 'repli',\n",
+ " 'wake',\n",
+ " 'rest',\n",
+ " 'funni',\n",
+ " 'total',\n",
+ " 'quit',\n",
+ " 'eye',\n",
+ " 'worri',\n",
+ " 'anymor',\n",
+ " 'class',\n",
+ " 'room',\n",
+ " 'came',\n",
+ " 'hit',\n",
+ " 'dinner',\n",
+ " 'money',\n",
+ " 'true',\n",
+ " 'mother',\n",
+ " 'without',\n",
+ " 'sister',\n",
+ " 'els',\n",
+ " 'hang',\n",
+ " 'send',\n",
+ " 'offic',\n",
+ " 'news',\n",
+ " 'brother',\n",
+ " 'word',\n",
+ " 'seen',\n",
+ " 'whole',\n",
+ " 'danc',\n",
+ " 'aw',\n",
+ " 'onc',\n",
+ " 'open',\n",
+ " 'either',\n",
+ " 'vote',\n",
+ " 'took',\n",
+ " 'link',\n",
+ " 'pain',\n",
+ " 'break',\n",
+ " 'person',\n",
+ " 'headach',\n",
+ " 'coffe',\n",
+ " 'www',\n",
+ " 'half',\n",
+ " 'hehe',\n",
+ " 'saturday',\n",
+ " 'idea',\n",
+ " 'hug',\n",
+ " 'st',\n",
+ " 'bring',\n",
+ " 'hello',\n",
+ " 'anyway',\n",
+ " 'photo',\n",
+ " 'ah',\n",
+ " 'onlin',\n",
+ " 'text',\n",
+ " 'abl',\n",
+ " 'enough',\n",
+ " 'g',\n",
+ " 'set',\n",
+ " 'close',\n",
+ " 'full',\n",
+ " 'crap',\n",
+ " 'awww',\n",
+ " 'kinda',\n",
+ " 'jealou',\n",
+ " 'cours',\n",
+ " 'trip',\n",
+ " 'reason',\n",
+ " 'dude',\n",
+ " 'crazi',\n",
+ " 'fall',\n",
+ " 'comput',\n",
+ " 'fix',\n",
+ " 'season',\n",
+ " 'heard',\n",
+ " 'pm',\n",
+ " 'forgot',\n",
+ " 'didnt',\n",
+ " 'kind',\n",
+ " 'fine',\n",
+ " 'site',\n",
+ " 'heart',\n",
+ " 'wont',\n",
+ " 'interest',\n",
+ " 'bought',\n",
+ " 'high',\n",
+ " 'mr',\n",
+ " 'add',\n",
+ " 'visit',\n",
+ " 'rock',\n",
+ " 'pay',\n",
+ " 'awak',\n",
+ " 'favorit',\n",
+ " 'relax',\n",
+ " 'problem',\n",
+ " 'sunni',\n",
+ " 'super',\n",
+ " 'star',\n",
+ " 'red',\n",
+ " 'beach',\n",
+ " 'line',\n",
+ " 'learn',\n",
+ " 'ago',\n",
+ " 'soo',\n",
+ " 'cuz',\n",
+ " 'asleep',\n",
+ " 'dead',\n",
+ " 'smile',\n",
+ " 'mileycyru',\n",
+ " 'sign',\n",
+ " 'fail',\n",
+ " 'hell',\n",
+ " 'sore',\n",
+ " 'tommcfli',\n",
+ " 'ride',\n",
+ " 'math',\n",
+ " 'afternoon',\n",
+ " 'lose',\n",
+ " 'part',\n",
+ " 'serious',\n",
+ " 'mind',\n",
+ " 'lucki',\n",
+ " 'drop',\n",
+ " 'power',\n",
+ " 'congrat',\n",
+ " 'definit',\n",
+ " 'concert',\n",
+ " 'facebook',\n",
+ " 'ticket',\n",
+ " 'ladi',\n",
+ " 'instead',\n",
+ " 'email',\n",
+ " 'shower',\n",
+ " 'hand',\n",
+ " 'mad',\n",
+ " 'offici',\n",
+ " 'youtub',\n",
+ " 'breakfast',\n",
+ " 'short',\n",
+ " 'internet',\n",
+ " 'boo',\n",
+ " 'broke',\n",
+ " 'wear',\n",
+ " 'sometim',\n",
+ " 'order',\n",
+ " 'nd',\n",
+ " 'dear',\n",
+ " 'bet',\n",
+ " 'agre',\n",
+ " 'perfect',\n",
+ " 'btw',\n",
+ " 'park',\n",
+ " 'award',\n",
+ " 'gym',\n",
+ " 'catch',\n",
+ " 'togeth',\n",
+ " 'test',\n",
+ " 'team',\n",
+ " 'suppos',\n",
+ " 'figur',\n",
+ " 'sat',\n",
+ " 'homework',\n",
+ " 'mention',\n",
+ " 'june',\n",
+ " 'alon',\n",
+ " 'beat',\n",
+ " 'nap',\n",
+ " 'pack',\n",
+ " 'soooo',\n",
+ " 'pick',\n",
+ " 'yea',\n",
+ " 'sing',\n",
+ " 'lmao',\n",
+ " 'nite',\n",
+ " 'xxx',\n",
+ " 'second',\n",
+ " 'album',\n",
+ " 'stuck',\n",
+ " 'store',\n",
+ " 'wed',\n",
+ " 'sigh',\n",
+ " 'goin',\n",
+ " 'ice',\n",
+ " 'vacat',\n",
+ " 'air',\n",
+ " 'upload',\n",
+ " 'coupl',\n",
+ " 'water',\n",
+ " 'hungri',\n",
+ " 'cook',\n",
+ " 'cousin',\n",
+ " 'xd',\n",
+ " 'dress',\n",
+ " 'easi',\n",
+ " 'side',\n",
+ " 'laptop',\n",
+ " 'account',\n",
+ " 'holiday',\n",
+ " 'wors',\n",
+ " 'revis',\n",
+ " 'bless',\n",
+ " 'foot',\n",
+ " 'decid',\n",
+ " 'join',\n",
+ " 'mood',\n",
+ " 'spend',\n",
+ " 'point',\n",
+ " 'moment',\n",
+ " 'window',\n",
+ " 'stori',\n",
+ " 'save',\n",
+ " 'graduat',\n",
+ " 'past',\n",
+ " 'top',\n",
+ " 'sleepi',\n",
+ " 'ipod',\n",
+ " 'f',\n",
+ " 'current',\n",
+ " 'understand',\n",
+ " 'yep',\n",
+ " 'throat',\n",
+ " 'chanc',\n",
+ " 'town',\n",
+ " 'differ',\n",
+ " 'hmm',\n",
+ " 'told',\n",
+ " 'bout',\n",
+ " 'mtv',\n",
+ " 'cream',\n",
+ " 'annoy',\n",
+ " 'answer',\n",
+ " 'age',\n",
+ " 'mac',\n",
+ " 'forget',\n",
+ " 'road',\n",
+ " 'thursday',\n",
+ " 'tour',\n",
+ " 'fast',\n",
+ " 'load',\n",
+ " 'celebr',\n",
+ " 'realiz',\n",
+ " 'jonasbroth',\n",
+ " 'shoot',\n",
+ " 'camp',\n",
+ " 'scare',\n",
+ " 'cut',\n",
+ " 'co',\n",
+ " 'knew',\n",
+ " 'date',\n",
+ " 'ddlovato',\n",
+ " 'shirt',\n",
+ " 'episod',\n",
+ " 'fli',\n",
+ " 'chat',\n",
+ " 'ahhh',\n",
+ " 'complet',\n",
+ " 'kick',\n",
+ " 'fb',\n",
+ " 'lazi',\n",
+ " 'ive',\n",
+ " 'especi',\n",
+ " 'card',\n",
+ " 'ppl',\n",
+ " 'mum',\n",
+ " 'rather',\n",
+ " 'black',\n",
+ " 'download',\n",
+ " 'list',\n",
+ " 'tea',\n",
+ " 'chocol',\n",
+ " 'yr',\n",
+ " 'count',\n",
+ " 'expect',\n",
+ " 'cancel',\n",
+ " 'flight',\n",
+ " 'appl',\n",
+ " 'ate',\n",
+ " 'number',\n",
+ " 'film',\n",
+ " 'state',\n",
+ " 'bye',\n",
+ " 'pizza',\n",
+ " 'worst',\n",
+ " 'wtf',\n",
+ " 'wine',\n",
+ " 'juli',\n",
+ " 'present',\n",
+ " 'meant',\n",
+ " 'english',\n",
+ " 'manag',\n",
+ " 'share',\n",
+ " 'colleg',\n",
+ " 'sent',\n",
+ " 'servic',\n",
+ " 'ach',\n",
+ " 'question',\n",
+ " 'flu',\n",
+ " 'depress',\n",
+ " 'nope',\n",
+ " 'freak',\n",
+ " 'laugh',\n",
+ " 'smell',\n",
+ " 'unfortun',\n",
+ " 'due',\n",
+ " 'woman',\n",
+ " 'beer',\n",
+ " 'messag',\n",
+ " 'bitch',\n",
+ " 'worth',\n",
+ " 'search',\n",
+ " 'websit',\n",
+ " 'bike',\n",
+ " 'comment',\n",
+ " 'touch',\n",
+ " 'shoe',\n",
+ " 'parent',\n",
+ " 'church',\n",
+ " 'sunshin',\n",
+ " 'swim',\n",
+ " 'miley',\n",
+ " 'boyfriend',\n",
+ " 'pool',\n",
+ " 'stomach',\n",
+ " 'lil',\n",
+ " 'burn',\n",
+ " 'support',\n",
+ " 'followfriday',\n",
+ " 'mess',\n",
+ " 'lay',\n",
+ " 'weird',\n",
+ " 'leg',\n",
+ " 'bum',\n",
+ " 'appar',\n",
+ " 'becom',\n",
+ " 'father',\n",
+ " 'cake',\n",
+ " 'hill',\n",
+ " 'case',\n",
+ " 'airport',\n",
+ " 'event',\n",
+ " 'shame',\n",
+ " 'hr',\n",
+ " 'london',\n",
+ " 'surpris',\n",
+ " 'round',\n",
+ " 'tummi',\n",
+ " 'hmmm',\n",
+ " 'voic',\n",
+ " 'broken',\n",
+ " 'blue',\n",
+ " 'safe',\n",
+ " 'spent',\n",
+ " 'fell',\n",
+ " 'possibl',\n",
+ " 'practic',\n",
+ " 'type',\n",
+ " 'usual',\n",
+ " 'yummi',\n",
+ " 'insid',\n",
+ " 'stress',\n",
+ " 'warm',\n",
+ " 'absolut',\n",
+ " 'normal',\n",
+ " 'alright',\n",
+ " 'cheer',\n",
+ " 'moon',\n",
+ " 'idk',\n",
+ " 'hold',\n",
+ " 'horribl',\n",
+ " 'stand',\n",
+ " 'near',\n",
+ " 'page',\n",
+ " 'chill',\n",
+ " 'note',\n",
+ " 'shot',\n",
+ " 'doctor',\n",
+ " 'uk',\n",
+ " 'xoxo',\n",
+ " 'club',\n",
+ " 'fire',\n",
+ " 'pray',\n",
+ " 'huge',\n",
+ " 'taken',\n",
+ " 'ball',\n",
+ " 'cup',\n",
+ " 'slow',\n",
+ " 'shall',\n",
+ " 'cd',\n",
+ " 'scari',\n",
+ " 'deserv',\n",
+ " 'disappoint',\n",
+ " 'block',\n",
+ " 'box',\n",
+ " 'pass',\n",
+ " 'exactli',\n",
+ " 'green',\n",
+ " 'sooooo',\n",
+ " 'bro',\n",
+ " 'band',\n",
+ " 'paper',\n",
+ " 'confus',\n",
+ " 'terribl',\n",
+ " 'myspac',\n",
+ " 'ear',\n",
+ " 'tan',\n",
+ " 'notic',\n",
+ " 'except',\n",
+ " 'twitpic',\n",
+ " 'googl',\n",
+ " 'issu',\n",
+ " 'low',\n",
+ " 'doesnt',\n",
+ " 'lie',\n",
+ " 'fair',\n",
+ " 'sadli',\n",
+ " 'mile',\n",
+ " 'mate',\n",
+ " 'project',\n",
+ " 'app',\n",
+ " 'pop',\n",
+ " 'countri',\n",
+ " 'wit',\n",
+ " 'longer',\n",
+ " 'crash',\n",
+ " 'raini',\n",
+ " 'arm',\n",
+ " 'david',\n",
+ " 'cover',\n",
+ " 'sell',\n",
+ " 'wednesday',\n",
+ " 'sim',\n",
+ " 'shift',\n",
+ " 'finger',\n",
+ " 'woo',\n",
+ " 'hubbi',\n",
+ " 'fit',\n",
+ " 'gorgeou',\n",
+ " 'gosh',\n",
+ " 'ff',\n",
+ " 'hangov',\n",
+ " 'caught',\n",
+ " 'mail',\n",
+ " 'return',\n",
+ " 'luv',\n",
+ " 'fight',\n",
+ " 'earlier',\n",
+ " 'yup',\n",
+ " 'plu',\n",
+ " 'lame',\n",
+ " 'gave',\n",
+ " 'ahead',\n",
+ " 'invit',\n",
+ " 'connect',\n",
+ " 'act',\n",
+ " 'special',\n",
+ " 'nearli',\n",
+ " 'isnt',\n",
+ " 'prepar',\n",
+ " 'watchin',\n",
+ " 'via',\n",
+ " 'white',\n",
+ " 'taylor',\n",
+ " 'sort',\n",
+ " 'interview',\n",
+ " 'blackberri',\n",
+ " 'recommend',\n",
+ " 'tom',\n",
+ " 'argh',\n",
+ " 'guitar',\n",
+ " 'bill',\n",
+ " 'front',\n",
+ " 'offer',\n",
+ " 'fact',\n",
+ " 'piec',\n",
+ " 'upset',\n",
+ " 'joy',\n",
+ " 'glass',\n",
+ " 'radio',\n",
+ " 'tuesday',\n",
+ " 'xo',\n",
+ " 'matter',\n",
+ " 'fill',\n",
+ " 'inde',\n",
+ " 'exhaust',\n",
+ " 'self',\n",
+ " 'congratul',\n",
+ " 'avail',\n",
+ " 'deal',\n",
+ " 'lone',\n",
+ " 'chees',\n",
+ " 'version',\n",
+ " 'john',\n",
+ " 'breath',\n",
+ " 'proud',\n",
+ " 'product',\n",
+ " 'gig',\n",
+ " 'teeth',\n",
+ " 'blah',\n",
+ " 'quiet',\n",
+ " 'kiss',\n",
+ " 'view',\n",
+ " 'drunk',\n",
+ " 'fever',\n",
+ " 'singl',\n",
+ " 'forev',\n",
+ " 'folk',\n",
+ " 'gettin',\n",
+ " 'pink',\n",
+ " 'bird',\n",
+ " 'promis',\n",
+ " 'goodby',\n",
+ " 'everybodi',\n",
+ " 'experi',\n",
+ " 'quick',\n",
+ " 'import',\n",
+ " 'tune',\n",
+ " 'although',\n",
+ " 'daughter',\n",
+ " 'sexi',\n",
+ " 'sky',\n",
+ " 'aint',\n",
+ " 'grow',\n",
+ " 'small',\n",
+ " 'fantast',\n",
+ " 'bar',\n",
+ " 'three',\n",
+ " 'design',\n",
+ " 'nose',\n",
+ " 'huh',\n",
+ " 'thx',\n",
+ " 'lesson',\n",
+ " 'fish',\n",
+ " 'bday',\n",
+ " 'arriv',\n",
+ " 'storm',\n",
+ " 'mix',\n",
+ " 'seat',\n",
+ " 'lake',\n",
+ " 'daddi',\n",
+ " 'fav',\n",
+ " 'piss',\n",
+ " 'blood',\n",
+ " 'knee',\n",
+ " 'child',\n",
+ " 'havent',\n",
+ " 'middl',\n",
+ " 'memori',\n",
+ " 'bag',\n",
+ " 'group',\n",
+ " 'dvd',\n",
+ " 'dure',\n",
+ " 'art',\n",
+ " 'behind',\n",
+ " 'jack',\n",
+ " 'paint',\n",
+ " 'hospit',\n",
+ " 'swine',\n",
+ " 'futur',\n",
+ " 'perform',\n",
+ " 'magic',\n",
+ " 'cloth',\n",
+ " 'remind',\n",
+ " 'stick',\n",
+ " 'web',\n",
+ " 'australia',\n",
+ " 'releas',\n",
+ " 'dentist',\n",
+ " 'land',\n",
+ " 'appreci',\n",
+ " 'session',\n",
+ " 'ring',\n",
+ " 'bloodi',\n",
+ " 'track',\n",
+ " 'info',\n",
+ " 'peac',\n",
+ " 'ouch',\n",
+ " 'delet',\n",
+ " 'channel',\n",
+ " 'nail',\n",
+ " 'race',\n",
+ " 'addict',\n",
+ " 'ran',\n",
+ " 'ny',\n",
+ " 'bbq',\n",
+ " 'gut',\n",
+ " 'trend',\n",
+ " 'cooki',\n",
+ " 'tear',\n",
+ " 'camera',\n",
+ " 'joe',\n",
+ " 'french',\n",
+ " 'roll',\n",
+ " 'buddi',\n",
+ " 'system',\n",
+ " 'edit',\n",
+ " 'doubl',\n",
+ " 'twilight',\n",
+ " 'puppi',\n",
+ " 'tast',\n",
+ " 'ng',\n",
+ " 'nobodi',\n",
+ " 'histori',\n",
+ " 'jon',\n",
+ " 'chicken',\n",
+ " 'compani',\n",
+ " 'plz',\n",
+ " 'mommi',\n",
+ " 'feelin',\n",
+ " 'record',\n",
+ " 'light',\n",
+ " 'ohh',\n",
+ " 'dr',\n",
+ " 'young',\n",
+ " 'rule',\n",
+ " 'form',\n",
+ " 'garden',\n",
+ " 'neck',\n",
+ " 'felt',\n",
+ " 'badli',\n",
+ " 'dark',\n",
+ " 'street',\n",
+ " 'slept',\n",
+ " 'local',\n",
+ " 'continu',\n",
+ " 'adam',\n",
+ " 'yum',\n",
+ " 'shut',\n",
+ " 'ahh',\n",
+ " 'fri',\n",
+ " 'brain',\n",
+ " 'profil',\n",
+ " 'gay',\n",
+ " 'teacher',\n",
+ " 'davidarchi',\n",
+ " 'file',\n",
+ " 'jk',\n",
+ " 'travel',\n",
+ " 'nyc',\n",
+ " 'natur',\n",
+ " 'vip',\n",
+ " 'dunno',\n",
+ " 'suggest',\n",
+ " 'whatev',\n",
+ " 'dm',\n",
+ " 'allow',\n",
+ " 'trek',\n",
+ " 'inspir',\n",
+ " 'extra',\n",
+ " 'board',\n",
+ " 'four',\n",
+ " 'shoulder',\n",
+ " 'contact',\n",
+ " 'bowl',\n",
+ " 'space',\n",
+ " 'pl',\n",
+ " 'twit',\n",
+ " 'ooh',\n",
+ " 'entertain',\n",
+ " 'king',\n",
+ " 'babe',\n",
+ " 'rob',\n",
+ " 'laker',\n",
+ " 'tree',\n",
+ " 'festiv',\n",
+ " 'ruin',\n",
+ " 'random',\n",
+ " 'joke',\n",
+ " 'bear',\n",
+ " 'hahah',\n",
+ " 'gift',\n",
+ " 'trailer',\n",
+ " 'marri',\n",
+ " 'midnight',\n",
+ " 'bunch',\n",
+ " 'forc',\n",
+ " 'vega',\n",
+ " 'pc',\n",
+ " 'entir',\n",
+ " 'report',\n",
+ " 'essay',\n",
+ " 'shine',\n",
+ " 'plane',\n",
+ " 'apart',\n",
+ " 'fam',\n",
+ " 'alot',\n",
+ " 'kate',\n",
+ " 'wind',\n",
+ " 'inform',\n",
+ " 'paid',\n",
+ " 'hannah',\n",
+ " 'bright',\n",
+ " 'eh',\n",
+ " 'cloud',\n",
+ " 'along',\n",
+ " 'nick',\n",
+ " 'chines',\n",
+ " 'seriou',\n",
+ " 'mmm',\n",
+ " 'seri',\n",
+ " 'thru',\n",
+ " 'teach',\n",
+ " 'chillin',\n",
+ " 'bb',\n",
+ " 'key',\n",
+ " 'begin',\n",
+ " 'rip',\n",
+ " 'server',\n",
+ " 'mama',\n",
+ " 'sum',\n",
+ " ...]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# your code here"
+ "# your code here\n",
+ "\n",
+ "# put together all words in a list\n",
+ "words = [item for sublist in tweets[\"text_processed\"] for item in sublist]\n",
+ "\n",
+ "# get frequency distribution\n",
+ "freqs = nltk.FreqDist(words)\n",
+ "\n",
+ "# select top 5000\n",
+ "top5000 = list(freqs)[:5000]\n",
+ "#top5000 = freqs.most_common(5000)\n",
+ "\n",
+ "top5000\n"
]
},
{
@@ -167,11 +1522,1050 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "({'go': False,\n",
+ " 'get': False,\n",
+ " 'wa': False,\n",
+ " 'day': False,\n",
+ " 'good': True,\n",
+ " 'like': False,\n",
+ " 'love': False,\n",
+ " 'work': False,\n",
+ " 'got': False,\n",
+ " 'quot': False,\n",
+ " 'time': False,\n",
+ " 'today': True,\n",
+ " 'miss': False,\n",
+ " 'lol': False,\n",
+ " 'thank': False,\n",
+ " 'back': False,\n",
+ " 'realli': False,\n",
+ " 'know': False,\n",
+ " 'think': False,\n",
+ " 'see': False,\n",
+ " 'feel': False,\n",
+ " 'amp': False,\n",
+ " 'watch': False,\n",
+ " 'need': True,\n",
+ " 'still': False,\n",
+ " 'night': False,\n",
+ " 'well': False,\n",
+ " 'make': False,\n",
+ " 'hope': False,\n",
+ " 'oh': False,\n",
+ " 'home': False,\n",
+ " 'new': False,\n",
+ " 'look': False,\n",
+ " 'twitter': False,\n",
+ " 'much': False,\n",
+ " 'last': False,\n",
+ " 'wish': False,\n",
+ " 'morn': False,\n",
+ " 'great': False,\n",
+ " 'wait': False,\n",
+ " 'sad': False,\n",
+ " 'tomorrow': False,\n",
+ " 'would': False,\n",
+ " 'sleep': False,\n",
+ " 'haha': False,\n",
+ " 'right': False,\n",
+ " 'whi': False,\n",
+ " 'follow': False,\n",
+ " 'thing': False,\n",
+ " 'fun': False,\n",
+ " 'tonight': False,\n",
+ " 'onli': False,\n",
+ " 'happi': True,\n",
+ " 'week': False,\n",
+ " 'friend': False,\n",
+ " 'bad': False,\n",
+ " 'nice': False,\n",
+ " 'veri': False,\n",
+ " 'sorri': False,\n",
+ " 'hi': False,\n",
+ " 'say': False,\n",
+ " 'way': False,\n",
+ " 'better': False,\n",
+ " 'school': False,\n",
+ " 'could': False,\n",
+ " 'hate': False,\n",
+ " 'yeah': False,\n",
+ " 'bed': False,\n",
+ " 'start': False,\n",
+ " 'tweet': False,\n",
+ " 'peopl': False,\n",
+ " 'gon': False,\n",
+ " 'though': False,\n",
+ " 'hour': False,\n",
+ " 'show': False,\n",
+ " 'even': False,\n",
+ " 'guy': False,\n",
+ " 'weekend': False,\n",
+ " 'ye': False,\n",
+ " 'life': False,\n",
+ " 'play': False,\n",
+ " 'everyon': False,\n",
+ " 'let': False,\n",
+ " 'final': False,\n",
+ " 'cant': False,\n",
+ " 'littl': False,\n",
+ " 'hey': False,\n",
+ " 'wan': False,\n",
+ " 'use': False,\n",
+ " 'movi': False,\n",
+ " 'lt': False,\n",
+ " 'rain': False,\n",
+ " 'sick': False,\n",
+ " 'year': False,\n",
+ " 'first': False,\n",
+ " 'ok': False,\n",
+ " 'tire': False,\n",
+ " 'find': True,\n",
+ " 'awesom': False,\n",
+ " 'x': False,\n",
+ " 'never': False,\n",
+ " 'next': False,\n",
+ " 'best': False,\n",
+ " 'soon': False,\n",
+ " 'ani': False,\n",
+ " 'talk': False,\n",
+ " 'call': False,\n",
+ " 'done': False,\n",
+ " 'head': False,\n",
+ " 'phone': False,\n",
+ " 'sure': False,\n",
+ " 'dont': False,\n",
+ " 'long': False,\n",
+ " 'alreadi': False,\n",
+ " 'help': False,\n",
+ " 'alway': False,\n",
+ " 'anoth': False,\n",
+ " 'hurt': False,\n",
+ " 'mayb': False,\n",
+ " 'keep': False,\n",
+ " 'pleas': False,\n",
+ " 'cool': False,\n",
+ " 'lot': False,\n",
+ " 'yay': False,\n",
+ " 'song': False,\n",
+ " 'someth': False,\n",
+ " 'girl': False,\n",
+ " 'eat': False,\n",
+ " 'made': False,\n",
+ " 'bore': False,\n",
+ " 'old': False,\n",
+ " 'enjoy': False,\n",
+ " 'suck': False,\n",
+ " 'leav': False,\n",
+ " 'yet': False,\n",
+ " 'ever': False,\n",
+ " 'read': False,\n",
+ " 'thought': False,\n",
+ " 'becaus': False,\n",
+ " 'lost': True,\n",
+ " 'ur': False,\n",
+ " 'readi': False,\n",
+ " 'away': False,\n",
+ " 'pretti': False,\n",
+ " 'actual': False,\n",
+ " 'finish': False,\n",
+ " 'live': False,\n",
+ " 'sound': False,\n",
+ " 'hous': False,\n",
+ " 'went': False,\n",
+ " 'excit': False,\n",
+ " 'listen': False,\n",
+ " 'hear': False,\n",
+ " 'game': False,\n",
+ " 'someon': False,\n",
+ " 'summer': False,\n",
+ " 'happen': False,\n",
+ " 'w': False,\n",
+ " 'left': False,\n",
+ " 'guess': False,\n",
+ " 'earli': False,\n",
+ " 'noth': False,\n",
+ " 'wonder': False,\n",
+ " 'befor': False,\n",
+ " 'b': False,\n",
+ " 'late': False,\n",
+ " 'babi': False,\n",
+ " 'job': False,\n",
+ " 'sinc': False,\n",
+ " 'doe': False,\n",
+ " 'tell': False,\n",
+ " 'omg': False,\n",
+ " 'glad': False,\n",
+ " 'ugh': False,\n",
+ " 'gone': False,\n",
+ " 'weather': False,\n",
+ " 'car': False,\n",
+ " 'mom': False,\n",
+ " 'kid': False,\n",
+ " 'p': False,\n",
+ " 'wow': False,\n",
+ " 'later': False,\n",
+ " 'saw': False,\n",
+ " 'beauti': False,\n",
+ " 'damn': False,\n",
+ " 'r': False,\n",
+ " 'amaz': False,\n",
+ " 'bit': False,\n",
+ " 'birthday': False,\n",
+ " 'stop': False,\n",
+ " 'mean': False,\n",
+ " 'big': False,\n",
+ " 'said': False,\n",
+ " 'check': False,\n",
+ " 'hot': False,\n",
+ " 'parti': True,\n",
+ " 'give': False,\n",
+ " 'sun': False,\n",
+ " 'god': False,\n",
+ " 'run': False,\n",
+ " 'may': False,\n",
+ " 'two': False,\n",
+ " 'updat': False,\n",
+ " 'shit': False,\n",
+ " 'must': False,\n",
+ " 'seem': False,\n",
+ " 'move': False,\n",
+ " 'world': False,\n",
+ " 'hard': False,\n",
+ " 'put': False,\n",
+ " 'iphon': False,\n",
+ " 'free': False,\n",
+ " 'cold': False,\n",
+ " 'luck': False,\n",
+ " 'yesterday': False,\n",
+ " 'th': False,\n",
+ " 'studi': False,\n",
+ " 'stay': False,\n",
+ " 'might': False,\n",
+ " 'meet': False,\n",
+ " 'month': False,\n",
+ " 'found': False,\n",
+ " 'book': False,\n",
+ " 'boy': False,\n",
+ " 'music': False,\n",
+ " 'friday': False,\n",
+ " 'lunch': False,\n",
+ " 'gt': False,\n",
+ " 'mani': False,\n",
+ " 'woke': False,\n",
+ " 'exam': False,\n",
+ " 'fuck': False,\n",
+ " 'aww': False,\n",
+ " 'buy': False,\n",
+ " 'shop': False,\n",
+ " 'tho': False,\n",
+ " 'around': False,\n",
+ " 'least': False,\n",
+ " 'monday': False,\n",
+ " 'post': False,\n",
+ " 'stuff': False,\n",
+ " 'clean': False,\n",
+ " 'cri': False,\n",
+ " 'famili': False,\n",
+ " 'okay': False,\n",
+ " 'anyth': False,\n",
+ " 'video': False,\n",
+ " 'xx': False,\n",
+ " 'forward': False,\n",
+ " 'drink': False,\n",
+ " 'welcom': False,\n",
+ " 'everyth': False,\n",
+ " 'sooo': False,\n",
+ " 'pictur': False,\n",
+ " 'busi': False,\n",
+ " 'believ': False,\n",
+ " 'hahaha': False,\n",
+ " 'drive': False,\n",
+ " 'food': False,\n",
+ " 'train': False,\n",
+ " 'stupid': False,\n",
+ " 'walk': False,\n",
+ " 'everi': False,\n",
+ " 'anyon': False,\n",
+ " 'sweet': False,\n",
+ " 'sunday': False,\n",
+ " 'probabl': False,\n",
+ " 'outsid': False,\n",
+ " 'win': False,\n",
+ " 'turn': False,\n",
+ " 'plan': False,\n",
+ " 'poor': False,\n",
+ " 'chang': False,\n",
+ " 'almost': False,\n",
+ " 'write': False,\n",
+ " 'hair': False,\n",
+ " 'far': False,\n",
+ " 'dad': False,\n",
+ " 'wrong': False,\n",
+ " 'dream': False,\n",
+ " 'place': False,\n",
+ " 'real': False,\n",
+ " 'cute': False,\n",
+ " 'kill': False,\n",
+ " 'ask': False,\n",
+ " 'rememb': False,\n",
+ " 'tv': False,\n",
+ " 'goodnight': False,\n",
+ " 'caus': False,\n",
+ " 'fan': False,\n",
+ " 'blog': False,\n",
+ " 'repli': False,\n",
+ " 'wake': False,\n",
+ " 'rest': False,\n",
+ " 'funni': False,\n",
+ " 'total': False,\n",
+ " 'quit': False,\n",
+ " 'eye': False,\n",
+ " 'worri': False,\n",
+ " 'anymor': False,\n",
+ " 'class': False,\n",
+ " 'room': False,\n",
+ " 'came': False,\n",
+ " 'hit': False,\n",
+ " 'dinner': False,\n",
+ " 'money': False,\n",
+ " 'true': False,\n",
+ " 'mother': False,\n",
+ " 'without': False,\n",
+ " 'sister': False,\n",
+ " 'els': False,\n",
+ " 'hang': False,\n",
+ " 'send': False,\n",
+ " 'offic': False,\n",
+ " 'news': False,\n",
+ " 'brother': False,\n",
+ " 'word': False,\n",
+ " 'seen': False,\n",
+ " 'whole': False,\n",
+ " 'danc': False,\n",
+ " 'aw': False,\n",
+ " 'onc': False,\n",
+ " 'open': False,\n",
+ " 'either': False,\n",
+ " 'vote': False,\n",
+ " 'took': True,\n",
+ " 'link': False,\n",
+ " 'pain': False,\n",
+ " 'break': False,\n",
+ " 'person': False,\n",
+ " 'headach': False,\n",
+ " 'coffe': False,\n",
+ " 'www': False,\n",
+ " 'half': False,\n",
+ " 'hehe': False,\n",
+ " 'saturday': False,\n",
+ " 'idea': False,\n",
+ " 'hug': False,\n",
+ " 'st': False,\n",
+ " 'bring': False,\n",
+ " 'hello': False,\n",
+ " 'anyway': False,\n",
+ " 'photo': False,\n",
+ " 'ah': False,\n",
+ " 'onlin': False,\n",
+ " 'text': False,\n",
+ " 'abl': False,\n",
+ " 'enough': False,\n",
+ " 'g': False,\n",
+ " 'set': False,\n",
+ " 'close': False,\n",
+ " 'full': False,\n",
+ " 'crap': False,\n",
+ " 'awww': False,\n",
+ " 'kinda': False,\n",
+ " 'jealou': False,\n",
+ " 'cours': False,\n",
+ " 'trip': False,\n",
+ " 'reason': False,\n",
+ " 'dude': False,\n",
+ " 'crazi': False,\n",
+ " 'fall': False,\n",
+ " 'comput': False,\n",
+ " 'fix': False,\n",
+ " 'season': False,\n",
+ " 'heard': False,\n",
+ " 'pm': False,\n",
+ " 'forgot': False,\n",
+ " 'didnt': False,\n",
+ " 'kind': False,\n",
+ " 'fine': False,\n",
+ " 'site': False,\n",
+ " 'heart': False,\n",
+ " 'wont': False,\n",
+ " 'interest': False,\n",
+ " 'bought': False,\n",
+ " 'high': False,\n",
+ " 'mr': False,\n",
+ " 'add': False,\n",
+ " 'visit': False,\n",
+ " 'rock': False,\n",
+ " 'pay': False,\n",
+ " 'awak': False,\n",
+ " 'favorit': False,\n",
+ " 'relax': False,\n",
+ " 'problem': False,\n",
+ " 'sunni': False,\n",
+ " 'super': False,\n",
+ " 'star': False,\n",
+ " 'red': False,\n",
+ " 'beach': False,\n",
+ " 'line': False,\n",
+ " 'learn': False,\n",
+ " 'ago': False,\n",
+ " 'soo': False,\n",
+ " 'cuz': False,\n",
+ " 'asleep': False,\n",
+ " 'dead': False,\n",
+ " 'smile': False,\n",
+ " 'mileycyru': False,\n",
+ " 'sign': False,\n",
+ " 'fail': False,\n",
+ " 'hell': False,\n",
+ " 'sore': False,\n",
+ " 'tommcfli': False,\n",
+ " 'ride': False,\n",
+ " 'math': False,\n",
+ " 'afternoon': False,\n",
+ " 'lose': False,\n",
+ " 'part': False,\n",
+ " 'serious': False,\n",
+ " 'mind': False,\n",
+ " 'lucki': False,\n",
+ " 'drop': False,\n",
+ " 'power': False,\n",
+ " 'congrat': False,\n",
+ " 'definit': False,\n",
+ " 'concert': False,\n",
+ " 'facebook': False,\n",
+ " 'ticket': False,\n",
+ " 'ladi': False,\n",
+ " 'instead': False,\n",
+ " 'email': False,\n",
+ " 'shower': False,\n",
+ " 'hand': False,\n",
+ " 'mad': False,\n",
+ " 'offici': False,\n",
+ " 'youtub': False,\n",
+ " 'breakfast': False,\n",
+ " 'short': False,\n",
+ " 'internet': False,\n",
+ " 'boo': False,\n",
+ " 'broke': False,\n",
+ " 'wear': False,\n",
+ " 'sometim': False,\n",
+ " 'order': False,\n",
+ " 'nd': False,\n",
+ " 'dear': False,\n",
+ " 'bet': False,\n",
+ " 'agre': False,\n",
+ " 'perfect': False,\n",
+ " 'btw': False,\n",
+ " 'park': False,\n",
+ " 'award': False,\n",
+ " 'gym': False,\n",
+ " 'catch': False,\n",
+ " 'togeth': False,\n",
+ " 'test': False,\n",
+ " 'team': False,\n",
+ " 'suppos': False,\n",
+ " 'figur': False,\n",
+ " 'sat': False,\n",
+ " 'homework': False,\n",
+ " 'mention': False,\n",
+ " 'june': False,\n",
+ " 'alon': False,\n",
+ " 'beat': False,\n",
+ " 'nap': False,\n",
+ " 'pack': False,\n",
+ " 'soooo': False,\n",
+ " 'pick': False,\n",
+ " 'yea': False,\n",
+ " 'sing': False,\n",
+ " 'lmao': False,\n",
+ " 'nite': False,\n",
+ " 'xxx': False,\n",
+ " 'second': False,\n",
+ " 'album': False,\n",
+ " 'stuck': False,\n",
+ " 'store': False,\n",
+ " 'wed': False,\n",
+ " 'sigh': False,\n",
+ " 'goin': False,\n",
+ " 'ice': False,\n",
+ " 'vacat': False,\n",
+ " 'air': False,\n",
+ " 'upload': False,\n",
+ " 'coupl': False,\n",
+ " 'water': False,\n",
+ " 'hungri': False,\n",
+ " 'cook': False,\n",
+ " 'cousin': False,\n",
+ " 'xd': False,\n",
+ " 'dress': False,\n",
+ " 'easi': False,\n",
+ " 'side': False,\n",
+ " 'laptop': False,\n",
+ " 'account': False,\n",
+ " 'holiday': False,\n",
+ " 'wors': False,\n",
+ " 'revis': False,\n",
+ " 'bless': False,\n",
+ " 'foot': False,\n",
+ " 'decid': False,\n",
+ " 'join': False,\n",
+ " 'mood': False,\n",
+ " 'spend': False,\n",
+ " 'point': False,\n",
+ " 'moment': False,\n",
+ " 'window': False,\n",
+ " 'stori': False,\n",
+ " 'save': False,\n",
+ " 'graduat': False,\n",
+ " 'past': False,\n",
+ " 'top': False,\n",
+ " 'sleepi': False,\n",
+ " 'ipod': False,\n",
+ " 'f': False,\n",
+ " 'current': False,\n",
+ " 'understand': False,\n",
+ " 'yep': False,\n",
+ " 'throat': False,\n",
+ " 'chanc': False,\n",
+ " 'town': False,\n",
+ " 'differ': False,\n",
+ " 'hmm': False,\n",
+ " 'told': False,\n",
+ " 'bout': False,\n",
+ " 'mtv': False,\n",
+ " 'cream': False,\n",
+ " 'annoy': False,\n",
+ " 'answer': False,\n",
+ " 'age': False,\n",
+ " 'mac': False,\n",
+ " 'forget': False,\n",
+ " 'road': False,\n",
+ " 'thursday': False,\n",
+ " 'tour': False,\n",
+ " 'fast': False,\n",
+ " 'load': False,\n",
+ " 'celebr': False,\n",
+ " 'realiz': False,\n",
+ " 'jonasbroth': False,\n",
+ " 'shoot': False,\n",
+ " 'camp': False,\n",
+ " 'scare': False,\n",
+ " 'cut': False,\n",
+ " 'co': False,\n",
+ " 'knew': False,\n",
+ " 'date': False,\n",
+ " 'ddlovato': False,\n",
+ " 'shirt': False,\n",
+ " 'episod': False,\n",
+ " 'fli': False,\n",
+ " 'chat': False,\n",
+ " 'ahhh': False,\n",
+ " 'complet': False,\n",
+ " 'kick': False,\n",
+ " 'fb': False,\n",
+ " 'lazi': False,\n",
+ " 'ive': False,\n",
+ " 'especi': False,\n",
+ " 'card': False,\n",
+ " 'ppl': False,\n",
+ " 'mum': False,\n",
+ " 'rather': False,\n",
+ " 'black': False,\n",
+ " 'download': False,\n",
+ " 'list': False,\n",
+ " 'tea': False,\n",
+ " 'chocol': False,\n",
+ " 'yr': False,\n",
+ " 'count': False,\n",
+ " 'expect': False,\n",
+ " 'cancel': False,\n",
+ " 'flight': False,\n",
+ " 'appl': False,\n",
+ " 'ate': False,\n",
+ " 'number': False,\n",
+ " 'film': False,\n",
+ " 'state': False,\n",
+ " 'bye': False,\n",
+ " 'pizza': False,\n",
+ " 'worst': False,\n",
+ " 'wtf': False,\n",
+ " 'wine': False,\n",
+ " 'juli': False,\n",
+ " 'present': False,\n",
+ " 'meant': False,\n",
+ " 'english': False,\n",
+ " 'manag': False,\n",
+ " 'share': False,\n",
+ " 'colleg': False,\n",
+ " 'sent': False,\n",
+ " 'servic': False,\n",
+ " 'ach': False,\n",
+ " 'question': False,\n",
+ " 'flu': False,\n",
+ " 'depress': False,\n",
+ " 'nope': False,\n",
+ " 'freak': False,\n",
+ " 'laugh': False,\n",
+ " 'smell': False,\n",
+ " 'unfortun': False,\n",
+ " 'due': False,\n",
+ " 'woman': False,\n",
+ " 'beer': False,\n",
+ " 'messag': False,\n",
+ " 'bitch': False,\n",
+ " 'worth': False,\n",
+ " 'search': False,\n",
+ " 'websit': False,\n",
+ " 'bike': False,\n",
+ " 'comment': False,\n",
+ " 'touch': False,\n",
+ " 'shoe': False,\n",
+ " 'parent': False,\n",
+ " 'church': False,\n",
+ " 'sunshin': False,\n",
+ " 'swim': False,\n",
+ " 'miley': False,\n",
+ " 'boyfriend': False,\n",
+ " 'pool': False,\n",
+ " 'stomach': False,\n",
+ " 'lil': False,\n",
+ " 'burn': False,\n",
+ " 'support': False,\n",
+ " 'followfriday': False,\n",
+ " 'mess': False,\n",
+ " 'lay': False,\n",
+ " 'weird': False,\n",
+ " 'leg': False,\n",
+ " 'bum': False,\n",
+ " 'appar': False,\n",
+ " 'becom': False,\n",
+ " 'father': False,\n",
+ " 'cake': False,\n",
+ " 'hill': False,\n",
+ " 'case': False,\n",
+ " 'airport': False,\n",
+ " 'event': False,\n",
+ " 'shame': False,\n",
+ " 'hr': False,\n",
+ " 'london': False,\n",
+ " 'surpris': False,\n",
+ " 'round': False,\n",
+ " 'tummi': False,\n",
+ " 'hmmm': False,\n",
+ " 'voic': False,\n",
+ " 'broken': False,\n",
+ " 'blue': False,\n",
+ " 'safe': False,\n",
+ " 'spent': False,\n",
+ " 'fell': False,\n",
+ " 'possibl': False,\n",
+ " 'practic': False,\n",
+ " 'type': False,\n",
+ " 'usual': False,\n",
+ " 'yummi': False,\n",
+ " 'insid': False,\n",
+ " 'stress': False,\n",
+ " 'warm': False,\n",
+ " 'absolut': False,\n",
+ " 'normal': False,\n",
+ " 'alright': False,\n",
+ " 'cheer': False,\n",
+ " 'moon': False,\n",
+ " 'idk': False,\n",
+ " 'hold': False,\n",
+ " 'horribl': False,\n",
+ " 'stand': False,\n",
+ " 'near': False,\n",
+ " 'page': False,\n",
+ " 'chill': False,\n",
+ " 'note': False,\n",
+ " 'shot': False,\n",
+ " 'doctor': False,\n",
+ " 'uk': False,\n",
+ " 'xoxo': False,\n",
+ " 'club': False,\n",
+ " 'fire': False,\n",
+ " 'pray': False,\n",
+ " 'huge': False,\n",
+ " 'taken': False,\n",
+ " 'ball': False,\n",
+ " 'cup': False,\n",
+ " 'slow': False,\n",
+ " 'shall': False,\n",
+ " 'cd': False,\n",
+ " 'scari': False,\n",
+ " 'deserv': False,\n",
+ " 'disappoint': False,\n",
+ " 'block': False,\n",
+ " 'box': False,\n",
+ " 'pass': False,\n",
+ " 'exactli': False,\n",
+ " 'green': False,\n",
+ " 'sooooo': False,\n",
+ " 'bro': False,\n",
+ " 'band': False,\n",
+ " 'paper': False,\n",
+ " 'confus': False,\n",
+ " 'terribl': False,\n",
+ " 'myspac': False,\n",
+ " 'ear': False,\n",
+ " 'tan': False,\n",
+ " 'notic': False,\n",
+ " 'except': False,\n",
+ " 'twitpic': False,\n",
+ " 'googl': False,\n",
+ " 'issu': False,\n",
+ " 'low': False,\n",
+ " 'doesnt': False,\n",
+ " 'lie': False,\n",
+ " 'fair': False,\n",
+ " 'sadli': False,\n",
+ " 'mile': False,\n",
+ " 'mate': False,\n",
+ " 'project': False,\n",
+ " 'app': False,\n",
+ " 'pop': False,\n",
+ " 'countri': False,\n",
+ " 'wit': False,\n",
+ " 'longer': False,\n",
+ " 'crash': False,\n",
+ " 'raini': False,\n",
+ " 'arm': False,\n",
+ " 'david': False,\n",
+ " 'cover': False,\n",
+ " 'sell': False,\n",
+ " 'wednesday': False,\n",
+ " 'sim': False,\n",
+ " 'shift': False,\n",
+ " 'finger': False,\n",
+ " 'woo': False,\n",
+ " 'hubbi': False,\n",
+ " 'fit': False,\n",
+ " 'gorgeou': False,\n",
+ " 'gosh': False,\n",
+ " 'ff': False,\n",
+ " 'hangov': False,\n",
+ " 'caught': False,\n",
+ " 'mail': False,\n",
+ " 'return': False,\n",
+ " 'luv': False,\n",
+ " 'fight': False,\n",
+ " 'earlier': False,\n",
+ " 'yup': False,\n",
+ " 'plu': False,\n",
+ " 'lame': False,\n",
+ " 'gave': False,\n",
+ " 'ahead': False,\n",
+ " 'invit': True,\n",
+ " 'connect': False,\n",
+ " 'act': False,\n",
+ " 'special': False,\n",
+ " 'nearli': False,\n",
+ " 'isnt': False,\n",
+ " 'prepar': False,\n",
+ " 'watchin': False,\n",
+ " 'via': False,\n",
+ " 'white': False,\n",
+ " 'taylor': False,\n",
+ " 'sort': False,\n",
+ " 'interview': False,\n",
+ " 'blackberri': False,\n",
+ " 'recommend': False,\n",
+ " 'tom': False,\n",
+ " 'argh': False,\n",
+ " 'guitar': False,\n",
+ " 'bill': False,\n",
+ " 'front': False,\n",
+ " 'offer': False,\n",
+ " 'fact': False,\n",
+ " 'piec': False,\n",
+ " 'upset': False,\n",
+ " 'joy': False,\n",
+ " 'glass': False,\n",
+ " 'radio': False,\n",
+ " 'tuesday': False,\n",
+ " 'xo': False,\n",
+ " 'matter': False,\n",
+ " 'fill': False,\n",
+ " 'inde': False,\n",
+ " 'exhaust': False,\n",
+ " 'self': False,\n",
+ " 'congratul': False,\n",
+ " 'avail': False,\n",
+ " 'deal': False,\n",
+ " 'lone': False,\n",
+ " 'chees': False,\n",
+ " 'version': False,\n",
+ " 'john': False,\n",
+ " 'breath': False,\n",
+ " 'proud': False,\n",
+ " 'product': False,\n",
+ " 'gig': False,\n",
+ " 'teeth': False,\n",
+ " 'blah': False,\n",
+ " 'quiet': False,\n",
+ " 'kiss': False,\n",
+ " 'view': False,\n",
+ " 'drunk': False,\n",
+ " 'fever': False,\n",
+ " 'singl': False,\n",
+ " 'forev': False,\n",
+ " 'folk': False,\n",
+ " 'gettin': False,\n",
+ " 'pink': False,\n",
+ " 'bird': False,\n",
+ " 'promis': False,\n",
+ " 'goodby': False,\n",
+ " 'everybodi': False,\n",
+ " 'experi': False,\n",
+ " 'quick': False,\n",
+ " 'import': False,\n",
+ " 'tune': False,\n",
+ " 'although': False,\n",
+ " 'daughter': False,\n",
+ " 'sexi': False,\n",
+ " 'sky': False,\n",
+ " 'aint': False,\n",
+ " 'grow': False,\n",
+ " 'small': False,\n",
+ " 'fantast': False,\n",
+ " 'bar': False,\n",
+ " 'three': False,\n",
+ " 'design': False,\n",
+ " 'nose': False,\n",
+ " 'huh': False,\n",
+ " 'thx': False,\n",
+ " 'lesson': False,\n",
+ " 'fish': False,\n",
+ " 'bday': False,\n",
+ " 'arriv': False,\n",
+ " 'storm': False,\n",
+ " 'mix': False,\n",
+ " 'seat': False,\n",
+ " 'lake': False,\n",
+ " 'daddi': False,\n",
+ " 'fav': False,\n",
+ " 'piss': False,\n",
+ " 'blood': False,\n",
+ " 'knee': False,\n",
+ " 'child': False,\n",
+ " 'havent': False,\n",
+ " 'middl': False,\n",
+ " 'memori': False,\n",
+ " 'bag': False,\n",
+ " 'group': False,\n",
+ " 'dvd': False,\n",
+ " 'dure': False,\n",
+ " 'art': False,\n",
+ " 'behind': False,\n",
+ " 'jack': False,\n",
+ " 'paint': False,\n",
+ " 'hospit': False,\n",
+ " 'swine': False,\n",
+ " 'futur': False,\n",
+ " 'perform': False,\n",
+ " 'magic': False,\n",
+ " 'cloth': False,\n",
+ " 'remind': False,\n",
+ " 'stick': False,\n",
+ " 'web': False,\n",
+ " 'australia': False,\n",
+ " 'releas': False,\n",
+ " 'dentist': False,\n",
+ " 'land': False,\n",
+ " 'appreci': False,\n",
+ " 'session': False,\n",
+ " 'ring': False,\n",
+ " 'bloodi': False,\n",
+ " 'track': False,\n",
+ " 'info': False,\n",
+ " 'peac': False,\n",
+ " 'ouch': False,\n",
+ " 'delet': False,\n",
+ " 'channel': False,\n",
+ " 'nail': False,\n",
+ " 'race': False,\n",
+ " 'addict': False,\n",
+ " 'ran': False,\n",
+ " 'ny': False,\n",
+ " 'bbq': False,\n",
+ " 'gut': False,\n",
+ " 'trend': False,\n",
+ " 'cooki': False,\n",
+ " 'tear': False,\n",
+ " 'camera': False,\n",
+ " 'joe': False,\n",
+ " 'french': False,\n",
+ " 'roll': False,\n",
+ " 'buddi': False,\n",
+ " 'system': False,\n",
+ " 'edit': False,\n",
+ " 'doubl': False,\n",
+ " 'twilight': False,\n",
+ " 'puppi': False,\n",
+ " 'tast': False,\n",
+ " 'ng': False,\n",
+ " 'nobodi': False,\n",
+ " 'histori': False,\n",
+ " 'jon': False,\n",
+ " 'chicken': False,\n",
+ " 'compani': False,\n",
+ " 'plz': False,\n",
+ " 'mommi': False,\n",
+ " 'feelin': False,\n",
+ " 'record': False,\n",
+ " 'light': False,\n",
+ " 'ohh': False,\n",
+ " 'dr': False,\n",
+ " 'young': False,\n",
+ " 'rule': False,\n",
+ " 'form': False,\n",
+ " 'garden': False,\n",
+ " 'neck': False,\n",
+ " 'felt': False,\n",
+ " 'badli': False,\n",
+ " 'dark': False,\n",
+ " 'street': False,\n",
+ " 'slept': False,\n",
+ " 'local': False,\n",
+ " 'continu': False,\n",
+ " 'adam': False,\n",
+ " 'yum': False,\n",
+ " 'shut': False,\n",
+ " 'ahh': False,\n",
+ " 'fri': False,\n",
+ " 'brain': False,\n",
+ " 'profil': False,\n",
+ " 'gay': False,\n",
+ " 'teacher': False,\n",
+ " 'davidarchi': False,\n",
+ " 'file': False,\n",
+ " 'jk': False,\n",
+ " 'travel': False,\n",
+ " 'nyc': False,\n",
+ " 'natur': False,\n",
+ " 'vip': False,\n",
+ " 'dunno': False,\n",
+ " 'suggest': False,\n",
+ " 'whatev': False,\n",
+ " 'dm': False,\n",
+ " 'allow': False,\n",
+ " 'trek': False,\n",
+ " 'inspir': False,\n",
+ " 'extra': False,\n",
+ " 'board': False,\n",
+ " 'four': False,\n",
+ " 'shoulder': False,\n",
+ " 'contact': False,\n",
+ " 'bowl': False,\n",
+ " 'space': False,\n",
+ " 'pl': False,\n",
+ " 'twit': False,\n",
+ " 'ooh': False,\n",
+ " 'entertain': False,\n",
+ " 'king': False,\n",
+ " 'babe': False,\n",
+ " 'rob': False,\n",
+ " 'laker': False,\n",
+ " 'tree': False,\n",
+ " 'festiv': False,\n",
+ " 'ruin': False,\n",
+ " 'random': False,\n",
+ " 'joke': False,\n",
+ " 'bear': False,\n",
+ " 'hahah': False,\n",
+ " 'gift': False,\n",
+ " 'trailer': False,\n",
+ " 'marri': False,\n",
+ " 'midnight': False,\n",
+ " 'bunch': False,\n",
+ " 'forc': False,\n",
+ " 'vega': False,\n",
+ " 'pc': False,\n",
+ " 'entir': False,\n",
+ " 'report': False,\n",
+ " 'essay': False,\n",
+ " 'shine': False,\n",
+ " 'plane': False,\n",
+ " 'apart': False,\n",
+ " 'fam': False,\n",
+ " 'alot': False,\n",
+ " 'kate': False,\n",
+ " 'wind': False,\n",
+ " 'inform': False,\n",
+ " 'paid': False,\n",
+ " 'hannah': False,\n",
+ " 'bright': False,\n",
+ " 'eh': False,\n",
+ " 'cloud': False,\n",
+ " 'along': False,\n",
+ " 'nick': False,\n",
+ " 'chines': False,\n",
+ " 'seriou': False,\n",
+ " 'mmm': False,\n",
+ " 'seri': False,\n",
+ " 'thru': False,\n",
+ " 'teach': False,\n",
+ " 'chillin': False,\n",
+ " 'bb': False,\n",
+ " 'key': False,\n",
+ " 'begin': False,\n",
+ " 'rip': False,\n",
+ " 'server': False,\n",
+ " 'mama': False,\n",
+ " 'sum': False,\n",
+ " ...},\n",
+ " True)"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# your code here"
+ "# your code here\n",
+ "import random\n",
+ "from nltk.corpus import movie_reviews\n",
+ "\n",
+ "# make target boolean\n",
+ "tweets[\"target\"] = tweets[\"target\"].astype('bool')\n",
+ "\n",
+ "# get list of words and category\n",
+ "documents = [(tweet[\"text_processed\"], tweet[\"target\"]) for index, tweet in tweets.iterrows()]\n",
+ " \n",
+ "random.shuffle(documents)\n",
+ "\n",
+ "# define function to find words from top5000\n",
+ "\n",
+ "def find_features(document):\n",
+ " words = set(document)\n",
+ " features = {}\n",
+ " for w in top5000:\n",
+ " features[w] = (w in words)\n",
+ "\n",
+ " return features\n",
+ "\n",
+ "# loop through texts and get boolean for whether the top5000 words are there\n",
+ "featuresets = [(find_features(rev), category) for (rev, category) in documents]\n",
+ "\n",
+ "# checking list\n",
+ "featuresets[0]\n",
+ "\n"
]
},
{
@@ -210,11 +2604,42 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 5,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Most Informative Features\n",
+ " sad = True False : True = 17.8 : 1.0\n",
+ " followfriday = True True : False = 17.1 : 1.0\n",
+ " congratul = True True : False = 15.1 : 1.0\n",
+ " welcom = True True : False = 12.3 : 1.0\n",
+ " kate = True False : True = 12.3 : 1.0\n",
+ " cancel = True False : True = 11.8 : 1.0\n",
+ " tune = True True : False = 11.7 : 1.0\n",
+ " dentist = True False : True = 11.6 : 1.0\n",
+ " knee = True False : True = 11.6 : 1.0\n",
+ " lone = True False : True = 11.6 : 1.0\n",
+ "None\n"
+ ]
+ }
+ ],
"source": [
- "# your code here"
+ "# your code here\n",
+ "\n",
+ "# train set\n",
+ "training_set = featuresets[:16000]\n",
+ "\n",
+ "# test set\n",
+ "testing_set = featuresets[16000:]\n",
+ "\n",
+ "# train classifier\n",
+ "classifier = nltk.NaiveBayesClassifier.train(training_set)\n",
+ "\n",
+ "# print most informative features\n",
+ "print(classifier.show_most_informative_features())\n"
]
},
{
@@ -230,11 +2655,22 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Classifier accuracy percent: 0.7285\n"
+ ]
+ }
+ ],
"source": [
- "# your code here"
+ "# your code here\n",
+ "\n",
+ "# accuracy score\n",
+ "print(\"Classifier accuracy percent:\",nltk.classify.accuracy(classifier, testing_set))"
]
},
{
@@ -252,7 +2688,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
@@ -270,7 +2706,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@@ -288,7 +2724,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@@ -298,7 +2734,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -312,7 +2748,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.3"
+ "version": "3.7.6"
}
},
"nbformat": 4,