diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb
index 0808166..f2b7884 100644
--- a/your-code/challenge-1.ipynb
+++ b/your-code/challenge-1.ipynb
@@ -66,9 +66,30 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
+ "source": [
+ "import re\n",
+ "import nltk"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'ironhack s q website is'"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"def clean_up(s):\n",
" \"\"\"\n",
@@ -79,7 +100,14 @@
"\n",
" Returns:\n",
" A string that has been cleaned up.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " string = re.sub(r'http\\S+', '', s)\n",
+ " return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n",
+ " \n",
+ "test = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\"\n",
+ "\n",
+ "test_string = clean_up(test)\n",
+ "test_string"
]
},
{
@@ -101,9 +129,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['ironhack', 's', 'q', 'website', 'is']"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"def tokenize(s):\n",
" \"\"\"\n",
@@ -114,7 +153,11 @@
"\n",
" Returns:\n",
" A list of words as the result of tokenization.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " return nltk.word_tokenize(s)\n",
+ "\n",
+ "test_string = tokenize(test_string)\n",
+ "test_string"
]
},
{
@@ -145,11 +188,17 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
+ "# Nope, something went wrong, I'll use another set of words\n",
+ "\n",
+ "import nltk\n",
+ "from nltk.stem import PorterStemmer, WordNetLemmatizer\n",
+ "\n",
"def stem_and_lemmatize(l):\n",
+ " \n",
" \"\"\"\n",
" Perform stemming and lemmatization on a list of words.\n",
"\n",
@@ -158,7 +207,17 @@
"\n",
" Returns:\n",
" A list of strings after being stemmed and lemmatized.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " ps = nltk.PorterStemmer()\n",
+ " lemmatizer = nltk.WordNetLemmatizer()\n",
+ " l2 = []\n",
+ " \n",
+ " for w in l:\n",
+ " s = ps.stem(w)\n",
+ " s = lemmatizer.lemmatize(s)\n",
+ " l2 += [s]\n",
+ " \n",
+ " return l2\n"
]
},
{
@@ -176,7 +235,16 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from nltk.corpus import stopwords"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -189,7 +257,10 @@
"\n",
" Returns:\n",
" A list of strings after stop words are removed.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " stop_words = stopwords.words('english')\n",
+ "\n",
+ " return [w for w in l if w not in stop_words]"
]
},
{
@@ -218,7 +289,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.3"
+ "version": "3.10.9"
}
},
"nbformat": 4,
diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb
index 6b0e116..bca22d3 100644
--- a/your-code/challenge-2.ipynb
+++ b/your-code/challenge-2.ipynb
@@ -18,8 +18,8 @@
"\n",
"```python\n",
">>> from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
- ">>> txt = \"Ironhack is a Global Tech School ranked num 2 worldwide.
",
- "
",
+ ">>> txt = \"Ironhack is a Global Tech School ranked num 2 worldwide.
\n",
+ "
\n",
"Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do.\"\n",
">>> analyzer = SentimentIntensityAnalyzer()\n",
">>> analyzer.polarity_scores(txt)\n",
@@ -46,11 +46,250 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from nltk.corpus import stopwords\n",
+ "import re\n",
+ "import nltk\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def clean_up(s):\n",
+ " \"\"\"\n",
+ " Cleans up numbers, URLs, and special characters from a string.\n",
+ "\n",
+ " Args:\n",
+ " s: The string to be cleaned up.\n",
+ "\n",
+ " Returns:\n",
+ " A string that has been cleaned up.\n",
+ " \"\"\"\n",
+ " string = re.sub(r'http\\S+', '', s)\n",
+ " return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n",
+ "\n",
+ "def tokenize(s):\n",
+ " \"\"\"\n",
+ " Tokenize a string.\n",
+ "\n",
+ " Args:\n",
+ " s: String to be tokenized.\n",
+ "\n",
+ " Returns:\n",
+ " A list of words as the result of tokenization.\n",
+ " \"\"\"\n",
+ " return nltk.word_tokenize(s)\n",
+ "\n",
+ "def stem_and_lemmatize(l):\n",
+ " \n",
+ " \"\"\"\n",
+ " Perform stemming and lemmatization on a list of words.\n",
+ "\n",
+ " Args:\n",
+ " l: A list of strings.\n",
+ "\n",
+ " Returns:\n",
+ " A list of strings after being stemmed and lemmatized.\n",
+ " \"\"\"\n",
+ " ps = nltk.PorterStemmer()\n",
+ " lemmatizer = nltk.WordNetLemmatizer()\n",
+ " l2 = []\n",
+ " \n",
+ " for w in l:\n",
+ " s = ps.stem(w)\n",
+ " s = lemmatizer.lemmatize(s)\n",
+ " l2 += [s]\n",
+ " \n",
+ " return l2\n",
+ "\n",
+ "\n",
+ "def remove_stopwords(l):\n",
+ " \"\"\"\n",
+ " Remove English stopwords from a list of strings.\n",
+ "\n",
+ " Args:\n",
+ " l: A list of strings.\n",
+ "\n",
+ " Returns:\n",
+ " A list of strings after stop words are removed.\n",
+ " \"\"\"\n",
+ " stop_words = stopwords.words('english')\n",
+ "\n",
+ " return [w for w in l if w not in stop_words]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1467810369 | \n",
+ " Mon Apr 06 22:19:45 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " _TheSpecialOne_ | \n",
+ " @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 1467810672 | \n",
+ " Mon Apr 06 22:19:49 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " scotthamilton | \n",
+ " is upset that he can't update his Facebook by ... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0 | \n",
+ " 1467810917 | \n",
+ " Mon Apr 06 22:19:53 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " mattycus | \n",
+ " @Kenichan I dived many times for the ball. Man... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ \\\n",
+ "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n",
+ "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n",
+ "\n",
+ " @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D \n",
+ "0 is upset that he can't update his Facebook by ... \n",
+ "1 @Kenichan I dived many times for the ball. Man... "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "tweets = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin1')\n",
+ "tweets.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " target | \n",
+ " id | \n",
+ " date | \n",
+ " flag | \n",
+ " user | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 1467810672 | \n",
+ " Mon Apr 06 22:19:49 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " scotthamilton | \n",
+ " is upset that he can't update his Facebook by ... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0 | \n",
+ " 1467810917 | \n",
+ " Mon Apr 06 22:19:53 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " mattycus | \n",
+ " @Kenichan I dived many times for the ball. Man... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " target id date flag user \\\n",
+ "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n",
+ "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n",
+ "\n",
+ " text \n",
+ "0 is upset that he can't update his Facebook by ... \n",
+ "1 @Kenichan I dived many times for the ball. Man... "
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tweets.columns = ['target','id','date','flag','user','text']\n",
+ "tweets.head(2)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sample = tweets.sample(20000)\n",
+ "sample['target'] = sample['target'].replace(4, 1)"
]
},
{
@@ -76,11 +315,206 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " target | \n",
+ " id | \n",
+ " date | \n",
+ " flag | \n",
+ " user | \n",
+ " text | \n",
+ " text_processed | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 593613 | \n",
+ " 0 | \n",
+ " 2218286287 | \n",
+ " Wed Jun 17 22:12:28 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " kaitlbean | \n",
+ " @06eleven hey! Where have all your salacious t... | \n",
+ " [eleven, hey, salaci, tweet, gone] | \n",
+ "
\n",
+ " \n",
+ " | 489821 | \n",
+ " 0 | \n",
+ " 2183125226 | \n",
+ " Mon Jun 15 14:04:46 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " Arelly | \n",
+ " Trying to study for exams -> failing | \n",
+ " [tri, studi, exam, gt, fail] | \n",
+ "
\n",
+ " \n",
+ " | 856761 | \n",
+ " 1 | \n",
+ " 1573804695 | \n",
+ " Tue Apr 21 02:56:37 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " elliottbledsoe | \n",
+ " just had lime cous-cous with @SarahMoran and @... | \n",
+ " [lime, cou, cou, sarahmoran, ehon, place] | \n",
+ "
\n",
+ " \n",
+ " | 1505181 | \n",
+ " 1 | \n",
+ " 2072316741 | \n",
+ " Sun Jun 07 20:47:40 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " suzieqjenny | \n",
+ " @daisywoo you didn't know NPH could sing!?!?! ... | \n",
+ " [daisywoo, know, nph, could, sing, saw, rent, ... | \n",
+ "
\n",
+ " \n",
+ " | 769617 | \n",
+ " 0 | \n",
+ " 2301511058 | \n",
+ " Tue Jun 23 15:32:19 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " noeyfashowey | \n",
+ " @sgarcia408 fuck.. It was never meant to be.. ... | \n",
+ " [sgarcia, fuck, wa, never, meant, jealou, wan,... | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 1498706 | \n",
+ " 1 | \n",
+ " 2070638801 | \n",
+ " Sun Jun 07 18:02:59 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " OinkRachel | \n",
+ " Just took a chance that can change my life . Lol | \n",
+ " [took, chanc, chang, life, lol] | \n",
+ "
\n",
+ " \n",
+ " | 660759 | \n",
+ " 0 | \n",
+ " 2242829124 | \n",
+ " Fri Jun 19 12:41:01 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " lexi_diaz | \n",
+ " time to go out to the job!!! im pretty tired b... | \n",
+ " [time, go, job, im, pretti, tire, go] | \n",
+ "
\n",
+ " \n",
+ " | 1502259 | \n",
+ " 1 | \n",
+ " 2071639205 | \n",
+ " Sun Jun 07 19:42:23 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " bowlwiki | \n",
+ " 0.6 Is that enough? @HarlemLanes Almost there | \n",
+ " [enough, harlemlan, almost] | \n",
+ "
\n",
+ " \n",
+ " | 1572472 | \n",
+ " 1 | \n",
+ " 2188940716 | \n",
+ " Mon Jun 15 22:53:30 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " capedcrusader13 | \n",
+ " @abbiirocks for what? i got hit by a car | \n",
+ " [abbiirock, got, hit, car] | \n",
+ "
\n",
+ " \n",
+ " | 212968 | \n",
+ " 0 | \n",
+ " 1974879860 | \n",
+ " Sat May 30 14:00:23 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " lauralem79 | \n",
+ " not able to download my badass pics | \n",
+ " [abl, download, badass, pic] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
20000 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " target id date flag \\\n",
+ "593613 0 2218286287 Wed Jun 17 22:12:28 PDT 2009 NO_QUERY \n",
+ "489821 0 2183125226 Mon Jun 15 14:04:46 PDT 2009 NO_QUERY \n",
+ "856761 1 1573804695 Tue Apr 21 02:56:37 PDT 2009 NO_QUERY \n",
+ "1505181 1 2072316741 Sun Jun 07 20:47:40 PDT 2009 NO_QUERY \n",
+ "769617 0 2301511058 Tue Jun 23 15:32:19 PDT 2009 NO_QUERY \n",
+ "... ... ... ... ... \n",
+ "1498706 1 2070638801 Sun Jun 07 18:02:59 PDT 2009 NO_QUERY \n",
+ "660759 0 2242829124 Fri Jun 19 12:41:01 PDT 2009 NO_QUERY \n",
+ "1502259 1 2071639205 Sun Jun 07 19:42:23 PDT 2009 NO_QUERY \n",
+ "1572472 1 2188940716 Mon Jun 15 22:53:30 PDT 2009 NO_QUERY \n",
+ "212968 0 1974879860 Sat May 30 14:00:23 PDT 2009 NO_QUERY \n",
+ "\n",
+ " user text \\\n",
+ "593613 kaitlbean @06eleven hey! Where have all your salacious t... \n",
+ "489821 Arelly Trying to study for exams -> failing \n",
+ "856761 elliottbledsoe just had lime cous-cous with @SarahMoran and @... \n",
+ "1505181 suzieqjenny @daisywoo you didn't know NPH could sing!?!?! ... \n",
+ "769617 noeyfashowey @sgarcia408 fuck.. It was never meant to be.. ... \n",
+ "... ... ... \n",
+ "1498706 OinkRachel Just took a chance that can change my life . Lol \n",
+ "660759 lexi_diaz time to go out to the job!!! im pretty tired b... \n",
+ "1502259 bowlwiki 0.6 Is that enough? @HarlemLanes Almost there \n",
+ "1572472 capedcrusader13 @abbiirocks for what? i got hit by a car \n",
+ "212968 lauralem79 not able to download my badass pics \n",
+ "\n",
+ " text_processed \n",
+ "593613 [eleven, hey, salaci, tweet, gone] \n",
+ "489821 [tri, studi, exam, gt, fail] \n",
+ "856761 [lime, cou, cou, sarahmoran, ehon, place] \n",
+ "1505181 [daisywoo, know, nph, could, sing, saw, rent, ... \n",
+ "769617 [sgarcia, fuck, wa, never, meant, jealou, wan,... \n",
+ "... ... \n",
+ "1498706 [took, chanc, chang, life, lol] \n",
+ "660759 [time, go, job, im, pretti, tire, go] \n",
+ "1502259 [enough, harlemlan, almost] \n",
+ "1572472 [abbiirock, got, hit, car] \n",
+ "212968 [abl, download, badass, pic] \n",
+ "\n",
+ "[20000 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# your code here"
+ "sample['text_processed'] = sample['text'].apply(clean_up).apply(tokenize).apply(stem_and_lemmatize).apply(remove_stopwords)\n",
+ "sample"
]
},
{
@@ -98,11 +532,1029 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['eleven',\n",
+ " 'hey',\n",
+ " 'salaci',\n",
+ " 'tweet',\n",
+ " 'gone',\n",
+ " 'tri',\n",
+ " 'studi',\n",
+ " 'exam',\n",
+ " 'gt',\n",
+ " 'fail',\n",
+ " 'lime',\n",
+ " 'cou',\n",
+ " 'sarahmoran',\n",
+ " 'ehon',\n",
+ " 'place',\n",
+ " 'daisywoo',\n",
+ " 'know',\n",
+ " 'nph',\n",
+ " 'could',\n",
+ " 'sing',\n",
+ " 'saw',\n",
+ " 'rent',\n",
+ " 'la',\n",
+ " 'wa',\n",
+ " 'fantast',\n",
+ " 'sgarcia',\n",
+ " 'fuck',\n",
+ " 'never',\n",
+ " 'meant',\n",
+ " 'jealou',\n",
+ " 'wan',\n",
+ " 'na',\n",
+ " 'buy',\n",
+ " 'makeup',\n",
+ " 'world',\n",
+ " 'hah',\n",
+ " 'albiezushi',\n",
+ " 'cooper',\n",
+ " 'go',\n",
+ " 'think',\n",
+ " 'may',\n",
+ " 'leav',\n",
+ " 'friday',\n",
+ " 'made',\n",
+ " 'best',\n",
+ " 'french',\n",
+ " 'toast',\n",
+ " 'life',\n",
+ " 'ate',\n",
+ " 'second',\n",
+ " 'bc',\n",
+ " 'hungri',\n",
+ " 'even',\n",
+ " 'rememb',\n",
+ " 'tast',\n",
+ " 'like',\n",
+ " 'sausag',\n",
+ " 'mcmuffin',\n",
+ " 'breakfast',\n",
+ " 'hahha',\n",
+ " 'mac',\n",
+ " 'luv',\n",
+ " 'watch',\n",
+ " 'quot',\n",
+ " 'nuclear',\n",
+ " 'hurrican',\n",
+ " 'recommend',\n",
+ " 'mamaw',\n",
+ " 'goe',\n",
+ " 'home',\n",
+ " 'today',\n",
+ " 'count',\n",
+ " 'day',\n",
+ " 'geoffrey',\n",
+ " 'come',\n",
+ " 'sea',\n",
+ " 'lt',\n",
+ " 'jamesdoc',\n",
+ " 'veri',\n",
+ " 'excit',\n",
+ " 'moment',\n",
+ " 'percentgrey',\n",
+ " 'thi',\n",
+ " 'time',\n",
+ " 'besid',\n",
+ " 'work',\n",
+ " 'someth',\n",
+ " 'interest',\n",
+ " 'talk',\n",
+ " 'okay',\n",
+ " 'back',\n",
+ " 'minut',\n",
+ " 'headach',\n",
+ " 'amp',\n",
+ " 'wait',\n",
+ " 'ibuprofin',\n",
+ " 'kick',\n",
+ " 'aha',\n",
+ " 'heyhunt',\n",
+ " 'follow',\n",
+ " 'friend',\n",
+ " 'michaela',\n",
+ " 'twitter',\n",
+ " 'siiick',\n",
+ " 'well',\n",
+ " 'least',\n",
+ " 'school',\n",
+ " 'xd',\n",
+ " 'miss',\n",
+ " 'tommi',\n",
+ " 'boy',\n",
+ " 'girl',\n",
+ " 'night',\n",
+ " 'tonight',\n",
+ " 'ugh',\n",
+ " 'four',\n",
+ " 'nathan',\n",
+ " 'rd',\n",
+ " 'anniversari',\n",
+ " 'hope',\n",
+ " 'play',\n",
+ " 'luna',\n",
+ " 'tomorrow',\n",
+ " 'get',\n",
+ " 'lvl',\n",
+ " 'current',\n",
+ " 'opal',\n",
+ " 'reinforc',\n",
+ " 'set',\n",
+ " 'lot',\n",
+ " 'vit',\n",
+ " 'ftw',\n",
+ " 'lynda',\n",
+ " 'im',\n",
+ " 'see',\n",
+ " 'crocrock',\n",
+ " 'jacksassradio',\n",
+ " 'jack',\n",
+ " 'pictur',\n",
+ " 'want',\n",
+ " 'parti',\n",
+ " 'villa',\n",
+ " 'sometim',\n",
+ " 'qejp',\n",
+ " 'b',\n",
+ " 'clean',\n",
+ " 'share',\n",
+ " 'ancient',\n",
+ " 'glambert',\n",
+ " 'hi',\n",
+ " 'gradauat',\n",
+ " 'london',\n",
+ " 'warn',\n",
+ " 'longer',\n",
+ " 'bu',\n",
+ " 'nearli',\n",
+ " 'hour',\n",
+ " 'fin',\n",
+ " 'park',\n",
+ " 'last',\n",
+ " 'befor',\n",
+ " 'realis',\n",
+ " 'shmelvywelvi',\n",
+ " 'need',\n",
+ " 'umm',\n",
+ " 'safe',\n",
+ " 'n',\n",
+ " 'sound',\n",
+ " 'worst',\n",
+ " 'ever',\n",
+ " 'girlycut',\n",
+ " 'good',\n",
+ " 'pinksealight',\n",
+ " 'nail',\n",
+ " 'fab',\n",
+ " 'would',\n",
+ " 'stay',\n",
+ " 'choic',\n",
+ " 'djndayo',\n",
+ " 'glad',\n",
+ " 'u',\n",
+ " 'ur',\n",
+ " 'germ',\n",
+ " 'weaken',\n",
+ " 'victim',\n",
+ " 'flu',\n",
+ " 'digloung',\n",
+ " 'definit',\n",
+ " 'comm',\n",
+ " 'ca',\n",
+ " 'abl',\n",
+ " 'hear',\n",
+ " 'guest',\n",
+ " 'loud',\n",
+ " 'nois',\n",
+ " 'clubb',\n",
+ " 'endear',\n",
+ " 'clear',\n",
+ " 'lavieenros',\n",
+ " 'cup',\n",
+ " 'fine',\n",
+ " 'darl',\n",
+ " 'hello',\n",
+ " 'helo',\n",
+ " 'everyoneeeeeee',\n",
+ " 'gordon',\n",
+ " 'peterson',\n",
+ " 'newseum',\n",
+ " 'member',\n",
+ " 'onli',\n",
+ " 'becaus',\n",
+ " 'high',\n",
+ " 'seat',\n",
+ " 'demand',\n",
+ " 'aunt',\n",
+ " 'rip',\n",
+ " 'chula',\n",
+ " 'erika',\n",
+ " 'roman',\n",
+ " 'love',\n",
+ " 'congratul',\n",
+ " 'brettport',\n",
+ " 'laura',\n",
+ " 'first',\n",
+ " 'babi',\n",
+ " 'brett',\n",
+ " 'mani',\n",
+ " 'maven',\n",
+ " 'bill',\n",
+ " 'hader',\n",
+ " 'curragh',\n",
+ " 'weekend',\n",
+ " 'said',\n",
+ " 'old',\n",
+ " 'one',\n",
+ " 'tree',\n",
+ " 'hill',\n",
+ " 'indulg',\n",
+ " 'runner',\n",
+ " 'knee',\n",
+ " 'mydecemberht',\n",
+ " 'daaannng',\n",
+ " 'blow',\n",
+ " 'sorri',\n",
+ " 'rohitsabu',\n",
+ " 'focu',\n",
+ " 'badavarasc',\n",
+ " 'pov',\n",
+ " 'tho',\n",
+ " 'cant',\n",
+ " 'figur',\n",
+ " 'mf',\n",
+ " 'cud',\n",
+ " 'make',\n",
+ " 'select',\n",
+ " 'shot',\n",
+ " 'hurray',\n",
+ " 'tf',\n",
+ " 'updat',\n",
+ " 'done',\n",
+ " 'anoth',\n",
+ " 'week',\n",
+ " 'owieh',\n",
+ " 'watchin',\n",
+ " 'nikki',\n",
+ " 'granger',\n",
+ " 'big',\n",
+ " 'brother',\n",
+ " 'quiz',\n",
+ " 'show',\n",
+ " 'mental',\n",
+ " 'report',\n",
+ " 'mom',\n",
+ " 'mall',\n",
+ " 'america',\n",
+ " 'actual',\n",
+ " 'walk',\n",
+ " 'autism',\n",
+ " 'fun',\n",
+ " 'theatr',\n",
+ " 'town',\n",
+ " 'hospit',\n",
+ " 'visit',\n",
+ " 'cuzzo',\n",
+ " 'mcraddict',\n",
+ " 'wish',\n",
+ " 'lcaller',\n",
+ " 'took',\n",
+ " 'long',\n",
+ " 'mate',\n",
+ " 'upload',\n",
+ " 'track',\n",
+ " 'p',\n",
+ " 'bit',\n",
+ " 'distract',\n",
+ " 'lol',\n",
+ " 'ilaura',\n",
+ " 'etsi',\n",
+ " 'sure',\n",
+ " 'c',\n",
+ " 'better',\n",
+ " 'whi',\n",
+ " 'amaz',\n",
+ " 'kid',\n",
+ " 'epandu',\n",
+ " 'pain',\n",
+ " 'gain',\n",
+ " 'man',\n",
+ " 'outsid',\n",
+ " 'dean',\n",
+ " 'crush',\n",
+ " 'calvin',\n",
+ " 'cri',\n",
+ " 'soon',\n",
+ " 'great',\n",
+ " 'fli',\n",
+ " 'sf',\n",
+ " 'warp',\n",
+ " 'month',\n",
+ " 'el',\n",
+ " 'cardiff',\n",
+ " 'doe',\n",
+ " 'anyon',\n",
+ " 'hammertim',\n",
+ " 'anymor',\n",
+ " 'ha',\n",
+ " 'mummmyyyyi',\n",
+ " 'word',\n",
+ " 'super',\n",
+ " 'realli',\n",
+ " 'hot',\n",
+ " 'let',\n",
+ " 'onlin',\n",
+ " 'box',\n",
+ " 'refresh',\n",
+ " 'chillin',\n",
+ " 'stephani',\n",
+ " 'denton',\n",
+ " 'blah',\n",
+ " 'much',\n",
+ " 'aappa',\n",
+ " 'thnx',\n",
+ " 'remind',\n",
+ " 'xx',\n",
+ " 'cold',\n",
+ " 'wet',\n",
+ " 'shop',\n",
+ " 'rain',\n",
+ " 'oh',\n",
+ " 'dio',\n",
+ " 'liciou',\n",
+ " 'girrrlllll',\n",
+ " 'kno',\n",
+ " 'live',\n",
+ " 'atl',\n",
+ " 'bleed',\n",
+ " 'carey',\n",
+ " 'anim',\n",
+ " 'yay',\n",
+ " 'plu',\n",
+ " 'mint',\n",
+ " 'justa',\n",
+ " 'coffe',\n",
+ " 'morn',\n",
+ " 'lunch',\n",
+ " 'refrigir',\n",
+ " 'brownfamilycat',\n",
+ " 'delay',\n",
+ " 'join',\n",
+ " 'facebook',\n",
+ " 'kale',\n",
+ " 'recip',\n",
+ " 'tip',\n",
+ " 'scale',\n",
+ " 'favor',\n",
+ " 'christian',\n",
+ " 'foot',\n",
+ " 'snuggi',\n",
+ " 'call',\n",
+ " 'fuggi',\n",
+ " 'zut',\n",
+ " 'radio',\n",
+ " 'global',\n",
+ " 'gourmand',\n",
+ " 'f',\n",
+ " 'thank',\n",
+ " 'still',\n",
+ " 'trip',\n",
+ " 'ye',\n",
+ " 'nyc',\n",
+ " 'excel',\n",
+ " 'heard',\n",
+ " 'anyth',\n",
+ " 'year',\n",
+ " 'break',\n",
+ " 'roxio',\n",
+ " 'found',\n",
+ " 'susan',\n",
+ " 'boyl',\n",
+ " 'yep',\n",
+ " 'bgt',\n",
+ " 'hopsit',\n",
+ " 'cuz',\n",
+ " 'went',\n",
+ " 'mad',\n",
+ " 'couldnt',\n",
+ " 'handl',\n",
+ " 'fame',\n",
+ " 'poor',\n",
+ " 'thing',\n",
+ " 'creativeblok',\n",
+ " 'interweb',\n",
+ " 'dead',\n",
+ " 'phone',\n",
+ " 'offici',\n",
+ " 'start',\n",
+ " 'end',\n",
+ " 'summer',\n",
+ " 'commentari',\n",
+ " 'bob',\n",
+ " 'green',\n",
+ " 'cnn',\n",
+ " 'lifetim',\n",
+ " 'littl',\n",
+ " 'haze',\n",
+ " 'hmm',\n",
+ " 'name',\n",
+ " 'rtkmusic',\n",
+ " 'forget',\n",
+ " 'tuesday',\n",
+ " 'compliment',\n",
+ " 'new',\n",
+ " 'arancinibabi',\n",
+ " 'shhh',\n",
+ " 'tell',\n",
+ " 'everyon',\n",
+ " 'especi',\n",
+ " 'half',\n",
+ " 'guy',\n",
+ " 'luck',\n",
+ " 'ya',\n",
+ " 'biolog',\n",
+ " 'practic',\n",
+ " 'tmmrw',\n",
+ " 'clueless',\n",
+ " 'sanjukta',\n",
+ " 'ask',\n",
+ " 'gf',\n",
+ " 'regular',\n",
+ " 'tweeter',\n",
+ " 'lack',\n",
+ " 'hooter',\n",
+ " 'wing',\n",
+ " 'suck',\n",
+ " 'crave',\n",
+ " 'chicken',\n",
+ " 'horribl',\n",
+ " 'finish',\n",
+ " 'number',\n",
+ " 'themonth',\n",
+ " 'djnycesf',\n",
+ " 'damn',\n",
+ " 'leeadership',\n",
+ " 'confer',\n",
+ " 'lost',\n",
+ " 'readi',\n",
+ " 'kill',\n",
+ " 'professor',\n",
+ " 'head',\n",
+ " 'hurt',\n",
+ " 'transform',\n",
+ " 'reveng',\n",
+ " 'fallen',\n",
+ " 'final',\n",
+ " 'young',\n",
+ " 'ladi',\n",
+ " 'understand',\n",
+ " 'lakergirl',\n",
+ " 'wakefield',\n",
+ " 'consist',\n",
+ " 'pitcher',\n",
+ " 'jason',\n",
+ " 'coachbagluv',\n",
+ " 'shud',\n",
+ " 'sleep',\n",
+ " 'haha',\n",
+ " 'got',\n",
+ " 'offer',\n",
+ " 'job',\n",
+ " 'stanst',\n",
+ " 'express',\n",
+ " 'also',\n",
+ " 'cut',\n",
+ " 'cost',\n",
+ " 'control',\n",
+ " 'train',\n",
+ " 'instead',\n",
+ " 'impress',\n",
+ " 'gestapo',\n",
+ " 'secur',\n",
+ " 'entranc',\n",
+ " 'gosh',\n",
+ " 'beeen',\n",
+ " 'caught',\n",
+ " 'happpyy',\n",
+ " 'soakeddd',\n",
+ " 'xxxx',\n",
+ " 'sharan',\n",
+ " 'machi',\n",
+ " 'twenti',\n",
+ " 'happen',\n",
+ " 'gon',\n",
+ " 'crazi',\n",
+ " 'noth',\n",
+ " 'movi',\n",
+ " 'seen',\n",
+ " 'nemo',\n",
+ " 'twittervers',\n",
+ " 'favorit',\n",
+ " 'repli',\n",
+ " 'icon',\n",
+ " 'top',\n",
+ " 'right',\n",
+ " 'corner',\n",
+ " 'look',\n",
+ " 'happi',\n",
+ " 'jcphilli',\n",
+ " 'closet',\n",
+ " 'spring',\n",
+ " 'princesspooh',\n",
+ " 'wonder',\n",
+ " 'short',\n",
+ " 'danc',\n",
+ " 'xoxo',\n",
+ " 'sexi',\n",
+ " 'pooh',\n",
+ " 'bum',\n",
+ " 'fit',\n",
+ " 'stuff',\n",
+ " 'room',\n",
+ " 'run',\n",
+ " 'late',\n",
+ " 'headin',\n",
+ " 'middleofnowher',\n",
+ " 'texa',\n",
+ " 'grandpar',\n",
+ " 'rofl',\n",
+ " 'gunther',\n",
+ " 'joeymcintyr',\n",
+ " 'awwwww',\n",
+ " 'pari',\n",
+ " 'everi',\n",
+ " 'shame',\n",
+ " 'didnt',\n",
+ " 'weeeeeee',\n",
+ " 'hahahaha',\n",
+ " 'expect',\n",
+ " 'chang',\n",
+ " 'mojolocollc',\n",
+ " 'articl',\n",
+ " 'small',\n",
+ " 'pretti',\n",
+ " 'ankurb',\n",
+ " 'might',\n",
+ " 'came',\n",
+ " 'nolif',\n",
+ " 'administr',\n",
+ " 'ing',\n",
+ " 'tax',\n",
+ " 'onepag',\n",
+ " 'design',\n",
+ " 'way',\n",
+ " 'cross',\n",
+ " 'finger',\n",
+ " 'pussycatdol',\n",
+ " 'bad',\n",
+ " 'plea',\n",
+ " 'god',\n",
+ " 'mameekin',\n",
+ " 'say',\n",
+ " 'neither',\n",
+ " 'aceduec',\n",
+ " 'listen',\n",
+ " 'bamma',\n",
+ " 'wear',\n",
+ " 'dem',\n",
+ " 'crinkl',\n",
+ " 'fall',\n",
+ " 'straight',\n",
+ " 'tommcfli',\n",
+ " 'help',\n",
+ " 'yr',\n",
+ " 'side',\n",
+ " 'tom',\n",
+ " 'load',\n",
+ " 'lovebug',\n",
+ " 'gi',\n",
+ " 'laptop',\n",
+ " 'drank',\n",
+ " 'sangria',\n",
+ " 'five',\n",
+ " 'catch',\n",
+ " 'breath',\n",
+ " 'softwaregoddess',\n",
+ " 'talkin',\n",
+ " 'barista',\n",
+ " 'buddi',\n",
+ " 'epicturtl',\n",
+ " 'aaronrva',\n",
+ " 'heaven',\n",
+ " 'hug',\n",
+ " 'arm',\n",
+ " 'around',\n",
+ " 'shoulder',\n",
+ " 'brought',\n",
+ " 'fux',\n",
+ " 'ddlovato',\n",
+ " 'arent',\n",
+ " 'mtv',\n",
+ " 'award',\n",
+ " 'forward',\n",
+ " 'tv',\n",
+ " 'cupcak',\n",
+ " 'sensitivech',\n",
+ " 'face',\n",
+ " 'susanpau',\n",
+ " 'appreci',\n",
+ " 'bloodi',\n",
+ " 'nose',\n",
+ " 'season',\n",
+ " 'takethatnew',\n",
+ " 'stand',\n",
+ " 'bought',\n",
+ " 'mistak',\n",
+ " 'panic',\n",
+ " 'allid',\n",
+ " 'certainli',\n",
+ " 'uyennguyen',\n",
+ " 'fulli',\n",
+ " 'stalk',\n",
+ " 'qld',\n",
+ " 'video',\n",
+ " 'lili',\n",
+ " 'allen',\n",
+ " 'singl',\n",
+ " 'st',\n",
+ " 'june',\n",
+ " 'exit',\n",
+ " 'peopl',\n",
+ " 'stop',\n",
+ " 'cough',\n",
+ " 'near',\n",
+ " 'scare',\n",
+ " 'contract',\n",
+ " 'swine',\n",
+ " 'nd',\n",
+ " 'hate',\n",
+ " 'sick',\n",
+ " 'dog',\n",
+ " 'cuddl',\n",
+ " 'star',\n",
+ " 'pool',\n",
+ " 'soft',\n",
+ " 'wave',\n",
+ " 'put',\n",
+ " 'full',\n",
+ " 'fantasi',\n",
+ " 'mode',\n",
+ " 'grandma',\n",
+ " 'backyard',\n",
+ " 'nice',\n",
+ " 'matti',\n",
+ " 'daddi',\n",
+ " 'sabbyaz',\n",
+ " 'blog',\n",
+ " 'email',\n",
+ " 'hungout',\n",
+ " 'drake',\n",
+ " 'nate',\n",
+ " 'michael',\n",
+ " 'jen',\n",
+ " 'nick',\n",
+ " 'cori',\n",
+ " 'graduat',\n",
+ " 'holi',\n",
+ " 'rosiereap',\n",
+ " 'l',\n",
+ " 'mother',\n",
+ " 'gfalcon',\n",
+ " 'looov',\n",
+ " 'album',\n",
+ " 'voic',\n",
+ " 'sad',\n",
+ " 'hunternjadezmom',\n",
+ " 'ok',\n",
+ " 'aplusk',\n",
+ " 'link',\n",
+ " 'imagin',\n",
+ " 'di',\n",
+ " 'soooo',\n",
+ " 'uuu',\n",
+ " 'xpb',\n",
+ " 'omgpop',\n",
+ " 'officialtila',\n",
+ " 'id',\n",
+ " 'bout',\n",
+ " 'crushin',\n",
+ " 'dont',\n",
+ " 'whore',\n",
+ " 'fanx',\n",
+ " 'mattyde',\n",
+ " 'twitblock',\n",
+ " 'api',\n",
+ " 'left',\n",
+ " 'er',\n",
+ " 'com',\n",
+ " 'seem',\n",
+ " 'jeannefromnc',\n",
+ " 'brantanamo',\n",
+ " 'round',\n",
+ " 'tea',\n",
+ " 'chees',\n",
+ " 'alon',\n",
+ " 'un',\n",
+ " 'care',\n",
+ " 'awak',\n",
+ " 'havent',\n",
+ " 'slept',\n",
+ " 'meh',\n",
+ " 'superpurpl',\n",
+ " 'ayt',\n",
+ " 'wit',\n",
+ " 'ppl',\n",
+ " 'though',\n",
+ " 'edit',\n",
+ " 'next',\n",
+ " 'vlog',\n",
+ " 'jholti',\n",
+ " 'ahhhhhhhh',\n",
+ " 'pupi',\n",
+ " 'serious',\n",
+ " 'loo',\n",
+ " 'mechan',\n",
+ " 'chat',\n",
+ " 'mirella',\n",
+ " 'text',\n",
+ " 'messag',\n",
+ " 'somebodi',\n",
+ " 'te',\n",
+ " 'real',\n",
+ " 'jonaskevin',\n",
+ " 'callmestephani',\n",
+ " 'lucki',\n",
+ " 'havnt',\n",
+ " 'gym',\n",
+ " 'yet',\n",
+ " 'omg',\n",
+ " 'ive',\n",
+ " 'stori',\n",
+ " 'bed',\n",
+ " 'lamp',\n",
+ " 'die',\n",
+ " 'darkstarbuck',\n",
+ " 'exceed',\n",
+ " 'band',\n",
+ " 'morrow',\n",
+ " 'someon',\n",
+ " 'meeeeeeee',\n",
+ " 'thisisdavina',\n",
+ " 'direct',\n",
+ " 'yer',\n",
+ " 'loyal',\n",
+ " 'wud',\n",
+ " 'older',\n",
+ " 'sister',\n",
+ " 'russia',\n",
+ " 'soooooooo',\n",
+ " 'mccainblogett',\n",
+ " 'rather',\n",
+ " 'pirat',\n",
+ " 'nik',\n",
+ " 'base',\n",
+ " 'servic',\n",
+ " 'dm',\n",
+ " 'find',\n",
+ " 'reesemarcel',\n",
+ " 'cute',\n",
+ " 'rees',\n",
+ " 'feel',\n",
+ " 'cool',\n",
+ " 'sunni',\n",
+ " 'gut',\n",
+ " 'birthday',\n",
+ " 'clubg',\n",
+ " 'stuck',\n",
+ " 'till',\n",
+ " 'nine',\n",
+ " 'book',\n",
+ " 'closer',\n",
+ " 'hotel',\n",
+ " 'sdcc',\n",
+ " 'boo',\n",
+ " 'hero',\n",
+ " 'panel',\n",
+ " 'strombo',\n",
+ " 'funni',\n",
+ " 'passwordreset',\n",
+ " 'welcom',\n",
+ " 'btw',\n",
+ " 'commiser',\n",
+ " 'micaiah',\n",
+ " 'taken',\n",
+ " 'soo',\n",
+ " 'rob',\n",
+ " 'papz',\n",
+ " 'whistl',\n",
+ " 'yell',\n",
+ " 'athim',\n",
+ " 'stone',\n",
+ " 'heart',\n",
+ " 'griffintech',\n",
+ " 'line',\n",
+ " 'g',\n",
+ " 'yeah',\n",
+ " 'add',\n",
+ " 'deidg',\n",
+ " 'maggieshiel',\n",
+ " 'probabl',\n",
+ " 'disappoint',\n",
+ " 'scifi',\n",
+ " 'coursee',\n",
+ " 'jessicastrust',\n",
+ " 'ditch',\n",
+ " 'strontiumfox',\n",
+ " 'gong',\n",
+ " 'glasgow',\n",
+ " 'teh',\n",
+ " 'hayl',\n",
+ " 'eileen',\n",
+ " 'hard',\n",
+ " 'battl',\n",
+ " 'fax',\n",
+ " 'machin',\n",
+ " 'win',\n",
+ " 'suppis',\n",
+ " 'goin',\n",
+ " 'til',\n",
+ " 'gg',\n",
+ " 'episod',\n",
+ " 'tire',\n",
+ " 'review',\n",
+ " 'traffic',\n",
+ " 'take',\n",
+ " 'away',\n",
+ " 'exhaust',\n",
+ " 'w',\n",
+ " 'lang',\n",
+ " 'owner',\n",
+ " 'secondlif',\n",
+ " 'read',\n",
+ " 'yesterday',\n",
+ " 'languag',\n",
+ " 'teacher',\n",
+ " 'jalt',\n",
+ " 'mingoent',\n",
+ " 'almost',\n",
+ " 'woken',\n",
+ " 'earli',\n",
+ " 'jennposs',\n",
+ " 'alreadi',\n",
+ " 'gezz',\n",
+ " 'record',\n",
+ " 'organis',\n",
+ " 'juli',\n",
+ " 'momadthenomad',\n",
+ " 'mo',\n",
+ " 'test',\n",
+ " 'dev',\n",
+ " 'open',\n",
+ " 'awesom',\n",
+ " 'comeback',\n",
+ " 'rogi',\n",
+ " 'whole',\n",
+ " 'match',\n",
+ " 'inspit',\n",
+ " 'comp',\n",
+ " 'graphic',\n",
+ " 'paper',\n",
+ " 'moro',\n",
+ " 'billyraycyru',\n",
+ " 'shoot',\n",
+ " 'th',\n",
+ " 'tour',\n",
+ " 'nickkkjonasss',\n",
+ " 'omj',\n",
+ " 'jona',\n",
+ " 'everyth',\n",
+ " 'vancouv',\n",
+ " 'filip',\n",
+ " 'murteira',\n",
+ " 'neosurgehost',\n",
+ " 'shocker',\n",
+ " 'experi',\n",
+ " 'ltaloi',\n",
+ " 'unfollow',\n",
+ " 'calendar',\n",
+ " 'annoy',\n",
+ " 'dump',\n",
+ " 'date',\n",
+ " 'save',\n",
+ " 'ill',\n",
+ " 'alway',\n",
+ " 'owe',\n",
+ " 'fuckasauru',\n",
+ " 'chickenshit',\n",
+ " 'excus',\n",
+ " 'backbon',\n",
+ " 'hide',\n",
+ " 'ashley',\n",
+ " 'hotdogsladi',\n",
+ " 'app',\n",
+ " 'support',\n",
+ " 'cascandar',\n",
+ " 'whatisit',\n",
+ " 'doin',\n",
+ " 'bore',\n",
+ " 'sunday',\n",
+ " 'gurli',\n",
+ " 'promis',\n",
+ " 'tenni',\n",
+ " 'cloth',\n",
+ " 'vnakic',\n",
+ " 'dude',\n",
+ " 'surpris',\n",
+ " 'dougi',\n",
+ " 'poynter',\n",
+ " 'regret',\n",
+ " 'ufff',\n",
+ " 'offlin',\n",
+ " 'moldavian',\n",
+ " 'italylogu',\n",
+ " 'jpack',\n",
+ " 'lie',\n",
+ " 'husband',\n",
+ " 'bio',\n",
+ " 'nephew',\n",
+ " 'bday',\n",
+ " 'gettin',\n",
+ " 'drink',\n",
+ " 'wif',\n",
+ " 'punit',\n",
+ " 'r',\n",
+ " 'wootwoot',\n",
+ " 'none',\n",
+ " 'mk',\n",
+ " 'skeptwiit',\n",
+ " 'notsiralansugar',\n",
+ " 'apprentic',\n",
+ " 'yayitsrobot',\n",
+ " 'cover',\n",
+ " 'cubicl',\n",
+ " 'sit',\n",
+ " 'faaaar',\n",
+ " 'cell',\n",
+ " 'carrier',\n",
+ " 'smartphon',\n",
+ " 'option',\n",
+ " 'blackberri',\n",
+ " 'window',\n",
+ " 'mobil',\n",
+ " 'either',\n",
+ " 'palm',\n",
+ " 'absolut',\n",
+ " 'sweetheart',\n",
+ " 'drift',\n",
+ " 'peac',\n",
+ " 'ian',\n",
+ " 'broskey',\n",
+ " 'mucho',\n",
+ " 'slightli',\n",
+ " 'emo',\n",
+ " 'footi',\n",
+ " 'pub',\n",
+ " 'front',\n",
+ " 'dpsrecord',\n",
+ " 'dammit',\n",
+ " 'wockeez',\n",
+ " 'meet',\n",
+ " 'greet',\n",
+ " 'chiago',\n",
+ " 'beuati',\n",
+ " 'realiz',\n",
+ " 'le',\n",
+ " 'pervi',\n",
+ " 'thought',\n",
+ " 'wont',\n",
+ " 'later',\n",
+ " 'michaelaranda',\n",
+ " 'orlando',\n",
+ " 'slipknot',\n",
+ " ...]"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# your code here"
+ "cfdist = nltk.FreqDist()\n",
+ "\n",
+ "for tweet in sample['text_processed']:\n",
+ " for word in tweet:\n",
+ " cfdist[word] += 1\n",
+ "\n",
+ "top_words = list(cfdist.keys())[:5000]\n",
+ "top_words"
]
},
{
@@ -167,11 +1619,28 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "20000\n"
+ ]
+ }
+ ],
"source": [
- "# your code here"
+ "def find_features(document):\n",
+ " words = set(document)\n",
+ " features = {}\n",
+ " for w in top_words:\n",
+ " features[w] = (w in words)\n",
+ " \n",
+ " return features\n",
+ " \n",
+ "feature_sets = [(find_features(tweet), target) for (tweet, target) in list(zip(sample['text_processed'], sample['target']))]\n",
+ "print(len(feature_sets))"
]
},
{
@@ -210,11 +1679,12 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "train_set, test_set = feature_sets[:10000], feature_sets[10000:]\n",
+ "classifier = nltk.NaiveBayesClassifier.train(train_set)"
]
},
{
@@ -228,13 +1698,38 @@
"As mentioned in one of the tutorial videos, a Naive Bayes model is considered OK if your accuracy score is over 0.6. If your accuracy score is over 0.7, you've done a great job!"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.7203\n",
+ "Most Informative Features\n",
+ " sick = True 0 : 1 = 23.1 : 1.0\n",
+ " sad = True 0 : 1 = 17.4 : 1.0\n",
+ " hurt = True 0 : 1 = 17.3 : 1.0\n",
+ " stomach = True 0 : 1 = 15.3 : 1.0\n",
+ " unfortun = True 0 : 1 = 15.3 : 1.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(nltk.classify.accuracy(classifier, test_set))\n",
+ "classifier.show_most_informative_features(5)"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "# How the hell are those tiny laptops from class doing this???\n",
+ "# My fan gets crazy everytime I run some of these cells u.u'"
]
},
{
@@ -312,7 +1807,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.3"
+ "version": "3.10.9"
}
},
"nbformat": 4,