diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb index 0808166..f2b7884 100644 --- a/your-code/challenge-1.ipynb +++ b/your-code/challenge-1.ipynb @@ -66,9 +66,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], + "source": [ + "import re\n", + "import nltk" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'ironhack s q website is'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def clean_up(s):\n", " \"\"\"\n", @@ -79,7 +100,14 @@ "\n", " Returns:\n", " A string that has been cleaned up.\n", - " \"\"\"" + " \"\"\"\n", + " string = re.sub(r'http\\S+', '', s)\n", + " return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n", + " \n", + "test = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\"\n", + "\n", + "test_string = clean_up(test)\n", + "test_string" ] }, { @@ -101,9 +129,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['ironhack', 's', 'q', 'website', 'is']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def tokenize(s):\n", " \"\"\"\n", @@ -114,7 +153,11 @@ "\n", " Returns:\n", " A list of words as the result of tokenization.\n", - " \"\"\"" + " \"\"\"\n", + " return nltk.word_tokenize(s)\n", + "\n", + "test_string = tokenize(test_string)\n", + "test_string" ] }, { @@ -145,11 +188,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ + "# Nope, something went wrong, I'll use another set of words\n", + "\n", + "import nltk\n", + "from nltk.stem import PorterStemmer, WordNetLemmatizer\n", + "\n", "def stem_and_lemmatize(l):\n", + " \n", " \"\"\"\n", " Perform stemming and lemmatization on a list of words.\n", "\n", @@ -158,7 +207,17 @@ "\n", " Returns:\n", " A list of strings after being stemmed and lemmatized.\n", - " \"\"\"" + " \"\"\"\n", + " ps = nltk.PorterStemmer()\n", + " lemmatizer = nltk.WordNetLemmatizer()\n", + " l2 = []\n", + " \n", + " for w in l:\n", + " s = ps.stem(w)\n", + " s = lemmatizer.lemmatize(s)\n", + " l2 += [s]\n", + " \n", + " return l2\n" ] }, { @@ -176,7 +235,16 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.corpus import stopwords" + ] + }, + { + "cell_type": "code", + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -189,7 +257,10 @@ "\n", " Returns:\n", " A list of strings after stop words are removed.\n", - " \"\"\"" + " \"\"\"\n", + " stop_words = stopwords.words('english')\n", + "\n", + " return [w for w in l if w not in stop_words]" ] }, { @@ -218,7 +289,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.10.9" } }, "nbformat": 4, diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb index 6b0e116..bca22d3 100644 --- a/your-code/challenge-2.ipynb +++ b/your-code/challenge-2.ipynb @@ -18,8 +18,8 @@ "\n", "```python\n", ">>> from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", - ">>> txt = \"Ironhack is a Global Tech School ranked num 2 worldwide. 
", - "
", + ">>> txt = \"Ironhack is a Global Tech School ranked num 2 worldwide. 
\n", + "
\n", "Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do.\"\n", ">>> analyzer = SentimentIntensityAnalyzer()\n", ">>> analyzer.polarity_scores(txt)\n", @@ -46,11 +46,250 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "import pandas as pd\n", + "import numpy as np\n", + "from nltk.corpus import stopwords\n", + "import re\n", + "import nltk\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def clean_up(s):\n", + " \"\"\"\n", + " Cleans up numbers, URLs, and special characters from a string.\n", + "\n", + " Args:\n", + " s: The string to be cleaned up.\n", + "\n", + " Returns:\n", + " A string that has been cleaned up.\n", + " \"\"\"\n", + " string = re.sub(r'http\\S+', '', s)\n", + " return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n", + "\n", + "def tokenize(s):\n", + " \"\"\"\n", + " Tokenize a string.\n", + "\n", + " Args:\n", + " s: String to be tokenized.\n", + "\n", + " Returns:\n", + " A list of words as the result of tokenization.\n", + " \"\"\"\n", + " return nltk.word_tokenize(s)\n", + "\n", + "def stem_and_lemmatize(l):\n", + " \n", + " \"\"\"\n", + " Perform stemming and lemmatization on a list of words.\n", + "\n", + " Args:\n", + " l: A list of strings.\n", + "\n", + " Returns:\n", + " A list of strings after being stemmed and lemmatized.\n", + " \"\"\"\n", + " ps = nltk.PorterStemmer()\n", + " lemmatizer = nltk.WordNetLemmatizer()\n", + " l2 = []\n", + " \n", + " for w in l:\n", + " s = ps.stem(w)\n", + " s = lemmatizer.lemmatize(s)\n", + " l2 += [s]\n", + " \n", + " return l2\n", + "\n", + "\n", + "def remove_stopwords(l):\n", + " \"\"\"\n", + " Remove English stopwords from a list of strings.\n", + "\n", + " Args:\n", + " l: A list of strings.\n", + "\n", + " Returns:\n", + " A list of strings after stop words are removed.\n", + " \"\"\"\n", + " stop_words = stopwords.words('english')\n", + "\n", + " return [w for w in l if w not in stop_words]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01467810369Mon Apr 06 22:19:45 PDT 2009NO_QUERY_TheSpecialOne_@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D
001467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
101467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
\n", + "
" + ], + "text/plain": [ + " 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY _TheSpecialOne_ \\\n", + "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n", + "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n", + "\n", + " @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D \n", + "0 is upset that he can't update his Facebook by ... \n", + "1 @Kenichan I dived many times for the ball. Man... " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "tweets = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin1')\n", + "tweets.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
targetiddateflagusertext
001467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
101467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
\n", + "
" + ], + "text/plain": [ + " target id date flag user \\\n", + "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY scotthamilton \n", + "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY mattycus \n", + "\n", + " text \n", + "0 is upset that he can't update his Facebook by ... \n", + "1 @Kenichan I dived many times for the ball. Man... " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweets.columns = ['target','id','date','flag','user','text']\n", + "tweets.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "sample = tweets.sample(20000)\n", + "sample['target'] = sample['target'].replace(4, 1)" ] }, { @@ -76,11 +315,206 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
targetiddateflagusertexttext_processed
59361302218286287Wed Jun 17 22:12:28 PDT 2009NO_QUERYkaitlbean@06eleven hey! Where have all your salacious t...[eleven, hey, salaci, tweet, gone]
48982102183125226Mon Jun 15 14:04:46 PDT 2009NO_QUERYArellyTrying to study for exams -> failing[tri, studi, exam, gt, fail]
85676111573804695Tue Apr 21 02:56:37 PDT 2009NO_QUERYelliottbledsoejust had lime cous-cous with @SarahMoran and @...[lime, cou, cou, sarahmoran, ehon, place]
150518112072316741Sun Jun 07 20:47:40 PDT 2009NO_QUERYsuzieqjenny@daisywoo you didn't know NPH could sing!?!?! ...[daisywoo, know, nph, could, sing, saw, rent, ...
76961702301511058Tue Jun 23 15:32:19 PDT 2009NO_QUERYnoeyfashowey@sgarcia408 fuck.. It was never meant to be.. ...[sgarcia, fuck, wa, never, meant, jealou, wan,...
........................
149870612070638801Sun Jun 07 18:02:59 PDT 2009NO_QUERYOinkRachelJust took a chance that can change my life . Lol[took, chanc, chang, life, lol]
66075902242829124Fri Jun 19 12:41:01 PDT 2009NO_QUERYlexi_diaztime to go out to the job!!! im pretty tired b...[time, go, job, im, pretti, tire, go]
150225912071639205Sun Jun 07 19:42:23 PDT 2009NO_QUERYbowlwiki0.6 Is that enough? @HarlemLanes Almost there[enough, harlemlan, almost]
157247212188940716Mon Jun 15 22:53:30 PDT 2009NO_QUERYcapedcrusader13@abbiirocks for what? i got hit by a car[abbiirock, got, hit, car]
21296801974879860Sat May 30 14:00:23 PDT 2009NO_QUERYlauralem79not able to download my badass pics[abl, download, badass, pic]
\n", + "

20000 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " target id date flag \\\n", + "593613 0 2218286287 Wed Jun 17 22:12:28 PDT 2009 NO_QUERY \n", + "489821 0 2183125226 Mon Jun 15 14:04:46 PDT 2009 NO_QUERY \n", + "856761 1 1573804695 Tue Apr 21 02:56:37 PDT 2009 NO_QUERY \n", + "1505181 1 2072316741 Sun Jun 07 20:47:40 PDT 2009 NO_QUERY \n", + "769617 0 2301511058 Tue Jun 23 15:32:19 PDT 2009 NO_QUERY \n", + "... ... ... ... ... \n", + "1498706 1 2070638801 Sun Jun 07 18:02:59 PDT 2009 NO_QUERY \n", + "660759 0 2242829124 Fri Jun 19 12:41:01 PDT 2009 NO_QUERY \n", + "1502259 1 2071639205 Sun Jun 07 19:42:23 PDT 2009 NO_QUERY \n", + "1572472 1 2188940716 Mon Jun 15 22:53:30 PDT 2009 NO_QUERY \n", + "212968 0 1974879860 Sat May 30 14:00:23 PDT 2009 NO_QUERY \n", + "\n", + " user text \\\n", + "593613 kaitlbean @06eleven hey! Where have all your salacious t... \n", + "489821 Arelly Trying to study for exams -> failing \n", + "856761 elliottbledsoe just had lime cous-cous with @SarahMoran and @... \n", + "1505181 suzieqjenny @daisywoo you didn't know NPH could sing!?!?! ... \n", + "769617 noeyfashowey @sgarcia408 fuck.. It was never meant to be.. ... \n", + "... ... ... \n", + "1498706 OinkRachel Just took a chance that can change my life . Lol \n", + "660759 lexi_diaz time to go out to the job!!! im pretty tired b... \n", + "1502259 bowlwiki 0.6 Is that enough? @HarlemLanes Almost there \n", + "1572472 capedcrusader13 @abbiirocks for what? i got hit by a car \n", + "212968 lauralem79 not able to download my badass pics \n", + "\n", + " text_processed \n", + "593613 [eleven, hey, salaci, tweet, gone] \n", + "489821 [tri, studi, exam, gt, fail] \n", + "856761 [lime, cou, cou, sarahmoran, ehon, place] \n", + "1505181 [daisywoo, know, nph, could, sing, saw, rent, ... \n", + "769617 [sgarcia, fuck, wa, never, meant, jealou, wan,... \n", + "... ... \n", + "1498706 [took, chanc, chang, life, lol] \n", + "660759 [time, go, job, im, pretti, tire, go] \n", + "1502259 [enough, harlemlan, almost] \n", + "1572472 [abbiirock, got, hit, car] \n", + "212968 [abl, download, badass, pic] \n", + "\n", + "[20000 rows x 7 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here" + "sample['text_processed'] = sample['text'].apply(clean_up).apply(tokenize).apply(stem_and_lemmatize).apply(remove_stopwords)\n", + "sample" ] }, { @@ -98,11 +532,1029 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['eleven',\n", + " 'hey',\n", + " 'salaci',\n", + " 'tweet',\n", + " 'gone',\n", + " 'tri',\n", + " 'studi',\n", + " 'exam',\n", + " 'gt',\n", + " 'fail',\n", + " 'lime',\n", + " 'cou',\n", + " 'sarahmoran',\n", + " 'ehon',\n", + " 'place',\n", + " 'daisywoo',\n", + " 'know',\n", + " 'nph',\n", + " 'could',\n", + " 'sing',\n", + " 'saw',\n", + " 'rent',\n", + " 'la',\n", + " 'wa',\n", + " 'fantast',\n", + " 'sgarcia',\n", + " 'fuck',\n", + " 'never',\n", + " 'meant',\n", + " 'jealou',\n", + " 'wan',\n", + " 'na',\n", + " 'buy',\n", + " 'makeup',\n", + " 'world',\n", + " 'hah',\n", + " 'albiezushi',\n", + " 'cooper',\n", + " 'go',\n", + " 'think',\n", + " 'may',\n", + " 'leav',\n", + " 'friday',\n", + " 'made',\n", + " 'best',\n", + " 'french',\n", + " 'toast',\n", + " 'life',\n", + " 'ate',\n", + " 'second',\n", + " 'bc',\n", + " 'hungri',\n", + " 'even',\n", + " 'rememb',\n", + " 'tast',\n", + " 'like',\n", + " 'sausag',\n", + " 'mcmuffin',\n", + " 'breakfast',\n", + " 'hahha',\n", + " 'mac',\n", + " 'luv',\n", + " 'watch',\n", + " 'quot',\n", + " 'nuclear',\n", + " 'hurrican',\n", + " 'recommend',\n", + " 'mamaw',\n", + " 'goe',\n", + " 'home',\n", + " 'today',\n", + " 'count',\n", + " 'day',\n", + " 'geoffrey',\n", + " 'come',\n", + " 'sea',\n", + " 'lt',\n", + " 'jamesdoc',\n", + " 'veri',\n", + " 'excit',\n", + " 'moment',\n", + " 'percentgrey',\n", + " 'thi',\n", + " 'time',\n", + " 'besid',\n", + " 'work',\n", + " 'someth',\n", + " 'interest',\n", + " 'talk',\n", + " 'okay',\n", + " 'back',\n", + " 'minut',\n", + " 'headach',\n", + " 'amp',\n", + " 'wait',\n", + " 'ibuprofin',\n", + " 'kick',\n", + " 'aha',\n", + " 'heyhunt',\n", + " 'follow',\n", + " 'friend',\n", + " 'michaela',\n", + " 'twitter',\n", + " 'siiick',\n", + " 'well',\n", + " 'least',\n", + " 'school',\n", + " 'xd',\n", + " 'miss',\n", + " 'tommi',\n", + " 'boy',\n", + " 'girl',\n", + " 'night',\n", + " 'tonight',\n", + " 'ugh',\n", + " 'four',\n", + " 'nathan',\n", + " 'rd',\n", + " 'anniversari',\n", + " 'hope',\n", + " 'play',\n", + " 'luna',\n", + " 'tomorrow',\n", + " 'get',\n", + " 'lvl',\n", + " 'current',\n", + " 'opal',\n", + " 'reinforc',\n", + " 'set',\n", + " 'lot',\n", + " 'vit',\n", + " 'ftw',\n", + " 'lynda',\n", + " 'im',\n", + " 'see',\n", + " 'crocrock',\n", + " 'jacksassradio',\n", + " 'jack',\n", + " 'pictur',\n", + " 'want',\n", + " 'parti',\n", + " 'villa',\n", + " 'sometim',\n", + " 'qejp',\n", + " 'b',\n", + " 'clean',\n", + " 'share',\n", + " 'ancient',\n", + " 'glambert',\n", + " 'hi',\n", + " 'gradauat',\n", + " 'london',\n", + " 'warn',\n", + " 'longer',\n", + " 'bu',\n", + " 'nearli',\n", + " 'hour',\n", + " 'fin',\n", + " 'park',\n", + " 'last',\n", + " 'befor',\n", + " 'realis',\n", + " 'shmelvywelvi',\n", + " 'need',\n", + " 'umm',\n", + " 'safe',\n", + " 'n',\n", + " 'sound',\n", + " 'worst',\n", + " 'ever',\n", + " 'girlycut',\n", + " 'good',\n", + " 'pinksealight',\n", + " 'nail',\n", + " 'fab',\n", + " 'would',\n", + " 'stay',\n", + " 'choic',\n", + " 'djndayo',\n", + " 'glad',\n", + " 'u',\n", + " 'ur',\n", + " 'germ',\n", + " 'weaken',\n", + " 'victim',\n", + " 'flu',\n", + " 'digloung',\n", + " 'definit',\n", + " 'comm',\n", + " 'ca',\n", + " 'abl',\n", + " 'hear',\n", + " 'guest',\n", + " 'loud',\n", + " 'nois',\n", + " 'clubb',\n", + " 'endear',\n", + " 'clear',\n", + " 'lavieenros',\n", + " 'cup',\n", + " 'fine',\n", + " 'darl',\n", + " 'hello',\n", + " 'helo',\n", + " 'everyoneeeeeee',\n", + " 'gordon',\n", + " 'peterson',\n", + " 'newseum',\n", + " 'member',\n", + " 'onli',\n", + " 'becaus',\n", + " 'high',\n", + " 'seat',\n", + " 'demand',\n", + " 'aunt',\n", + " 'rip',\n", + " 'chula',\n", + " 'erika',\n", + " 'roman',\n", + " 'love',\n", + " 'congratul',\n", + " 'brettport',\n", + " 'laura',\n", + " 'first',\n", + " 'babi',\n", + " 'brett',\n", + " 'mani',\n", + " 'maven',\n", + " 'bill',\n", + " 'hader',\n", + " 'curragh',\n", + " 'weekend',\n", + " 'said',\n", + " 'old',\n", + " 'one',\n", + " 'tree',\n", + " 'hill',\n", + " 'indulg',\n", + " 'runner',\n", + " 'knee',\n", + " 'mydecemberht',\n", + " 'daaannng',\n", + " 'blow',\n", + " 'sorri',\n", + " 'rohitsabu',\n", + " 'focu',\n", + " 'badavarasc',\n", + " 'pov',\n", + " 'tho',\n", + " 'cant',\n", + " 'figur',\n", + " 'mf',\n", + " 'cud',\n", + " 'make',\n", + " 'select',\n", + " 'shot',\n", + " 'hurray',\n", + " 'tf',\n", + " 'updat',\n", + " 'done',\n", + " 'anoth',\n", + " 'week',\n", + " 'owieh',\n", + " 'watchin',\n", + " 'nikki',\n", + " 'granger',\n", + " 'big',\n", + " 'brother',\n", + " 'quiz',\n", + " 'show',\n", + " 'mental',\n", + " 'report',\n", + " 'mom',\n", + " 'mall',\n", + " 'america',\n", + " 'actual',\n", + " 'walk',\n", + " 'autism',\n", + " 'fun',\n", + " 'theatr',\n", + " 'town',\n", + " 'hospit',\n", + " 'visit',\n", + " 'cuzzo',\n", + " 'mcraddict',\n", + " 'wish',\n", + " 'lcaller',\n", + " 'took',\n", + " 'long',\n", + " 'mate',\n", + " 'upload',\n", + " 'track',\n", + " 'p',\n", + " 'bit',\n", + " 'distract',\n", + " 'lol',\n", + " 'ilaura',\n", + " 'etsi',\n", + " 'sure',\n", + " 'c',\n", + " 'better',\n", + " 'whi',\n", + " 'amaz',\n", + " 'kid',\n", + " 'epandu',\n", + " 'pain',\n", + " 'gain',\n", + " 'man',\n", + " 'outsid',\n", + " 'dean',\n", + " 'crush',\n", + " 'calvin',\n", + " 'cri',\n", + " 'soon',\n", + " 'great',\n", + " 'fli',\n", + " 'sf',\n", + " 'warp',\n", + " 'month',\n", + " 'el',\n", + " 'cardiff',\n", + " 'doe',\n", + " 'anyon',\n", + " 'hammertim',\n", + " 'anymor',\n", + " 'ha',\n", + " 'mummmyyyyi',\n", + " 'word',\n", + " 'super',\n", + " 'realli',\n", + " 'hot',\n", + " 'let',\n", + " 'onlin',\n", + " 'box',\n", + " 'refresh',\n", + " 'chillin',\n", + " 'stephani',\n", + " 'denton',\n", + " 'blah',\n", + " 'much',\n", + " 'aappa',\n", + " 'thnx',\n", + " 'remind',\n", + " 'xx',\n", + " 'cold',\n", + " 'wet',\n", + " 'shop',\n", + " 'rain',\n", + " 'oh',\n", + " 'dio',\n", + " 'liciou',\n", + " 'girrrlllll',\n", + " 'kno',\n", + " 'live',\n", + " 'atl',\n", + " 'bleed',\n", + " 'carey',\n", + " 'anim',\n", + " 'yay',\n", + " 'plu',\n", + " 'mint',\n", + " 'justa',\n", + " 'coffe',\n", + " 'morn',\n", + " 'lunch',\n", + " 'refrigir',\n", + " 'brownfamilycat',\n", + " 'delay',\n", + " 'join',\n", + " 'facebook',\n", + " 'kale',\n", + " 'recip',\n", + " 'tip',\n", + " 'scale',\n", + " 'favor',\n", + " 'christian',\n", + " 'foot',\n", + " 'snuggi',\n", + " 'call',\n", + " 'fuggi',\n", + " 'zut',\n", + " 'radio',\n", + " 'global',\n", + " 'gourmand',\n", + " 'f',\n", + " 'thank',\n", + " 'still',\n", + " 'trip',\n", + " 'ye',\n", + " 'nyc',\n", + " 'excel',\n", + " 'heard',\n", + " 'anyth',\n", + " 'year',\n", + " 'break',\n", + " 'roxio',\n", + " 'found',\n", + " 'susan',\n", + " 'boyl',\n", + " 'yep',\n", + " 'bgt',\n", + " 'hopsit',\n", + " 'cuz',\n", + " 'went',\n", + " 'mad',\n", + " 'couldnt',\n", + " 'handl',\n", + " 'fame',\n", + " 'poor',\n", + " 'thing',\n", + " 'creativeblok',\n", + " 'interweb',\n", + " 'dead',\n", + " 'phone',\n", + " 'offici',\n", + " 'start',\n", + " 'end',\n", + " 'summer',\n", + " 'commentari',\n", + " 'bob',\n", + " 'green',\n", + " 'cnn',\n", + " 'lifetim',\n", + " 'littl',\n", + " 'haze',\n", + " 'hmm',\n", + " 'name',\n", + " 'rtkmusic',\n", + " 'forget',\n", + " 'tuesday',\n", + " 'compliment',\n", + " 'new',\n", + " 'arancinibabi',\n", + " 'shhh',\n", + " 'tell',\n", + " 'everyon',\n", + " 'especi',\n", + " 'half',\n", + " 'guy',\n", + " 'luck',\n", + " 'ya',\n", + " 'biolog',\n", + " 'practic',\n", + " 'tmmrw',\n", + " 'clueless',\n", + " 'sanjukta',\n", + " 'ask',\n", + " 'gf',\n", + " 'regular',\n", + " 'tweeter',\n", + " 'lack',\n", + " 'hooter',\n", + " 'wing',\n", + " 'suck',\n", + " 'crave',\n", + " 'chicken',\n", + " 'horribl',\n", + " 'finish',\n", + " 'number',\n", + " 'themonth',\n", + " 'djnycesf',\n", + " 'damn',\n", + " 'leeadership',\n", + " 'confer',\n", + " 'lost',\n", + " 'readi',\n", + " 'kill',\n", + " 'professor',\n", + " 'head',\n", + " 'hurt',\n", + " 'transform',\n", + " 'reveng',\n", + " 'fallen',\n", + " 'final',\n", + " 'young',\n", + " 'ladi',\n", + " 'understand',\n", + " 'lakergirl',\n", + " 'wakefield',\n", + " 'consist',\n", + " 'pitcher',\n", + " 'jason',\n", + " 'coachbagluv',\n", + " 'shud',\n", + " 'sleep',\n", + " 'haha',\n", + " 'got',\n", + " 'offer',\n", + " 'job',\n", + " 'stanst',\n", + " 'express',\n", + " 'also',\n", + " 'cut',\n", + " 'cost',\n", + " 'control',\n", + " 'train',\n", + " 'instead',\n", + " 'impress',\n", + " 'gestapo',\n", + " 'secur',\n", + " 'entranc',\n", + " 'gosh',\n", + " 'beeen',\n", + " 'caught',\n", + " 'happpyy',\n", + " 'soakeddd',\n", + " 'xxxx',\n", + " 'sharan',\n", + " 'machi',\n", + " 'twenti',\n", + " 'happen',\n", + " 'gon',\n", + " 'crazi',\n", + " 'noth',\n", + " 'movi',\n", + " 'seen',\n", + " 'nemo',\n", + " 'twittervers',\n", + " 'favorit',\n", + " 'repli',\n", + " 'icon',\n", + " 'top',\n", + " 'right',\n", + " 'corner',\n", + " 'look',\n", + " 'happi',\n", + " 'jcphilli',\n", + " 'closet',\n", + " 'spring',\n", + " 'princesspooh',\n", + " 'wonder',\n", + " 'short',\n", + " 'danc',\n", + " 'xoxo',\n", + " 'sexi',\n", + " 'pooh',\n", + " 'bum',\n", + " 'fit',\n", + " 'stuff',\n", + " 'room',\n", + " 'run',\n", + " 'late',\n", + " 'headin',\n", + " 'middleofnowher',\n", + " 'texa',\n", + " 'grandpar',\n", + " 'rofl',\n", + " 'gunther',\n", + " 'joeymcintyr',\n", + " 'awwwww',\n", + " 'pari',\n", + " 'everi',\n", + " 'shame',\n", + " 'didnt',\n", + " 'weeeeeee',\n", + " 'hahahaha',\n", + " 'expect',\n", + " 'chang',\n", + " 'mojolocollc',\n", + " 'articl',\n", + " 'small',\n", + " 'pretti',\n", + " 'ankurb',\n", + " 'might',\n", + " 'came',\n", + " 'nolif',\n", + " 'administr',\n", + " 'ing',\n", + " 'tax',\n", + " 'onepag',\n", + " 'design',\n", + " 'way',\n", + " 'cross',\n", + " 'finger',\n", + " 'pussycatdol',\n", + " 'bad',\n", + " 'plea',\n", + " 'god',\n", + " 'mameekin',\n", + " 'say',\n", + " 'neither',\n", + " 'aceduec',\n", + " 'listen',\n", + " 'bamma',\n", + " 'wear',\n", + " 'dem',\n", + " 'crinkl',\n", + " 'fall',\n", + " 'straight',\n", + " 'tommcfli',\n", + " 'help',\n", + " 'yr',\n", + " 'side',\n", + " 'tom',\n", + " 'load',\n", + " 'lovebug',\n", + " 'gi',\n", + " 'laptop',\n", + " 'drank',\n", + " 'sangria',\n", + " 'five',\n", + " 'catch',\n", + " 'breath',\n", + " 'softwaregoddess',\n", + " 'talkin',\n", + " 'barista',\n", + " 'buddi',\n", + " 'epicturtl',\n", + " 'aaronrva',\n", + " 'heaven',\n", + " 'hug',\n", + " 'arm',\n", + " 'around',\n", + " 'shoulder',\n", + " 'brought',\n", + " 'fux',\n", + " 'ddlovato',\n", + " 'arent',\n", + " 'mtv',\n", + " 'award',\n", + " 'forward',\n", + " 'tv',\n", + " 'cupcak',\n", + " 'sensitivech',\n", + " 'face',\n", + " 'susanpau',\n", + " 'appreci',\n", + " 'bloodi',\n", + " 'nose',\n", + " 'season',\n", + " 'takethatnew',\n", + " 'stand',\n", + " 'bought',\n", + " 'mistak',\n", + " 'panic',\n", + " 'allid',\n", + " 'certainli',\n", + " 'uyennguyen',\n", + " 'fulli',\n", + " 'stalk',\n", + " 'qld',\n", + " 'video',\n", + " 'lili',\n", + " 'allen',\n", + " 'singl',\n", + " 'st',\n", + " 'june',\n", + " 'exit',\n", + " 'peopl',\n", + " 'stop',\n", + " 'cough',\n", + " 'near',\n", + " 'scare',\n", + " 'contract',\n", + " 'swine',\n", + " 'nd',\n", + " 'hate',\n", + " 'sick',\n", + " 'dog',\n", + " 'cuddl',\n", + " 'star',\n", + " 'pool',\n", + " 'soft',\n", + " 'wave',\n", + " 'put',\n", + " 'full',\n", + " 'fantasi',\n", + " 'mode',\n", + " 'grandma',\n", + " 'backyard',\n", + " 'nice',\n", + " 'matti',\n", + " 'daddi',\n", + " 'sabbyaz',\n", + " 'blog',\n", + " 'email',\n", + " 'hungout',\n", + " 'drake',\n", + " 'nate',\n", + " 'michael',\n", + " 'jen',\n", + " 'nick',\n", + " 'cori',\n", + " 'graduat',\n", + " 'holi',\n", + " 'rosiereap',\n", + " 'l',\n", + " 'mother',\n", + " 'gfalcon',\n", + " 'looov',\n", + " 'album',\n", + " 'voic',\n", + " 'sad',\n", + " 'hunternjadezmom',\n", + " 'ok',\n", + " 'aplusk',\n", + " 'link',\n", + " 'imagin',\n", + " 'di',\n", + " 'soooo',\n", + " 'uuu',\n", + " 'xpb',\n", + " 'omgpop',\n", + " 'officialtila',\n", + " 'id',\n", + " 'bout',\n", + " 'crushin',\n", + " 'dont',\n", + " 'whore',\n", + " 'fanx',\n", + " 'mattyde',\n", + " 'twitblock',\n", + " 'api',\n", + " 'left',\n", + " 'er',\n", + " 'com',\n", + " 'seem',\n", + " 'jeannefromnc',\n", + " 'brantanamo',\n", + " 'round',\n", + " 'tea',\n", + " 'chees',\n", + " 'alon',\n", + " 'un',\n", + " 'care',\n", + " 'awak',\n", + " 'havent',\n", + " 'slept',\n", + " 'meh',\n", + " 'superpurpl',\n", + " 'ayt',\n", + " 'wit',\n", + " 'ppl',\n", + " 'though',\n", + " 'edit',\n", + " 'next',\n", + " 'vlog',\n", + " 'jholti',\n", + " 'ahhhhhhhh',\n", + " 'pupi',\n", + " 'serious',\n", + " 'loo',\n", + " 'mechan',\n", + " 'chat',\n", + " 'mirella',\n", + " 'text',\n", + " 'messag',\n", + " 'somebodi',\n", + " 'te',\n", + " 'real',\n", + " 'jonaskevin',\n", + " 'callmestephani',\n", + " 'lucki',\n", + " 'havnt',\n", + " 'gym',\n", + " 'yet',\n", + " 'omg',\n", + " 'ive',\n", + " 'stori',\n", + " 'bed',\n", + " 'lamp',\n", + " 'die',\n", + " 'darkstarbuck',\n", + " 'exceed',\n", + " 'band',\n", + " 'morrow',\n", + " 'someon',\n", + " 'meeeeeeee',\n", + " 'thisisdavina',\n", + " 'direct',\n", + " 'yer',\n", + " 'loyal',\n", + " 'wud',\n", + " 'older',\n", + " 'sister',\n", + " 'russia',\n", + " 'soooooooo',\n", + " 'mccainblogett',\n", + " 'rather',\n", + " 'pirat',\n", + " 'nik',\n", + " 'base',\n", + " 'servic',\n", + " 'dm',\n", + " 'find',\n", + " 'reesemarcel',\n", + " 'cute',\n", + " 'rees',\n", + " 'feel',\n", + " 'cool',\n", + " 'sunni',\n", + " 'gut',\n", + " 'birthday',\n", + " 'clubg',\n", + " 'stuck',\n", + " 'till',\n", + " 'nine',\n", + " 'book',\n", + " 'closer',\n", + " 'hotel',\n", + " 'sdcc',\n", + " 'boo',\n", + " 'hero',\n", + " 'panel',\n", + " 'strombo',\n", + " 'funni',\n", + " 'passwordreset',\n", + " 'welcom',\n", + " 'btw',\n", + " 'commiser',\n", + " 'micaiah',\n", + " 'taken',\n", + " 'soo',\n", + " 'rob',\n", + " 'papz',\n", + " 'whistl',\n", + " 'yell',\n", + " 'athim',\n", + " 'stone',\n", + " 'heart',\n", + " 'griffintech',\n", + " 'line',\n", + " 'g',\n", + " 'yeah',\n", + " 'add',\n", + " 'deidg',\n", + " 'maggieshiel',\n", + " 'probabl',\n", + " 'disappoint',\n", + " 'scifi',\n", + " 'coursee',\n", + " 'jessicastrust',\n", + " 'ditch',\n", + " 'strontiumfox',\n", + " 'gong',\n", + " 'glasgow',\n", + " 'teh',\n", + " 'hayl',\n", + " 'eileen',\n", + " 'hard',\n", + " 'battl',\n", + " 'fax',\n", + " 'machin',\n", + " 'win',\n", + " 'suppis',\n", + " 'goin',\n", + " 'til',\n", + " 'gg',\n", + " 'episod',\n", + " 'tire',\n", + " 'review',\n", + " 'traffic',\n", + " 'take',\n", + " 'away',\n", + " 'exhaust',\n", + " 'w',\n", + " 'lang',\n", + " 'owner',\n", + " 'secondlif',\n", + " 'read',\n", + " 'yesterday',\n", + " 'languag',\n", + " 'teacher',\n", + " 'jalt',\n", + " 'mingoent',\n", + " 'almost',\n", + " 'woken',\n", + " 'earli',\n", + " 'jennposs',\n", + " 'alreadi',\n", + " 'gezz',\n", + " 'record',\n", + " 'organis',\n", + " 'juli',\n", + " 'momadthenomad',\n", + " 'mo',\n", + " 'test',\n", + " 'dev',\n", + " 'open',\n", + " 'awesom',\n", + " 'comeback',\n", + " 'rogi',\n", + " 'whole',\n", + " 'match',\n", + " 'inspit',\n", + " 'comp',\n", + " 'graphic',\n", + " 'paper',\n", + " 'moro',\n", + " 'billyraycyru',\n", + " 'shoot',\n", + " 'th',\n", + " 'tour',\n", + " 'nickkkjonasss',\n", + " 'omj',\n", + " 'jona',\n", + " 'everyth',\n", + " 'vancouv',\n", + " 'filip',\n", + " 'murteira',\n", + " 'neosurgehost',\n", + " 'shocker',\n", + " 'experi',\n", + " 'ltaloi',\n", + " 'unfollow',\n", + " 'calendar',\n", + " 'annoy',\n", + " 'dump',\n", + " 'date',\n", + " 'save',\n", + " 'ill',\n", + " 'alway',\n", + " 'owe',\n", + " 'fuckasauru',\n", + " 'chickenshit',\n", + " 'excus',\n", + " 'backbon',\n", + " 'hide',\n", + " 'ashley',\n", + " 'hotdogsladi',\n", + " 'app',\n", + " 'support',\n", + " 'cascandar',\n", + " 'whatisit',\n", + " 'doin',\n", + " 'bore',\n", + " 'sunday',\n", + " 'gurli',\n", + " 'promis',\n", + " 'tenni',\n", + " 'cloth',\n", + " 'vnakic',\n", + " 'dude',\n", + " 'surpris',\n", + " 'dougi',\n", + " 'poynter',\n", + " 'regret',\n", + " 'ufff',\n", + " 'offlin',\n", + " 'moldavian',\n", + " 'italylogu',\n", + " 'jpack',\n", + " 'lie',\n", + " 'husband',\n", + " 'bio',\n", + " 'nephew',\n", + " 'bday',\n", + " 'gettin',\n", + " 'drink',\n", + " 'wif',\n", + " 'punit',\n", + " 'r',\n", + " 'wootwoot',\n", + " 'none',\n", + " 'mk',\n", + " 'skeptwiit',\n", + " 'notsiralansugar',\n", + " 'apprentic',\n", + " 'yayitsrobot',\n", + " 'cover',\n", + " 'cubicl',\n", + " 'sit',\n", + " 'faaaar',\n", + " 'cell',\n", + " 'carrier',\n", + " 'smartphon',\n", + " 'option',\n", + " 'blackberri',\n", + " 'window',\n", + " 'mobil',\n", + " 'either',\n", + " 'palm',\n", + " 'absolut',\n", + " 'sweetheart',\n", + " 'drift',\n", + " 'peac',\n", + " 'ian',\n", + " 'broskey',\n", + " 'mucho',\n", + " 'slightli',\n", + " 'emo',\n", + " 'footi',\n", + " 'pub',\n", + " 'front',\n", + " 'dpsrecord',\n", + " 'dammit',\n", + " 'wockeez',\n", + " 'meet',\n", + " 'greet',\n", + " 'chiago',\n", + " 'beuati',\n", + " 'realiz',\n", + " 'le',\n", + " 'pervi',\n", + " 'thought',\n", + " 'wont',\n", + " 'later',\n", + " 'michaelaranda',\n", + " 'orlando',\n", + " 'slipknot',\n", + " ...]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here" + "cfdist = nltk.FreqDist()\n", + "\n", + "for tweet in sample['text_processed']:\n", + " for word in tweet:\n", + " cfdist[word] += 1\n", + "\n", + "top_words = list(cfdist.keys())[:5000]\n", + "top_words" ] }, { @@ -167,11 +1619,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "20000\n" + ] + } + ], "source": [ - "# your code here" + "def find_features(document):\n", + " words = set(document)\n", + " features = {}\n", + " for w in top_words:\n", + " features[w] = (w in words)\n", + " \n", + " return features\n", + " \n", + "feature_sets = [(find_features(tweet), target) for (tweet, target) in list(zip(sample['text_processed'], sample['target']))]\n", + "print(len(feature_sets))" ] }, { @@ -210,11 +1679,12 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "train_set, test_set = feature_sets[:10000], feature_sets[10000:]\n", + "classifier = nltk.NaiveBayesClassifier.train(train_set)" ] }, { @@ -228,13 +1698,38 @@ "As mentioned in one of the tutorial videos, a Naive Bayes model is considered OK if your accuracy score is over 0.6. If your accuracy score is over 0.7, you've done a great job!" ] }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7203\n", + "Most Informative Features\n", + " sick = True 0 : 1 = 23.1 : 1.0\n", + " sad = True 0 : 1 = 17.4 : 1.0\n", + " hurt = True 0 : 1 = 17.3 : 1.0\n", + " stomach = True 0 : 1 = 15.3 : 1.0\n", + " unfortun = True 0 : 1 = 15.3 : 1.0\n" + ] + } + ], + "source": [ + "print(nltk.classify.accuracy(classifier, test_set))\n", + "classifier.show_most_informative_features(5)" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# How the hell are those tiny laptops from class doing this???\n", + "# My fan gets crazy everytime I run some of these cells u.u'" ] }, { @@ -312,7 +1807,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.10.9" } }, "nbformat": 4,