From d5d28fbd1497f141c3f1b0b356bcdbdc8c86d357 Mon Sep 17 00:00:00 2001 From: Joao Carrasco Soares Date: Mon, 13 Mar 2023 17:21:16 +0000 Subject: [PATCH] done --- your-code/challenge-1.ipynb | 175 +++-- your-code/challenge-2.ipynb | 1228 ++++++++++++++++++++++++++++++++++- 2 files changed, 1349 insertions(+), 54 deletions(-) diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb index 0808166..450f6de 100644 --- a/your-code/challenge-1.ipynb +++ b/your-code/challenge-1.ipynb @@ -66,20 +66,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "from nltk.tokenize import word_tokenize" + ] + }, + { + "cell_type": "code", + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "def clean_up(s):\n", - " \"\"\"\n", - " Cleans up numbers, URLs, and special characters from a string.\n", - "\n", - " Args:\n", - " s: The string to be cleaned up.\n", - "\n", - " Returns:\n", - " A string that has been cleaned up.\n", - " \"\"\"" + " s = re.sub(r'http\\S+', ' ', s)\n", + " s = re.sub(r'[^\\w\\s]+|\\d+', ' ', s)\n", + " s = s.strip()\n", + " s = re.sub(r'\\s+', ' ', s)\n", + " \n", + " return s.lower()" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ironhack s q website is\n" + ] + } + ], + "source": [ + "s = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\\\")\"\n", + "clean_sentence = clean_up(s)\n", + "print(clean_sentence)" ] }, { @@ -101,20 +127,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 72, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.tokenize import word_tokenize" + ] + }, + { + "cell_type": "code", + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "def tokenize(s):\n", - " \"\"\"\n", - " Tokenize a string.\n", - "\n", - " Args:\n", - " s: String to be tokenized.\n", - "\n", - " Returns:\n", - " A list of words as the result of tokenization.\n", - " \"\"\"" + " return word_tokenize(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['ironhack', 's', 'q', 'website', 'is']\n" + ] + } + ], + "source": [ + "text = 'ironhack s q website is'\n", + "tokens = tokenize(text)\n", + "print(tokens)" ] }, { @@ -145,20 +191,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.stem import SnowballStemmer, WordNetLemmatizer" + ] + }, + { + "cell_type": "code", + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "def stem_and_lemmatize(l):\n", - " \"\"\"\n", - " Perform stemming and lemmatization on a list of words.\n", - "\n", - " Args:\n", - " l: A list of strings.\n", - "\n", - " Returns:\n", - " A list of strings after being stemmed and lemmatized.\n", - " \"\"\"" + " stemmer = SnowballStemmer('english')\n", + " lemmatizer = WordNetLemmatizer()\n", + " stems = [stemmer.stem(l) for l in l]\n", + " lemmas = [lemmatizer.lemmatize(l) for l in l]\n", + " return stems, lemmas" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(['@', 'i', 'r', 'o', 'n', 'h', 'a', 'c', 'k', \"'\", 's', '-', '#', 'q', ' ', 'w', 'e', 'b', 's', 'i', 't', 'e', ' ', '7', '7', '6', '-', 'i', 's', ' ', 'h', 't', 't', 'p', ':', '/', '/', 'i', 'r', 'o', 'n', 'h', 'a', 'c', 'k', '.', 'c', 'o', 'm', ' ', '[', '(', '2', '0', '1', '8', ')', ']', '\"', ')'], ['@', 'I', 'r', 'o', 'n', 'h', 'a', 'c', 'k', \"'\", 's', '-', '#', 'Q', ' ', 'w', 'e', 'b', 's', 'i', 't', 'e', ' ', '7', '7', '6', '-', 'i', 's', ' ', 'h', 't', 't', 'p', ':', '/', '/', 'i', 'r', 'o', 'n', 'h', 'a', 'c', 'k', '.', 'c', 'o', 'm', ' ', '[', '(', '2', '0', '1', '8', ')', ']', '\"', ')'])\n" + ] + } + ], + "source": [ + "print(stem_and_lemmatize(s))" ] }, { @@ -176,20 +244,40 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.corpus import stopwords" + ] + }, + { + "cell_type": "code", + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "def remove_stopwords(l):\n", - " \"\"\"\n", - " Remove English stopwords from a list of strings.\n", - "\n", - " Args:\n", - " l: A list of strings.\n", - "\n", - " Returns:\n", - " A list of strings after stop words are removed.\n", - " \"\"\"" + " stop_words = set(stopwords.words('english'))\n", + " filtered_words = [l for l in l if l not in stop_words]\n", + " return filtered_words" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['@', 'I', 'r', 'n', 'h', 'c', 'k', \"'\", '-', '#', 'Q', ' ', 'w', 'e', 'b', 'e', ' ', '7', '7', '6', '-', ' ', 'h', 'p', ':', '/', '/', 'r', 'n', 'h', 'c', 'k', '.', 'c', ' ', '[', '(', '2', '0', '1', '8', ')', ']', '\"', ')']\n" + ] + } + ], + "source": [ + "print(remove_stopwords(s))" ] }, { @@ -218,7 +306,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.10.7" + }, + "vscode": { + "interpreter": { + "hash": "721db305ef1fd1fc91cdf20e400af694a949fe540ac5f48c160f31c7e384879d" + } } }, "nbformat": 4, diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb index 6b0e116..51c32f1 100644 --- a/your-code/challenge-2.ipynb +++ b/your-code/challenge-2.ipynb @@ -18,8 +18,8 @@ "\n", "```python\n", ">>> from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", - ">>> txt = \"Ironhack is a Global Tech School ranked num 2 worldwide. 
", - "
", + ">>> txt = \"Ironhack is a Global Tech School ranked num 2 worldwide. 
\n", + "
\n", "Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do.\"\n", ">>> analyzer = SentimentIntensityAnalyzer()\n", ">>> analyzer.polarity_scores(txt)\n", @@ -46,11 +46,116 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
targetiddateflagusertext
001467810369Mon Apr 06 22:19:45 PDT 2009NO_QUERY_TheSpecialOne_@switchfoot http://twitpic.com/2y1zl - Awww, t...
101467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by ...
201467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Man...
301467811184Mon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire
401467811193Mon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all....
\n", + "
" + ], + "text/plain": [ + " target id date flag \\\n", + "0 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY \n", + "1 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY \n", + "2 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY \n", + "3 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", + "4 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", + "\n", + " user text \n", + "0 _TheSpecialOne_ @switchfoot http://twitpic.com/2y1zl - Awww, t... \n", + "1 scotthamilton is upset that he can't update his Facebook by ... \n", + "2 mattycus @Kenichan I dived many times for the ball. Man... \n", + "3 ElleCTF my whole body feels itchy and like its on fire \n", + "4 Karoli @nationwideclass no, it's not behaving at all.... " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# your code here" + "import pandas as pd\n", + "df = pd.read_csv('noemoticon.csv',encoding='latin-1', header=None)\n", + "df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']\n", + "df.head()\n", + "\n", + "##was giving me an error, had to add multiple arg." ] }, { @@ -76,11 +181,54 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "import re\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.stem.snowball import SnowballStemmer\n", + "from nltk.stem.wordnet import WordNetLemmatizer\n", + "from nltk.corpus import stopwords" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def process_text(s):\n", + " # Clean up the text\n", + " s = re.sub(r'http\\S+', ' ', s)\n", + " s = re.sub(r'[^\\w\\s]+|\\d+', ' ', s)\n", + " s = s.strip()\n", + " s = re.sub(r'\\s+', ' ', s)\n", + " s = s.lower()\n", + "\n", + " # Tokenize the text\n", + " tokens = word_tokenize(s)\n", + "\n", + " # Remove stop words\n", + " stop_words = set(stopwords.words('english'))\n", + " filtered_words = [w for w in tokens if not w in stop_words]\n", + "\n", + " # Stem and lemmatize the words\n", + " stemmer = SnowballStemmer('english')\n", + " lemmatizer = WordNetLemmatizer()\n", + " stems = [stemmer.stem(w) for w in filtered_words]\n", + " lemmas = [lemmatizer.lemmatize(w) for w in filtered_words]\n", + "\n", + " return stems, lemmas" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df['text_processed'] = df['text'].apply(process_text).apply(lambda x: x[0]) # or x[1] for lemmas" ] }, { @@ -102,7 +250,1031 @@ "metadata": {}, "outputs": [], "source": [ - "# your code here" + "from nltk import FreqDist\n", + "\n", + "all_words = [word for text in df['text_processed'] for word in text]\n", + "freq_dist = FreqDist(all_words)\n", + "top_words = freq_dist.most_common(5000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('go', 138687),\n", + " ('get', 110838),\n", + " ('day', 109146),\n", + " ('good', 92565),\n", + " ('work', 87870),\n", + " ('like', 83831),\n", + " ('love', 82663),\n", + " ('quot', 73383),\n", + " ('got', 71107),\n", + " ('today', 68697),\n", + " ('time', 66391),\n", + " ('lol', 59472),\n", + " ('thank', 59434),\n", + " ('back', 57401),\n", + " ('one', 57368),\n", + " ('want', 57339),\n", + " ('miss', 56915),\n", + " ('u', 56586),\n", + " ('know', 54960),\n", + " ('see', 51368),\n", + " ('feel', 51266),\n", + " ('think', 51190),\n", + " ('im', 50695),\n", + " ('realli', 50076),\n", + " ('amp', 48767),\n", + " ('night', 45850),\n", + " ('hope', 44935),\n", + " ('watch', 43639),\n", + " ('still', 43618),\n", + " ('need', 43438),\n", + " ('make', 43217),\n", + " ('well', 42947),\n", + " ('new', 42452),\n", + " ('na', 42081),\n", + " ('home', 40829),\n", + " ('oh', 39946),\n", + " ('look', 39752),\n", + " ('come', 39117),\n", + " ('much', 37149),\n", + " ('last', 36279),\n", + " ('twitter', 36166),\n", + " ('morn', 35723),\n", + " ('tomorrow', 34328),\n", + " ('wish', 34038),\n", + " ('great', 33853),\n", + " ('wait', 32603),\n", + " ('sad', 32527),\n", + " ('sleep', 32458),\n", + " ('haha', 31533),\n", + " ('bad', 28835),\n", + " ('fun', 28719),\n", + " ('week', 28612),\n", + " ('tri', 28245),\n", + " ('right', 28131),\n", + " ('follow', 28039),\n", + " ('happi', 27720),\n", + " ('would', 27330),\n", + " ('friend', 26606),\n", + " ('thing', 26519),\n", + " ('sorri', 26429),\n", + " ('tonight', 26182),\n", + " ('say', 25568),\n", + " ('way', 25205),\n", + " ('take', 24740),\n", + " ('gon', 24096),\n", + " ('nice', 24083),\n", + " ('though', 24017),\n", + " ('better', 23263),\n", + " ('hate', 23019),\n", + " ('even', 22879),\n", + " ('yeah', 22478),\n", + " ('bed', 22430),\n", + " ('tweet', 22369),\n", + " ('could', 21928),\n", + " ('start', 21798),\n", + " ('school', 21078),\n", + " ('hour', 21063),\n", + " ('peopl', 20982),\n", + " ('show', 20770),\n", + " ('guy', 19702),\n", + " ('play', 19688),\n", + " ('weekend', 19616),\n", + " ('hey', 19112),\n", + " ('final', 18962),\n", + " ('awesom', 18777),\n", + " ('yes', 18693),\n", + " ('next', 18669),\n", + " ('let', 18599),\n", + " ('lt', 18562),\n", + " ('use', 18509),\n", + " ('dont', 18235),\n", + " ('never', 17945),\n", + " ('soon', 17857),\n", + " ('cant', 17684),\n", + " ('tire', 17371),\n", + " ('long', 17247),\n", + " ('rain', 17199),\n", + " ('pleas', 17170),\n", + " ('littl', 16978),\n", + " ('first', 16845),\n", + " ('life', 16821),\n", + " ('year', 16817),\n", + " ('everyon', 16801),\n", + " ('wan', 16751),\n", + " ('movi', 16632),\n", + " ('x', 16563),\n", + " ('best', 16512),\n", + " ('sick', 16473),\n", + " ('ok', 16232),\n", + " ('girl', 15877),\n", + " ('find', 15798),\n", + " ('call', 15626),\n", + " ('suck', 15521),\n", + " ('sure', 15465),\n", + " ('done', 15365),\n", + " ('help', 15336),\n", + " ('bore', 15314),\n", + " ('head', 15246),\n", + " ('alway', 14971),\n", + " ('talk', 14926),\n", + " ('keep', 14792),\n", + " ('alreadi', 14773),\n", + " ('cool', 14731),\n", + " ('lot', 14582),\n", + " ('anoth', 14576),\n", + " ('live', 14573),\n", + " ('someth', 14481),\n", + " ('us', 14465),\n", + " ('eat', 14360),\n", + " ('phone', 14348),\n", + " ('man', 14267),\n", + " ('leav', 14186),\n", + " ('read', 14155),\n", + " ('hurt', 14041),\n", + " ('readi', 14001),\n", + " ('made', 13854),\n", + " ('yay', 13796),\n", + " ('enjoy', 13746),\n", + " ('song', 13481),\n", + " ('hous', 13432),\n", + " ('yet', 13398),\n", + " ('went', 13308),\n", + " ('ur', 13304),\n", + " ('ever', 13155),\n", + " ('n', 13070),\n", + " ('sound', 12856),\n", + " ('thought', 12754),\n", + " ('pretti', 12720),\n", + " ('mayb', 12681),\n", + " ('amaz', 12511),\n", + " ('excit', 12463),\n", + " ('away', 12315),\n", + " ('summer', 12289),\n", + " ('game', 12276),\n", + " ('finish', 12262),\n", + " ('omg', 12231),\n", + " ('old', 12185),\n", + " ('tell', 12170),\n", + " ('guess', 12169),\n", + " ('damn', 11997),\n", + " ('mean', 11897),\n", + " ('listen', 11840),\n", + " ('earli', 11830),\n", + " ('someon', 11740),\n", + " ('check', 11588),\n", + " ('bit', 11543),\n", + " ('babi', 11525),\n", + " ('left', 11507),\n", + " ('lost', 11479),\n", + " ('give', 11477),\n", + " ('end', 11181),\n", + " ('big', 11180),\n", + " ('hot', 11160),\n", + " ('wow', 11159),\n", + " ('parti', 11157),\n", + " ('late', 11148),\n", + " ('noth', 11087),\n", + " ('hear', 11074),\n", + " ('w', 10944),\n", + " ('ya', 10911),\n", + " ('b', 10729),\n", + " ('glad', 10633),\n", + " ('actual', 10609),\n", + " ('pic', 10595),\n", + " ('birthday', 10583),\n", + " ('happen', 10538),\n", + " ('hard', 10528),\n", + " ('sun', 10455),\n", + " ('stop', 10454),\n", + " ('also', 10439),\n", + " ('weather', 10392),\n", + " ('later', 10386),\n", + " ('two', 10312),\n", + " ('mom', 10205),\n", + " ('wonder', 10185),\n", + " ('stuff', 10133),\n", + " ('ugh', 10059),\n", + " ('put', 10054),\n", + " ('ta', 9991),\n", + " ('saw', 9923),\n", + " ('run', 9897),\n", + " ('god', 9825),\n", + " ('exam', 9822),\n", + " ('fuck', 9783),\n", + " ('stay', 9777),\n", + " ('car', 9772),\n", + " ('might', 9767),\n", + " ('th', 9726),\n", + " ('music', 9693),\n", + " ('world', 9661),\n", + " ('yesterday', 9650),\n", + " ('kid', 9598),\n", + " ('said', 9595),\n", + " ('that', 9532),\n", + " ('r', 9490),\n", + " ('meet', 9489),\n", + " ('sinc', 9462),\n", + " ('hi', 9384),\n", + " ('job', 9375),\n", + " ('post', 9287),\n", + " ('beauti', 9280),\n", + " ('updat', 9253),\n", + " ('sunday', 9216),\n", + " ('friday', 9151),\n", + " ('monday', 9088),\n", + " ('around', 9087),\n", + " ('video', 9057),\n", + " ('mani', 9031),\n", + " ('seem', 8992),\n", + " ('com', 8885),\n", + " ('cold', 8803),\n", + " ('luck', 8715),\n", + " ('found', 8691),\n", + " ('must', 8688),\n", + " ('poor', 8682),\n", + " ('cri', 8638),\n", + " ('book', 8590),\n", + " ('move', 8569),\n", + " ('die', 8520),\n", + " ('aww', 8473),\n", + " ('busi', 8441),\n", + " ('boy', 8425),\n", + " ('gone', 8362),\n", + " ('may', 8350),\n", + " ('buy', 8204),\n", + " ('shop', 8154),\n", + " ('famili', 8153),\n", + " ('anyth', 8150),\n", + " ('plan', 8085),\n", + " ('studi', 8074),\n", + " ('woke', 8032),\n", + " ('least', 8028),\n", + " ('hair', 8004),\n", + " ('food', 7998),\n", + " ('total', 7990),\n", + " ('month', 7990),\n", + " ('okay', 7982),\n", + " ('iphon', 7947),\n", + " ('till', 7940),\n", + " ('cute', 7933),\n", + " ('lunch', 7880),\n", + " ('almost', 7876),\n", + " ('free', 7854),\n", + " ('tho', 7848),\n", + " ('win', 7772),\n", + " ('sweet', 7730),\n", + " ('far', 7705),\n", + " ('believ', 7694),\n", + " ('drink', 7678),\n", + " ('dinner', 7673),\n", + " ('pictur', 7664),\n", + " ('caus', 7652),\n", + " ('chang', 7628),\n", + " ('place', 7626),\n", + " ('funni', 7623),\n", + " ('everyth', 7623),\n", + " ('class', 7588),\n", + " ('shit', 7575),\n", + " ('welcom', 7572),\n", + " ('p', 7460),\n", + " ('gt', 7451),\n", + " ('anyon', 7439),\n", + " ('drive', 7417),\n", + " ('forward', 7341),\n", + " ('turn', 7335),\n", + " ('sit', 7312),\n", + " ('mine', 7298),\n", + " ('without', 7232),\n", + " ('walk', 7222),\n", + " ('ask', 7220),\n", + " ('real', 7148),\n", + " ('name', 7142),\n", + " ('everi', 7080),\n", + " ('dream', 7050),\n", + " ('write', 7036),\n", + " ('stupid', 7019),\n", + " ('idea', 6985),\n", + " ('dad', 6973),\n", + " ('hahaha', 6934),\n", + " ('send', 6933),\n", + " ('outsid', 6929),\n", + " ('ill', 6896),\n", + " ('clean', 6895),\n", + " ('coffe', 6881),\n", + " ('enough', 6815),\n", + " ('room', 6747),\n", + " ('wrong', 6711),\n", + " ('fan', 6670),\n", + " ('anymor', 6666),\n", + " ('wake', 6655),\n", + " ('dog', 6650),\n", + " ('didnt', 6614),\n", + " ('probabl', 6602),\n", + " ('saturday', 6521),\n", + " ('ha', 6469),\n", + " ('tv', 6434),\n", + " ('c', 6425),\n", + " ('money', 6409),\n", + " ('minut', 6408),\n", + " ('repli', 6317),\n", + " ('person', 6288),\n", + " ('xx', 6254),\n", + " ('eye', 6245),\n", + " ('break', 6242),\n", + " ('sooo', 6237),\n", + " ('face', 6231),\n", + " ('serious', 6230),\n", + " ('rememb', 6178),\n", + " ('headach', 6171),\n", + " ('hit', 6158),\n", + " ('aw', 6151),\n", + " ('rock', 6139),\n", + " ('brother', 6124),\n", + " ('fail', 6069),\n", + " ('blog', 6048),\n", + " ('beach', 6025),\n", + " ('train', 6010),\n", + " ('came', 6007),\n", + " ('whole', 6001),\n", + " ('hang', 5986),\n", + " ('seen', 5983),\n", + " ('crazi', 5977),\n", + " ('kinda', 5975),\n", + " ('open', 5968),\n", + " ('mother', 5942),\n", + " ('pain', 5922),\n", + " ('rest', 5920),\n", + " ('kill', 5898),\n", + " ('â', 5897),\n", + " ('close', 5878),\n", + " ('super', 5819),\n", + " ('word', 5818),\n", + " ('comput', 5767),\n", + " ('care', 5742),\n", + " ('quit', 5726),\n", + " ('text', 5724),\n", + " ('half', 5713),\n", + " ('took', 5711),\n", + " ('hell', 5683),\n", + " ('hello', 5668),\n", + " ('awww', 5654),\n", + " ('news', 5644),\n", + " ('anyway', 5633),\n", + " ('true', 5609),\n", + " ('worri', 5605),\n", + " ('goodnight', 5549),\n", + " ('part', 5546),\n", + " ('pm', 5534),\n", + " ('e', 5512),\n", + " ('heart', 5497),\n", + " ('abl', 5475),\n", + " ('forgot', 5472),\n", + " ('problem', 5465),\n", + " ('trip', 5459),\n", + " ('els', 5458),\n", + " ('ago', 5421),\n", + " ('kind', 5405),\n", + " ('offic', 5404),\n", + " ('bring', 5401),\n", + " ('either', 5366),\n", + " ('mind', 5366),\n", + " ('photo', 5349),\n", + " ('full', 5340),\n", + " ('boo', 5320),\n", + " ('ah', 5293),\n", + " ('link', 5288),\n", + " ('danc', 5270),\n", + " ('ð', 5269),\n", + " ('pay', 5238),\n", + " ('soo', 5226),\n", + " ('hug', 5208),\n", + " ('sister', 5200),\n", + " ('ñ', 5178),\n", + " ('cuz', 5154),\n", + " ('alon', 5096),\n", + " ('internet', 5096),\n", + " ('hehe', 5067),\n", + " ('fall', 5042),\n", + " ('test', 5030),\n", + " ('btw', 5011),\n", + " ('stuck', 4981),\n", + " ('heard', 4961),\n", + " ('sometim', 4961),\n", + " ('cours', 4958),\n", + " ('email', 4957),\n", + " ('pick', 4951),\n", + " ('ticket', 4950),\n", + " ('st', 4937),\n", + " ('g', 4901),\n", + " ('site', 4899),\n", + " ('www', 4842),\n", + " ('set', 4834),\n", + " ('learn', 4827),\n", + " ('interest', 4775),\n", + " ('wont', 4772),\n", + " ('pass', 4768),\n", + " ('hand', 4766),\n", + " ('shower', 4750),\n", + " ('vote', 4740),\n", + " ('nite', 4732),\n", + " ('onlin', 4722),\n", + " ('concert', 4720),\n", + " ('add', 4713),\n", + " ('k', 4689),\n", + " ('season', 4671),\n", + " ('visit', 4663),\n", + " ('dude', 4662),\n", + " ('fine', 4651),\n", + " ('ice', 4644),\n", + " ('mileycyrus', 4626),\n", + " ('awak', 4619),\n", + " ('suppos', 4594),\n", + " ('breakfast', 4586),\n", + " ('fix', 4573),\n", + " ('facebook', 4571),\n", + " ('cat', 4536),\n", + " ('told', 4516),\n", + " ('favorit', 4503),\n", + " ('goe', 4490),\n", + " ('ass', 4482),\n", + " ('sunni', 4469),\n", + " ('wear', 4447),\n", + " ('catch', 4440),\n", + " ('pack', 4439),\n", + " ('til', 4437),\n", + " ('smile', 4432),\n", + " ('high', 4406),\n", + " ('broke', 4403),\n", + " ('lmao', 4382),\n", + " ('cut', 4381),\n", + " ('bought', 4363),\n", + " ('june', 4352),\n", + " ('spend', 4347),\n", + " ('lucki', 4319),\n", + " ('crap', 4298),\n", + " ('l', 4284),\n", + " ('mad', 4265),\n", + " ('la', 4256),\n", + " ('asleep', 4246),\n", + " ('afternoon', 4240),\n", + " ('hungri', 4236),\n", + " ('reason', 4223),\n", + " ('red', 4210),\n", + " ('ride', 4204),\n", + " ('min', 4172),\n", + " ('sign', 4170),\n", + " ('definit', 4162),\n", + " ('agre', 4157),\n", + " ('ladi', 4144),\n", + " ('laugh', 4140),\n", + " ('bye', 4108),\n", + " ('instead', 4089),\n", + " ('jealous', 4055),\n", + " ('short', 4042),\n", + " ('perfect', 4031),\n", + " ('yea', 4028),\n", + " ('xd', 4013),\n", + " ('stori', 3991),\n", + " ('page', 3991),\n", + " ('second', 3990),\n", + " ('nap', 3990),\n", + " ('top', 3988),\n", + " ('bout', 3983),\n", + " ('wed', 3979),\n", + " ('sore', 3977),\n", + " ('citi', 3975),\n", + " ('album', 3953),\n", + " ('sigh', 3945),\n", + " ('homework', 3935),\n", + " ('messag', 3926),\n", + " ('dead', 3921),\n", + " ('tommcfli', 3913),\n", + " ('graduat', 3913),\n", + " ('dear', 3912),\n", + " ('figur', 3904),\n", + " ('join', 3901),\n", + " ('sing', 3886),\n", + " ('list', 3866),\n", + " ('tour', 3862),\n", + " ('togeth', 3857),\n", + " ('date', 3856),\n", + " ('near', 3848),\n", + " ('youtub', 3843),\n", + " ('soooo', 3837),\n", + " ('congrat', 3811),\n", + " ('laptop', 3810),\n", + " ('holiday', 3809),\n", + " ('star', 3806),\n", + " ('park', 3802),\n", + " ('water', 3783),\n", + " ('award', 3780),\n", + " ('save', 3776),\n", + " ('store', 3776),\n", + " ('point', 3773),\n", + " ('coupl', 3752),\n", + " ('goin', 3749),\n", + " ('revis', 3734),\n", + " ('moment', 3730),\n", + " ('complet', 3721),\n", + " ('relax', 3702),\n", + " ('drop', 3699),\n", + " ('town', 3693),\n", + " ('line', 3666),\n", + " ('side', 3658),\n", + " ('download', 3647),\n", + " ('dress', 3647),\n", + " ('church', 3637),\n", + " ('order', 3629),\n", + " ('account', 3601),\n", + " ('cook', 3591),\n", + " ('annoy', 3583),\n", + " ('ipod', 3573),\n", + " ('tea', 3565),\n", + " ('share', 3563),\n", + " ('weird', 3560),\n", + " ('answer', 3558),\n", + " ('ppl', 3533),\n", + " ('offici', 3517),\n", + " ('cream', 3514),\n", + " ('less', 3512),\n", + " ('decid', 3495),\n", + " ('ddlovato', 3493),\n", + " ('ive', 3489),\n", + " ('gym', 3451),\n", + " ('lose', 3450),\n", + " ('scare', 3442),\n", + " ('forget', 3430),\n", + " ('f', 3422),\n", + " ('air', 3420),\n", + " ('mood', 3419),\n", + " ('lil', 3378),\n", + " ('realiz', 3370),\n", + " ('math', 3343),\n", + " ('unfortun', 3341),\n", + " ('chat', 3339),\n", + " ('fli', 3331),\n", + " ('english', 3329),\n", + " ('nd', 3324),\n", + " ('mum', 3318),\n", + " ('understand', 3313),\n", + " ('past', 3303),\n", + " ('fb', 3297),\n", + " ('chocol', 3295),\n", + " ('differ', 3278),\n", + " ('pool', 3276),\n", + " ('band', 3270),\n", + " ('usual', 3264),\n", + " ('comment', 3254),\n", + " ('ate', 3252),\n", + " ('episod', 3252),\n", + " ('fast', 3250),\n", + " ('ahh', 3215),\n", + " ('knew', 3210),\n", + " ('window', 3195),\n", + " ('upload', 3194),\n", + " ('kick', 3170),\n", + " ('worst', 3169),\n", + " ('london', 3152),\n", + " ('support', 3144),\n", + " ('broken', 3137),\n", + " ('chanc', 3132),\n", + " ('load', 3130),\n", + " ('horribl', 3128),\n", + " ('parent', 3126),\n", + " ('flight', 3126),\n", + " ('hmm', 3108),\n", + " ('black', 3106),\n", + " ('yep', 3093),\n", + " ('question', 3092),\n", + " ('throat', 3090),\n", + " ('cheer', 3089),\n", + " ('team', 3085),\n", + " ('three', 3083),\n", + " ('worth', 3083),\n", + " ('sat', 3070),\n", + " ('sleepi', 3051),\n", + " ('sunshin', 3041),\n", + " ('da', 3038),\n", + " ('upset', 3037),\n", + " ('card', 3032),\n", + " ('via', 3030),\n", + " ('special', 3022),\n", + " ('fair', 3012),\n", + " ('xxx', 3008),\n", + " ('mac', 3007),\n", + " ('bless', 3006),\n", + " ('depress', 3005),\n", + " ('shirt', 3001),\n", + " ('slow', 2996),\n", + " ('myspac', 2986),\n", + " ('em', 2974),\n", + " ('number', 2973),\n", + " ('ad', 2968),\n", + " ('beat', 2961),\n", + " ('leg', 2958),\n", + " ('sent', 2950),\n", + " ('green', 2950),\n", + " ('followfriday', 2949),\n", + " ('jona', 2939),\n", + " ('gave', 2927),\n", + " ('bet', 2910),\n", + " ('colleg', 2902),\n", + " ('sim', 2900),\n", + " ('record', 2889),\n", + " ('project', 2883),\n", + " ('appar', 2880),\n", + " ('paper', 2870),\n", + " ('cake', 2866),\n", + " ('tuesday', 2864),\n", + " ('moon', 2852),\n", + " ('app', 2845),\n", + " ('websit', 2845),\n", + " ('what', 2843),\n", + " ('finger', 2840),\n", + " ('beer', 2839),\n", + " ('vacat', 2838),\n", + " ('power', 2834),\n", + " ('blue', 2834),\n", + " ('warm', 2826),\n", + " ('film', 2815),\n", + " ('fell', 2814),\n", + " ('uk', 2812),\n", + " ('light', 2811),\n", + " ('garden', 2811),\n", + " ('wors', 2810),\n", + " ('easi', 2788),\n", + " ('possibl', 2777),\n", + " ('juli', 2776),\n", + " ('doesnt', 2769),\n", + " ('miley', 2768),\n", + " ('rather', 2763),\n", + " ('bodi', 2761),\n", + " ('longer', 2758),\n", + " ('bday', 2757),\n", + " ('nope', 2753),\n", + " ('mr', 2741),\n", + " ('flu', 2734),\n", + " ('shoe', 2732),\n", + " ('disappoint', 2725),\n", + " ('huge', 2723),\n", + " ('mess', 2721),\n", + " ('freak', 2717),\n", + " ('googl', 2715),\n", + " ('mtv', 2714),\n", + " ('wtf', 2710),\n", + " ('due', 2708),\n", + " ('absolut', 2700),\n", + " ('celebr', 2697),\n", + " ('spent', 2694),\n", + " ('safe', 2688),\n", + " ('chill', 2683),\n", + " ('plus', 2682),\n", + " ('bike', 2680),\n", + " ('lay', 2671),\n", + " ('shame', 2667),\n", + " ('voic', 2665),\n", + " ('cancel', 2662),\n", + " ('age', 2661),\n", + " ('burn', 2661),\n", + " ('lazi', 2652),\n", + " ('thx', 2647),\n", + " ('cousin', 2639),\n", + " ('white', 2633),\n", + " ('forev', 2630),\n", + " ('earlier', 2622),\n", + " ('stress', 2616),\n", + " ('ahhh', 2607),\n", + " ('stomach', 2603),\n", + " ('touch', 2601),\n", + " ('babe', 2595),\n", + " ('thursday', 2589),\n", + " ('hold', 2588),\n", + " ('swim', 2585),\n", + " ('remind', 2584),\n", + " ('quick', 2581),\n", + " ('david', 2564),\n", + " ('shot', 2560),\n", + " ('bus', 2559),\n", + " ('except', 2558),\n", + " ('idk', 2555),\n", + " ('especi', 2554),\n", + " ('camp', 2553),\n", + " ('lie', 2545),\n", + " ('manag', 2541),\n", + " ('son', 2540),\n", + " ('exact', 2540),\n", + " ('camera', 2539),\n", + " ('v', 2536),\n", + " ('slept', 2534),\n", + " ('box', 2531),\n", + " ('½', 2526),\n", + " ('appreci', 2525),\n", + " ('met', 2524),\n", + " ('boyfriend', 2523),\n", + " ('appl', 2523),\n", + " ('pray', 2521),\n", + " ('bum', 2519),\n", + " ('crash', 2511),\n", + " ('tom', 2505),\n", + " ('sort', 2497),\n", + " ('shoot', 2494),\n", + " ('surpris', 2493),\n", + " ('type', 2487),\n", + " ('current', 2485),\n", + " ('luv', 2478),\n", + " ('insid', 2477),\n", + " ('yummi', 2466),\n", + " ('hrs', 2460),\n", + " ('fight', 2456),\n", + " ('piss', 2455),\n", + " ('block', 2453),\n", + " ('present', 2431),\n", + " ('airport', 2428),\n", + " ('note', 2428),\n", + " ('father', 2426),\n", + " ('jonasbroth', 2421),\n", + " ('wit', 2416),\n", + " ('cover', 2415),\n", + " ('pizza', 2409),\n", + " ('case', 2407),\n", + " ('havent', 2407),\n", + " ('servic', 2407),\n", + " ('mail', 2403),\n", + " ('terribl', 2401),\n", + " ('club', 2399),\n", + " ('road', 2397),\n", + " ('bbq', 2395),\n", + " ('random', 2390),\n", + " ('confus', 2389),\n", + " ('arriv', 2386),\n", + " ('invit', 2384),\n", + " ('radio', 2379),\n", + " ('bitch', 2378),\n", + " ('hospit', 2373),\n", + " ('chicken', 2369),\n", + " ('meant', 2368),\n", + " ('expect', 2368),\n", + " ('small', 2358),\n", + " ('raini', 2350),\n", + " ('deal', 2346),\n", + " ('fit', 2345),\n", + " ('interview', 2341),\n", + " ('storm', 2341),\n", + " ('hubbi', 2336),\n", + " ('h', 2332),\n", + " ('tummi', 2330),\n", + " ('design', 2330),\n", + " ('cloth', 2329),\n", + " ('ps', 2326),\n", + " ('count', 2321),\n", + " ('tast', 2320),\n", + " ('dm', 2320),\n", + " ('doctor', 2310),\n", + " ('hill', 2305),\n", + " ('proud', 2294),\n", + " ('notic', 2292),\n", + " ('smell', 2290),\n", + " ('twilight', 2287),\n", + " ('laker', 2279),\n", + " ('lone', 2270),\n", + " ('addict', 2270),\n", + " ('felt', 2269),\n", + " ('cup', 2269),\n", + " ('mention', 2268),\n", + " ('speak', 2259),\n", + " ('stand', 2258),\n", + " ('shall', 2255),\n", + " ('wine', 2253),\n", + " ('alright', 2247),\n", + " ('begin', 2240),\n", + " ('search', 2235),\n", + " ('goodby', 2235),\n", + " ('cd', 2233),\n", + " ('peac', 2230),\n", + " ('yup', 2224),\n", + " ('ach', 2216),\n", + " ('fact', 2215),\n", + " ('issu', 2211),\n", + " ('gorgeous', 2211),\n", + " ('product', 2206),\n", + " ('bag', 2205),\n", + " ('lame', 2203),\n", + " ('practic', 2201),\n", + " ('wednesday', 2200),\n", + " ('yo', 2199),\n", + " ('wash', 2199),\n", + " ('pull', 2194),\n", + " ('woo', 2184),\n", + " ('j', 2183),\n", + " ('feet', 2176),\n", + " ('connect', 2170),\n", + " ('hmmm', 2163),\n", + " ('front', 2161),\n", + " ('kiss', 2159),\n", + " ('pink', 2156),\n", + " ('glass', 2154),\n", + " ('bar', 2153),\n", + " ('tan', 2150),\n", + " ('roll', 2148),\n", + " ('tear', 2146),\n", + " ('whatev', 2141),\n", + " ('compani', 2140),\n", + " ('cos', 2134),\n", + " ('bro', 2130),\n", + " ('taken', 2130),\n", + " ('ouch', 2130),\n", + " ('xoxo', 2124),\n", + " ('french', 2122),\n", + " ('apart', 2114),\n", + " ('scari', 2111),\n", + " ('state', 2105),\n", + " ('joke', 2104),\n", + " ('ball', 2098),\n", + " ('exhaust', 2094),\n", + " ('event', 2092),\n", + " ('memori', 2086),\n", + " ('drunk', 2086),\n", + " ('becom', 2080),\n", + " ('mile', 2079),\n", + " ('paint', 2078),\n", + " ('normal', 2074),\n", + " ('ear', 2072),\n", + " ('everybodi', 2067),\n", + " ('daughter', 2063),\n", + " ('jus', 2062),\n", + " ('mommi', 2055),\n", + " ('guitar', 2051),\n", + " ('à', 2051),\n", + " ('round', 2050),\n", + " ('isnt', 2050),\n", + " ('mate', 2043),\n", + " ('behind', 2040),\n", + " ('version', 2038),\n", + " ('prob', 2035),\n", + " ('sold', 2026),\n", + " ('travel', 2025),\n", + " ('rip', 2023),\n", + " ('releas', 2020),\n", + " ('art', 2016),\n", + " ('gettin', 2016),\n", + " ('door', 2015),\n", + " ('plane', 2010),\n", + " ('return', 2003),\n", + " ('promis', 2001),\n", + " ('although', 2001),\n", + " ('hangov', 1994),\n", + " ('fire', 1992),\n", + " ('matter', 1990),\n", + " ('sell', 1990),\n", + " ('singl', 1986),\n", + " ('web', 1986),\n", + " ('arm', 1985),\n", + " ('cross', 1985),\n", + " ('sis', 1983),\n", + " ('puppi', 1982),\n", + " ('vega', 1981),\n", + " ('wife', 1981),\n", + " ('magic', 1979),\n", + " ('allow', 1976),\n", + " ('along', 1973),\n", + " ('pop', 1972),\n", + " ('sale', 1968),\n", + " ('fantast', 1961),\n", + " ('hahah', 1947),\n", + " ('countri', 1943),\n", + " ('fish', 1939),\n", + " ('fri', 1938),\n", + " ('clear', 1932),\n", + " ('alot', 1931),\n", + " ('dark', 1925),\n", + " ('group', 1924),\n", + " ('bug', 1917),\n", + " ('wat', 1915),\n", + " ('self', 1914),\n", + " ('ï', 1909),\n", + " ('bb', 1907),\n", + " ('hotel', 1902),\n", + " ('cooki', 1902),\n", + " ('ruin', 1900),\n", + " ('death', 1892),\n", + " ('track', 1888),\n", + " ('eh', 1888),\n", + " ('ahead', 1879),\n", + " ('act', 1875),\n", + " ('screen', 1874),\n", + " ('huh', 1866),\n", + " ('wast', 1864),\n", + " ('twit', 1860),\n", + " ('ohh', 1854),\n", + " ('hun', 1851),\n", + " ('gosh', 1849),\n", + " ('histori', 1849),\n", + " ('inde', 1849),\n", + " ('angel', 1848),\n", + " ('instal', 1847),\n", + " ('ff', 1846),\n", + " ('deserv', 1844),\n", + " ('perform', 1841),\n", + " ('nick', 1840),\n", + " ('buddi', 1839),\n", + " ('aint', 1835),\n", + " ('bird', 1832),\n", + " ('fml', 1824),\n", + " ('profil', 1824),\n", + " ('cough', 1817),\n", + " ('race', 1810),\n", + " ('low', 1808),\n", + " ('daddi', 1807),\n", + " ('vip', 1805),\n", + " ('dvd', 1802),\n", + " ('major', 1800),\n", + " ('chees', 1797),\n", + " ('nobodi', 1797),\n", + " ('fill', 1796),\n", + " ('heat', 1795),\n", + " ('yum', 1791),\n", + " ('street', 1789),\n", + " ('land', 1788),\n", + " ('sexi', 1787),\n", + " ('fat', 1786),\n", + " ('extra', 1786),\n", + " ('traffic', 1773),\n", + " ('bloodi', 1770),\n", + " ('gay', 1769),\n", + " ('troubl', 1766),\n", + " ('grow', 1766),\n", + " ('delet', 1765),\n", + " ('tweetdeck', 1759),\n", + " ('throw', 1759),\n", + " ('posit', 1754),\n", + " ('blood', 1749),\n", + " ('pc', 1746),\n", + " ('gut', 1746),\n", + " ('nose', 1744),\n", + " ('vid', 1742),\n", + " ('ran', 1732),\n", + " ('men', 1731),\n", + " ('nail', 1731),\n", + " ('prepar', 1731),\n", + " ('edit', 1730),\n", + " ('other', 1722),\n", + " ('recommend', 1717),\n", + " ('itun', 1717),\n", + " ('watchin', 1716),\n", + " ('step', 1711),\n", + " ('view', 1709),\n", + " ('taylor', 1709),\n", + " ('somewher', 1707),\n", + " ('awwww', 1707),\n", + " ('mall', 1706),\n", + " ('rd', 1705),\n", + " ('direct', 1704),\n", + " ('coz', 1703),\n", + " ('fam', 1702),\n", + " ('caught', 1702),\n", + " ('joe', 1702),\n", + " ('suggest', 1700),\n", + " ('inspir', 1699),\n", + " ('dunno', 1699),\n", + " ('chillin', 1699),\n", + " ('dang', 1698),\n", + " ('result', 1697),\n", + " ('shine', 1697),\n", + " ('info', 1693),\n", + " ('bill', 1692),\n", + " ('shut', 1692),\n", + " ('market', 1691),\n", + " ('ooh', 1690),\n", + " ('def', 1686),\n", + " ('fav', 1682),\n", + " ('fever', 1682),\n", + " ('mark', 1681),\n", + " ('nyc', 1680),\n", + " ('anim', 1680),\n", + " ('report', 1675),\n", + " ('stick', 1674),\n", + " ('jon', 1672),\n", + " ('blah', 1670),\n", + " ('sky', 1669),\n", + " ('congratul', 1667),\n", + " ...]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "top_words" ] }, { @@ -171,7 +1343,20 @@ "metadata": {}, "outputs": [], "source": [ - "# your code here" + "from nltk.tokenize import word_tokenize\n", + "\n", + "# top 5000 words\n", + "top_words = [w[0] for w in word_features[:5000]]\n", + "\n", + "# list of tuples, each containing a dictionary of word features and the label\n", + "features = []\n", + "for tweet, label in zip(tweets, labels):\n", + " # tokenize tweet\n", + " words = set(word_tokenize(tweet.lower()))\n", + " # create dictionary of word features\n", + " word_features = {word: (word in words) for word in top_words}\n", + " # append to list of features\n", + " features.append((word_features, label))" ] }, { @@ -210,11 +1395,21 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "import nltk\n", + "\n", + "# Split the feature set into training and test sets\n", + "train_set, test_set = featuresets[:800], featuresets[800:]\n", + "\n", + "# Create and train a Bayes classifier instance\n", + "classifier = nltk.NaiveBayesClassifier.train(train_set)\n", + "\n", + "# Inspect the most important features\n", + "classifier.show_most_informative_features(10)" ] }, { @@ -234,7 +1429,9 @@ "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "accuracy = nltk.classify.accuracy(classifier, test_set)\n", + "print(\"Accuracy:\", accuracy)" ] }, { @@ -312,7 +1509,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.10.7" + }, + "vscode": { + "interpreter": { + "hash": "721db305ef1fd1fc91cdf20e400af694a949fe540ac5f48c160f31c7e384879d" + } } }, "nbformat": 4,