diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb
index 0808166..450f6de 100644
--- a/your-code/challenge-1.ipynb
+++ b/your-code/challenge-1.ipynb
@@ -66,20 +66,46 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 69,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "from nltk.tokenize import word_tokenize"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"def clean_up(s):\n",
- " \"\"\"\n",
- " Cleans up numbers, URLs, and special characters from a string.\n",
- "\n",
- " Args:\n",
- " s: The string to be cleaned up.\n",
- "\n",
- " Returns:\n",
- " A string that has been cleaned up.\n",
- " \"\"\""
+ " s = re.sub(r'http\\S+', ' ', s)\n",
+ " s = re.sub(r'[^\\w\\s]+|\\d+', ' ', s)\n",
+ " s = s.strip()\n",
+ " s = re.sub(r'\\s+', ' ', s)\n",
+ " \n",
+ " return s.lower()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ironhack s q website is\n"
+ ]
+ }
+ ],
+ "source": [
+ "s = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\\\")\"\n",
+ "clean_sentence = clean_up(s)\n",
+ "print(clean_sentence)"
]
},
{
@@ -101,20 +127,40 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 72,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from nltk.tokenize import word_tokenize"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"def tokenize(s):\n",
- " \"\"\"\n",
- " Tokenize a string.\n",
- "\n",
- " Args:\n",
- " s: String to be tokenized.\n",
- "\n",
- " Returns:\n",
- " A list of words as the result of tokenization.\n",
- " \"\"\""
+ " return word_tokenize(s)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['ironhack', 's', 'q', 'website', 'is']\n"
+ ]
+ }
+ ],
+ "source": [
+ "text = 'ironhack s q website is'\n",
+ "tokens = tokenize(text)\n",
+ "print(tokens)"
]
},
{
@@ -145,20 +191,42 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 75,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from nltk.stem import SnowballStemmer, WordNetLemmatizer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"def stem_and_lemmatize(l):\n",
- " \"\"\"\n",
- " Perform stemming and lemmatization on a list of words.\n",
- "\n",
- " Args:\n",
- " l: A list of strings.\n",
- "\n",
- " Returns:\n",
- " A list of strings after being stemmed and lemmatized.\n",
- " \"\"\""
+ " stemmer = SnowballStemmer('english')\n",
+ " lemmatizer = WordNetLemmatizer()\n",
+ " stems = [stemmer.stem(l) for l in l]\n",
+ " lemmas = [lemmatizer.lemmatize(l) for l in l]\n",
+ " return stems, lemmas"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(['@', 'i', 'r', 'o', 'n', 'h', 'a', 'c', 'k', \"'\", 's', '-', '#', 'q', ' ', 'w', 'e', 'b', 's', 'i', 't', 'e', ' ', '7', '7', '6', '-', 'i', 's', ' ', 'h', 't', 't', 'p', ':', '/', '/', 'i', 'r', 'o', 'n', 'h', 'a', 'c', 'k', '.', 'c', 'o', 'm', ' ', '[', '(', '2', '0', '1', '8', ')', ']', '\"', ')'], ['@', 'I', 'r', 'o', 'n', 'h', 'a', 'c', 'k', \"'\", 's', '-', '#', 'Q', ' ', 'w', 'e', 'b', 's', 'i', 't', 'e', ' ', '7', '7', '6', '-', 'i', 's', ' ', 'h', 't', 't', 'p', ':', '/', '/', 'i', 'r', 'o', 'n', 'h', 'a', 'c', 'k', '.', 'c', 'o', 'm', ' ', '[', '(', '2', '0', '1', '8', ')', ']', '\"', ')'])\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(stem_and_lemmatize(s))"
]
},
{
@@ -176,20 +244,40 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 78,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from nltk.corpus import stopwords"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"def remove_stopwords(l):\n",
- " \"\"\"\n",
- " Remove English stopwords from a list of strings.\n",
- "\n",
- " Args:\n",
- " l: A list of strings.\n",
- "\n",
- " Returns:\n",
- " A list of strings after stop words are removed.\n",
- " \"\"\""
+ " stop_words = set(stopwords.words('english'))\n",
+ " filtered_words = [l for l in l if l not in stop_words]\n",
+ " return filtered_words"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['@', 'I', 'r', 'n', 'h', 'c', 'k', \"'\", '-', '#', 'Q', ' ', 'w', 'e', 'b', 'e', ' ', '7', '7', '6', '-', ' ', 'h', 'p', ':', '/', '/', 'r', 'n', 'h', 'c', 'k', '.', 'c', ' ', '[', '(', '2', '0', '1', '8', ')', ']', '\"', ')']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(remove_stopwords(s))"
]
},
{
@@ -218,7 +306,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.3"
+ "version": "3.10.7"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "721db305ef1fd1fc91cdf20e400af694a949fe540ac5f48c160f31c7e384879d"
+ }
}
},
"nbformat": 4,
diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb
index 6b0e116..51c32f1 100644
--- a/your-code/challenge-2.ipynb
+++ b/your-code/challenge-2.ipynb
@@ -18,8 +18,8 @@
"\n",
"```python\n",
">>> from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
- ">>> txt = \"Ironhack is a Global Tech School ranked num 2 worldwide.
",
- "
",
+ ">>> txt = \"Ironhack is a Global Tech School ranked num 2 worldwide.
\n",
+ "
\n",
"Our mission is to help people transform their careers and join a thriving community of tech professionals that love what they do.\"\n",
">>> analyzer = SentimentIntensityAnalyzer()\n",
">>> analyzer.polarity_scores(txt)\n",
@@ -46,11 +46,116 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " target | \n",
+ " id | \n",
+ " date | \n",
+ " flag | \n",
+ " user | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 1467810369 | \n",
+ " Mon Apr 06 22:19:45 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " _TheSpecialOne_ | \n",
+ " @switchfoot http://twitpic.com/2y1zl - Awww, t... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0 | \n",
+ " 1467810672 | \n",
+ " Mon Apr 06 22:19:49 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " scotthamilton | \n",
+ " is upset that he can't update his Facebook by ... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0 | \n",
+ " 1467810917 | \n",
+ " Mon Apr 06 22:19:53 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " mattycus | \n",
+ " @Kenichan I dived many times for the ball. Man... | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0 | \n",
+ " 1467811184 | \n",
+ " Mon Apr 06 22:19:57 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " ElleCTF | \n",
+ " my whole body feels itchy and like its on fire | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0 | \n",
+ " 1467811193 | \n",
+ " Mon Apr 06 22:19:57 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " Karoli | \n",
+ " @nationwideclass no, it's not behaving at all.... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " target id date flag \\\n",
+ "0 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY \n",
+ "1 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY \n",
+ "2 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY \n",
+ "3 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n",
+ "4 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n",
+ "\n",
+ " user text \n",
+ "0 _TheSpecialOne_ @switchfoot http://twitpic.com/2y1zl - Awww, t... \n",
+ "1 scotthamilton is upset that he can't update his Facebook by ... \n",
+ "2 mattycus @Kenichan I dived many times for the ball. Man... \n",
+ "3 ElleCTF my whole body feels itchy and like its on fire \n",
+ "4 Karoli @nationwideclass no, it's not behaving at all.... "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# your code here"
+ "import pandas as pd\n",
+ "df = pd.read_csv('noemoticon.csv',encoding='latin-1', header=None)\n",
+ "df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']\n",
+ "df.head()\n",
+ "\n",
+ "##was giving me an error, had to add multiple arg."
]
},
{
@@ -76,11 +181,54 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "import re\n",
+ "from nltk.tokenize import word_tokenize\n",
+ "from nltk.stem.snowball import SnowballStemmer\n",
+ "from nltk.stem.wordnet import WordNetLemmatizer\n",
+ "from nltk.corpus import stopwords"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def process_text(s):\n",
+ " # Clean up the text\n",
+ " s = re.sub(r'http\\S+', ' ', s)\n",
+ " s = re.sub(r'[^\\w\\s]+|\\d+', ' ', s)\n",
+ " s = s.strip()\n",
+ " s = re.sub(r'\\s+', ' ', s)\n",
+ " s = s.lower()\n",
+ "\n",
+ " # Tokenize the text\n",
+ " tokens = word_tokenize(s)\n",
+ "\n",
+ " # Remove stop words\n",
+ " stop_words = set(stopwords.words('english'))\n",
+ " filtered_words = [w for w in tokens if not w in stop_words]\n",
+ "\n",
+ " # Stem and lemmatize the words\n",
+ " stemmer = SnowballStemmer('english')\n",
+ " lemmatizer = WordNetLemmatizer()\n",
+ " stems = [stemmer.stem(w) for w in filtered_words]\n",
+ " lemmas = [lemmatizer.lemmatize(w) for w in filtered_words]\n",
+ "\n",
+ " return stems, lemmas"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['text_processed'] = df['text'].apply(process_text).apply(lambda x: x[0]) # or x[1] for lemmas"
]
},
{
@@ -102,7 +250,1031 @@
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "from nltk import FreqDist\n",
+ "\n",
+ "all_words = [word for text in df['text_processed'] for word in text]\n",
+ "freq_dist = FreqDist(all_words)\n",
+ "top_words = freq_dist.most_common(5000)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[('go', 138687),\n",
+ " ('get', 110838),\n",
+ " ('day', 109146),\n",
+ " ('good', 92565),\n",
+ " ('work', 87870),\n",
+ " ('like', 83831),\n",
+ " ('love', 82663),\n",
+ " ('quot', 73383),\n",
+ " ('got', 71107),\n",
+ " ('today', 68697),\n",
+ " ('time', 66391),\n",
+ " ('lol', 59472),\n",
+ " ('thank', 59434),\n",
+ " ('back', 57401),\n",
+ " ('one', 57368),\n",
+ " ('want', 57339),\n",
+ " ('miss', 56915),\n",
+ " ('u', 56586),\n",
+ " ('know', 54960),\n",
+ " ('see', 51368),\n",
+ " ('feel', 51266),\n",
+ " ('think', 51190),\n",
+ " ('im', 50695),\n",
+ " ('realli', 50076),\n",
+ " ('amp', 48767),\n",
+ " ('night', 45850),\n",
+ " ('hope', 44935),\n",
+ " ('watch', 43639),\n",
+ " ('still', 43618),\n",
+ " ('need', 43438),\n",
+ " ('make', 43217),\n",
+ " ('well', 42947),\n",
+ " ('new', 42452),\n",
+ " ('na', 42081),\n",
+ " ('home', 40829),\n",
+ " ('oh', 39946),\n",
+ " ('look', 39752),\n",
+ " ('come', 39117),\n",
+ " ('much', 37149),\n",
+ " ('last', 36279),\n",
+ " ('twitter', 36166),\n",
+ " ('morn', 35723),\n",
+ " ('tomorrow', 34328),\n",
+ " ('wish', 34038),\n",
+ " ('great', 33853),\n",
+ " ('wait', 32603),\n",
+ " ('sad', 32527),\n",
+ " ('sleep', 32458),\n",
+ " ('haha', 31533),\n",
+ " ('bad', 28835),\n",
+ " ('fun', 28719),\n",
+ " ('week', 28612),\n",
+ " ('tri', 28245),\n",
+ " ('right', 28131),\n",
+ " ('follow', 28039),\n",
+ " ('happi', 27720),\n",
+ " ('would', 27330),\n",
+ " ('friend', 26606),\n",
+ " ('thing', 26519),\n",
+ " ('sorri', 26429),\n",
+ " ('tonight', 26182),\n",
+ " ('say', 25568),\n",
+ " ('way', 25205),\n",
+ " ('take', 24740),\n",
+ " ('gon', 24096),\n",
+ " ('nice', 24083),\n",
+ " ('though', 24017),\n",
+ " ('better', 23263),\n",
+ " ('hate', 23019),\n",
+ " ('even', 22879),\n",
+ " ('yeah', 22478),\n",
+ " ('bed', 22430),\n",
+ " ('tweet', 22369),\n",
+ " ('could', 21928),\n",
+ " ('start', 21798),\n",
+ " ('school', 21078),\n",
+ " ('hour', 21063),\n",
+ " ('peopl', 20982),\n",
+ " ('show', 20770),\n",
+ " ('guy', 19702),\n",
+ " ('play', 19688),\n",
+ " ('weekend', 19616),\n",
+ " ('hey', 19112),\n",
+ " ('final', 18962),\n",
+ " ('awesom', 18777),\n",
+ " ('yes', 18693),\n",
+ " ('next', 18669),\n",
+ " ('let', 18599),\n",
+ " ('lt', 18562),\n",
+ " ('use', 18509),\n",
+ " ('dont', 18235),\n",
+ " ('never', 17945),\n",
+ " ('soon', 17857),\n",
+ " ('cant', 17684),\n",
+ " ('tire', 17371),\n",
+ " ('long', 17247),\n",
+ " ('rain', 17199),\n",
+ " ('pleas', 17170),\n",
+ " ('littl', 16978),\n",
+ " ('first', 16845),\n",
+ " ('life', 16821),\n",
+ " ('year', 16817),\n",
+ " ('everyon', 16801),\n",
+ " ('wan', 16751),\n",
+ " ('movi', 16632),\n",
+ " ('x', 16563),\n",
+ " ('best', 16512),\n",
+ " ('sick', 16473),\n",
+ " ('ok', 16232),\n",
+ " ('girl', 15877),\n",
+ " ('find', 15798),\n",
+ " ('call', 15626),\n",
+ " ('suck', 15521),\n",
+ " ('sure', 15465),\n",
+ " ('done', 15365),\n",
+ " ('help', 15336),\n",
+ " ('bore', 15314),\n",
+ " ('head', 15246),\n",
+ " ('alway', 14971),\n",
+ " ('talk', 14926),\n",
+ " ('keep', 14792),\n",
+ " ('alreadi', 14773),\n",
+ " ('cool', 14731),\n",
+ " ('lot', 14582),\n",
+ " ('anoth', 14576),\n",
+ " ('live', 14573),\n",
+ " ('someth', 14481),\n",
+ " ('us', 14465),\n",
+ " ('eat', 14360),\n",
+ " ('phone', 14348),\n",
+ " ('man', 14267),\n",
+ " ('leav', 14186),\n",
+ " ('read', 14155),\n",
+ " ('hurt', 14041),\n",
+ " ('readi', 14001),\n",
+ " ('made', 13854),\n",
+ " ('yay', 13796),\n",
+ " ('enjoy', 13746),\n",
+ " ('song', 13481),\n",
+ " ('hous', 13432),\n",
+ " ('yet', 13398),\n",
+ " ('went', 13308),\n",
+ " ('ur', 13304),\n",
+ " ('ever', 13155),\n",
+ " ('n', 13070),\n",
+ " ('sound', 12856),\n",
+ " ('thought', 12754),\n",
+ " ('pretti', 12720),\n",
+ " ('mayb', 12681),\n",
+ " ('amaz', 12511),\n",
+ " ('excit', 12463),\n",
+ " ('away', 12315),\n",
+ " ('summer', 12289),\n",
+ " ('game', 12276),\n",
+ " ('finish', 12262),\n",
+ " ('omg', 12231),\n",
+ " ('old', 12185),\n",
+ " ('tell', 12170),\n",
+ " ('guess', 12169),\n",
+ " ('damn', 11997),\n",
+ " ('mean', 11897),\n",
+ " ('listen', 11840),\n",
+ " ('earli', 11830),\n",
+ " ('someon', 11740),\n",
+ " ('check', 11588),\n",
+ " ('bit', 11543),\n",
+ " ('babi', 11525),\n",
+ " ('left', 11507),\n",
+ " ('lost', 11479),\n",
+ " ('give', 11477),\n",
+ " ('end', 11181),\n",
+ " ('big', 11180),\n",
+ " ('hot', 11160),\n",
+ " ('wow', 11159),\n",
+ " ('parti', 11157),\n",
+ " ('late', 11148),\n",
+ " ('noth', 11087),\n",
+ " ('hear', 11074),\n",
+ " ('w', 10944),\n",
+ " ('ya', 10911),\n",
+ " ('b', 10729),\n",
+ " ('glad', 10633),\n",
+ " ('actual', 10609),\n",
+ " ('pic', 10595),\n",
+ " ('birthday', 10583),\n",
+ " ('happen', 10538),\n",
+ " ('hard', 10528),\n",
+ " ('sun', 10455),\n",
+ " ('stop', 10454),\n",
+ " ('also', 10439),\n",
+ " ('weather', 10392),\n",
+ " ('later', 10386),\n",
+ " ('two', 10312),\n",
+ " ('mom', 10205),\n",
+ " ('wonder', 10185),\n",
+ " ('stuff', 10133),\n",
+ " ('ugh', 10059),\n",
+ " ('put', 10054),\n",
+ " ('ta', 9991),\n",
+ " ('saw', 9923),\n",
+ " ('run', 9897),\n",
+ " ('god', 9825),\n",
+ " ('exam', 9822),\n",
+ " ('fuck', 9783),\n",
+ " ('stay', 9777),\n",
+ " ('car', 9772),\n",
+ " ('might', 9767),\n",
+ " ('th', 9726),\n",
+ " ('music', 9693),\n",
+ " ('world', 9661),\n",
+ " ('yesterday', 9650),\n",
+ " ('kid', 9598),\n",
+ " ('said', 9595),\n",
+ " ('that', 9532),\n",
+ " ('r', 9490),\n",
+ " ('meet', 9489),\n",
+ " ('sinc', 9462),\n",
+ " ('hi', 9384),\n",
+ " ('job', 9375),\n",
+ " ('post', 9287),\n",
+ " ('beauti', 9280),\n",
+ " ('updat', 9253),\n",
+ " ('sunday', 9216),\n",
+ " ('friday', 9151),\n",
+ " ('monday', 9088),\n",
+ " ('around', 9087),\n",
+ " ('video', 9057),\n",
+ " ('mani', 9031),\n",
+ " ('seem', 8992),\n",
+ " ('com', 8885),\n",
+ " ('cold', 8803),\n",
+ " ('luck', 8715),\n",
+ " ('found', 8691),\n",
+ " ('must', 8688),\n",
+ " ('poor', 8682),\n",
+ " ('cri', 8638),\n",
+ " ('book', 8590),\n",
+ " ('move', 8569),\n",
+ " ('die', 8520),\n",
+ " ('aww', 8473),\n",
+ " ('busi', 8441),\n",
+ " ('boy', 8425),\n",
+ " ('gone', 8362),\n",
+ " ('may', 8350),\n",
+ " ('buy', 8204),\n",
+ " ('shop', 8154),\n",
+ " ('famili', 8153),\n",
+ " ('anyth', 8150),\n",
+ " ('plan', 8085),\n",
+ " ('studi', 8074),\n",
+ " ('woke', 8032),\n",
+ " ('least', 8028),\n",
+ " ('hair', 8004),\n",
+ " ('food', 7998),\n",
+ " ('total', 7990),\n",
+ " ('month', 7990),\n",
+ " ('okay', 7982),\n",
+ " ('iphon', 7947),\n",
+ " ('till', 7940),\n",
+ " ('cute', 7933),\n",
+ " ('lunch', 7880),\n",
+ " ('almost', 7876),\n",
+ " ('free', 7854),\n",
+ " ('tho', 7848),\n",
+ " ('win', 7772),\n",
+ " ('sweet', 7730),\n",
+ " ('far', 7705),\n",
+ " ('believ', 7694),\n",
+ " ('drink', 7678),\n",
+ " ('dinner', 7673),\n",
+ " ('pictur', 7664),\n",
+ " ('caus', 7652),\n",
+ " ('chang', 7628),\n",
+ " ('place', 7626),\n",
+ " ('funni', 7623),\n",
+ " ('everyth', 7623),\n",
+ " ('class', 7588),\n",
+ " ('shit', 7575),\n",
+ " ('welcom', 7572),\n",
+ " ('p', 7460),\n",
+ " ('gt', 7451),\n",
+ " ('anyon', 7439),\n",
+ " ('drive', 7417),\n",
+ " ('forward', 7341),\n",
+ " ('turn', 7335),\n",
+ " ('sit', 7312),\n",
+ " ('mine', 7298),\n",
+ " ('without', 7232),\n",
+ " ('walk', 7222),\n",
+ " ('ask', 7220),\n",
+ " ('real', 7148),\n",
+ " ('name', 7142),\n",
+ " ('everi', 7080),\n",
+ " ('dream', 7050),\n",
+ " ('write', 7036),\n",
+ " ('stupid', 7019),\n",
+ " ('idea', 6985),\n",
+ " ('dad', 6973),\n",
+ " ('hahaha', 6934),\n",
+ " ('send', 6933),\n",
+ " ('outsid', 6929),\n",
+ " ('ill', 6896),\n",
+ " ('clean', 6895),\n",
+ " ('coffe', 6881),\n",
+ " ('enough', 6815),\n",
+ " ('room', 6747),\n",
+ " ('wrong', 6711),\n",
+ " ('fan', 6670),\n",
+ " ('anymor', 6666),\n",
+ " ('wake', 6655),\n",
+ " ('dog', 6650),\n",
+ " ('didnt', 6614),\n",
+ " ('probabl', 6602),\n",
+ " ('saturday', 6521),\n",
+ " ('ha', 6469),\n",
+ " ('tv', 6434),\n",
+ " ('c', 6425),\n",
+ " ('money', 6409),\n",
+ " ('minut', 6408),\n",
+ " ('repli', 6317),\n",
+ " ('person', 6288),\n",
+ " ('xx', 6254),\n",
+ " ('eye', 6245),\n",
+ " ('break', 6242),\n",
+ " ('sooo', 6237),\n",
+ " ('face', 6231),\n",
+ " ('serious', 6230),\n",
+ " ('rememb', 6178),\n",
+ " ('headach', 6171),\n",
+ " ('hit', 6158),\n",
+ " ('aw', 6151),\n",
+ " ('rock', 6139),\n",
+ " ('brother', 6124),\n",
+ " ('fail', 6069),\n",
+ " ('blog', 6048),\n",
+ " ('beach', 6025),\n",
+ " ('train', 6010),\n",
+ " ('came', 6007),\n",
+ " ('whole', 6001),\n",
+ " ('hang', 5986),\n",
+ " ('seen', 5983),\n",
+ " ('crazi', 5977),\n",
+ " ('kinda', 5975),\n",
+ " ('open', 5968),\n",
+ " ('mother', 5942),\n",
+ " ('pain', 5922),\n",
+ " ('rest', 5920),\n",
+ " ('kill', 5898),\n",
+ " ('â', 5897),\n",
+ " ('close', 5878),\n",
+ " ('super', 5819),\n",
+ " ('word', 5818),\n",
+ " ('comput', 5767),\n",
+ " ('care', 5742),\n",
+ " ('quit', 5726),\n",
+ " ('text', 5724),\n",
+ " ('half', 5713),\n",
+ " ('took', 5711),\n",
+ " ('hell', 5683),\n",
+ " ('hello', 5668),\n",
+ " ('awww', 5654),\n",
+ " ('news', 5644),\n",
+ " ('anyway', 5633),\n",
+ " ('true', 5609),\n",
+ " ('worri', 5605),\n",
+ " ('goodnight', 5549),\n",
+ " ('part', 5546),\n",
+ " ('pm', 5534),\n",
+ " ('e', 5512),\n",
+ " ('heart', 5497),\n",
+ " ('abl', 5475),\n",
+ " ('forgot', 5472),\n",
+ " ('problem', 5465),\n",
+ " ('trip', 5459),\n",
+ " ('els', 5458),\n",
+ " ('ago', 5421),\n",
+ " ('kind', 5405),\n",
+ " ('offic', 5404),\n",
+ " ('bring', 5401),\n",
+ " ('either', 5366),\n",
+ " ('mind', 5366),\n",
+ " ('photo', 5349),\n",
+ " ('full', 5340),\n",
+ " ('boo', 5320),\n",
+ " ('ah', 5293),\n",
+ " ('link', 5288),\n",
+ " ('danc', 5270),\n",
+ " ('ð', 5269),\n",
+ " ('pay', 5238),\n",
+ " ('soo', 5226),\n",
+ " ('hug', 5208),\n",
+ " ('sister', 5200),\n",
+ " ('ñ', 5178),\n",
+ " ('cuz', 5154),\n",
+ " ('alon', 5096),\n",
+ " ('internet', 5096),\n",
+ " ('hehe', 5067),\n",
+ " ('fall', 5042),\n",
+ " ('test', 5030),\n",
+ " ('btw', 5011),\n",
+ " ('stuck', 4981),\n",
+ " ('heard', 4961),\n",
+ " ('sometim', 4961),\n",
+ " ('cours', 4958),\n",
+ " ('email', 4957),\n",
+ " ('pick', 4951),\n",
+ " ('ticket', 4950),\n",
+ " ('st', 4937),\n",
+ " ('g', 4901),\n",
+ " ('site', 4899),\n",
+ " ('www', 4842),\n",
+ " ('set', 4834),\n",
+ " ('learn', 4827),\n",
+ " ('interest', 4775),\n",
+ " ('wont', 4772),\n",
+ " ('pass', 4768),\n",
+ " ('hand', 4766),\n",
+ " ('shower', 4750),\n",
+ " ('vote', 4740),\n",
+ " ('nite', 4732),\n",
+ " ('onlin', 4722),\n",
+ " ('concert', 4720),\n",
+ " ('add', 4713),\n",
+ " ('k', 4689),\n",
+ " ('season', 4671),\n",
+ " ('visit', 4663),\n",
+ " ('dude', 4662),\n",
+ " ('fine', 4651),\n",
+ " ('ice', 4644),\n",
+ " ('mileycyrus', 4626),\n",
+ " ('awak', 4619),\n",
+ " ('suppos', 4594),\n",
+ " ('breakfast', 4586),\n",
+ " ('fix', 4573),\n",
+ " ('facebook', 4571),\n",
+ " ('cat', 4536),\n",
+ " ('told', 4516),\n",
+ " ('favorit', 4503),\n",
+ " ('goe', 4490),\n",
+ " ('ass', 4482),\n",
+ " ('sunni', 4469),\n",
+ " ('wear', 4447),\n",
+ " ('catch', 4440),\n",
+ " ('pack', 4439),\n",
+ " ('til', 4437),\n",
+ " ('smile', 4432),\n",
+ " ('high', 4406),\n",
+ " ('broke', 4403),\n",
+ " ('lmao', 4382),\n",
+ " ('cut', 4381),\n",
+ " ('bought', 4363),\n",
+ " ('june', 4352),\n",
+ " ('spend', 4347),\n",
+ " ('lucki', 4319),\n",
+ " ('crap', 4298),\n",
+ " ('l', 4284),\n",
+ " ('mad', 4265),\n",
+ " ('la', 4256),\n",
+ " ('asleep', 4246),\n",
+ " ('afternoon', 4240),\n",
+ " ('hungri', 4236),\n",
+ " ('reason', 4223),\n",
+ " ('red', 4210),\n",
+ " ('ride', 4204),\n",
+ " ('min', 4172),\n",
+ " ('sign', 4170),\n",
+ " ('definit', 4162),\n",
+ " ('agre', 4157),\n",
+ " ('ladi', 4144),\n",
+ " ('laugh', 4140),\n",
+ " ('bye', 4108),\n",
+ " ('instead', 4089),\n",
+ " ('jealous', 4055),\n",
+ " ('short', 4042),\n",
+ " ('perfect', 4031),\n",
+ " ('yea', 4028),\n",
+ " ('xd', 4013),\n",
+ " ('stori', 3991),\n",
+ " ('page', 3991),\n",
+ " ('second', 3990),\n",
+ " ('nap', 3990),\n",
+ " ('top', 3988),\n",
+ " ('bout', 3983),\n",
+ " ('wed', 3979),\n",
+ " ('sore', 3977),\n",
+ " ('citi', 3975),\n",
+ " ('album', 3953),\n",
+ " ('sigh', 3945),\n",
+ " ('homework', 3935),\n",
+ " ('messag', 3926),\n",
+ " ('dead', 3921),\n",
+ " ('tommcfli', 3913),\n",
+ " ('graduat', 3913),\n",
+ " ('dear', 3912),\n",
+ " ('figur', 3904),\n",
+ " ('join', 3901),\n",
+ " ('sing', 3886),\n",
+ " ('list', 3866),\n",
+ " ('tour', 3862),\n",
+ " ('togeth', 3857),\n",
+ " ('date', 3856),\n",
+ " ('near', 3848),\n",
+ " ('youtub', 3843),\n",
+ " ('soooo', 3837),\n",
+ " ('congrat', 3811),\n",
+ " ('laptop', 3810),\n",
+ " ('holiday', 3809),\n",
+ " ('star', 3806),\n",
+ " ('park', 3802),\n",
+ " ('water', 3783),\n",
+ " ('award', 3780),\n",
+ " ('save', 3776),\n",
+ " ('store', 3776),\n",
+ " ('point', 3773),\n",
+ " ('coupl', 3752),\n",
+ " ('goin', 3749),\n",
+ " ('revis', 3734),\n",
+ " ('moment', 3730),\n",
+ " ('complet', 3721),\n",
+ " ('relax', 3702),\n",
+ " ('drop', 3699),\n",
+ " ('town', 3693),\n",
+ " ('line', 3666),\n",
+ " ('side', 3658),\n",
+ " ('download', 3647),\n",
+ " ('dress', 3647),\n",
+ " ('church', 3637),\n",
+ " ('order', 3629),\n",
+ " ('account', 3601),\n",
+ " ('cook', 3591),\n",
+ " ('annoy', 3583),\n",
+ " ('ipod', 3573),\n",
+ " ('tea', 3565),\n",
+ " ('share', 3563),\n",
+ " ('weird', 3560),\n",
+ " ('answer', 3558),\n",
+ " ('ppl', 3533),\n",
+ " ('offici', 3517),\n",
+ " ('cream', 3514),\n",
+ " ('less', 3512),\n",
+ " ('decid', 3495),\n",
+ " ('ddlovato', 3493),\n",
+ " ('ive', 3489),\n",
+ " ('gym', 3451),\n",
+ " ('lose', 3450),\n",
+ " ('scare', 3442),\n",
+ " ('forget', 3430),\n",
+ " ('f', 3422),\n",
+ " ('air', 3420),\n",
+ " ('mood', 3419),\n",
+ " ('lil', 3378),\n",
+ " ('realiz', 3370),\n",
+ " ('math', 3343),\n",
+ " ('unfortun', 3341),\n",
+ " ('chat', 3339),\n",
+ " ('fli', 3331),\n",
+ " ('english', 3329),\n",
+ " ('nd', 3324),\n",
+ " ('mum', 3318),\n",
+ " ('understand', 3313),\n",
+ " ('past', 3303),\n",
+ " ('fb', 3297),\n",
+ " ('chocol', 3295),\n",
+ " ('differ', 3278),\n",
+ " ('pool', 3276),\n",
+ " ('band', 3270),\n",
+ " ('usual', 3264),\n",
+ " ('comment', 3254),\n",
+ " ('ate', 3252),\n",
+ " ('episod', 3252),\n",
+ " ('fast', 3250),\n",
+ " ('ahh', 3215),\n",
+ " ('knew', 3210),\n",
+ " ('window', 3195),\n",
+ " ('upload', 3194),\n",
+ " ('kick', 3170),\n",
+ " ('worst', 3169),\n",
+ " ('london', 3152),\n",
+ " ('support', 3144),\n",
+ " ('broken', 3137),\n",
+ " ('chanc', 3132),\n",
+ " ('load', 3130),\n",
+ " ('horribl', 3128),\n",
+ " ('parent', 3126),\n",
+ " ('flight', 3126),\n",
+ " ('hmm', 3108),\n",
+ " ('black', 3106),\n",
+ " ('yep', 3093),\n",
+ " ('question', 3092),\n",
+ " ('throat', 3090),\n",
+ " ('cheer', 3089),\n",
+ " ('team', 3085),\n",
+ " ('three', 3083),\n",
+ " ('worth', 3083),\n",
+ " ('sat', 3070),\n",
+ " ('sleepi', 3051),\n",
+ " ('sunshin', 3041),\n",
+ " ('da', 3038),\n",
+ " ('upset', 3037),\n",
+ " ('card', 3032),\n",
+ " ('via', 3030),\n",
+ " ('special', 3022),\n",
+ " ('fair', 3012),\n",
+ " ('xxx', 3008),\n",
+ " ('mac', 3007),\n",
+ " ('bless', 3006),\n",
+ " ('depress', 3005),\n",
+ " ('shirt', 3001),\n",
+ " ('slow', 2996),\n",
+ " ('myspac', 2986),\n",
+ " ('em', 2974),\n",
+ " ('number', 2973),\n",
+ " ('ad', 2968),\n",
+ " ('beat', 2961),\n",
+ " ('leg', 2958),\n",
+ " ('sent', 2950),\n",
+ " ('green', 2950),\n",
+ " ('followfriday', 2949),\n",
+ " ('jona', 2939),\n",
+ " ('gave', 2927),\n",
+ " ('bet', 2910),\n",
+ " ('colleg', 2902),\n",
+ " ('sim', 2900),\n",
+ " ('record', 2889),\n",
+ " ('project', 2883),\n",
+ " ('appar', 2880),\n",
+ " ('paper', 2870),\n",
+ " ('cake', 2866),\n",
+ " ('tuesday', 2864),\n",
+ " ('moon', 2852),\n",
+ " ('app', 2845),\n",
+ " ('websit', 2845),\n",
+ " ('what', 2843),\n",
+ " ('finger', 2840),\n",
+ " ('beer', 2839),\n",
+ " ('vacat', 2838),\n",
+ " ('power', 2834),\n",
+ " ('blue', 2834),\n",
+ " ('warm', 2826),\n",
+ " ('film', 2815),\n",
+ " ('fell', 2814),\n",
+ " ('uk', 2812),\n",
+ " ('light', 2811),\n",
+ " ('garden', 2811),\n",
+ " ('wors', 2810),\n",
+ " ('easi', 2788),\n",
+ " ('possibl', 2777),\n",
+ " ('juli', 2776),\n",
+ " ('doesnt', 2769),\n",
+ " ('miley', 2768),\n",
+ " ('rather', 2763),\n",
+ " ('bodi', 2761),\n",
+ " ('longer', 2758),\n",
+ " ('bday', 2757),\n",
+ " ('nope', 2753),\n",
+ " ('mr', 2741),\n",
+ " ('flu', 2734),\n",
+ " ('shoe', 2732),\n",
+ " ('disappoint', 2725),\n",
+ " ('huge', 2723),\n",
+ " ('mess', 2721),\n",
+ " ('freak', 2717),\n",
+ " ('googl', 2715),\n",
+ " ('mtv', 2714),\n",
+ " ('wtf', 2710),\n",
+ " ('due', 2708),\n",
+ " ('absolut', 2700),\n",
+ " ('celebr', 2697),\n",
+ " ('spent', 2694),\n",
+ " ('safe', 2688),\n",
+ " ('chill', 2683),\n",
+ " ('plus', 2682),\n",
+ " ('bike', 2680),\n",
+ " ('lay', 2671),\n",
+ " ('shame', 2667),\n",
+ " ('voic', 2665),\n",
+ " ('cancel', 2662),\n",
+ " ('age', 2661),\n",
+ " ('burn', 2661),\n",
+ " ('lazi', 2652),\n",
+ " ('thx', 2647),\n",
+ " ('cousin', 2639),\n",
+ " ('white', 2633),\n",
+ " ('forev', 2630),\n",
+ " ('earlier', 2622),\n",
+ " ('stress', 2616),\n",
+ " ('ahhh', 2607),\n",
+ " ('stomach', 2603),\n",
+ " ('touch', 2601),\n",
+ " ('babe', 2595),\n",
+ " ('thursday', 2589),\n",
+ " ('hold', 2588),\n",
+ " ('swim', 2585),\n",
+ " ('remind', 2584),\n",
+ " ('quick', 2581),\n",
+ " ('david', 2564),\n",
+ " ('shot', 2560),\n",
+ " ('bus', 2559),\n",
+ " ('except', 2558),\n",
+ " ('idk', 2555),\n",
+ " ('especi', 2554),\n",
+ " ('camp', 2553),\n",
+ " ('lie', 2545),\n",
+ " ('manag', 2541),\n",
+ " ('son', 2540),\n",
+ " ('exact', 2540),\n",
+ " ('camera', 2539),\n",
+ " ('v', 2536),\n",
+ " ('slept', 2534),\n",
+ " ('box', 2531),\n",
+ " ('½', 2526),\n",
+ " ('appreci', 2525),\n",
+ " ('met', 2524),\n",
+ " ('boyfriend', 2523),\n",
+ " ('appl', 2523),\n",
+ " ('pray', 2521),\n",
+ " ('bum', 2519),\n",
+ " ('crash', 2511),\n",
+ " ('tom', 2505),\n",
+ " ('sort', 2497),\n",
+ " ('shoot', 2494),\n",
+ " ('surpris', 2493),\n",
+ " ('type', 2487),\n",
+ " ('current', 2485),\n",
+ " ('luv', 2478),\n",
+ " ('insid', 2477),\n",
+ " ('yummi', 2466),\n",
+ " ('hrs', 2460),\n",
+ " ('fight', 2456),\n",
+ " ('piss', 2455),\n",
+ " ('block', 2453),\n",
+ " ('present', 2431),\n",
+ " ('airport', 2428),\n",
+ " ('note', 2428),\n",
+ " ('father', 2426),\n",
+ " ('jonasbroth', 2421),\n",
+ " ('wit', 2416),\n",
+ " ('cover', 2415),\n",
+ " ('pizza', 2409),\n",
+ " ('case', 2407),\n",
+ " ('havent', 2407),\n",
+ " ('servic', 2407),\n",
+ " ('mail', 2403),\n",
+ " ('terribl', 2401),\n",
+ " ('club', 2399),\n",
+ " ('road', 2397),\n",
+ " ('bbq', 2395),\n",
+ " ('random', 2390),\n",
+ " ('confus', 2389),\n",
+ " ('arriv', 2386),\n",
+ " ('invit', 2384),\n",
+ " ('radio', 2379),\n",
+ " ('bitch', 2378),\n",
+ " ('hospit', 2373),\n",
+ " ('chicken', 2369),\n",
+ " ('meant', 2368),\n",
+ " ('expect', 2368),\n",
+ " ('small', 2358),\n",
+ " ('raini', 2350),\n",
+ " ('deal', 2346),\n",
+ " ('fit', 2345),\n",
+ " ('interview', 2341),\n",
+ " ('storm', 2341),\n",
+ " ('hubbi', 2336),\n",
+ " ('h', 2332),\n",
+ " ('tummi', 2330),\n",
+ " ('design', 2330),\n",
+ " ('cloth', 2329),\n",
+ " ('ps', 2326),\n",
+ " ('count', 2321),\n",
+ " ('tast', 2320),\n",
+ " ('dm', 2320),\n",
+ " ('doctor', 2310),\n",
+ " ('hill', 2305),\n",
+ " ('proud', 2294),\n",
+ " ('notic', 2292),\n",
+ " ('smell', 2290),\n",
+ " ('twilight', 2287),\n",
+ " ('laker', 2279),\n",
+ " ('lone', 2270),\n",
+ " ('addict', 2270),\n",
+ " ('felt', 2269),\n",
+ " ('cup', 2269),\n",
+ " ('mention', 2268),\n",
+ " ('speak', 2259),\n",
+ " ('stand', 2258),\n",
+ " ('shall', 2255),\n",
+ " ('wine', 2253),\n",
+ " ('alright', 2247),\n",
+ " ('begin', 2240),\n",
+ " ('search', 2235),\n",
+ " ('goodby', 2235),\n",
+ " ('cd', 2233),\n",
+ " ('peac', 2230),\n",
+ " ('yup', 2224),\n",
+ " ('ach', 2216),\n",
+ " ('fact', 2215),\n",
+ " ('issu', 2211),\n",
+ " ('gorgeous', 2211),\n",
+ " ('product', 2206),\n",
+ " ('bag', 2205),\n",
+ " ('lame', 2203),\n",
+ " ('practic', 2201),\n",
+ " ('wednesday', 2200),\n",
+ " ('yo', 2199),\n",
+ " ('wash', 2199),\n",
+ " ('pull', 2194),\n",
+ " ('woo', 2184),\n",
+ " ('j', 2183),\n",
+ " ('feet', 2176),\n",
+ " ('connect', 2170),\n",
+ " ('hmmm', 2163),\n",
+ " ('front', 2161),\n",
+ " ('kiss', 2159),\n",
+ " ('pink', 2156),\n",
+ " ('glass', 2154),\n",
+ " ('bar', 2153),\n",
+ " ('tan', 2150),\n",
+ " ('roll', 2148),\n",
+ " ('tear', 2146),\n",
+ " ('whatev', 2141),\n",
+ " ('compani', 2140),\n",
+ " ('cos', 2134),\n",
+ " ('bro', 2130),\n",
+ " ('taken', 2130),\n",
+ " ('ouch', 2130),\n",
+ " ('xoxo', 2124),\n",
+ " ('french', 2122),\n",
+ " ('apart', 2114),\n",
+ " ('scari', 2111),\n",
+ " ('state', 2105),\n",
+ " ('joke', 2104),\n",
+ " ('ball', 2098),\n",
+ " ('exhaust', 2094),\n",
+ " ('event', 2092),\n",
+ " ('memori', 2086),\n",
+ " ('drunk', 2086),\n",
+ " ('becom', 2080),\n",
+ " ('mile', 2079),\n",
+ " ('paint', 2078),\n",
+ " ('normal', 2074),\n",
+ " ('ear', 2072),\n",
+ " ('everybodi', 2067),\n",
+ " ('daughter', 2063),\n",
+ " ('jus', 2062),\n",
+ " ('mommi', 2055),\n",
+ " ('guitar', 2051),\n",
+ " ('à', 2051),\n",
+ " ('round', 2050),\n",
+ " ('isnt', 2050),\n",
+ " ('mate', 2043),\n",
+ " ('behind', 2040),\n",
+ " ('version', 2038),\n",
+ " ('prob', 2035),\n",
+ " ('sold', 2026),\n",
+ " ('travel', 2025),\n",
+ " ('rip', 2023),\n",
+ " ('releas', 2020),\n",
+ " ('art', 2016),\n",
+ " ('gettin', 2016),\n",
+ " ('door', 2015),\n",
+ " ('plane', 2010),\n",
+ " ('return', 2003),\n",
+ " ('promis', 2001),\n",
+ " ('although', 2001),\n",
+ " ('hangov', 1994),\n",
+ " ('fire', 1992),\n",
+ " ('matter', 1990),\n",
+ " ('sell', 1990),\n",
+ " ('singl', 1986),\n",
+ " ('web', 1986),\n",
+ " ('arm', 1985),\n",
+ " ('cross', 1985),\n",
+ " ('sis', 1983),\n",
+ " ('puppi', 1982),\n",
+ " ('vega', 1981),\n",
+ " ('wife', 1981),\n",
+ " ('magic', 1979),\n",
+ " ('allow', 1976),\n",
+ " ('along', 1973),\n",
+ " ('pop', 1972),\n",
+ " ('sale', 1968),\n",
+ " ('fantast', 1961),\n",
+ " ('hahah', 1947),\n",
+ " ('countri', 1943),\n",
+ " ('fish', 1939),\n",
+ " ('fri', 1938),\n",
+ " ('clear', 1932),\n",
+ " ('alot', 1931),\n",
+ " ('dark', 1925),\n",
+ " ('group', 1924),\n",
+ " ('bug', 1917),\n",
+ " ('wat', 1915),\n",
+ " ('self', 1914),\n",
+ " ('ï', 1909),\n",
+ " ('bb', 1907),\n",
+ " ('hotel', 1902),\n",
+ " ('cooki', 1902),\n",
+ " ('ruin', 1900),\n",
+ " ('death', 1892),\n",
+ " ('track', 1888),\n",
+ " ('eh', 1888),\n",
+ " ('ahead', 1879),\n",
+ " ('act', 1875),\n",
+ " ('screen', 1874),\n",
+ " ('huh', 1866),\n",
+ " ('wast', 1864),\n",
+ " ('twit', 1860),\n",
+ " ('ohh', 1854),\n",
+ " ('hun', 1851),\n",
+ " ('gosh', 1849),\n",
+ " ('histori', 1849),\n",
+ " ('inde', 1849),\n",
+ " ('angel', 1848),\n",
+ " ('instal', 1847),\n",
+ " ('ff', 1846),\n",
+ " ('deserv', 1844),\n",
+ " ('perform', 1841),\n",
+ " ('nick', 1840),\n",
+ " ('buddi', 1839),\n",
+ " ('aint', 1835),\n",
+ " ('bird', 1832),\n",
+ " ('fml', 1824),\n",
+ " ('profil', 1824),\n",
+ " ('cough', 1817),\n",
+ " ('race', 1810),\n",
+ " ('low', 1808),\n",
+ " ('daddi', 1807),\n",
+ " ('vip', 1805),\n",
+ " ('dvd', 1802),\n",
+ " ('major', 1800),\n",
+ " ('chees', 1797),\n",
+ " ('nobodi', 1797),\n",
+ " ('fill', 1796),\n",
+ " ('heat', 1795),\n",
+ " ('yum', 1791),\n",
+ " ('street', 1789),\n",
+ " ('land', 1788),\n",
+ " ('sexi', 1787),\n",
+ " ('fat', 1786),\n",
+ " ('extra', 1786),\n",
+ " ('traffic', 1773),\n",
+ " ('bloodi', 1770),\n",
+ " ('gay', 1769),\n",
+ " ('troubl', 1766),\n",
+ " ('grow', 1766),\n",
+ " ('delet', 1765),\n",
+ " ('tweetdeck', 1759),\n",
+ " ('throw', 1759),\n",
+ " ('posit', 1754),\n",
+ " ('blood', 1749),\n",
+ " ('pc', 1746),\n",
+ " ('gut', 1746),\n",
+ " ('nose', 1744),\n",
+ " ('vid', 1742),\n",
+ " ('ran', 1732),\n",
+ " ('men', 1731),\n",
+ " ('nail', 1731),\n",
+ " ('prepar', 1731),\n",
+ " ('edit', 1730),\n",
+ " ('other', 1722),\n",
+ " ('recommend', 1717),\n",
+ " ('itun', 1717),\n",
+ " ('watchin', 1716),\n",
+ " ('step', 1711),\n",
+ " ('view', 1709),\n",
+ " ('taylor', 1709),\n",
+ " ('somewher', 1707),\n",
+ " ('awwww', 1707),\n",
+ " ('mall', 1706),\n",
+ " ('rd', 1705),\n",
+ " ('direct', 1704),\n",
+ " ('coz', 1703),\n",
+ " ('fam', 1702),\n",
+ " ('caught', 1702),\n",
+ " ('joe', 1702),\n",
+ " ('suggest', 1700),\n",
+ " ('inspir', 1699),\n",
+ " ('dunno', 1699),\n",
+ " ('chillin', 1699),\n",
+ " ('dang', 1698),\n",
+ " ('result', 1697),\n",
+ " ('shine', 1697),\n",
+ " ('info', 1693),\n",
+ " ('bill', 1692),\n",
+ " ('shut', 1692),\n",
+ " ('market', 1691),\n",
+ " ('ooh', 1690),\n",
+ " ('def', 1686),\n",
+ " ('fav', 1682),\n",
+ " ('fever', 1682),\n",
+ " ('mark', 1681),\n",
+ " ('nyc', 1680),\n",
+ " ('anim', 1680),\n",
+ " ('report', 1675),\n",
+ " ('stick', 1674),\n",
+ " ('jon', 1672),\n",
+ " ('blah', 1670),\n",
+ " ('sky', 1669),\n",
+ " ('congratul', 1667),\n",
+ " ...]"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "top_words"
]
},
{
@@ -171,7 +1343,20 @@
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "from nltk.tokenize import word_tokenize\n",
+ "\n",
+ "# top 5000 words\n",
+ "top_words = [w[0] for w in word_features[:5000]]\n",
+ "\n",
+ "# list of tuples, each containing a dictionary of word features and the label\n",
+ "features = []\n",
+ "for tweet, label in zip(tweets, labels):\n",
+ " # tokenize tweet\n",
+ " words = set(word_tokenize(tweet.lower()))\n",
+ " # create dictionary of word features\n",
+ " word_features = {word: (word in words) for word in top_words}\n",
+ " # append to list of features\n",
+ " features.append((word_features, label))"
]
},
{
@@ -210,11 +1395,21 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "# your code here\n",
+ "import nltk\n",
+ "\n",
+ "# Split the feature set into training and test sets\n",
+ "train_set, test_set = featuresets[:800], featuresets[800:]\n",
+ "\n",
+ "# Create and train a Bayes classifier instance\n",
+ "classifier = nltk.NaiveBayesClassifier.train(train_set)\n",
+ "\n",
+ "# Inspect the most important features\n",
+ "classifier.show_most_informative_features(10)"
]
},
{
@@ -234,7 +1429,9 @@
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "# your code here\n",
+ "accuracy = nltk.classify.accuracy(classifier, test_set)\n",
+ "print(\"Accuracy:\", accuracy)"
]
},
{
@@ -312,7 +1509,12 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.3"
+ "version": "3.10.7"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "721db305ef1fd1fc91cdf20e400af694a949fe540ac5f48c160f31c7e384879d"
+ }
}
},
"nbformat": 4,