diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb
index 0808166..b361fa2 100644
--- a/your-code/challenge-1.ipynb
+++ b/your-code/challenge-1.ipynb
@@ -66,11 +66,29 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "This is a test string with a URL special characters numbers and multiple whitespaces\n"
+ ]
+ }
+ ],
"source": [
- "def clean_up(s):\n",
+ "import re\n",
+ "\n",
+ "\n",
+ "def clean_up(text):\n",
+ " text = re.sub(r'http\\S+|www\\.\\S+', '', text)\n",
+ " text = re.sub(r\"[^a-zA-Z\\s']\", ' ', text)\n",
+ " text = re.sub(r'\\s+', ' ', text).strip()\n",
+ " return text\n",
+ " \n",
+ " \n",
+ " \n",
" \"\"\"\n",
" Cleans up numbers, URLs, and special characters from a string.\n",
"\n",
@@ -79,7 +97,13 @@
"\n",
" Returns:\n",
" A string that has been cleaned up.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " \n",
+ "test_string = \"This is a test string with a URL http://example.com, special characters #@!, numbers 123, and multiple whitespaces.\"\n",
+ "\n",
+ "cleaned_text = clean_up(test_string)\n",
+ "\n",
+ "print(cleaned_text)\n"
]
},
{
@@ -101,12 +125,54 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['ironhack', 's', 'q', 'website', 'is']\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package punkt to\n",
+ "[nltk_data] C:\\Users\\User\\AppData\\Roaming\\nltk_data...\n",
+ "[nltk_data] Package punkt is already up-to-date!\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'\\n Tokenize a string.\\n\\n Args:\\n s: String to be tokenized.\\n\\n Returns:\\n A list of words as the result of tokenization.\\n'"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "def tokenize(s):\n",
- " \"\"\"\n",
+ "import nltk\n",
+ "from nltk.tokenize import word_tokenize\n",
+ "\n",
+ "nltk.download('punkt')\n",
+ "\n",
+ "\n",
+ "def tokenize(text):\n",
+ " return word_tokenize(text) \n",
+ " \n",
+ "\n",
+ " \n",
+ "test_string = \"ironhack s q website is\"\n",
+ "tokens = tokenize(test_string)\n",
+ "print(tokens) \n",
+ " \n",
+ "\n",
+ "\"\"\"\n",
" Tokenize a string.\n",
"\n",
" Args:\n",
@@ -114,7 +180,7 @@
"\n",
" Returns:\n",
" A list of words as the result of tokenization.\n",
- " \"\"\""
+ "\"\"\""
]
},
{
@@ -145,11 +211,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
- "def stem_and_lemmatize(l):\n",
+ "import nltk\n",
+ "from nltk.stem import PorterStemmer, WordNetLemmatizer\n",
+ "\n",
+ "\n",
+ "\n",
+ "def stem_and_lemmatize(words):\n",
+ " \n",
+ " stemmer = PorterStemmer()\n",
+ " lemmatizer = WordNetLemmatizer()\n",
+ " \n",
+ " stemmed = [stemmer.stem(word) for word in words]\n",
+ " lemmatized = [lemmatizer.lemmatize(word) for word in words]\n",
+ " \n",
+ " return stemmed, lemmatized\n",
" \"\"\"\n",
" Perform stemming and lemmatization on a list of words.\n",
"\n",
@@ -176,11 +255,32 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 4,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package stopwords to\n",
+ "[nltk_data] C:\\Users\\User\\AppData\\Roaming\\nltk_data...\n",
+ "[nltk_data] Package stopwords is already up-to-date!\n"
+ ]
+ }
+ ],
"source": [
- "def remove_stopwords(l):\n",
+ "import nltk\n",
+ "from nltk.corpus import stopwords\n",
+ "\n",
+ "nltk.download('stopwords')\n",
+ "\n",
+ "\n",
+ "def remove_stopwords(words):\n",
+ " stop_words = set(stopwords.words('english'))\n",
+ " \n",
+ " clean_words = [word for word in words if word.lower() not in stop_words]\n",
+ " \n",
+ " return clean_words\n",
" \"\"\"\n",
" Remove English stopwords from a list of strings.\n",
"\n",
@@ -204,7 +304,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -218,7 +318,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.3"
+ "version": "3.11.5"
}
},
"nbformat": 4,
diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb
index 6b0e116..0770d69 100644
--- a/your-code/challenge-2.ipynb
+++ b/your-code/challenge-2.ipynb
@@ -46,11 +46,227 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
+ "execution_count": 71,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " target | \n",
+ " id | \n",
+ " date | \n",
+ " flag | \n",
+ " user | \n",
+ " text | \n",
+ " is_positive | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 1467810672 | \n",
+ " Mon Apr 06 22:19:49 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " scotthamilton | \n",
+ " is upset that he can't update his Facebook by ... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0 | \n",
+ " 1467810917 | \n",
+ " Mon Apr 06 22:19:53 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " mattycus | \n",
+ " @Kenichan I dived many times for the ball. Man... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0 | \n",
+ " 1467811184 | \n",
+ " Mon Apr 06 22:19:57 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " ElleCTF | \n",
+ " my whole body feels itchy and like its on fire | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0 | \n",
+ " 1467811193 | \n",
+ " Mon Apr 06 22:19:57 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " Karoli | \n",
+ " @nationwideclass no, it's not behaving at all.... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0 | \n",
+ " 1467811372 | \n",
+ " Mon Apr 06 22:20:00 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " joy_wolf | \n",
+ " @Kwesidei not the whole crew | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 1599994 | \n",
+ " 4 | \n",
+ " 2193601966 | \n",
+ " Tue Jun 16 08:40:49 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " AmandaMarie1028 | \n",
+ " Just woke up. Having no school is the best fee... | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 1599995 | \n",
+ " 4 | \n",
+ " 2193601969 | \n",
+ " Tue Jun 16 08:40:49 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " TheWDBoards | \n",
+ " TheWDB.com - Very cool to hear old Walt interv... | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 1599996 | \n",
+ " 4 | \n",
+ " 2193601991 | \n",
+ " Tue Jun 16 08:40:49 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " bpbabe | \n",
+ " Are you ready for your MoJo Makeover? Ask me f... | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 1599997 | \n",
+ " 4 | \n",
+ " 2193602064 | \n",
+ " Tue Jun 16 08:40:49 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " tinydiamondz | \n",
+ " Happy 38th Birthday to my boo of alll time!!! ... | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 1599998 | \n",
+ " 4 | \n",
+ " 2193602129 | \n",
+ " Tue Jun 16 08:40:50 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " RyanTrevMorris | \n",
+ " happy #charitytuesday @theNSPCC @SparksCharity... | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1599999 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " target id date flag \\\n",
+ "0 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY \n",
+ "1 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY \n",
+ "2 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n",
+ "3 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n",
+ "4 0 1467811372 Mon Apr 06 22:20:00 PDT 2009 NO_QUERY \n",
+ "... ... ... ... ... \n",
+ "1599994 4 2193601966 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n",
+ "1599995 4 2193601969 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n",
+ "1599996 4 2193601991 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n",
+ "1599997 4 2193602064 Tue Jun 16 08:40:49 PDT 2009 NO_QUERY \n",
+ "1599998 4 2193602129 Tue Jun 16 08:40:50 PDT 2009 NO_QUERY \n",
+ "\n",
+ " user text \\\n",
+ "0 scotthamilton is upset that he can't update his Facebook by ... \n",
+ "1 mattycus @Kenichan I dived many times for the ball. Man... \n",
+ "2 ElleCTF my whole body feels itchy and like its on fire \n",
+ "3 Karoli @nationwideclass no, it's not behaving at all.... \n",
+ "4 joy_wolf @Kwesidei not the whole crew \n",
+ "... ... ... \n",
+ "1599994 AmandaMarie1028 Just woke up. Having no school is the best fee... \n",
+ "1599995 TheWDBoards TheWDB.com - Very cool to hear old Walt interv... \n",
+ "1599996 bpbabe Are you ready for your MoJo Makeover? Ask me f... \n",
+ "1599997 tinydiamondz Happy 38th Birthday to my boo of alll time!!! ... \n",
+ "1599998 RyanTrevMorris happy #charitytuesday @theNSPCC @SparksCharity... \n",
+ "\n",
+ " is_positive \n",
+ "0 False \n",
+ "1 False \n",
+ "2 False \n",
+ "3 False \n",
+ "4 False \n",
+ "... ... \n",
+ "1599994 True \n",
+ "1599995 True \n",
+ "1599996 True \n",
+ "1599997 True \n",
+ "1599998 True \n",
+ "\n",
+ "[1599999 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 71,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# your code here"
+ "import pandas as pd\n",
+ "import re\n",
+ "import nltk\n",
+ "from nltk.corpus import stopwords\n",
+ "from nltk.stem import PorterStemmer, WordNetLemmatizer\n",
+ "from nltk.probability import FreqDist\n",
+ "from nltk.classify import NaiveBayesClassifier\n",
+ "from nltk.classify.util import accuracy as nltk_accuracy\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "\n",
+ "# your code here\n",
+ "df = pd.read_csv(r'C:\\Users\\User\\Desktop\\iRonhack\\Classes at Iron\\Week 20\\Day 1\\lab-nlp\\your-code\\twitter_data.csv', encoding='ISO-8859-1')\n",
+ "\n",
+ "df.columns = ['target','id','date','flag','user','text']\n",
+ "df\n",
+ "\n",
+ "\n",
+ "#map\n",
+ "df['is_positive'] = df['target'].map({0: False, 4: True})\n",
+ "df"
]
},
{
@@ -76,11 +292,538 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 72,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " target | \n",
+ " id | \n",
+ " date | \n",
+ " flag | \n",
+ " user | \n",
+ " text | \n",
+ " is_positive | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 541200 | \n",
+ " 0 | \n",
+ " 2200003313 | \n",
+ " Tue Jun 16 18:18:13 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " DEWGetMeTho77 | \n",
+ " @Nkluvr4eva My poor little dumpling In Holmde... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 750 | \n",
+ " 0 | \n",
+ " 1467998601 | \n",
+ " Mon Apr 06 23:11:18 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " Young_J | \n",
+ " I'm off too bed. I gotta wake up hella early t... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 766711 | \n",
+ " 0 | \n",
+ " 2300049112 | \n",
+ " Tue Jun 23 13:40:12 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " dougnawoschik | \n",
+ " I havent been able to listen to it yet My spe... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 285055 | \n",
+ " 0 | \n",
+ " 1993474319 | \n",
+ " Mon Jun 01 10:26:09 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " thireven | \n",
+ " now remembers why solving a relatively big equ... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 705995 | \n",
+ " 0 | \n",
+ " 2256551006 | \n",
+ " Sat Jun 20 12:56:51 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " taracollins086 | \n",
+ " Ate too much, feel sick | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 1374482 | \n",
+ " 4 | \n",
+ " 2051447103 | \n",
+ " Fri Jun 05 22:02:36 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " _Jaska | \n",
+ " @girlwonder24 Thanks. | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 667014 | \n",
+ " 0 | \n",
+ " 2245469948 | \n",
+ " Fri Jun 19 16:10:39 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " julianicolao | \n",
+ " trying to study for the biggest test, next wee... | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 1451234 | \n",
+ " 4 | \n",
+ " 2063022808 | \n",
+ " Sun Jun 07 01:05:46 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " ElaineToni | \n",
+ " Just finished watching Your Song Presents: Boy... | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 1181412 | \n",
+ " 4 | \n",
+ " 1982082859 | \n",
+ " Sun May 31 10:29:36 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " lindseyrd20 | \n",
+ " @janfran813 awww i can't wait to get one | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 517910 | \n",
+ " 0 | \n",
+ " 2191411932 | \n",
+ " Tue Jun 16 05:13:13 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " serraannisa | \n",
+ " doing nothing | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
20000 rows × 7 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " target id date flag \\\n",
+ "541200 0 2200003313 Tue Jun 16 18:18:13 PDT 2009 NO_QUERY \n",
+ "750 0 1467998601 Mon Apr 06 23:11:18 PDT 2009 NO_QUERY \n",
+ "766711 0 2300049112 Tue Jun 23 13:40:12 PDT 2009 NO_QUERY \n",
+ "285055 0 1993474319 Mon Jun 01 10:26:09 PDT 2009 NO_QUERY \n",
+ "705995 0 2256551006 Sat Jun 20 12:56:51 PDT 2009 NO_QUERY \n",
+ "... ... ... ... ... \n",
+ "1374482 4 2051447103 Fri Jun 05 22:02:36 PDT 2009 NO_QUERY \n",
+ "667014 0 2245469948 Fri Jun 19 16:10:39 PDT 2009 NO_QUERY \n",
+ "1451234 4 2063022808 Sun Jun 07 01:05:46 PDT 2009 NO_QUERY \n",
+ "1181412 4 1982082859 Sun May 31 10:29:36 PDT 2009 NO_QUERY \n",
+ "517910 0 2191411932 Tue Jun 16 05:13:13 PDT 2009 NO_QUERY \n",
+ "\n",
+ " user text \\\n",
+ "541200 DEWGetMeTho77 @Nkluvr4eva My poor little dumpling In Holmde... \n",
+ "750 Young_J I'm off too bed. I gotta wake up hella early t... \n",
+ "766711 dougnawoschik I havent been able to listen to it yet My spe... \n",
+ "285055 thireven now remembers why solving a relatively big equ... \n",
+ "705995 taracollins086 Ate too much, feel sick \n",
+ "... ... ... \n",
+ "1374482 _Jaska @girlwonder24 Thanks. \n",
+ "667014 julianicolao trying to study for the biggest test, next wee... \n",
+ "1451234 ElaineToni Just finished watching Your Song Presents: Boy... \n",
+ "1181412 lindseyrd20 @janfran813 awww i can't wait to get one \n",
+ "517910 serraannisa doing nothing \n",
+ "\n",
+ " is_positive \n",
+ "541200 False \n",
+ "750 False \n",
+ "766711 False \n",
+ "285055 False \n",
+ "705995 False \n",
+ "... ... \n",
+ "1374482 True \n",
+ "667014 False \n",
+ "1451234 True \n",
+ "1181412 True \n",
+ "517910 False \n",
+ "\n",
+ "[20000 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 72,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# your code here"
+ "import re\n",
+ "import nltk\n",
+ "from nltk.tokenize import word_tokenize\n",
+ "import nltk\n",
+ "from nltk.stem import PorterStemmer, WordNetLemmatizer\n",
+ "import nltk\n",
+ "from nltk.corpus import stopwords\n",
+ "import random\n",
+ "\n",
+ "\n",
+ "# your code here\n",
+ "\n",
+ "\n",
+ "df_sampled = df.sample(n=20000, random_state=42)\n",
+ "df_sampled\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " target | \n",
+ " id | \n",
+ " date | \n",
+ " flag | \n",
+ " user | \n",
+ " text | \n",
+ " is_positive | \n",
+ " clean | \n",
+ " token | \n",
+ " stemmed_and_lemmatized | \n",
+ " text_processed | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 541200 | \n",
+ " 0 | \n",
+ " 2200003313 | \n",
+ " Tue Jun 16 18:18:13 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " DEWGetMeTho77 | \n",
+ " @Nkluvr4eva My poor little dumpling In Holmde... | \n",
+ " False | \n",
+ " Nkluvr eva My poor little dumpling In Holmdel ... | \n",
+ " [Nkluvr, eva, My, poor, little, dumpling, In, ... | \n",
+ " [Nkluvr, eva, My, poor, little, dumpling, In, ... | \n",
+ " [Nkluvr, eva, poor, little, dumpling, Holmdel,... | \n",
+ "
\n",
+ " \n",
+ " | 750 | \n",
+ " 0 | \n",
+ " 1467998601 | \n",
+ " Mon Apr 06 23:11:18 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " Young_J | \n",
+ " I'm off too bed. I gotta wake up hella early t... | \n",
+ " False | \n",
+ " I'm off too bed I gotta wake up hella early to... | \n",
+ " [I, 'm, off, too, bed, I, got, ta, wake, up, h... | \n",
+ " [I, 'm, off, too, bed, I, got, ta, wake, up, h... | \n",
+ " ['m, bed, got, ta, wake, hella, early, tomorro... | \n",
+ "
\n",
+ " \n",
+ " | 766711 | \n",
+ " 0 | \n",
+ " 2300049112 | \n",
+ " Tue Jun 23 13:40:12 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " dougnawoschik | \n",
+ " I havent been able to listen to it yet My spe... | \n",
+ " False | \n",
+ " I havent been able to listen to it yet My spea... | \n",
+ " [I, havent, been, able, to, listen, to, it, ye... | \n",
+ " [I, havent, been, able, to, listen, to, it, ye... | \n",
+ " [havent, able, listen, yet, speaker, busted] | \n",
+ "
\n",
+ " \n",
+ " | 285055 | \n",
+ " 0 | \n",
+ " 1993474319 | \n",
+ " Mon Jun 01 10:26:09 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " thireven | \n",
+ " now remembers why solving a relatively big equ... | \n",
+ " False | \n",
+ " now remembers why solving a relatively big equ... | \n",
+ " [now, remembers, why, solving, a, relatively, ... | \n",
+ " [now, remembers, why, solving, a, relatively, ... | \n",
+ " [remembers, solving, relatively, big, equation... | \n",
+ "
\n",
+ " \n",
+ " | 705995 | \n",
+ " 0 | \n",
+ " 2256551006 | \n",
+ " Sat Jun 20 12:56:51 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " taracollins086 | \n",
+ " Ate too much, feel sick | \n",
+ " False | \n",
+ " Ate too much feel sick | \n",
+ " [Ate, too, much, feel, sick] | \n",
+ " [Ate, too, much, feel, sick] | \n",
+ " [Ate, much, feel, sick] | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 1374482 | \n",
+ " 4 | \n",
+ " 2051447103 | \n",
+ " Fri Jun 05 22:02:36 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " _Jaska | \n",
+ " @girlwonder24 Thanks. | \n",
+ " True | \n",
+ " girlwonder Thanks | \n",
+ " [girlwonder, Thanks] | \n",
+ " [girlwonder, Thanks] | \n",
+ " [girlwonder, Thanks] | \n",
+ "
\n",
+ " \n",
+ " | 667014 | \n",
+ " 0 | \n",
+ " 2245469948 | \n",
+ " Fri Jun 19 16:10:39 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " julianicolao | \n",
+ " trying to study for the biggest test, next wee... | \n",
+ " False | \n",
+ " trying to study for the biggest test next week... | \n",
+ " [trying, to, study, for, the, biggest, test, n... | \n",
+ " [trying, to, study, for, the, biggest, test, n... | \n",
+ " [trying, study, biggest, test, next, week, n't... | \n",
+ "
\n",
+ " \n",
+ " | 1451234 | \n",
+ " 4 | \n",
+ " 2063022808 | \n",
+ " Sun Jun 07 01:05:46 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " ElaineToni | \n",
+ " Just finished watching Your Song Presents: Boy... | \n",
+ " True | \n",
+ " Just finished watching Your Song Presents Boys... | \n",
+ " [Just, finished, watching, Your, Song, Present... | \n",
+ " [Just, finished, watching, Your, Song, Present... | \n",
+ " [finished, watching, Song, Presents, Boystown] | \n",
+ "
\n",
+ " \n",
+ " | 1181412 | \n",
+ " 4 | \n",
+ " 1982082859 | \n",
+ " Sun May 31 10:29:36 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " lindseyrd20 | \n",
+ " @janfran813 awww i can't wait to get one | \n",
+ " True | \n",
+ " janfran awww i can't wait to get one | \n",
+ " [janfran, awww, i, ca, n't, wait, to, get, one] | \n",
+ " [janfran, awww, i, ca, n't, wait, to, get, one] | \n",
+ " [janfran, awww, ca, n't, wait, get, one] | \n",
+ "
\n",
+ " \n",
+ " | 517910 | \n",
+ " 0 | \n",
+ " 2191411932 | \n",
+ " Tue Jun 16 05:13:13 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " serraannisa | \n",
+ " doing nothing | \n",
+ " False | \n",
+ " doing nothing | \n",
+ " [doing, nothing] | \n",
+ " [doing, nothing] | \n",
+ " [nothing] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
20000 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " target id date flag \\\n",
+ "541200 0 2200003313 Tue Jun 16 18:18:13 PDT 2009 NO_QUERY \n",
+ "750 0 1467998601 Mon Apr 06 23:11:18 PDT 2009 NO_QUERY \n",
+ "766711 0 2300049112 Tue Jun 23 13:40:12 PDT 2009 NO_QUERY \n",
+ "285055 0 1993474319 Mon Jun 01 10:26:09 PDT 2009 NO_QUERY \n",
+ "705995 0 2256551006 Sat Jun 20 12:56:51 PDT 2009 NO_QUERY \n",
+ "... ... ... ... ... \n",
+ "1374482 4 2051447103 Fri Jun 05 22:02:36 PDT 2009 NO_QUERY \n",
+ "667014 0 2245469948 Fri Jun 19 16:10:39 PDT 2009 NO_QUERY \n",
+ "1451234 4 2063022808 Sun Jun 07 01:05:46 PDT 2009 NO_QUERY \n",
+ "1181412 4 1982082859 Sun May 31 10:29:36 PDT 2009 NO_QUERY \n",
+ "517910 0 2191411932 Tue Jun 16 05:13:13 PDT 2009 NO_QUERY \n",
+ "\n",
+ " user text \\\n",
+ "541200 DEWGetMeTho77 @Nkluvr4eva My poor little dumpling In Holmde... \n",
+ "750 Young_J I'm off too bed. I gotta wake up hella early t... \n",
+ "766711 dougnawoschik I havent been able to listen to it yet My spe... \n",
+ "285055 thireven now remembers why solving a relatively big equ... \n",
+ "705995 taracollins086 Ate too much, feel sick \n",
+ "... ... ... \n",
+ "1374482 _Jaska @girlwonder24 Thanks. \n",
+ "667014 julianicolao trying to study for the biggest test, next wee... \n",
+ "1451234 ElaineToni Just finished watching Your Song Presents: Boy... \n",
+ "1181412 lindseyrd20 @janfran813 awww i can't wait to get one \n",
+ "517910 serraannisa doing nothing \n",
+ "\n",
+ " is_positive clean \\\n",
+ "541200 False Nkluvr eva My poor little dumpling In Holmdel ... \n",
+ "750 False I'm off too bed I gotta wake up hella early to... \n",
+ "766711 False I havent been able to listen to it yet My spea... \n",
+ "285055 False now remembers why solving a relatively big equ... \n",
+ "705995 False Ate too much feel sick \n",
+ "... ... ... \n",
+ "1374482 True girlwonder Thanks \n",
+ "667014 False trying to study for the biggest test next week... \n",
+ "1451234 True Just finished watching Your Song Presents Boys... \n",
+ "1181412 True janfran awww i can't wait to get one \n",
+ "517910 False doing nothing \n",
+ "\n",
+ " token \\\n",
+ "541200 [Nkluvr, eva, My, poor, little, dumpling, In, ... \n",
+ "750 [I, 'm, off, too, bed, I, got, ta, wake, up, h... \n",
+ "766711 [I, havent, been, able, to, listen, to, it, ye... \n",
+ "285055 [now, remembers, why, solving, a, relatively, ... \n",
+ "705995 [Ate, too, much, feel, sick] \n",
+ "... ... \n",
+ "1374482 [girlwonder, Thanks] \n",
+ "667014 [trying, to, study, for, the, biggest, test, n... \n",
+ "1451234 [Just, finished, watching, Your, Song, Present... \n",
+ "1181412 [janfran, awww, i, ca, n't, wait, to, get, one] \n",
+ "517910 [doing, nothing] \n",
+ "\n",
+ " stemmed_and_lemmatized \\\n",
+ "541200 [Nkluvr, eva, My, poor, little, dumpling, In, ... \n",
+ "750 [I, 'm, off, too, bed, I, got, ta, wake, up, h... \n",
+ "766711 [I, havent, been, able, to, listen, to, it, ye... \n",
+ "285055 [now, remembers, why, solving, a, relatively, ... \n",
+ "705995 [Ate, too, much, feel, sick] \n",
+ "... ... \n",
+ "1374482 [girlwonder, Thanks] \n",
+ "667014 [trying, to, study, for, the, biggest, test, n... \n",
+ "1451234 [Just, finished, watching, Your, Song, Present... \n",
+ "1181412 [janfran, awww, i, ca, n't, wait, to, get, one] \n",
+ "517910 [doing, nothing] \n",
+ "\n",
+ " text_processed \n",
+ "541200 [Nkluvr, eva, poor, little, dumpling, Holmdel,... \n",
+ "750 ['m, bed, got, ta, wake, hella, early, tomorro... \n",
+ "766711 [havent, able, listen, yet, speaker, busted] \n",
+ "285055 [remembers, solving, relatively, big, equation... \n",
+ "705995 [Ate, much, feel, sick] \n",
+ "... ... \n",
+ "1374482 [girlwonder, Thanks] \n",
+ "667014 [trying, study, biggest, test, next, week, n't... \n",
+ "1451234 [finished, watching, Song, Presents, Boystown] \n",
+ "1181412 [janfran, awww, ca, n't, wait, get, one] \n",
+ "517910 [nothing] \n",
+ "\n",
+ "[20000 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "def clean_up(text):\n",
+ " text = re.sub(r'http\\S+|www\\.\\S+', '', text)\n",
+ " text = re.sub(r\"[^a-zA-Z\\s']\", ' ', text)\n",
+ " text = re.sub(r'\\s+', ' ', text).strip()\n",
+ " return text\n",
+ "\n",
+ "def tokenize(text):\n",
+ " return nltk.word_tokenize(text)\n",
+ "\n",
+ "def stem_and_lemmatize(words):\n",
+ " stemmer = PorterStemmer()\n",
+ " lemmatizer = WordNetLemmatizer()\n",
+ " stemmed = [stemmer.stem(word) for word in words]\n",
+ " lemmatized = [lemmatizer.lemmatize(word) for word in words]\n",
+ " return lemmatized \n",
+ "\n",
+ "def remove_stopwords(words):\n",
+ " stop_words = set(stopwords.words('english'))\n",
+ " return [word for word in words if word.lower() not in stop_words]\n",
+ "\n",
+ "\n",
+ "\n",
+ "df_sampled['clean'] = df_sampled['text'].apply(clean_up)\n",
+ "df_sampled['token'] = df_sampled['clean'].apply(tokenize)\n",
+ "df_sampled['stemmed_and_lemmatized'] = df_sampled['token'].apply(stem_and_lemmatize)\n",
+ "df_sampled['text_processed'] = df_sampled['stemmed_and_lemmatized'].apply(remove_stopwords)\n",
+ "\n",
+ "df_sampled\n",
+ "\n"
]
},
{
@@ -98,11 +841,1048 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 74,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Top 5,000 words example (first 10): [(\"n't\", 2268), (\"'s\", 2150), (\"'m\", 1610), ('wa', 1412), ('day', 1217), ('get', 1044), ('like', 945), ('good', 933), ('quot', 921), ('go', 898), ('work', 824), ('today', 772), ('love', 756), ('got', 741), ('time', 729), ('one', 695), ('going', 693), ('u', 683), ('know', 680), ('back', 595), ('want', 571), ('amp', 567), ('really', 550), ('think', 535), ('night', 535), ('im', 532), ('see', 528), ('na', 506), (\"'ll\", 488), ('ca', 487), ('lol', 483), ('home', 477), ('new', 476), ('still', 475), ('well', 468), ('much', 466), ('ha', 462), ('need', 459), ('feel', 437), ('miss', 435), ('last', 409), ('make', 408), ('tomorrow', 400), (\"'re\", 375), ('great', 361), ('would', 359), ('morning', 357), ('bad', 337), ('fun', 326), ('sad', 323), ('sleep', 322), ('come', 315), ('wish', 313), ('week', 312), ('tonight', 307), ('say', 305), ('right', 303), ('thing', 298), ('oh', 297), ('friend', 296), ('could', 295), ('nice', 295), ('though', 294), ('haha', 294), (\"'\", 293), ('thanks', 290), (\"'ve\", 285), ('wait', 281), ('gon', 275), ('bed', 274), ('look', 274), ('hope', 274), ('better', 269), ('way', 268), ('lt', 267), ('getting', 266), ('hate', 259), ('twitter', 258), ('people', 251), ('hour', 249), ('sorry', 237), ('weekend', 232), ('Thanks', 228), ('show', 226), ('little', 225), ('happy', 225), ('next', 222), ('school', 221), ('Good', 220), ('doe', 218), ('Oh', 216), ('sick', 215), ('even', 214), ('take', 211), ('dont', 209), ('watching', 208), ('guy', 207), ('working', 207), ('LOL', 205), ('soon', 205)]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "[\"n't\",\n",
+ " \"'s\",\n",
+ " \"'m\",\n",
+ " 'wa',\n",
+ " 'day',\n",
+ " 'get',\n",
+ " 'like',\n",
+ " 'good',\n",
+ " 'quot',\n",
+ " 'go',\n",
+ " 'work',\n",
+ " 'today',\n",
+ " 'love',\n",
+ " 'got',\n",
+ " 'time',\n",
+ " 'one',\n",
+ " 'going',\n",
+ " 'u',\n",
+ " 'know',\n",
+ " 'back',\n",
+ " 'want',\n",
+ " 'amp',\n",
+ " 'really',\n",
+ " 'think',\n",
+ " 'night',\n",
+ " 'im',\n",
+ " 'see',\n",
+ " 'na',\n",
+ " \"'ll\",\n",
+ " 'ca',\n",
+ " 'lol',\n",
+ " 'home',\n",
+ " 'new',\n",
+ " 'still',\n",
+ " 'well',\n",
+ " 'much',\n",
+ " 'ha',\n",
+ " 'need',\n",
+ " 'feel',\n",
+ " 'miss',\n",
+ " 'last',\n",
+ " 'make',\n",
+ " 'tomorrow',\n",
+ " \"'re\",\n",
+ " 'great',\n",
+ " 'would',\n",
+ " 'morning',\n",
+ " 'bad',\n",
+ " 'fun',\n",
+ " 'sad',\n",
+ " 'sleep',\n",
+ " 'come',\n",
+ " 'wish',\n",
+ " 'week',\n",
+ " 'tonight',\n",
+ " 'say',\n",
+ " 'right',\n",
+ " 'thing',\n",
+ " 'oh',\n",
+ " 'friend',\n",
+ " 'could',\n",
+ " 'nice',\n",
+ " 'though',\n",
+ " 'haha',\n",
+ " \"'\",\n",
+ " 'thanks',\n",
+ " \"'ve\",\n",
+ " 'wait',\n",
+ " 'gon',\n",
+ " 'bed',\n",
+ " 'look',\n",
+ " 'hope',\n",
+ " 'better',\n",
+ " 'way',\n",
+ " 'lt',\n",
+ " 'getting',\n",
+ " 'hate',\n",
+ " 'twitter',\n",
+ " 'people',\n",
+ " 'hour',\n",
+ " 'sorry',\n",
+ " 'weekend',\n",
+ " 'Thanks',\n",
+ " 'show',\n",
+ " 'little',\n",
+ " 'happy',\n",
+ " 'next',\n",
+ " 'school',\n",
+ " 'Good',\n",
+ " 'doe',\n",
+ " 'Oh',\n",
+ " 'sick',\n",
+ " 'even',\n",
+ " 'take',\n",
+ " 'dont',\n",
+ " 'watching',\n",
+ " 'guy',\n",
+ " 'working',\n",
+ " 'LOL',\n",
+ " 'soon',\n",
+ " 'life',\n",
+ " 'girl',\n",
+ " 'cant',\n",
+ " 'watch',\n",
+ " 'year',\n",
+ " 'x',\n",
+ " 'always',\n",
+ " 'movie',\n",
+ " 'already',\n",
+ " 'everyone',\n",
+ " 'tweet',\n",
+ " 'long',\n",
+ " 'yeah',\n",
+ " 'tired',\n",
+ " 'first',\n",
+ " 'never',\n",
+ " 'wan',\n",
+ " 'suck',\n",
+ " 'sure',\n",
+ " 'start',\n",
+ " 'awesome',\n",
+ " 'find',\n",
+ " 'let',\n",
+ " 'something',\n",
+ " 'yet',\n",
+ " 'phone',\n",
+ " 'best',\n",
+ " 'pretty',\n",
+ " \"'d\",\n",
+ " 'away',\n",
+ " 'done',\n",
+ " 'feeling',\n",
+ " 'cool',\n",
+ " 'old',\n",
+ " 'song',\n",
+ " 'man',\n",
+ " 'looking',\n",
+ " 'sun',\n",
+ " 'thought',\n",
+ " 'please',\n",
+ " 'yes',\n",
+ " 'another',\n",
+ " 'bit',\n",
+ " 'lot',\n",
+ " 'help',\n",
+ " 'hurt',\n",
+ " 'wo',\n",
+ " 'house',\n",
+ " 'made',\n",
+ " 'ever',\n",
+ " 'keep',\n",
+ " 'n',\n",
+ " 'ya',\n",
+ " 'game',\n",
+ " 'ready',\n",
+ " 'went',\n",
+ " 'guess',\n",
+ " 'th',\n",
+ " 'ok',\n",
+ " 'early',\n",
+ " 'mean',\n",
+ " 'follow',\n",
+ " 'sound',\n",
+ " 'hard',\n",
+ " 'pic',\n",
+ " 'rain',\n",
+ " 'left',\n",
+ " 'summer',\n",
+ " 'Im',\n",
+ " 'hey',\n",
+ " 'thank',\n",
+ " 'missed',\n",
+ " 'ur',\n",
+ " 'trying',\n",
+ " 'lost',\n",
+ " 'big',\n",
+ " 'stuff',\n",
+ " 'w',\n",
+ " 'ta',\n",
+ " 'said',\n",
+ " 'late',\n",
+ " 'Ca',\n",
+ " 'party',\n",
+ " 'call',\n",
+ " 'video',\n",
+ " 'tell',\n",
+ " 'someone',\n",
+ " 'baby',\n",
+ " 'yesterday',\n",
+ " 'play',\n",
+ " 'car',\n",
+ " 'found',\n",
+ " 'luck',\n",
+ " 'mom',\n",
+ " 'nothing',\n",
+ " 'maybe',\n",
+ " 'many',\n",
+ " 'birthday',\n",
+ " 'Twitter',\n",
+ " 'follower',\n",
+ " 'weather',\n",
+ " 'two',\n",
+ " 'also',\n",
+ " 'bored',\n",
+ " 'exam',\n",
+ " 'gone',\n",
+ " 'might',\n",
+ " 'waiting',\n",
+ " 'read',\n",
+ " 'funny',\n",
+ " 'hot',\n",
+ " 'world',\n",
+ " 'gt',\n",
+ " 'finally',\n",
+ " 'job',\n",
+ " 'damn',\n",
+ " 'since',\n",
+ " 'excited',\n",
+ " 'later',\n",
+ " 'amazing',\n",
+ " 'check',\n",
+ " 'hear',\n",
+ " 'family',\n",
+ " 'making',\n",
+ " 'live',\n",
+ " 'head',\n",
+ " 'saw',\n",
+ " 'talk',\n",
+ " 'Thank',\n",
+ " 'Got',\n",
+ " 'coming',\n",
+ " 'anything',\n",
+ " 'Going',\n",
+ " 'cold',\n",
+ " 'Well',\n",
+ " 'try',\n",
+ " 'almost',\n",
+ " 'end',\n",
+ " 'give',\n",
+ " 'around',\n",
+ " 'boy',\n",
+ " 'thats',\n",
+ " 'put',\n",
+ " 'till',\n",
+ " 'tho',\n",
+ " 'leave',\n",
+ " 'glad',\n",
+ " 'use',\n",
+ " 'beautiful',\n",
+ " 'dad',\n",
+ " 'far',\n",
+ " 'b',\n",
+ " 'book',\n",
+ " 'place',\n",
+ " 'fan',\n",
+ " 'lunch',\n",
+ " 'missing',\n",
+ " 'cry',\n",
+ " 'must',\n",
+ " 'stop',\n",
+ " 'least',\n",
+ " 'Hope',\n",
+ " 'stay',\n",
+ " 'music',\n",
+ " 'Happy',\n",
+ " 'free',\n",
+ " 'forward',\n",
+ " 'picture',\n",
+ " 'xx',\n",
+ " 'wanted',\n",
+ " 'food',\n",
+ " 'Hey',\n",
+ " 'iPhone',\n",
+ " 'update',\n",
+ " 'Sorry',\n",
+ " 'r',\n",
+ " 'class',\n",
+ " 'woke',\n",
+ " 'omg',\n",
+ " 'yay',\n",
+ " 'may',\n",
+ " 'eat',\n",
+ " 'kid',\n",
+ " 'busy',\n",
+ " 'cause',\n",
+ " 'anymore',\n",
+ " 'totally',\n",
+ " 'thinking',\n",
+ " 'headache',\n",
+ " 'actually',\n",
+ " 'dog',\n",
+ " 'dinner',\n",
+ " 'minute',\n",
+ " 'U',\n",
+ " 'okay',\n",
+ " 'sweet',\n",
+ " 'lovely',\n",
+ " 'shit',\n",
+ " 'New',\n",
+ " 'ill',\n",
+ " 'idea',\n",
+ " 'win',\n",
+ " 'poor',\n",
+ " 'without',\n",
+ " 'came',\n",
+ " 'month',\n",
+ " 'word',\n",
+ " 'Love',\n",
+ " 'hair',\n",
+ " 'cute',\n",
+ " 'wrong',\n",
+ " 'Day',\n",
+ " 'wow',\n",
+ " 'believe',\n",
+ " 'face',\n",
+ " 'anyone',\n",
+ " 'Sunday',\n",
+ " 'name',\n",
+ " 'every',\n",
+ " 'everything',\n",
+ " 'able',\n",
+ " 'playing',\n",
+ " 'didnt',\n",
+ " 'sooo',\n",
+ " 'kinda',\n",
+ " 'buy',\n",
+ " 'Yeah',\n",
+ " 'part',\n",
+ " 'p',\n",
+ " 'Morning',\n",
+ " 'mine',\n",
+ " 'finished',\n",
+ " 'room',\n",
+ " 'mileycyrus',\n",
+ " 'else',\n",
+ " 'alone',\n",
+ " 'eye',\n",
+ " 'listening',\n",
+ " 'either',\n",
+ " 'ticket',\n",
+ " 'enough',\n",
+ " 'heard',\n",
+ " 'stupid',\n",
+ " 'OMG',\n",
+ " 'outside',\n",
+ " 'following',\n",
+ " 'hug',\n",
+ " 'mind',\n",
+ " 'meet',\n",
+ " 'true',\n",
+ " 'eating',\n",
+ " 'Monday',\n",
+ " 'final',\n",
+ " 'coffee',\n",
+ " 'study',\n",
+ " 'real',\n",
+ " 'computer',\n",
+ " 'blog',\n",
+ " 'post',\n",
+ " 'break',\n",
+ " 'person',\n",
+ " 'enjoy',\n",
+ " 'Haha',\n",
+ " 'reading',\n",
+ " 'whole',\n",
+ " 'hand',\n",
+ " 'dream',\n",
+ " 'hehe',\n",
+ " 'talking',\n",
+ " 'album',\n",
+ " 'aww',\n",
+ " 'crazy',\n",
+ " 'Yes',\n",
+ " 'reply',\n",
+ " 'Watching',\n",
+ " 'Still',\n",
+ " 'probably',\n",
+ " 'photo',\n",
+ " 'add',\n",
+ " 'plan',\n",
+ " 'rest',\n",
+ " 'Damn',\n",
+ " 'half',\n",
+ " 'using',\n",
+ " 'taking',\n",
+ " 'text',\n",
+ " 'side',\n",
+ " 'hahaha',\n",
+ " 'Friday',\n",
+ " 'run',\n",
+ " 'fine',\n",
+ " 'stuck',\n",
+ " 'heart',\n",
+ " 'seen',\n",
+ " 'news',\n",
+ " 'full',\n",
+ " 'god',\n",
+ " 'forgot',\n",
+ " 'hit',\n",
+ " 'Great',\n",
+ " 'seems',\n",
+ " 'seeing',\n",
+ " 'hi',\n",
+ " 'trip',\n",
+ " 'course',\n",
+ " 'pain',\n",
+ " 'kind',\n",
+ " 'money',\n",
+ " 'change',\n",
+ " 'beach',\n",
+ " 'told',\n",
+ " 'nite',\n",
+ " 'started',\n",
+ " 'shopping',\n",
+ " 'hopefully',\n",
+ " 'super',\n",
+ " 'took',\n",
+ " 'problem',\n",
+ " 'brother',\n",
+ " 'site',\n",
+ " 'boring',\n",
+ " 'com',\n",
+ " 'send',\n",
+ " 'used',\n",
+ " 'train',\n",
+ " 'pm',\n",
+ " 'tried',\n",
+ " 'nap',\n",
+ " 'died',\n",
+ " 'quite',\n",
+ " 'remember',\n",
+ " 'reason',\n",
+ " 'pay',\n",
+ " 'finish',\n",
+ " 'soo',\n",
+ " 'bought',\n",
+ " 'afternoon',\n",
+ " 'sister',\n",
+ " 'link',\n",
+ " 'ago',\n",
+ " 'P',\n",
+ " 'raining',\n",
+ " 'LOVE',\n",
+ " 'instead',\n",
+ " 'rock',\n",
+ " 'til',\n",
+ " 'crap',\n",
+ " 'Back',\n",
+ " 'drink',\n",
+ " 'cuz',\n",
+ " 'couple',\n",
+ " 'point',\n",
+ " 'Get',\n",
+ " 'concert',\n",
+ " 'drive',\n",
+ " 'tommcfly',\n",
+ " 'dude',\n",
+ " 'jealous',\n",
+ " 'running',\n",
+ " 'lmao',\n",
+ " 'boo',\n",
+ " 'welcome',\n",
+ " 'test',\n",
+ " 'sore',\n",
+ " 'Yay',\n",
+ " 'tv',\n",
+ " 'loved',\n",
+ " 'evening',\n",
+ " 'hell',\n",
+ " 'page',\n",
+ " 'yea',\n",
+ " 'walk',\n",
+ " 'season',\n",
+ " 'wonder',\n",
+ " 'list',\n",
+ " 'store',\n",
+ " 'anyway',\n",
+ " 'awake',\n",
+ " 'move',\n",
+ " 'wont',\n",
+ " 'studying',\n",
+ " 'sunny',\n",
+ " 'breakfast',\n",
+ " 'friday',\n",
+ " 'definitely',\n",
+ " 'water',\n",
+ " 'wake',\n",
+ " 'Hi',\n",
+ " 'God',\n",
+ " 'monday',\n",
+ " 'cat',\n",
+ " 'asleep',\n",
+ " 'mum',\n",
+ " 'bring',\n",
+ " 'open',\n",
+ " 'le',\n",
+ " 'leaving',\n",
+ " 'ugh',\n",
+ " 'chocolate',\n",
+ " 'hr',\n",
+ " 'email',\n",
+ " 'moment',\n",
+ " 'office',\n",
+ " 'second',\n",
+ " 'shower',\n",
+ " 'smile',\n",
+ " 'Lol',\n",
+ " 'Ugh',\n",
+ " 'hungry',\n",
+ " 'broke',\n",
+ " 'Wish',\n",
+ " 'clean',\n",
+ " 'cut',\n",
+ " 'ddlovato',\n",
+ " 'gym',\n",
+ " 'ride',\n",
+ " 'Today',\n",
+ " 'watched',\n",
+ " 'visit',\n",
+ " 'Please',\n",
+ " 'R',\n",
+ " 'ask',\n",
+ " 'number',\n",
+ " 'red',\n",
+ " 'worth',\n",
+ " 'project',\n",
+ " 'Getting',\n",
+ " 'close',\n",
+ " 'saying',\n",
+ " 'One',\n",
+ " 'lucky',\n",
+ " 'sitting',\n",
+ " 'worse',\n",
+ " 'seriously',\n",
+ " 'online',\n",
+ " 'church',\n",
+ " 'shirt',\n",
+ " 'dance',\n",
+ " 'set',\n",
+ " 'bout',\n",
+ " 'together',\n",
+ " 'wonderful',\n",
+ " 'wear',\n",
+ " 'team',\n",
+ " 'answer',\n",
+ " 'top',\n",
+ " 'June',\n",
+ " 'tea',\n",
+ " 'longer',\n",
+ " 'E',\n",
+ " 'soooo',\n",
+ " 'worry',\n",
+ " 'care',\n",
+ " 'meeting',\n",
+ " 'forget',\n",
+ " 'min',\n",
+ " 'Go',\n",
+ " 'sunday',\n",
+ " 'internet',\n",
+ " 'hang',\n",
+ " 'cream',\n",
+ " 'st',\n",
+ " 'starting',\n",
+ " 'mood',\n",
+ " 'v',\n",
+ " 'fast',\n",
+ " 'horrible',\n",
+ " 'date',\n",
+ " 'c',\n",
+ " 'Enjoy',\n",
+ " 'via',\n",
+ " 'happen',\n",
+ " 'earlier',\n",
+ " 'fucking',\n",
+ " 'ate',\n",
+ " 'favorite',\n",
+ " 'followfriday',\n",
+ " 'driving',\n",
+ " 'happened',\n",
+ " 'TV',\n",
+ " 'Saturday',\n",
+ " 'doesnt',\n",
+ " 'high',\n",
+ " 'mother',\n",
+ " 'town',\n",
+ " 'enjoying',\n",
+ " 'agree',\n",
+ " 'turn',\n",
+ " 'chance',\n",
+ " 'Wow',\n",
+ " 'parent',\n",
+ " 'website',\n",
+ " 'Finally',\n",
+ " 'question',\n",
+ " 'Ok',\n",
+ " 'Glad',\n",
+ " 'broken',\n",
+ " 'tweeting',\n",
+ " 'black',\n",
+ " 'rainy',\n",
+ " 'co',\n",
+ " 'ice',\n",
+ " 'Goodnight',\n",
+ " 'pool',\n",
+ " 'heading',\n",
+ " 'sigh',\n",
+ " 'B',\n",
+ " 'drinking',\n",
+ " 'park',\n",
+ " 'fall',\n",
+ " 'slept',\n",
+ " 'YAY',\n",
+ " 'small',\n",
+ " 'LA',\n",
+ " 'laptop',\n",
+ " 'chat',\n",
+ " 'knew',\n",
+ " 'fail',\n",
+ " 'da',\n",
+ " 'chicken',\n",
+ " 'goin',\n",
+ " 'slow',\n",
+ " 'throat',\n",
+ " 'episode',\n",
+ " 'e',\n",
+ " 'business',\n",
+ " 'garden',\n",
+ " 'homework',\n",
+ " 'passed',\n",
+ " 'upset',\n",
+ " 'comment',\n",
+ " 'sleeping',\n",
+ " 'airport',\n",
+ " 'saturday',\n",
+ " 'Let',\n",
+ " 'taken',\n",
+ " 'understand',\n",
+ " 'due',\n",
+ " 'shop',\n",
+ " 'Work',\n",
+ " 'hello',\n",
+ " 'listen',\n",
+ " 'k',\n",
+ " 'woman',\n",
+ " 'support',\n",
+ " 'aw',\n",
+ " 'Night',\n",
+ " 'star',\n",
+ " 'Time',\n",
+ " 'vote',\n",
+ " 'story',\n",
+ " 'scared',\n",
+ " 'message',\n",
+ " 'holiday',\n",
+ " 'foot',\n",
+ " 'nd',\n",
+ " 'Maybe',\n",
+ " 'awww',\n",
+ " 'line',\n",
+ " 'L',\n",
+ " 'weird',\n",
+ " 'sunshine',\n",
+ " 'fell',\n",
+ " 'seem',\n",
+ " 'english',\n",
+ " 'lady',\n",
+ " 'award',\n",
+ " 'pick',\n",
+ " 'bus',\n",
+ " 'glass',\n",
+ " 'worst',\n",
+ " 'called',\n",
+ " 'Feeling',\n",
+ " 'account',\n",
+ " 'congrats',\n",
+ " 'goodnight',\n",
+ " 'company',\n",
+ " 'Really',\n",
+ " 'xD',\n",
+ " 'son',\n",
+ " 'Poor',\n",
+ " 'dear',\n",
+ " 'mad',\n",
+ " 'Need',\n",
+ " 'order',\n",
+ " 'Last',\n",
+ " 'fuck',\n",
+ " 'past',\n",
+ " 'facebook',\n",
+ " 'rather',\n",
+ " 'havent',\n",
+ " 'spent',\n",
+ " 'load',\n",
+ " 'fix',\n",
+ " 'bag',\n",
+ " 'short',\n",
+ " 'May',\n",
+ " 'leg',\n",
+ " 'interesting',\n",
+ " 'gave',\n",
+ " 'dead',\n",
+ " 'Nice',\n",
+ " 'hoping',\n",
+ " 'different',\n",
+ " 'loving',\n",
+ " 'catch',\n",
+ " 'ipod',\n",
+ " 'absolutely',\n",
+ " 'perfect',\n",
+ " 'case',\n",
+ " 'Miss',\n",
+ " 'officially',\n",
+ " 'writing',\n",
+ " 'sometimes',\n",
+ " 'meant',\n",
+ " 'ah',\n",
+ " 'cleaning',\n",
+ " 'forever',\n",
+ " 'X',\n",
+ " 'issue',\n",
+ " 'G',\n",
+ " 'window',\n",
+ " 'dress',\n",
+ " 'idk',\n",
+ " 'Looking',\n",
+ " 'deal',\n",
+ " 'inside',\n",
+ " 'needed',\n",
+ " 'si',\n",
+ " 'moving',\n",
+ " 'profile',\n",
+ " 'write',\n",
+ " 'C',\n",
+ " 'graduation',\n",
+ " 'fight',\n",
+ " 'bday',\n",
+ " 'met',\n",
+ " 'power',\n",
+ " 'box',\n",
+ " 'cousin',\n",
+ " 'sent',\n",
+ " 'Sad',\n",
+ " 'bet',\n",
+ " 'looked',\n",
+ " 'living',\n",
+ " 'worried',\n",
+ " 'bye',\n",
+ " 'wedding',\n",
+ " 'iphone',\n",
+ " 'college',\n",
+ " 'btw',\n",
+ " 'Welcome',\n",
+ " 'touch',\n",
+ " 'kill',\n",
+ " 'Awesome',\n",
+ " 'Another',\n",
+ " 'youtube',\n",
+ " 'fantastic',\n",
+ " 'camera',\n",
+ " 'group',\n",
+ " 'vip',\n",
+ " 'cake',\n",
+ " 'sort',\n",
+ " 'cup',\n",
+ " 'especially',\n",
+ " 'city',\n",
+ " 'gorgeous',\n",
+ " 'clothes',\n",
+ " 'version',\n",
+ " 'finger',\n",
+ " 'band',\n",
+ " 'Everyone',\n",
+ " 'liked',\n",
+ " 'unfortunately',\n",
+ " 'beer',\n",
+ " 'shoot',\n",
+ " 'lonely',\n",
+ " 'bitch',\n",
+ " 'shoe',\n",
+ " 'singing',\n",
+ " 'interview',\n",
+ " 'random',\n",
+ " 'drop',\n",
+ " 'ppl',\n",
+ " 'yr',\n",
+ " 'gay',\n",
+ " 'sleepy',\n",
+ " 'white',\n",
+ " 'body',\n",
+ " 'fb',\n",
+ " 'Someone',\n",
+ " 'supposed',\n",
+ " 'info',\n",
+ " 'paper',\n",
+ " 'David',\n",
+ " 'Like',\n",
+ " 'Birthday',\n",
+ " 'lil',\n",
+ " 'Trying',\n",
+ " 'Awww',\n",
+ " 'ai',\n",
+ " 'shot',\n",
+ " 'Also',\n",
+ " 'special',\n",
+ " 'door',\n",
+ " 'sign',\n",
+ " 'hubby',\n",
+ " 'puppy',\n",
+ " 'plz',\n",
+ " 'alright',\n",
+ " 'shall',\n",
+ " 'thx',\n",
+ " 'save',\n",
+ " 'everybody',\n",
+ " 'OK',\n",
+ " 'arm',\n",
+ " 'bro',\n",
+ " 'learn',\n",
+ " 'web',\n",
+ " 'asked',\n",
+ " 'ive',\n",
+ " 'peep',\n",
+ " 'green',\n",
+ " 'See',\n",
+ " 'quick',\n",
+ " 'june',\n",
+ " 'sit',\n",
+ " 'K',\n",
+ " 'confused',\n",
+ " 'laugh',\n",
+ " 'promise',\n",
+ " 'Okay',\n",
+ " 'voice',\n",
+ " 'flight',\n",
+ " 'tear',\n",
+ " 'realize',\n",
+ " 'nose',\n",
+ " 'babe',\n",
+ " 'note',\n",
+ " 'relaxing',\n",
+ " 'Looks',\n",
+ " 'Ah',\n",
+ " 'future',\n",
+ " 'lesson',\n",
+ " 'em',\n",
+ " 'easy',\n",
+ " 'Hopefully',\n",
+ " 'Man',\n",
+ " 'yummy',\n",
+ " 'hold',\n",
+ " 'spend',\n",
+ " 'Life',\n",
+ " 'light',\n",
+ " 'Follow',\n",
+ " 'xoxo',\n",
+ " 'xxx',\n",
+ " 'blue',\n",
+ " 'Aww',\n",
+ " 'lazy',\n",
+ " 'smell',\n",
+ " 'Listening',\n",
+ " 'along',\n",
+ " 'English',\n",
+ " 'father',\n",
+ " 'WTF',\n",
+ " 'freaking',\n",
+ " 'Home',\n",
+ " 'except',\n",
+ " 'math',\n",
+ " 'tour',\n",
+ " 'age',\n",
+ " 'plane',\n",
+ " 'hanging',\n",
+ " 'share',\n",
+ " 'join',\n",
+ " 'wine',\n",
+ " 'OH',\n",
+ " 'download',\n",
+ " 'fact',\n",
+ " 'Gon',\n",
+ " 'bloody',\n",
+ " 'ahh',\n",
+ " 'stand',\n",
+ " 'none',\n",
+ " 'Hello',\n",
+ " 'fly',\n",
+ " 'j',\n",
+ " 'vacation',\n",
+ " 'wishing',\n",
+ " 'myspace',\n",
+ " 'Facebook',\n",
+ " 'paid',\n",
+ " 'N',\n",
+ " 'exciting',\n",
+ " 'matter',\n",
+ " 'mile',\n",
+ " 'round',\n",
+ " 'huge',\n",
+ " 'Went',\n",
+ " 'lame',\n",
+ " 'jus',\n",
+ " 'hospital',\n",
+ " 'figure',\n",
+ " 'giving',\n",
+ " 'topic',\n",
+ " 'ear',\n",
+ " 'afford',\n",
+ " 'nearly',\n",
+ " 'ended',\n",
+ " 'Come',\n",
+ " 'hahah',\n",
+ " 'air',\n",
+ " 'Mr',\n",
+ " 'proud',\n",
+ " 'whatever',\n",
+ " 'three',\n",
+ " 'lately',\n",
+ " 'kitty',\n",
+ " 'club',\n",
+ " 'XD',\n",
+ " 'f',\n",
+ " 'sing',\n",
+ " 'showing',\n",
+ " 'road',\n",
+ " 'Take',\n",
+ " 'beat',\n",
+ " 'London',\n",
+ " 'sale',\n",
+ " 'country',\n",
+ " 'gunna',\n",
+ " 'DM',\n",
+ " 'warm',\n",
+ " 'stopped',\n",
+ " 'worked',\n",
+ " 'boot',\n",
+ " 'fever',\n",
+ " 'Tuesday',\n",
+ " 'cheese',\n",
+ " 'exactly',\n",
+ " 'Star',\n",
+ " 'service',\n",
+ " 'tan',\n",
+ " 'cover',\n",
+ " 'Heading',\n",
+ " 'iPod',\n",
+ " 'Working',\n",
+ " 'low',\n",
+ " 'helping',\n",
+ " 'Mother',\n",
+ " 'type',\n",
+ " 'fair',\n",
+ " 'trouble',\n",
+ " 'honey',\n",
+ " 'g',\n",
+ " 'storm',\n",
+ " 'fam',\n",
+ " 'joke',\n",
+ " 'enjoyed',\n",
+ " 'sold',\n",
+ " 'mouth',\n",
+ " 'wearing',\n",
+ " 'depressing',\n",
+ " 'luv',\n",
+ " 'Boo',\n",
+ " 'card',\n",
+ " 'child',\n",
+ " 'delicious',\n",
+ " 'mall',\n",
+ " 'degree',\n",
+ " 'radio',\n",
+ " 'sooooo',\n",
+ " 'boyfriend',\n",
+ " 'packing',\n",
+ " 'played',\n",
+ " 'ahead',\n",
+ " 'Ha',\n",
+ " 'spot',\n",
+ " 'snow',\n",
+ " 'floor',\n",
+ " 'afraid',\n",
+ " 'bike',\n",
+ " 'happens',\n",
+ " 'young',\n",
+ " 'others',\n",
+ " 'drunk',\n",
+ " 'l',\n",
+ " 'GOOD',\n",
+ " 'Check',\n",
+ " 'REALLY',\n",
+ " 'flu',\n",
+ " 'tweeps',\n",
+ " 'safe',\n",
+ " 'yo',\n",
+ " 'mail',\n",
+ " 'indeed',\n",
+ " 'Fuck',\n",
+ " 'UK',\n",
+ " 'Guess',\n",
+ " 'piece',\n",
+ " 'changed',\n",
+ " ...]"
+ ]
+ },
+ "execution_count": 74,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# your code here"
+ "from nltk.probability import FreqDist\n",
+ "\n",
+ "all_words = [word for words in df_sampled['text_processed'] for word in words]\n",
+ "\n",
+ "freq_dist = FreqDist(all_words)\n",
+ "\n",
+ "top_5000_words = freq_dist.most_common(5000)\n",
+ "\n",
+ "print(\"Top 5,000 words example (first 10):\", top_5000_words[:100])\n",
+ "\n",
+ "top5000 = []\n",
+ "\n",
+ "for whatever in top_5000_words:\n",
+ " for name in whatever:\n",
+ " top5000.append(name)\n",
+ " \n",
+ "top5000 = top5000[::2]\n",
+ "\n",
+ "top5000\n",
+ "\n"
]
},
{
@@ -167,11 +1947,21 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "\n",
+ "def find_features(document):\n",
+ " words = set(document)\n",
+ " features = {word: (word in words) for word in top5000}\n",
+ " return features\n",
+ "\n",
+ "\n",
+ "documents = list(zip(df_sampled['text_processed'], df_sampled['is_positive']))\n",
+ "\n",
+ "\n",
+ "featuresets = [(find_features(doc), category) for (doc, category) in documents]"
]
},
{
@@ -210,11 +2000,17 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "\n",
+ "\n",
+ "random.seed(42)\n",
+ "random.shuffle(featuresets)\n",
+ "train_set, test_set = train_test_split(featuresets, test_size=0.2)\n",
+ "\n",
+ "classifier = NaiveBayesClassifier.train(train_set)"
]
},
{
@@ -230,75 +2026,42 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 77,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Accuracy: 0.7160\n",
+ "Most Informative Features\n",
+ " Ugh = True False : True = 24.9 : 1.0\n",
+ " throat = True False : True = 20.1 : 1.0\n",
+ " Poor = True False : True = 18.7 : 1.0\n",
+ " laugh = True True : False = 16.0 : 1.0\n",
+ " Welcome = True True : False = 15.3 : 1.0\n",
+ " Follow = True True : False = 14.0 : 1.0\n",
+ " sad = True False : True = 13.7 : 1.0\n",
+ " horrible = True False : True = 12.9 : 1.0\n",
+ " ugh = True False : True = 12.9 : 1.0\n",
+ " Hi = True True : False = 12.7 : 1.0\n"
+ ]
+ }
+ ],
"source": [
- "# your code here"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Bonus Question 1: Improve Model Performance\n",
- "\n",
- "If you are still not exhausted so far and want to dig deeper, try to improve your classifier performance. There are many aspects you can dig into, for example:\n",
- "\n",
- "* Improve stemming and lemmatization. Inspect your bag of words and the most important features. Are there any words you should furuther remove from analysis? You can append these words to further remove to the stop words list.\n",
+ "# your code here\n",
"\n",
- "* Remember we only used the top 5,000 features to build model? Try using different numbers of top features. The bottom line is to use as few features as you can without compromising your model performance. The fewer features you select into your model, the faster your model is trained. Then you can use a larger sample size to improve your model accuracy score."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# your code here"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Bonus Question 2: Machine Learning Pipeline\n",
+ "accuracy = nltk_accuracy(classifier, test_set)\n",
+ "print(f\"Accuracy: {accuracy:.4f}\")\n",
"\n",
- "In a new Jupyter Notebook, combine all your codes into a function (or a class). Your new function will execute the complete machine learning pipeline job by receiving the dataset location and output the classifier. This will allow you to use your function to predict the sentiment of any tweet in real time. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# your code here"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Bonus Question 3: Apache Spark\n",
"\n",
- "If you have completed the Apache Spark advanced topic lab, what you can do is to migrate your pipeline from local to a Databricks Notebook. Share your notebook with your instructor and classmates to show off your achievements!"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# your code here"
+ "classifier.show_most_informative_features(10)\n"
]
}
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -312,7 +2075,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.3"
+ "version": "3.11.5"
}
},
"nbformat": 4,