diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb
index 0808166..faf7854 100644
--- a/your-code/challenge-1.ipynb
+++ b/your-code/challenge-1.ipynb
@@ -66,11 +66,43 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "'ironhack s q website is'\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'ironhack s q website is'"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
+ "# will change the last \" for a ', in order to change it back again later:\n",
+ "string_ironhack = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]')\"\n",
+ "\n",
+ "import re\n",
+ "\n",
+ "#Defining the function:\n",
"def clean_up(s):\n",
+ " s = s.lower()\n",
+ " result = re.sub(r\"(?<=is)(.*)(?= )\", \"\", s)\n",
+ " result1 = re.sub(r\"[0-9+]\", \"\", result)\n",
+ " result2 = re.sub(r\"[^A-Za-z0-9]\", \" \", result1)\n",
+ " result3 = re.sub(r\"[0-9]\", \" \", result2)\n",
+ " result4 = result3.lstrip()\n",
+ " result5 = result4.rstrip()\n",
+ " return result5\n",
" \"\"\"\n",
" Cleans up numbers, URLs, and special characters from a string.\n",
"\n",
@@ -79,9 +111,22 @@
"\n",
" Returns:\n",
" A string that has been cleaned up.\n",
- " \"\"\""
+ " \"\"\"\n",
+ " \n",
+ "print(\"'ironhack s q website is'\")\n",
+ "clean_string = clean_up(string_ironhack)\n",
+ "clean_string\n",
+ "\n",
+ "#Done"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -101,11 +146,37 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package punkt to\n",
+ "[nltk_data] /Users/jossuebangos/nltk_data...\n",
+ "[nltk_data] Unzipping tokenizers/punkt.zip.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "['ironhack', 's', 'q', 'website', 'is']"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
+ "import nltk\n",
+ "nltk.download('punkt')\n",
+ "from nltk.tokenize import word_tokenize\n",
"def tokenize(s):\n",
+ " s = word_tokenize(s)\n",
+ " return s\n",
+ " \n",
" \"\"\"\n",
" Tokenize a string.\n",
"\n",
@@ -114,7 +185,9 @@
"\n",
" Returns:\n",
" A list of words as the result of tokenization.\n",
- " \"\"\""
+ " \"\"\"\n",
+ "tokenized_str = tokenize(clean_string)\n",
+ "tokenized_str"
]
},
{
@@ -145,11 +218,43 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package wordnet to\n",
+ "[nltk_data] /Users/jossuebangos/nltk_data...\n",
+ "[nltk_data] Unzipping corpora/wordnet.zip.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "['ironhack', 's', 'q', 'websit', 'is']"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
+ "from nltk.stem import WordNetLemmatizer\n",
+ "from nltk.stem import PorterStemmer\n",
+ "nltk.download('wordnet')\n",
"def stem_and_lemmatize(l):\n",
+ " \n",
+ " #Initializing both methods:\n",
+ " ps = PorterStemmer() \n",
+ " lemmatizer = WordNetLemmatizer()\n",
+ " \n",
+ " stemmed = [ps.stem(word) for word in l]\n",
+ " stem_and_lemmat = [lemmatizer.lemmatize(word) for word in stemmed]\n",
+ " return stem_and_lemmat\n",
+ " #lemmatizer.lemmatize(stemmed)\n",
" \"\"\"\n",
" Perform stemming and lemmatization on a list of words.\n",
"\n",
@@ -158,7 +263,9 @@
"\n",
" Returns:\n",
" A list of strings after being stemmed and lemmatized.\n",
- " \"\"\""
+ " \"\"\"\n",
+ "stem_and_lemmat = stem_and_lemmatize(tokenized_str)\n",
+ "stem_and_lemmat"
]
},
{
@@ -176,11 +283,36 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 5,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package stopwords to\n",
+ "[nltk_data] /Users/jossuebangos/nltk_data...\n",
+ "[nltk_data] Unzipping corpora/stopwords.zip.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "['ironhack', 'q', 'websit']"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
+ "from nltk.corpus import stopwords\n",
+ "nltk.download('stopwords')\n",
+ "\n",
"def remove_stopwords(l):\n",
+ " stop = stopwords.words('english')\n",
+ " return [word for word in l if word not in stop]\n",
" \"\"\"\n",
" Remove English stopwords from a list of strings.\n",
"\n",
@@ -189,7 +321,8 @@
"\n",
" Returns:\n",
" A list of strings after stop words are removed.\n",
- " \"\"\""
+ " \"\"\"\n",
+ "remove_stopwords(stem_and_lemmat)"
]
},
{
@@ -218,9 +351,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.3"
+ "version": "3.8.3"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb
index 6b0e116..97b02d1 100644
--- a/your-code/challenge-2.ipynb
+++ b/your-code/challenge-2.ipynb
@@ -46,11 +46,44 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "#Necessary libraries:\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import seaborn as sns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Importing the dataset, with the respective column names:\n",
+ "\n",
+ "header_list = ['target', 'id', 'date', 'flag', 'user', 'text']\n",
+ "tweets = pd.read_csv(r'tweets.csv', names = header_list, encoding = 'latin-1')\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#tweets.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tweets_dummy = tweets.iloc[:20000,]\n"
]
},
{
@@ -74,13 +107,359 @@
""
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " target | \n",
+ " id | \n",
+ " date | \n",
+ " flag | \n",
+ " user | \n",
+ " text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 1467810369 | \n",
+ " Mon Apr 06 22:19:45 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " _TheSpecialOne_ | \n",
+ " @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0 | \n",
+ " 1467810672 | \n",
+ " Mon Apr 06 22:19:49 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " scotthamilton | \n",
+ " is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah! | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0 | \n",
+ " 1467810917 | \n",
+ " Mon Apr 06 22:19:53 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " mattycus | \n",
+ " @Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0 | \n",
+ " 1467811184 | \n",
+ " Mon Apr 06 22:19:57 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " ElleCTF | \n",
+ " my whole body feels itchy and like its on fire | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0 | \n",
+ " 1467811193 | \n",
+ " Mon Apr 06 22:19:57 PDT 2009 | \n",
+ " NO_QUERY | \n",
+ " Karoli | \n",
+ " @nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " target id date flag \\\n",
+ "0 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY \n",
+ "1 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY \n",
+ "2 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY \n",
+ "3 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n",
+ "4 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n",
+ "\n",
+ " user \\\n",
+ "0 _TheSpecialOne_ \n",
+ "1 scotthamilton \n",
+ "2 mattycus \n",
+ "3 ElleCTF \n",
+ "4 Karoli \n",
+ "\n",
+ " text \n",
+ "0 @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D \n",
+ "1 is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah! \n",
+ "2 @Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds \n",
+ "3 my whole body feels itchy and like its on fire \n",
+ "4 @nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. "
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tweets_dummy.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ ":3: FutureWarning: Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.\n",
+ " pd.set_option('display.max_colwidth', -1) #To display the entire row.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0 @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D\n",
+ "1 is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah! \n",
+ "2 @Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds \n",
+ "3 my whole body feels itchy and like its on fire \n",
+ "4 @nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. \n",
+ "5 @Kwesidei not the whole crew \n",
+ "6 Need a hug \n",
+ "7 @LOLTrish hey long time no see! Yes.. Rains a bit ,only a bit LOL , I'm fine thanks , how's you ? \n",
+ "8 @Tatiana_K nope they didn't have it \n",
+ "9 @twittera que me muera ? \n",
+ "Name: text, dtype: object"
+ ]
+ },
+ "execution_count": 40,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Taking a look at the text phrases:\n",
+ "\n",
+ "pd.set_option('display.max_colwidth', -1) #To display the entire row.\n",
+ "tweets_dummy['text'].head(10)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'switchfoot awww that s a bummer you shoulda got david carr of third day to do it d'"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "abc = \"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D\"\n",
+ "import regex as re\n",
+ "\n",
+ "def clean_up(s):\n",
+ " s = s.lower()\n",
+ " result = re.sub(r\"http\\S+\", \"\", s)\n",
+ " result1 = re.sub(r\"[0-9+]\", \"\", result)\n",
+ " result2 = re.sub(r\"[^A-Za-z0-9]\", \" \", result1)\n",
+ " result3 = re.sub(r\"[0-9]\", \" \", result2)\n",
+ " result4 = result3.lstrip()\n",
+ " result5 = result4.rstrip()\n",
+ " return result5\n",
+ "\n",
+ "clean_string = clean_up(abc)\n",
+ "clean_string"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['switchfoot',\n",
+ " 'awww',\n",
+ " 'that',\n",
+ " 's',\n",
+ " 'a',\n",
+ " 'bummer',\n",
+ " 'you',\n",
+ " 'shoulda',\n",
+ " 'got',\n",
+ " 'david',\n",
+ " 'carr',\n",
+ " 'of',\n",
+ " 'third',\n",
+ " 'day',\n",
+ " 'to',\n",
+ " 'do',\n",
+ " 'it',\n",
+ " 'd']"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import nltk\n",
+ "#nltk.download('punkt')\n",
+ "\n",
+ "from nltk.tokenize import word_tokenize\n",
+ "def tokenize(s):\n",
+ " s = word_tokenize(s)\n",
+ " return s\n",
+ "\n",
+ "tokenized_str = tokenize(clean_string)\n",
+ "tokenized_str"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package wordnet to\n",
+ "[nltk_data] /Users/jossuebangos/nltk_data...\n",
+ "[nltk_data] Package wordnet is already up-to-date!\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "['switchfoot',\n",
+ " 'awww',\n",
+ " 'that',\n",
+ " 's',\n",
+ " 'a',\n",
+ " 'bummer',\n",
+ " 'you',\n",
+ " 'shoulda',\n",
+ " 'got',\n",
+ " 'david',\n",
+ " 'carr',\n",
+ " 'of',\n",
+ " 'third',\n",
+ " 'day',\n",
+ " 'to',\n",
+ " 'do',\n",
+ " 'it',\n",
+ " 'd']"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Stemming and Lemmatization:\n",
+ "\n",
+ "from nltk.stem import WordNetLemmatizer\n",
+ "from nltk.stem import PorterStemmer\n",
+ "nltk.download('wordnet')\n",
+ "\n",
+ "def stem_and_lemmatize(text):\n",
+ " \n",
+ " #Initializing both methods:\n",
+ " ps = PorterStemmer() \n",
+ " lemmatizer = WordNetLemmatizer()\n",
+ " \n",
+ " stemmed = [ps.stem(word) for word in text]\n",
+ " stem_and_lemmat = [lemmatizer.lemmatize(word) for word in stemmed]\n",
+ " return stem_and_lemmat\n",
+ "\n",
+ "stem_and_lemmat = stem_and_lemmatize(tokenized_str)\n",
+ "stem_and_lemmat"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package stopwords to\n",
+ "[nltk_data] /Users/jossuebangos/nltk_data...\n",
+ "[nltk_data] Package stopwords is already up-to-date!\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "['switchfoot',\n",
+ " 'awww',\n",
+ " 'bummer',\n",
+ " 'shoulda',\n",
+ " 'got',\n",
+ " 'david',\n",
+ " 'carr',\n",
+ " 'third',\n",
+ " 'day']"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Stop words:\n",
+ "\n",
+ "from nltk.corpus import stopwords\n",
+ "nltk.download('stopwords')\n",
+ "\n",
+ "def remove_stopwords(text):\n",
+ " stop = stopwords.words('english')\n",
+ " return [word for word in text if word not in stop]\n",
+ "stop_worded = remove_stopwords(stem_and_lemmat)\n",
+ "stop_worded"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "tweets['text_processed'] = tweets['text'].apply(clean_up).apply(tokenize).apply(stem_and_lemmatize).apply(remove_stopwords)"
]
},
{
@@ -102,7 +481,25 @@
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "bagofwords = [word for lists in tweets['text'] for word in lists]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "freq_words = nltk.FreqDist(bagofwords)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#freq_words.most_common(5000)"
]
},
{
@@ -171,9 +568,39 @@
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "word_features = list(freq_words.keys())[:5000]\n",
+ "# Sentiment Score is 0 = negative, 2 = neutral, and 4 = positive):\n",
+ "documents = list(zip(tweets['text_processed'], np.where(tweets['target'] == 4, True, False)))"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#From Video & Kaggle Notebook\n",
+ "\n",
+ "def find_features(documents):\n",
+ " words = set(documents)\n",
+ " features = {}\n",
+ " for w in word_features:\n",
+ " features[w] = (w in words)\n",
+ "\n",
+ " return features\n",
+ "\n",
+ "#For all documents, saving the feature existence booleans and their respective positive or negative categories:\n",
+ "\n",
+ "featuresets = [(find_features(rev), category) for (rev, category) in documents]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -210,11 +637,35 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "from nltk import NaiveBayesClassifier\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "#Separating into training and testing sets:\n",
+ "training_set = feature_sets[int(len(featuresets)*0.8):]\n",
+ "testing_set = feature_sets[:int(len(featuresets)*0.8)]\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "classifier = nltk.NaiveBayesClassifier.train(training_set)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "classifier.show_most_informative_features()"
]
},
{
@@ -234,7 +685,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# your code here"
+ "nltk.classify.accuracy(classifier, testing_set)"
]
},
{
@@ -312,9 +763,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.3"
+ "version": "3.8.3"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/your-code/wordcloud-phrases.png b/your-code/wordcloud-phrases.png
new file mode 100644
index 0000000..7785b01
Binary files /dev/null and b/your-code/wordcloud-phrases.png differ