diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb index 0808166..faf7854 100644 --- a/your-code/challenge-1.ipynb +++ b/your-code/challenge-1.ipynb @@ -66,11 +66,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'ironhack s q website is'\n" + ] + }, + { + "data": { + "text/plain": [ + "'ironhack s q website is'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "# will change the last \" for a ', in order to change it back again later:\n", + "string_ironhack = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]')\"\n", + "\n", + "import re\n", + "\n", + "#Defining the function:\n", "def clean_up(s):\n", + " s = s.lower()\n", + " result = re.sub(r\"(?<=is)(.*)(?= )\", \"\", s)\n", + " result1 = re.sub(r\"[0-9+]\", \"\", result)\n", + " result2 = re.sub(r\"[^A-Za-z0-9]\", \" \", result1)\n", + " result3 = re.sub(r\"[0-9]\", \" \", result2)\n", + " result4 = result3.lstrip()\n", + " result5 = result4.rstrip()\n", + " return result5\n", " \"\"\"\n", " Cleans up numbers, URLs, and special characters from a string.\n", "\n", @@ -79,9 +111,22 @@ "\n", " Returns:\n", " A string that has been cleaned up.\n", - " \"\"\"" + " \"\"\"\n", + " \n", + "print(\"'ironhack s q website is'\")\n", + "clean_string = clean_up(string_ironhack)\n", + "clean_string\n", + "\n", + "#Done" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -101,11 +146,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] /Users/jossuebangos/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n" + ] + }, + { + "data": { + "text/plain": [ + "['ironhack', 's', 'q', 'website', 'is']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "import nltk\n", + "nltk.download('punkt')\n", + "from nltk.tokenize import word_tokenize\n", "def tokenize(s):\n", + " s = word_tokenize(s)\n", + " return s\n", + " \n", " \"\"\"\n", " Tokenize a string.\n", "\n", @@ -114,7 +185,9 @@ "\n", " Returns:\n", " A list of words as the result of tokenization.\n", - " \"\"\"" + " \"\"\"\n", + "tokenized_str = tokenize(clean_string)\n", + "tokenized_str" ] }, { @@ -145,11 +218,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to\n", + "[nltk_data] /Users/jossuebangos/nltk_data...\n", + "[nltk_data] Unzipping corpora/wordnet.zip.\n" + ] + }, + { + "data": { + "text/plain": [ + "['ironhack', 's', 'q', 'websit', 'is']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.stem import PorterStemmer\n", + "nltk.download('wordnet')\n", "def stem_and_lemmatize(l):\n", + " \n", + " #Initializing both methods:\n", + " ps = PorterStemmer() \n", + " lemmatizer = WordNetLemmatizer()\n", + " \n", + " stemmed = [ps.stem(word) for word in l]\n", + " stem_and_lemmat = [lemmatizer.lemmatize(word) for word in stemmed]\n", + " return stem_and_lemmat\n", + " #lemmatizer.lemmatize(stemmed)\n", " \"\"\"\n", " Perform stemming and lemmatization on a list of words.\n", "\n", @@ -158,7 +263,9 @@ "\n", " Returns:\n", " A list of strings after being stemmed and lemmatized.\n", - " \"\"\"" + " \"\"\"\n", + "stem_and_lemmat = stem_and_lemmatize(tokenized_str)\n", + "stem_and_lemmat" ] }, { @@ -176,11 +283,36 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] /Users/jossuebangos/nltk_data...\n", + "[nltk_data] Unzipping corpora/stopwords.zip.\n" + ] + }, + { + "data": { + "text/plain": [ + "['ironhack', 'q', 'websit']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "from nltk.corpus import stopwords\n", + "nltk.download('stopwords')\n", + "\n", "def remove_stopwords(l):\n", + " stop = stopwords.words('english')\n", + " return [word for word in l if word not in stop]\n", " \"\"\"\n", " Remove English stopwords from a list of strings.\n", "\n", @@ -189,7 +321,8 @@ "\n", " Returns:\n", " A list of strings after stop words are removed.\n", - " \"\"\"" + " \"\"\"\n", + "remove_stopwords(stem_and_lemmat)" ] }, { @@ -218,9 +351,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.8.3" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb index 6b0e116..97b02d1 100644 --- a/your-code/challenge-2.ipynb +++ b/your-code/challenge-2.ipynb @@ -46,11 +46,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "#Necessary libraries:\n", + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "# Importing the dataset, with the respective column names:\n", + "\n", + "header_list = ['target', 'id', 'date', 'flag', 'user', 'text']\n", + "tweets = pd.read_csv(r'tweets.csv', names = header_list, encoding = 'latin-1')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "#tweets.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "tweets_dummy = tweets.iloc[:20000,]\n" ] }, { @@ -74,13 +107,359 @@ "![Processed Data](data-cleaning-results.png)" ] }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
targetiddateflagusertext
001467810369Mon Apr 06 22:19:45 PDT 2009NO_QUERY_TheSpecialOne_@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D
101467810672Mon Apr 06 22:19:49 PDT 2009NO_QUERYscotthamiltonis upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
201467810917Mon Apr 06 22:19:53 PDT 2009NO_QUERYmattycus@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
301467811184Mon Apr 06 22:19:57 PDT 2009NO_QUERYElleCTFmy whole body feels itchy and like its on fire
401467811193Mon Apr 06 22:19:57 PDT 2009NO_QUERYKaroli@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.
\n", + "
" + ], + "text/plain": [ + " target id date flag \\\n", + "0 0 1467810369 Mon Apr 06 22:19:45 PDT 2009 NO_QUERY \n", + "1 0 1467810672 Mon Apr 06 22:19:49 PDT 2009 NO_QUERY \n", + "2 0 1467810917 Mon Apr 06 22:19:53 PDT 2009 NO_QUERY \n", + "3 0 1467811184 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", + "4 0 1467811193 Mon Apr 06 22:19:57 PDT 2009 NO_QUERY \n", + "\n", + " user \\\n", + "0 _TheSpecialOne_ \n", + "1 scotthamilton \n", + "2 mattycus \n", + "3 ElleCTF \n", + "4 Karoli \n", + "\n", + " text \n", + "0 @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D \n", + "1 is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah! \n", + "2 @Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds \n", + "3 my whole body feels itchy and like its on fire \n", + "4 @nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. " + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tweets_dummy.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":3: FutureWarning: Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.\n", + " pd.set_option('display.max_colwidth', -1) #To display the entire row.\n" + ] + }, + { + "data": { + "text/plain": [ + "0 @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D\n", + "1 is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah! \n", + "2 @Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds \n", + "3 my whole body feels itchy and like its on fire \n", + "4 @nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. \n", + "5 @Kwesidei not the whole crew \n", + "6 Need a hug \n", + "7 @LOLTrish hey long time no see! Yes.. Rains a bit ,only a bit LOL , I'm fine thanks , how's you ? \n", + "8 @Tatiana_K nope they didn't have it \n", + "9 @twittera que me muera ? \n", + "Name: text, dtype: object" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Taking a look at the text phrases:\n", + "\n", + "pd.set_option('display.max_colwidth', -1) #To display the entire row.\n", + "tweets_dummy['text'].head(10)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'switchfoot awww that s a bummer you shoulda got david carr of third day to do it d'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "abc = \"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D\"\n", + "import regex as re\n", + "\n", + "def clean_up(s):\n", + " s = s.lower()\n", + " result = re.sub(r\"http\\S+\", \"\", s)\n", + " result1 = re.sub(r\"[0-9+]\", \"\", result)\n", + " result2 = re.sub(r\"[^A-Za-z0-9]\", \" \", result1)\n", + " result3 = re.sub(r\"[0-9]\", \" \", result2)\n", + " result4 = result3.lstrip()\n", + " result5 = result4.rstrip()\n", + " return result5\n", + "\n", + "clean_string = clean_up(abc)\n", + "clean_string" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['switchfoot',\n", + " 'awww',\n", + " 'that',\n", + " 's',\n", + " 'a',\n", + " 'bummer',\n", + " 'you',\n", + " 'shoulda',\n", + " 'got',\n", + " 'david',\n", + " 'carr',\n", + " 'of',\n", + " 'third',\n", + " 'day',\n", + " 'to',\n", + " 'do',\n", + " 'it',\n", + " 'd']" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import nltk\n", + "#nltk.download('punkt')\n", + "\n", + "from nltk.tokenize import word_tokenize\n", + "def tokenize(s):\n", + " s = word_tokenize(s)\n", + " return s\n", + "\n", + "tokenized_str = tokenize(clean_string)\n", + "tokenized_str" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to\n", + "[nltk_data] /Users/jossuebangos/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "['switchfoot',\n", + " 'awww',\n", + " 'that',\n", + " 's',\n", + " 'a',\n", + " 'bummer',\n", + " 'you',\n", + " 'shoulda',\n", + " 'got',\n", + " 'david',\n", + " 'carr',\n", + " 'of',\n", + " 'third',\n", + " 'day',\n", + " 'to',\n", + " 'do',\n", + " 'it',\n", + " 'd']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Stemming and Lemmatization:\n", + "\n", + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.stem import PorterStemmer\n", + "nltk.download('wordnet')\n", + "\n", + "def stem_and_lemmatize(text):\n", + " \n", + " #Initializing both methods:\n", + " ps = PorterStemmer() \n", + " lemmatizer = WordNetLemmatizer()\n", + " \n", + " stemmed = [ps.stem(word) for word in text]\n", + " stem_and_lemmat = [lemmatizer.lemmatize(word) for word in stemmed]\n", + " return stem_and_lemmat\n", + "\n", + "stem_and_lemmat = stem_and_lemmatize(tokenized_str)\n", + "stem_and_lemmat" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] /Users/jossuebangos/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "['switchfoot',\n", + " 'awww',\n", + " 'bummer',\n", + " 'shoulda',\n", + " 'got',\n", + " 'david',\n", + " 'carr',\n", + " 'third',\n", + " 'day']" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Stop words:\n", + "\n", + "from nltk.corpus import stopwords\n", + "nltk.download('stopwords')\n", + "\n", + "def remove_stopwords(text):\n", + " stop = stopwords.words('english')\n", + " return [word for word in text if word not in stop]\n", + "stop_worded = remove_stopwords(stem_and_lemmat)\n", + "stop_worded" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "tweets['text_processed'] = tweets['text'].apply(clean_up).apply(tokenize).apply(stem_and_lemmatize).apply(remove_stopwords)" ] }, { @@ -102,7 +481,25 @@ "metadata": {}, "outputs": [], "source": [ - "# your code here" + "bagofwords = [word for lists in tweets['text'] for word in lists]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "freq_words = nltk.FreqDist(bagofwords)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "#freq_words.most_common(5000)" ] }, { @@ -171,9 +568,39 @@ "metadata": {}, "outputs": [], "source": [ - "# your code here" + "word_features = list(freq_words.keys())[:5000]\n", + "# Sentiment Score is 0 = negative, 2 = neutral, and 4 = positive):\n", + "documents = list(zip(tweets['text_processed'], np.where(tweets['target'] == 4, True, False)))" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#From Video & Kaggle Notebook\n", + "\n", + "def find_features(documents):\n", + " words = set(documents)\n", + " features = {}\n", + " for w in word_features:\n", + " features[w] = (w in words)\n", + "\n", + " return features\n", + "\n", + "#For all documents, saving the feature existence booleans and their respective positive or negative categories:\n", + "\n", + "featuresets = [(find_features(rev), category) for (rev, category) in documents]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -210,11 +637,35 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "from nltk import NaiveBayesClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "#Separating into training and testing sets:\n", + "training_set = feature_sets[int(len(featuresets)*0.8):]\n", + "testing_set = feature_sets[:int(len(featuresets)*0.8)]\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "classifier = nltk.NaiveBayesClassifier.train(training_set)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "classifier.show_most_informative_features()" ] }, { @@ -234,7 +685,7 @@ "metadata": {}, "outputs": [], "source": [ - "# your code here" + "nltk.classify.accuracy(classifier, testing_set)" ] }, { @@ -312,9 +763,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.8.3" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/your-code/wordcloud-phrases.png b/your-code/wordcloud-phrases.png new file mode 100644 index 0000000..7785b01 Binary files /dev/null and b/your-code/wordcloud-phrases.png differ