From 91ca0fc840593be994aacc7f41b216da95aed8a2 Mon Sep 17 00:00:00 2001 From: joanitamateus Date: Sun, 1 Aug 2021 23:14:47 +0100 Subject: [PATCH] joanita --- your-code/challenge-1.ipynb | 129 +++++++++++++++++++++++++--- your-code/challenge-2.ipynb | 165 +++++++++++++++++++++++++++++++++--- 2 files changed, 271 insertions(+), 23 deletions(-) diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb index 0808166..7350a39 100644 --- a/your-code/challenge-1.ipynb +++ b/your-code/challenge-1.ipynb @@ -66,9 +66,58 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to\n", + "[nltk_data] C:\\Users\\joani\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package stopwords to\n", + "[nltk_data] C:\\Users\\joani\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Unzipping corpora\\stopwords.zip.\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re \n", + "import nltk\n", + "from nltk.tokenize import word_tokenize\n", + "nltk.download('punkt')\n", + "from nltk.stem import WordNetLemmatizer\n", + "from nltk.corpus import stopwords\n", + "nltk.download('stopwords')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'ironhack s q website is'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def clean_up(s):\n", " \"\"\"\n", @@ -79,7 +128,13 @@ "\n", " Returns:\n", " A string that has been cleaned up.\n", - " \"\"\"" + " \"\"\"\n", + " string = re.sub(r'http\\S+', '', s)\n", + " return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n", + "test = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\"\n", + " \n", + "test_string = clean_up(test)\n", + "test_string" ] }, { @@ -101,9 +156,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['ironhack', 's', 'q', 'website', 'is']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def tokenize(s):\n", " \"\"\"\n", @@ -114,7 +180,11 @@ "\n", " Returns:\n", " A list of words as the result of tokenization.\n", - " \"\"\"" + " \"\"\"\n", + " return nltk.word_tokenize(s)\n", + "\n", + "test_string = tokenize(test_string)\n", + "test_string" ] }, { @@ -145,9 +215,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "SyntaxError", + "evalue": "invalid syntax (, line 21)", + "output_type": "error", + "traceback": [ + "\u001b[1;36m File \u001b[1;32m\"\"\u001b[1;36m, line \u001b[1;32m21\u001b[0m\n\u001b[1;33m reTurn l\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n" + ] + } + ], "source": [ "def stem_and_lemmatize(l):\n", " \"\"\"\n", @@ -158,7 +237,18 @@ "\n", " Returns:\n", " A list of strings after being stemmed and lemmatized.\n", - " \"\"\"" + " \"\"\"\n", + " ps = nltk.PorterStemmer()\n", + " lemmatizer = nltk.WordNetLemmatizer()\n", + " l = []\n", + " \n", + " for w in l:\n", + " s = ps.stem(w)\n", + " s = lemmatizer.lemmatize(s)\n", + " \n", + " l += [s]\n", + " \n", + "reTurn l" ] }, { @@ -176,9 +266,17 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 22, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ironhack q website\n" + ] + } + ], "source": [ "def remove_stopwords(l):\n", " \"\"\"\n", @@ -189,7 +287,12 @@ "\n", " Returns:\n", " A list of strings after stop words are removed.\n", - " \"\"\"" + " \"\"\"\n", + " stop_words = stopwords.words('english')\n", + " \n", + " return ' '.join([w for w in l if w not in stop_words])\n", + "\n", + "print(remove_stopwords(test_string))" ] }, { @@ -218,7 +321,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.9.6" } }, "nbformat": 4, diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb index 6b0e116..28df31e 100644 --- a/your-code/challenge-2.ipynb +++ b/your-code/challenge-2.ipynb @@ -46,13 +46,128 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "import pandas as pd\n", + "import nltk\n", + "from nltk.stem import WordNetLemmatizer \n", + "from nltk.corpus import stopwords\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "import numpy as np \n", + "from nltk.probability import ConditionalFreqDist\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "ename": "SyntaxError", + "evalue": "cannot assign to literal (, line 39)", + "output_type": "error", + "traceback": [ + "\u001b[1;36m File \u001b[1;32m\"\"\u001b[1;36m, line \u001b[1;32m39\u001b[0m\n\u001b[1;33m 12 = []\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m cannot assign to literal\n" + ] + } + ], + "source": [ + "def clean_up(s):\n", + " \"\"\"\n", + " Cleans up numbers, URLs, and special characters from a string.\n", + "\n", + " Args:\n", + " s: The string to be cleaned up.\n", + "\n", + " Returns:\n", + " A string that has been cleaned up.\n", + " \"\"\"\n", + " string = re.sub(r'http\\S+', '', s)\n", + " return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n", + "\n", + "def tokenize(s):\n", + " \"\"\"\n", + " Tokenize a string.\n", + "\n", + " Args:\n", + " s: String to be tokenized.\n", + "\n", + " Returns:\n", + " A list of words as the result of tokenization.\n", + " \"\"\"\n", + " return nltk.word_tokenize(s)\n", + "\n", + "def stem_and_lemmatize(l):\n", + " \"\"\"\n", + " Perform stemming and lemmatization on a list of words.\n", + "\n", + " Args:\n", + " l: A list of strings.\n", + "\n", + " Returns:\n", + " A list of strings after being stemmed and lemmatized.\n", + " \"\"\"\n", + " ps = nltk.PorterStemmer()\n", + " lemmatizer = nltk.WordNetLemmatizer()\n", + "\n", + " 12 = []\n", + " \n", + " for w in l:\n", + " s = ps.stem(w)\n", + " s = lemmatizer.lemmatize(s)\n", + " \n", + " 12 += [s]\n", + " \n", + " reTurn 12 \n", + " \n", + "\n", + "def remove_stopwords(l):\n", + " \"\"\"\n", + " Remove English stopwords from a list of strings.\n", + "\n", + " Args:\n", + " l: A list of strings.\n", + "\n", + " Returns:\n", + " A list of strings after stop words are removed.\n", + " \"\"\"\n", + " stop_words = stopwords.words('english')\n", + " \n", + " return ' '.join([w for w in l if w not in stop_words])\n", + "\n", + "print(remove_stopwords(test_string))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "sentimentt = pd.read_csv('C:\\\\Users\\\\joani\\\\Downloads\\\\archive (1)\\\\training.1600000.processed.noemoticon.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "sample = sentimentt.sample(20000)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -80,7 +195,9 @@ "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "sample['text_processed'] = sample['text'].apply(clean_up).apply(tokenize).apply(steam_and_Lemmatize.apply(remove_stopwprds))\n", + "sample" ] }, { @@ -102,7 +219,14 @@ "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "cfdist = nltk.FreqDist()\n", + "\n", + "for tweet:\n", + " cfdist[word] += 1\n", + " \n", + "top_words = list(cfdist.keys())[:5000]\n", + "top_words" ] }, { @@ -171,7 +295,24 @@ "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "def find_features(document):\n", + " words = set(document)\n", + " features = {}\n", + " for w in top_words:\n", + " features[w] = (w in words)\n", + " \n", + " return features " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "feature_sets = [(find_features(tweet), target) for (tweet, targe) in list(zip(sample['text_processed'], sample['target']))]\n", + "print(len(feature_sets))" ] }, { @@ -210,11 +351,13 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "train_set, test_set = feature_sets[:10000], feature_sets[10000:]\n", + "classifier = ntlk.NaiveBayesClassifier.train(train_set)" ] }, { @@ -230,11 +373,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "# your code here" + "# your code here\n", + "print(nltk.classify.accuracy(classifier, test_set))\n", + "classifier.show_most_informative_features(5)" ] }, { @@ -312,7 +457,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.9.6" } }, "nbformat": 4,