From 91ca0fc840593be994aacc7f41b216da95aed8a2 Mon Sep 17 00:00:00 2001
From: joanitamateus <joanitaduarte@sapo.pt>
Date: Sun, 1 Aug 2021 23:14:47 +0100
Subject: [PATCH] joanita

---
 your-code/challenge-1.ipynb | 129 +++++++++++++++++++++++++---
 your-code/challenge-2.ipynb | 165 +++++++++++++++++++++++++++++++++---
 2 files changed, 271 insertions(+), 23 deletions(-)

diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb
index 0808166..7350a39 100644
--- a/your-code/challenge-1.ipynb
+++ b/your-code/challenge-1.ipynb
@@ -66,9 +66,58 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to\n",
+      "[nltk_data]     C:\\Users\\joani\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n",
+      "[nltk_data] Downloading package stopwords to\n",
+      "[nltk_data]     C:\\Users\\joani\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Unzipping corpora\\stopwords.zip.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import re \n",
+    "import nltk\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "nltk.download('punkt')\n",
+    "from nltk.stem import WordNetLemmatizer\n",
+    "from nltk.corpus import stopwords\n",
+    "nltk.download('stopwords')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'ironhack s q website is'"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "def clean_up(s):\n",
     "    \"\"\"\n",
@@ -79,7 +128,13 @@
     "\n",
     "    Returns:\n",
     "        A string that has been cleaned up.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    string = re.sub(r'http\\S+', '', s)\n",
+    "    return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n",
+    "test = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\"\n",
+    "                     \n",
+    "test_string = clean_up(test)\n",
+    "test_string"
    ]
   },
   {
@@ -101,9 +156,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ironhack', 's', 'q', 'website', 'is']"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "def tokenize(s):\n",
     "    \"\"\"\n",
@@ -114,7 +180,11 @@
     "\n",
     "    Returns:\n",
     "        A list of words as the result of tokenization.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    return nltk.word_tokenize(s)\n",
+    "\n",
+    "test_string = tokenize(test_string)\n",
+    "test_string"
    ]
   },
   {
@@ -145,9 +215,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "SyntaxError",
+     "evalue": "invalid syntax (<ipython-input-2-90172a743b82>, line 21)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;36m  File \u001b[1;32m\"<ipython-input-2-90172a743b82>\"\u001b[1;36m, line \u001b[1;32m21\u001b[0m\n\u001b[1;33m    reTurn l\u001b[0m\n\u001b[1;37m           ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
+     ]
+    }
+   ],
    "source": [
     "def stem_and_lemmatize(l):\n",
     "    \"\"\"\n",
@@ -158,7 +237,18 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after being stemmed and lemmatized.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    ps = nltk.PorterStemmer()\n",
+    "    lemmatizer = nltk.WordNetLemmatizer()\n",
+    "    l = []\n",
+    "    \n",
+    "    for w in l:\n",
+    "        s = ps.stem(w)\n",
+    "        s = lemmatizer.lemmatize(s)\n",
+    "        \n",
+    "        l += [s]\n",
+    "        \n",
+    "reTurn l"
    ]
   },
   {
@@ -176,9 +266,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 22,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ironhack q website\n"
+     ]
+    }
+   ],
    "source": [
     "def remove_stopwords(l):\n",
     "    \"\"\"\n",
@@ -189,7 +287,12 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after stop words are removed.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    stop_words = stopwords.words('english')\n",
+    "    \n",
+    "    return ' '.join([w for w in l if w not in stop_words])\n",
+    "\n",
+    "print(remove_stopwords(test_string))"
    ]
   },
   {
@@ -218,7 +321,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.9.6"
   }
  },
  "nbformat": 4,
diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb
index 6b0e116..28df31e 100644
--- a/your-code/challenge-2.ipynb
+++ b/your-code/challenge-2.ipynb
@@ -46,13 +46,128 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# your code here"
+    "# your code here\n",
+    "import pandas as pd\n",
+    "import nltk\n",
+    "from nltk.stem import WordNetLemmatizer \n",
+    "from nltk.corpus import stopwords\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "import numpy as np \n",
+    "from nltk.probability import ConditionalFreqDist\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "SyntaxError",
+     "evalue": "cannot assign to literal (<ipython-input-5-c692506fd993>, line 39)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;36m  File \u001b[1;32m\"<ipython-input-5-c692506fd993>\"\u001b[1;36m, line \u001b[1;32m39\u001b[0m\n\u001b[1;33m    12 = []\u001b[0m\n\u001b[1;37m    ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m cannot assign to literal\n"
+     ]
+    }
+   ],
+   "source": [
+    "def clean_up(s):\n",
+    "    \"\"\"\n",
+    "    Cleans up numbers, URLs, and special characters from a string.\n",
+    "\n",
+    "    Args:\n",
+    "        s: The string to be cleaned up.\n",
+    "\n",
+    "    Returns:\n",
+    "        A string that has been cleaned up.\n",
+    "    \"\"\"\n",
+    "    string = re.sub(r'http\\S+', '', s)\n",
+    "    return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n",
+    "\n",
+    "def tokenize(s):\n",
+    "    \"\"\"\n",
+    "    Tokenize a string.\n",
+    "\n",
+    "    Args:\n",
+    "        s: String to be tokenized.\n",
+    "\n",
+    "    Returns:\n",
+    "        A list of words as the result of tokenization.\n",
+    "    \"\"\"\n",
+    "    return nltk.word_tokenize(s)\n",
+    "\n",
+    "def stem_and_lemmatize(l):\n",
+    "    \"\"\"\n",
+    "    Perform stemming and lemmatization on a list of words.\n",
+    "\n",
+    "    Args:\n",
+    "        l: A list of strings.\n",
+    "\n",
+    "    Returns:\n",
+    "        A list of strings after being stemmed and lemmatized.\n",
+    "    \"\"\"\n",
+    "    ps = nltk.PorterStemmer()\n",
+    "    lemmatizer = nltk.WordNetLemmatizer()\n",
+    "\n",
+    "    12 = []\n",
+    "    \n",
+    "    for w in l:\n",
+    "        s = ps.stem(w)\n",
+    "        s = lemmatizer.lemmatize(s)\n",
+    "        \n",
+    "        12 += [s]\n",
+    "        \n",
+    "        reTurn 12 \n",
+    "        \n",
+    "\n",
+    "def remove_stopwords(l):\n",
+    "    \"\"\"\n",
+    "    Remove English stopwords from a list of strings.\n",
+    "\n",
+    "    Args:\n",
+    "        l: A list of strings.\n",
+    "\n",
+    "    Returns:\n",
+    "        A list of strings after stop words are removed.\n",
+    "    \"\"\"\n",
+    "    stop_words = stopwords.words('english')\n",
+    "    \n",
+    "    return ' '.join([w for w in l if w not in stop_words])\n",
+    "\n",
+    "print(remove_stopwords(test_string))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sentimentt = pd.read_csv('C:\\\\Users\\\\joani\\\\Downloads\\\\archive (1)\\\\training.1600000.processed.noemoticon.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample = sentimentt.sample(20000)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -80,7 +195,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# your code here"
+    "# your code here\n",
+    "sample['text_processed'] = sample['text'].apply(clean_up).apply(tokenize).apply(steam_and_Lemmatize.apply(remove_stopwprds))\n",
+    "sample"
    ]
   },
   {
@@ -102,7 +219,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# your code here"
+    "# your code here\n",
+    "cfdist = nltk.FreqDist()\n",
+    "\n",
+    "for tweet:\n",
+    "    cfdist[word] += 1\n",
+    "    \n",
+    "top_words = list(cfdist.keys())[:5000]\n",
+    "top_words"
    ]
   },
   {
@@ -171,7 +295,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# your code here"
+    "# your code here\n",
+    "def find_features(document):\n",
+    "    words = set(document)\n",
+    "    features = {}\n",
+    "    for w in top_words:\n",
+    "        features[w] = (w in words)\n",
+    "        \n",
+    "    return features "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "feature_sets = [(find_features(tweet), target) for (tweet, targe) in list(zip(sample['text_processed'], sample['target']))]\n",
+    "print(len(feature_sets))"
    ]
   },
   {
@@ -210,11 +351,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# your code here"
+    "# your code here\n",
+    "train_set, test_set = feature_sets[:10000], feature_sets[10000:]\n",
+    "classifier = ntlk.NaiveBayesClassifier.train(train_set)"
    ]
   },
   {
@@ -230,11 +373,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# your code here"
+    "# your code here\n",
+    "print(nltk.classify.accuracy(classifier, test_set))\n",
+    "classifier.show_most_informative_features(5)"
    ]
   },
   {
@@ -312,7 +457,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.9.6"
   }
  },
  "nbformat": 4,