ta-data-mad · rfminguez · Mar 29, 2021 · Apr 3, 2021 · Apr 3, 2021 · Apr 5, 2021
diff --git a/module-3/natural-language-processing/.gitignore b/module-3/natural-language-processing/.gitignore
@@ -0,0 +1 @@
+data/
diff --git a/module-3/natural-language-processing/your-code/challenge-1.ipynb b/module-3/natural-language-processing/your-code/challenge-1.ipynb
@@ -66,10 +66,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import re\n",
+    "\n",
     "def clean_up(s):\n",
     "    \"\"\"\n",
     "    Cleans up numbers, URLs, and special characters from a string.\n",
@@ -79,7 +81,40 @@
     "\n",
     "    Returns:\n",
     "        A string that has been cleaned up.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    # Replace urls\n",
+    "    s = re.sub(r'https?://[^\\s]*', ' ', s)\n",
+    "    \n",
+    "    # Replace numbers and special characters\n",
+    "    s = re.sub(r'[\\W\\d]', ' ', s)\n",
+    "    \n",
+    "    return s.lower()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  ironhack s  q website     is              \n"
+     ]
+    }
+   ],
+   "source": [
+    "# @Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\")\n",
+    "# Expected result:\n",
+    "# ironhack s  q website  is\n",
+    "\n",
+    "text_to_clean = \"\"\"\n",
+    "@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\")\n",
+    "\"\"\"\n",
+    "\n",
+    "clean_text = clean_up(text_to_clean)\n",
+    "print(clean_text)"
    ]
   },
   {
@@ -101,10 +136,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 21,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['ironhack', 's', 'q', 'website', 'is']\n"
+     ]
+    }
+   ],
    "source": [
+    "from nltk import word_tokenize\n",
+    "\n",
     "def tokenize(s):\n",
     "    \"\"\"\n",
     "    Tokenize a string.\n",
@@ -114,7 +159,12 @@
     "\n",
     "    Returns:\n",
     "        A list of words as the result of tokenization.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    return word_tokenize(s)\n",
+    "\n",
+    "tokenized_text = tokenize(clean_text)\n",
+    "\n",
+    "print(tokenized_text)"
    ]
   },
   {
@@ -145,10 +195,51 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 32,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package wordnet to /home/raul/nltk_data...\n",
+      "[nltk_data]   Unzipping corpora/wordnet.zip.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# This download is needed for the lemmatization process in the next cell\n",
+    "import nltk\n",
+    "nltk.download('wordnet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['ironhack', 's', 'q', 'websit', 'is']\n"
+     ]
+    }
+   ],
    "source": [
+    "from nltk.stem.snowball import SnowballStemmer\n",
+    "from nltk.stem import WordNetLemmatizer\n",
+    "\n",
     "def stem_and_lemmatize(l):\n",
     "    \"\"\"\n",
     "    Perform stemming and lemmatization on a list of words.\n",
@@ -158,7 +249,22 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after being stemmed and lemmatized.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    \n",
+    "    # stemming\n",
+    "    stemmer = SnowballStemmer(\"english\")\n",
+    "    stemmed_list = [stemmer.stem(i) for i in l]\n",
+    "    \n",
+    "    # lemmatizing\n",
+    "    lemmatizer = WordNetLemmatizer()\n",
+    "    stemmed_and_lemmatized_list = [lemmatizer.lemmatize(i) for i in stemmed_list]\n",
+    "    \n",
+    "    # return the result string from the process of stemming + lemmatizing\n",
+    "    return stemmed_and_lemmatized_list\n",
+    "\n",
+    "\n",
+    "stemmed_and_lemmatized_text = stem_and_lemmatize(tokenized_text)\n",
+    "print(stemmed_and_lemmatized_text)"
    ]
   },
   {
@@ -176,10 +282,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 36,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['ironhack', 'q', 'websit']\n"
+     ]
+    }
+   ],
    "source": [
+    "from nltk.corpus import stopwords\n",
+    "\n",
     "def remove_stopwords(l):\n",
     "    \"\"\"\n",
     "    Remove English stopwords from a list of strings.\n",
@@ -189,7 +305,13 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after stop words are removed.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    stop_words = set(stopwords.words('english'))\n",
+    "    return [w for w in l if w not in stop_words]\n",
+    "\n",
+    "text_without_stop_words = remove_stopwords(stemmed_and_lemmatized_text)\n",
+    "\n",
+    "print(text_without_stop_words)"
    ]
   },
   {
@@ -204,9 +326,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python [conda env:ironhack_env]",
    "language": "python",
-   "name": "python3"
+   "name": "conda-env-ironhack_env-py"
   },
   "language_info": {
    "codemirror_mode": {
@@ -218,9 +340,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }