ta-data-mad · CastillaAlvaro · Apr 24, 2021 · Apr 24, 2021
diff --git a/module-3/natural-language-processing/your-code/challenge-1.ipynb b/module-3/natural-language-processing/your-code/challenge-1.ipynb
@@ -66,7 +66,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s = \"\"\"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\")\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -79,7 +97,34 @@
     "\n",
     "    Returns:\n",
     "        A string that has been cleaned up.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    \n",
+    "    url = re.sub('http://[\\\\w]+\\\\.\\\\w+', '', s)\n",
+    "    numbers = re.sub('\\\\d+', '', url).lower()\n",
+    "    new_string = re.sub('[^a-z0-9]', ' ', numbers).strip()\n",
+    "    \n",
+    "    return new_string"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'ironhack s  q website  is'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "new_string = clean_up(s)\n",
+    "new_string"
    ]
   },
   {
@@ -101,7 +146,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to /home/usuario/nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "from nltk import word_tokenize\n",
+    "nltk.download('punkt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -114,7 +189,29 @@
     "\n",
     "    Returns:\n",
     "        A list of words as the result of tokenization.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    \n",
+    "    return word_tokenize(new_string)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ironhack', 's', 'q', 'website', 'is']"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenize(new_string)"
    ]
   },
   {
@@ -145,7 +242,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.stem import PorterStemmer\n",
+    "porter = PorterStemmer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -158,7 +265,58 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after being stemmed and lemmatized.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    \n",
+    "    return [porter.stem(i) for i in l]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "l = ['ironhack', 's', 'q', 'website', 'is']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['token', 'token', 'token']"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stem_and_lemmatize(['token', 'tokenize', 'tokenization'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ironhack', 's', 'q', 'websit', 'is']"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stem_and_lemmatize(l)"
    ]
   },
   {
@@ -176,7 +334,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to\n",
+      "[nltk_data]     /home/usuario/nltk_data...\n",
+      "[nltk_data]   Unzipping corpora/stopwords.zip.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from nltk.corpus import stopwords\n",
+    "nltk.download('stopwords')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -189,7 +377,33 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after stop words are removed.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    \n",
+    "    lemma = [porter.stem(i) for i in l]\n",
+    "    stop_wds = [i for i in lemma if i not in stopwords.words('english')]\n",
+    "    \n",
+    "    return stop_wds\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ironhack', 'q', 'websit']"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "remove_stopwords(l)"
    ]
   },
   {
@@ -204,9 +418,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python [conda env:ironhack_env]",
    "language": "python",
-   "name": "python3"
+   "name": "conda-env-ironhack_env-py"
   },
   "language_info": {
    "codemirror_mode": {
@@ -218,9 +432,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }