ta-data-mad · JaimeGarciaVel · Apr 7, 2021
diff --git a/module-3/natural-language-processing/your-code/challenge-1.ipynb b/module-3/natural-language-processing/your-code/challenge-1.ipynb
@@ -66,20 +66,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def clean_up(s):\n",
-    "    \"\"\"\n",
-    "    Cleans up numbers, URLs, and special characters from a string.\n",
-    "\n",
-    "    Args:\n",
-    "        s: The string to be cleaned up.\n",
+    "import re\n",
     "\n",
-    "    Returns:\n",
-    "        A string that has been cleaned up.\n",
-    "    \"\"\""
+    "def clean_up(s):\n",
+    "    \n",
+    "    remover1= re.sub('http://[\\w]+\\.\\w+', '', s)\n",
+    "    remover2= re.sub('\\d+', '', remover1).lower()\n",
+    "    solution= re.sub('[^a-z0-9]', ' ', remover2).strip()  \n",
+    "    \n",
+    "    return solution\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'ironhack s  q website  is'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "clean_up(\"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\")"
    ]
   },
   {
@@ -101,20 +121,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to\n",
+      "[nltk_data]     /home/jaimegarcia/nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n"
+     ]
+    }
+   ],
    "source": [
-    "def tokenize(s):\n",
-    "    \"\"\"\n",
-    "    Tokenize a string.\n",
-    "\n",
-    "    Args:\n",
-    "        s: String to be tokenized.\n",
+    "from nltk import word_tokenize\n",
+    "nltk.download('punkt')\n",
     "\n",
-    "    Returns:\n",
-    "        A list of words as the result of tokenization.\n",
-    "    \"\"\""
+    "def tokenize(s):\n",
+    "    return  word_tokenize(s)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ironhack', 's', 'q', 'website', 'is']"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenize('ironhack s  q website  is')"
    ]
   },
   {
@@ -145,20 +190,50 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 26,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package wordnet to\n",
+      "[nltk_data]     /home/jaimegarcia/nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n"
+     ]
+    }
+   ],
    "source": [
-    "def stem_and_lemmatize(l):\n",
-    "    \"\"\"\n",
-    "    Perform stemming and lemmatization on a list of words.\n",
-    "\n",
-    "    Args:\n",
-    "        l: A list of strings.\n",
+    "from nltk.stem import WordNetLemmatizer\n",
+    "lemmatizer= WordNetLemmatizer()\n",
+    "nltk.download('wordnet')\n",
     "\n",
-    "    Returns:\n",
-    "        A list of strings after being stemmed and lemmatized.\n",
-    "    \"\"\""
+    "def stem_and_lemmatize(l):\n",
+    "    lista=[]\n",
+    "    for i in l:\n",
+    "        lista.append(lemmatizer.lemmatize(i))\n",
+    "   \n",
+    "    return lista\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['run', 'ran', 'change', 'changed', 'gonzo', 'wash', 'wa']"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "stem_and_lemmatize(['run', 'ran', 'change', 'changed', 'gonzo', 'wash', 'was'])"
    ]
   },
   {
@@ -176,20 +251,51 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 32,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to\n",
+      "[nltk_data]     /home/jaimegarcia/nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n"
+     ]
+    }
+   ],
    "source": [
-    "def remove_stopwords(l):\n",
-    "    \"\"\"\n",
-    "    Remove English stopwords from a list of strings.\n",
-    "\n",
-    "    Args:\n",
-    "        l: A list of strings.\n",
+    "from nltk.corpus import stopwords\n",
+    "nltk.download('stopwords')\n",
     "\n",
-    "    Returns:\n",
-    "        A list of strings after stop words are removed.\n",
-    "    \"\"\""
+    "def remove_stopwords(l):\n",
+    "    lista=[]\n",
+    "    for i in l:\n",
+    "        lista.append(lemmatizer.lemmatize(i))\n",
+    "        \n",
+    "    stopword= [i for i in lista if i not in stopwords.words('english')]\n",
+    "    \n",
+    "    return stopword\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['I', 'wa', 'running', 'bleeding']"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "remove_stopwords(['I', 'was', 'running', 'and', 'then', 'they', 'are', 'over', 'here', 'bleeding'])"
    ]
   },
   {
@@ -204,9 +310,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python [conda env:ironhack_env]",
    "language": "python",
-   "name": "python3"
+   "name": "conda-env-ironhack_env-py"
   },
   "language_info": {
    "codemirror_mode": {
@@ -218,9 +324,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }