ta-data-mad · gonalvarez05 · Mar 26, 2021 · Mar 29, 2021 · Mar 29, 2021 · Mar 30, 2021
diff --git a/module-3/natural-language-processing/training.1600000.processed.noemoticon.csv.zip b/module-3/natural-language-processing/training.1600000.processed.noemoticon.csv.zip
diff --git a/module-3/natural-language-processing/your-code/Untitled.ipynb b/module-3/natural-language-processing/your-code/Untitled.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/module-3/natural-language-processing/your-code/challenge-1.ipynb b/module-3/natural-language-processing/your-code/challenge-1.ipynb
@@ -66,9 +66,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'ironhack s  q website  is'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "def clean_up(s):\n",
     "    \"\"\"\n",
@@ -79,7 +99,17 @@
     "\n",
     "    Returns:\n",
     "        A string that has been cleaned up.\n",
-    "    \"\"\""
+    "        \n",
+    "    \"\"\"\n",
+    "    \n",
+    "    remove_url= re.sub('http://[\\w]+\\.\\w+', '', s)\n",
+    "    remove_num= re.sub('\\d+', '', remove_url)\n",
+    "    remove_num=remove_num.lower()\n",
+    "    new_string= re.sub('[^a-z0-9]', ' ', remove_num).strip()  \n",
+    "    \n",
+    "    return new_string\n",
+    "\n",
+    "clean_up(\"@Ironhack's- Q website 776-is http://ironhack.com [(2018)]\")   "
    ]
   },
   {
@@ -101,7 +131,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to\n",
+      "[nltk_data]     /Users/gonzaloalvarez/nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "from nltk import word_tokenize\n",
+    "nltk.download('punkt')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -114,7 +175,32 @@
     "\n",
     "    Returns:\n",
     "        A list of words as the result of tokenization.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    \n",
+    "    return word_tokenize(s)\n",
+    "\n",
+    "    \n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ironhack', 's', 'q', 'website', 'is']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenize('ironhack s  q website  is')"
    ]
   },
   {
@@ -145,9 +231,51 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package wordnet to\n",
+      "[nltk_data]     /Users/gonzaloalvarez/nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from nltk.stem import WordNetLemmatizer\n",
+    "lemmatizer= WordNetLemmatizer()\n",
+    "nltk.download('wordnet')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['run', 'ran', 'change', 'changed', 'gonzo', 'wash', 'wa']"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "def stem_and_lemmatize(l):\n",
     "    \"\"\"\n",
@@ -158,7 +286,13 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after being stemmed and lemmatized.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    \n",
+    "    lemmatize= [lemmatizer.lemmatize(t) for t in l]\n",
+    "   \n",
+    "    return lemmatize\n",
+    "\n",
+    "stem_and_lemmatize(['run', 'ran', 'change', 'changed', 'gonzo', 'wash', 'was'])"
    ]
   },
   {
@@ -176,9 +310,50 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to\n",
+      "[nltk_data]     /Users/gonzaloalvarez/nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from nltk.corpus import stopwords\n",
+    "nltk.download('stopwords')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['I', 'wa', 'running', 'bleeding']"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "def remove_stopwords(l):\n",
     "    \"\"\"\n",
@@ -189,7 +364,14 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after stop words are removed.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    \n",
+    "    lemmatize= [lemmatizer.lemmatize(t) for t in l]\n",
+    "    no_stops= [t for t in lemmatize if t not in stopwords.words('english')]\n",
+    "    \n",
+    "    return no_stops\n",
+    "\n",
+    "remove_stopwords(['I', 'was', 'running', 'and', 'then', 'they', 'are', 'over', 'here', 'bleeding'])"
    ]
   },
   {
@@ -204,9 +386,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python [conda env:ironhack_env]",
    "language": "python",
-   "name": "python3"
+   "name": "conda-env-ironhack_env-py"
   },
   "language_info": {
    "codemirror_mode": {
@@ -218,9 +400,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.8.5"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }