ta-data-lis · JavierIronhack · Jun 8, 2023
diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb
@@ -66,9 +66,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "import re\n",
+    "import nltk"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'ironhack s q website is'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "def clean_up(s):\n",
     "    \"\"\"\n",
@@ -79,7 +100,14 @@
     "\n",
     "    Returns:\n",
     "        A string that has been cleaned up.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    string = re.sub(r'http\\S+', '', s)\n",
+    "    return re.sub('[^A-Za-z]+', ' ', string).lower().strip()\n",
+    "    \n",
+    "test = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\"\n",
+    "\n",
+    "test_string = clean_up(test)\n",
+    "test_string"
    ]
   },
   {
@@ -101,9 +129,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['ironhack', 's', 'q', 'website', 'is']"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "def tokenize(s):\n",
     "    \"\"\"\n",
@@ -114,7 +153,11 @@
     "\n",
     "    Returns:\n",
     "        A list of words as the result of tokenization.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    return nltk.word_tokenize(s)\n",
+    "\n",
+    "test_string = tokenize(test_string)\n",
+    "test_string"
    ]
   },
   {
@@ -145,11 +188,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Nope, something went wrong, I'll use another set of words\n",
+    "\n",
+    "import nltk\n",
+    "from nltk.stem import PorterStemmer, WordNetLemmatizer\n",
+    "\n",
     "def stem_and_lemmatize(l):\n",
+    "    \n",
     "    \"\"\"\n",
     "    Perform stemming and lemmatization on a list of words.\n",
     "\n",
@@ -158,7 +207,17 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after being stemmed and lemmatized.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    ps = nltk.PorterStemmer()\n",
+    "    lemmatizer = nltk.WordNetLemmatizer()\n",
+    "    l2 = []\n",
+    "    \n",
+    "    for w in l:\n",
+    "        s = ps.stem(w)\n",
+    "        s = lemmatizer.lemmatize(s)\n",
+    "        l2 += [s]\n",
+    "    \n",
+    "    return l2\n"
    ]
   },
   {
@@ -176,7 +235,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.corpus import stopwords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -189,7 +257,10 @@
     "\n",
     "    Returns:\n",
     "        A list of strings after stop words are removed.\n",
-    "    \"\"\""
+    "    \"\"\"\n",
+    "    stop_words = stopwords.words('english')\n",
+    "\n",
+    "    return [w for w in l if w not in stop_words]"
    ]
   },
   {
@@ -218,7 +289,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.10.9"
   }
  },
  "nbformat": 4,