ta-data-lis · JMpCS · Mar 13, 2023
diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb
@@ -66,20 +66,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "from nltk.tokenize import word_tokenize"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
    "metadata": {},
    "outputs": [],
    "source": [
     "def clean_up(s):\n",
-    "    \"\"\"\n",
-    "    Cleans up numbers, URLs, and special characters from a string.\n",
-    "\n",
-    "    Args:\n",
-    "        s: The string to be cleaned up.\n",
-    "\n",
-    "    Returns:\n",
-    "        A string that has been cleaned up.\n",
-    "    \"\"\""
+    "    s = re.sub(r'http\\S+', ' ', s)\n",
+    "    s = re.sub(r'[^\\w\\s]+|\\d+', ' ', s)\n",
+    "    s = s.strip()\n",
+    "    s = re.sub(r'\\s+', ' ', s)\n",
+    "    \n",
+    "    return s.lower()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ironhack s q website is\n"
+     ]
+    }
+   ],
+   "source": [
+    "s = \"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\\\")\"\n",
+    "clean_sentence = clean_up(s)\n",
+    "print(clean_sentence)"
    ]
   },
   {
@@ -101,20 +127,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.tokenize import word_tokenize"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
    "metadata": {},
    "outputs": [],
    "source": [
     "def tokenize(s):\n",
-    "    \"\"\"\n",
-    "    Tokenize a string.\n",
-    "\n",
-    "    Args:\n",
-    "        s: String to be tokenized.\n",
-    "\n",
-    "    Returns:\n",
-    "        A list of words as the result of tokenization.\n",
-    "    \"\"\""
+    "    return word_tokenize(s)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['ironhack', 's', 'q', 'website', 'is']\n"
+     ]
+    }
+   ],
+   "source": [
+    "text = 'ironhack s q website is'\n",
+    "tokens = tokenize(text)\n",
+    "print(tokens)"
    ]
   },
   {
@@ -145,20 +191,42 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.stem import SnowballStemmer, WordNetLemmatizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
    "metadata": {},
    "outputs": [],
    "source": [
     "def stem_and_lemmatize(l):\n",
-    "    \"\"\"\n",
-    "    Perform stemming and lemmatization on a list of words.\n",
-    "\n",
-    "    Args:\n",
-    "        l: A list of strings.\n",
-    "\n",
-    "    Returns:\n",
-    "        A list of strings after being stemmed and lemmatized.\n",
-    "    \"\"\""
+    "    stemmer = SnowballStemmer('english')\n",
+    "    lemmatizer = WordNetLemmatizer()\n",
+    "    stems = [stemmer.stem(l) for l in l]\n",
+    "    lemmas = [lemmatizer.lemmatize(l) for l in l]\n",
+    "    return stems, lemmas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(['@', 'i', 'r', 'o', 'n', 'h', 'a', 'c', 'k', \"'\", 's', '-', '#', 'q', ' ', 'w', 'e', 'b', 's', 'i', 't', 'e', ' ', '7', '7', '6', '-', 'i', 's', ' ', 'h', 't', 't', 'p', ':', '/', '/', 'i', 'r', 'o', 'n', 'h', 'a', 'c', 'k', '.', 'c', 'o', 'm', ' ', '[', '(', '2', '0', '1', '8', ')', ']', '\"', ')'], ['@', 'I', 'r', 'o', 'n', 'h', 'a', 'c', 'k', \"'\", 's', '-', '#', 'Q', ' ', 'w', 'e', 'b', 's', 'i', 't', 'e', ' ', '7', '7', '6', '-', 'i', 's', ' ', 'h', 't', 't', 'p', ':', '/', '/', 'i', 'r', 'o', 'n', 'h', 'a', 'c', 'k', '.', 'c', 'o', 'm', ' ', '[', '(', '2', '0', '1', '8', ')', ']', '\"', ')'])\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(stem_and_lemmatize(s))"
    ]
   },
   {
@@ -176,20 +244,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.corpus import stopwords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
    "metadata": {},
    "outputs": [],
    "source": [
     "def remove_stopwords(l):\n",
-    "    \"\"\"\n",
-    "    Remove English stopwords from a list of strings.\n",
-    "\n",
-    "    Args:\n",
-    "        l: A list of strings.\n",
-    "\n",
-    "    Returns:\n",
-    "        A list of strings after stop words are removed.\n",
-    "    \"\"\""
+    "    stop_words = set(stopwords.words('english'))\n",
+    "    filtered_words = [l for l in l if l not in stop_words]\n",
+    "    return filtered_words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['@', 'I', 'r', 'n', 'h', 'c', 'k', \"'\", '-', '#', 'Q', ' ', 'w', 'e', 'b', 'e', ' ', '7', '7', '6', '-', ' ', 'h', 'p', ':', '/', '/', 'r', 'n', 'h', 'c', 'k', '.', 'c', ' ', '[', '(', '2', '0', '1', '8', ')', ']', '\"', ')']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(remove_stopwords(s))"
    ]
   },
   {
@@ -218,7 +306,12 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.10.7"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "721db305ef1fd1fc91cdf20e400af694a949fe540ac5f48c160f31c7e384879d"
+   }
   }
  },
  "nbformat": 4,