Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 116 additions & 16 deletions your-code/challenge-1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,29 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"This is a test string with a URL special characters numbers and multiple whitespaces\n"
]
}
],
"source": [
"def clean_up(s):\n",
"import re\n",
"\n",
"\n",
"def clean_up(text):\n",
" text = re.sub(r'http\\S+|www\\.\\S+', '', text)\n",
" text = re.sub(r\"[^a-zA-Z\\s']\", ' ', text)\n",
" text = re.sub(r'\\s+', ' ', text).strip()\n",
" return text\n",
" \n",
" \n",
" \n",
" \"\"\"\n",
" Cleans up numbers, URLs, and special characters from a string.\n",
"\n",
Expand All @@ -79,7 +97,13 @@
"\n",
" Returns:\n",
" A string that has been cleaned up.\n",
" \"\"\""
" \"\"\"\n",
" \n",
"test_string = \"This is a test string with a URL http://example.com, special characters #@!, numbers 123, and multiple whitespaces.\"\n",
"\n",
"cleaned_text = clean_up(test_string)\n",
"\n",
"print(cleaned_text)\n"
]
},
{
Expand All @@ -101,20 +125,62 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['ironhack', 's', 'q', 'website', 'is']\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\User\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"'\\n Tokenize a string.\\n\\n Args:\\n s: String to be tokenized.\\n\\n Returns:\\n A list of words as the result of tokenization.\\n'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def tokenize(s):\n",
" \"\"\"\n",
"import nltk\n",
"from nltk.tokenize import word_tokenize\n",
"\n",
"nltk.download('punkt')\n",
"\n",
"\n",
"def tokenize(text):\n",
" return word_tokenize(text) \n",
" \n",
"\n",
" \n",
"test_string = \"ironhack s q website is\"\n",
"tokens = tokenize(test_string)\n",
"print(tokens) \n",
" \n",
"\n",
"\"\"\"\n",
" Tokenize a string.\n",
"\n",
" Args:\n",
" s: String to be tokenized.\n",
"\n",
" Returns:\n",
" A list of words as the result of tokenization.\n",
" \"\"\""
"\"\"\""
]
},
{
Expand Down Expand Up @@ -145,11 +211,24 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def stem_and_lemmatize(l):\n",
"import nltk\n",
"from nltk.stem import PorterStemmer, WordNetLemmatizer\n",
"\n",
"\n",
"\n",
"def stem_and_lemmatize(words):\n",
" \n",
" stemmer = PorterStemmer()\n",
" lemmatizer = WordNetLemmatizer()\n",
" \n",
" stemmed = [stemmer.stem(word) for word in words]\n",
" lemmatized = [lemmatizer.lemmatize(word) for word in words]\n",
" \n",
" return stemmed, lemmatized\n",
" \"\"\"\n",
" Perform stemming and lemmatization on a list of words.\n",
"\n",
Expand All @@ -176,11 +255,32 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] C:\\Users\\User\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
}
],
"source": [
"def remove_stopwords(l):\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"\n",
"nltk.download('stopwords')\n",
"\n",
"\n",
"def remove_stopwords(words):\n",
" stop_words = set(stopwords.words('english'))\n",
" \n",
" clean_words = [word for word in words if word.lower() not in stop_words]\n",
" \n",
" return clean_words\n",
" \"\"\"\n",
" Remove English stopwords from a list of strings.\n",
"\n",
Expand All @@ -204,7 +304,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -218,7 +318,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
"version": "3.11.5"
}
},
"nbformat": 4,
Expand Down
Loading