Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 153 additions & 47 deletions module-3/natural-language-processing/your-code/challenge-1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -66,20 +66,40 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def clean_up(s):\n",
" \"\"\"\n",
" Cleans up numbers, URLs, and special characters from a string.\n",
"\n",
" Args:\n",
" s: The string to be cleaned up.\n",
"import re\n",
"\n",
" Returns:\n",
" A string that has been cleaned up.\n",
" \"\"\""
"def clean_up(s):\n",
" \n",
" remover1= re.sub('http://[\\w]+\\.\\w+', '', s)\n",
" remover2= re.sub('\\d+', '', remover1).lower()\n",
" solution= re.sub('[^a-z0-9]', ' ', remover2).strip() \n",
" \n",
" return solution\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'ironhack s q website is'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clean_up(\"@Ironhack's-#Q website 776-is http://ironhack.com [(2018)]\")"
]
},
{
Expand All @@ -101,20 +121,45 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] /home/jaimegarcia/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
}
],
"source": [
"def tokenize(s):\n",
" \"\"\"\n",
" Tokenize a string.\n",
"\n",
" Args:\n",
" s: String to be tokenized.\n",
"from nltk import word_tokenize\n",
"nltk.download('punkt')\n",
"\n",
" Returns:\n",
" A list of words as the result of tokenization.\n",
" \"\"\""
"def tokenize(s):\n",
" return word_tokenize(s)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['ironhack', 's', 'q', 'website', 'is']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenize('ironhack s q website is')"
]
},
{
Expand Down Expand Up @@ -145,20 +190,50 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 26,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package wordnet to\n",
"[nltk_data] /home/jaimegarcia/nltk_data...\n",
"[nltk_data] Package wordnet is already up-to-date!\n"
]
}
],
"source": [
"def stem_and_lemmatize(l):\n",
" \"\"\"\n",
" Perform stemming and lemmatization on a list of words.\n",
"\n",
" Args:\n",
" l: A list of strings.\n",
"from nltk.stem import WordNetLemmatizer\n",
"lemmatizer= WordNetLemmatizer()\n",
"nltk.download('wordnet')\n",
"\n",
" Returns:\n",
" A list of strings after being stemmed and lemmatized.\n",
" \"\"\""
"def stem_and_lemmatize(l):\n",
" lista=[]\n",
" for i in l:\n",
" lista.append(lemmatizer.lemmatize(i))\n",
" \n",
" return lista\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['run', 'ran', 'change', 'changed', 'gonzo', 'wash', 'wa']"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"stem_and_lemmatize(['run', 'ran', 'change', 'changed', 'gonzo', 'wash', 'was'])"
]
},
{
Expand All @@ -176,20 +251,51 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 32,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] /home/jaimegarcia/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
}
],
"source": [
"def remove_stopwords(l):\n",
" \"\"\"\n",
" Remove English stopwords from a list of strings.\n",
"\n",
" Args:\n",
" l: A list of strings.\n",
"from nltk.corpus import stopwords\n",
"nltk.download('stopwords')\n",
"\n",
" Returns:\n",
" A list of strings after stop words are removed.\n",
" \"\"\""
"def remove_stopwords(l):\n",
" lista=[]\n",
" for i in l:\n",
" lista.append(lemmatizer.lemmatize(i))\n",
" \n",
" stopword= [i for i in lista if i not in stopwords.words('english')]\n",
" \n",
" return stopword\n"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['I', 'wa', 'running', 'bleeding']"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"remove_stopwords(['I', 'was', 'running', 'and', 'then', 'they', 'are', 'over', 'here', 'bleeding'])"
]
},
{
Expand All @@ -204,9 +310,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python [conda env:ironhack_env]",
"language": "python",
"name": "python3"
"name": "conda-env-ironhack_env-py"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -218,9 +324,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}
Loading