From d6326c3e21bf0b98b6beeca7a2712cba7c696e1b Mon Sep 17 00:00:00 2001 From: TGLSpain Date: Fri, 15 Jul 2022 11:00:30 -0300 Subject: [PATCH 1/2] first commit --- .../challenge-1-checkpoint.ipynb | 403 +++++++++++++++++ .../challenge-2-checkpoint.ipynb | 426 ++++++++++++++++++ your-code/challenge-1.ipynb | 209 +++++++-- your-code/challenge-2.ipynb | 129 +++++- 4 files changed, 1119 insertions(+), 48 deletions(-) create mode 100644 your-code/.ipynb_checkpoints/challenge-1-checkpoint.ipynb create mode 100644 your-code/.ipynb_checkpoints/challenge-2-checkpoint.ipynb diff --git a/your-code/.ipynb_checkpoints/challenge-1-checkpoint.ipynb b/your-code/.ipynb_checkpoints/challenge-1-checkpoint.ipynb new file mode 100644 index 0000000..58fa394 --- /dev/null +++ b/your-code/.ipynb_checkpoints/challenge-1-checkpoint.ipynb @@ -0,0 +1,403 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# String Operations Lab\n", + "\n", + "**Before your start:**\n", + "\n", + "- Read the README.md file\n", + "- Comment as much as you can and use the resources in the README.md file\n", + "- Happy learning!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Challenge 1 - Combining Strings\n", + "\n", + "Combining strings is an important skill to acquire. There are multiple ways of combining strings in Python, as well as combining strings with variables. We will explore this in the first challenge. In the cell below, combine the strings in the list and add spaces between the strings (do not add a space after the last string). Insert a period after the last string." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Durante un tiempo no estuvo segura de si su marido era su marido.\n" + ] + } + ], + "source": [ + "str_list = ['Durante', 'un', 'tiempo', 'no', 'estuvo', 'segura', 'de', 'si', 'su', 'marido', 'era', 'su', 'marido']\n", + "# Your code here:\n", + "\n", + "my_string = \" \".join(str_list) + \".\"\n", + "print(my_string)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the cell below, use the list of strings to create a grocery list. Start the list with the string `Grocery list: ` and include a comma and a space between each item except for the last one. Include a period at the end. Only include foods in the list that start with the letter 'b' and ensure all foods are lower case." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Grocery list: bananas, bread, brownie mix, broccoli.\n" + ] + } + ], + "source": [ + "food_list = ['Bananas', 'Chocolate', 'bread', 'diapers', 'Ice Cream', 'Brownie Mix', 'broccoli']\n", + "# Your code here:\n", + "\n", + "food_list_string = \"Grocery list:\"\n", + "\n", + "for food in food_list:\n", + " if food.lower()[0] == \"b\":\n", + " food_list_string += f\" {food.lower()},\"\n", + "\n", + "final_food_string = food_list_string[:-1]\n", + "\n", + "final_food_string += \".\"\n", + "\n", + "print(final_food_string)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the cell below, write a function that computes the area of a circle using its radius. Compute the area of the circle and insert the radius and the area between the two strings. Make sure to include spaces between the variable and the strings. \n", + "\n", + "Note: You can use the techniques we have learned so far or use f-strings. F-strings allow us to embed code inside strings. You can read more about f-strings [here](https://www.python.org/dev/peps/pep-0498/)." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The area of the circle with radius: 4.5 is: 63.61725123519331\n" + ] + } + ], + "source": [ + "import math\n", + "\n", + "string1 = \"The area of the circle with radius:\"\n", + "string2 = \"is:\"\n", + "radius = 4.5\n", + "\n", + "def area(x, pi = math.pi):\n", + " # This function takes a radius and returns the area of a circle. We also pass a default value for pi.\n", + " # Input: Float (and default value for pi)\n", + " # Output: Float\n", + " \n", + " # Sample input: 5.0\n", + " # Sample Output: 78.53981633\n", + " \n", + " # Your code here:\n", + " area = pi*x**2\n", + " return area\n", + "\n", + "area = area(radius) \n", + " \n", + "# Your output string here:\n", + "\n", + "print(f\"{string1} {radius} {string2} {area}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Challenge 2 - Splitting Strings\n", + "\n", + "We have first looked at combining strings into one long string. There are times where we need to do the opposite and split the string into smaller components for further analysis. \n", + "\n", + "In the cell below, split the string into a list of strings using the space delimiter. Count the frequency of each word in the string in a dictionary. Strip the periods, line breaks and commas from the text. Make sure to remove empty strings from your dictionary." + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'with': 1, 'it': 1, 'i’ve': 1, 'who': 1, 'and': 1, 'say': 3, 'some': 2, 'if': 1, 'the': 1, 'perish': 1, 'think': 1, 'to': 2, 'enough': 1, 'desire': 1, 'tasted': 1, 'that': 1, 'but': 1, 'fire': 2, 'hold': 1, 'from': 1, 'great': 1, 'in': 2, 'what': 1, 'is': 1, 'end': 1, 'twice': 1, 'know': 1, 'will': 1, 'ice': 2, 'had': 1, 'would': 1, 'suffice': 1, 'those': 1, 'hate': 1, 'of': 2, 'for': 1, 'destruction': 1, 'i': 3, 'world': 1, 'favor': 1, 'also': 1}\n", + "{'with': 1, 'Some': 0, 'it': 1, 'I’ve': 0, 'who': 1, 'say': 3, 'if': 1, 'the': 1, 'perish': 1, 'But': 0, 'think': 1, 'to': 2, 'enough': 1, 'From': 0, 'desire': 1, 'tasted': 1, 'that': 1, 'Is': 0, 'fire': 2, 'hold': 1, 'To': 0, 'great': 1, 'in': 2, 'what': 1, 'I': 0, 'end': 1, 'twice': 1, 'know': 1, 'will': 1, 'ice': 2, 'had': 1, 'would': 1, 'suffice': 1, 'those': 1, 'hate': 1, 'of': 2, 'And': 0, 'for': 1, 'destruction': 1, 'world': 1, 'favor': 1, 'also': 1}\n" + ] + } + ], + "source": [ + "poem = \"\"\"Some say the world will end in fire,\n", + "Some say in ice.\n", + "From what I’ve tasted of desire\n", + "I hold with those who favor fire.\n", + "But if it had to perish twice,\n", + "I think I know enough of hate\n", + "To say that for destruction ice\n", + "Is also great\n", + "And would suffice.\"\"\"\n", + "\n", + "# Your code here:\n", + "\n", + "poem_no_commas = poem.split(\",\")\n", + "poem_no_commas_string = \"\".join([fragment for fragment in poem_no_commas])\n", + "\n", + "poem_no_periods = poem_no_commas_string.split(\".\")\n", + "poem_no_periods_string = \"\".join([fragment for fragment in poem_no_periods])\n", + "\n", + "poem_transform_newlines = poem_no_periods_string.replace(\"\\n\", \" \").lower()\n", + "\n", + "final_poem_no_spaces = poem_transform_newlines.split(\" \")\n", + "\n", + "word_list = [(word, final_poem_no_spaces.count(word)) for word in final_poem_no_spaces]\n", + "\n", + "word_list_set = set()\n", + "for word in word_list:\n", + " word_list_set.add(word)\n", + " \n", + "words_dict = {}\n", + "for word_tuple in word_list_set:\n", + " words_dict[word_tuple[0]]= word_tuple[1]\n", + " \n", + "print(words_dict)\n", + "\n", + "# Segunda opcion usando re.split()\n", + "import re\n", + "pattern = r\"[,\\.\\n\\s]\"\n", + "final_poem = re.split(pattern, poem)\n", + "\n", + "# juntar en un string para poner todo en lower\n", + "\n", + "\n", + "word_list_set = set()\n", + "for word in final_word_list:\n", + " word_list_set.add(word)\n", + " \n", + "words_dict = {}\n", + "for word_tuple in word_list_set:\n", + " words_dict[word_tuple[0]]= word_tuple[1]\n", + "\n", + "print(words_dict)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the cell below, find all the words that appear in the text and do not appear in the blacklist. You must parse the string but can choose any data structure you wish for the words that do not appear in the blacklist. Remove all non letter characters and convert all words to lower case." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['I', 'was', 'angry', 'with', 'my', 'friend', 'I', 'told', 'my', 'wrath', 'my', 'wrath', 'did', 'end', 'I', 'was', 'angry', 'with', 'my', 'foe:', 'I', 'told', 'not', 'my', 'wrath', 'did', 'grow', 'And', 'I', 'waterd', 'fears', 'Night', '&', 'morning', 'with', 'my', 'tears:', 'And', 'I', 'sunned', 'with', 'smiles', 'And', 'with', 'soft', 'deceitful', 'wiles', 'And', 'grew', 'both', 'day', 'night', 'Till', 'bore', 'apple', 'bright', 'And', 'my', 'foe', 'beheld', 'shine', 'And', 'he', 'knew', 'that', 'was', 'mine', 'And', 'into', 'my', 'garden', 'stole', 'When', 'night', 'had', 'veild', 'pole', 'In', 'morning', 'glad', 'I', 'see', 'My', 'foe', 'outstretched', 'beneath', 'tree']\n" + ] + } + ], + "source": [ + "blacklist = ['and', 'as', 'an', 'a', 'the', 'in', 'it']\n", + "\n", + "poem = \"\"\"I was angry with my friend; \n", + "I told my wrath, my wrath did end.\n", + "I was angry with my foe: \n", + "I told it not, my wrath did grow. \n", + "\n", + "And I waterd it in fears,\n", + "Night & morning with my tears: \n", + "And I sunned it with smiles,\n", + "And with soft deceitful wiles. \n", + "\n", + "And it grew both day and night. \n", + "Till it bore an apple bright. \n", + "And my foe beheld it shine,\n", + "And he knew that it was mine. \n", + "\n", + "And into my garden stole, \n", + "When the night had veild the pole; \n", + "In the morning glad I see; \n", + "My foe outstretched beneath the tree.\"\"\"\n", + "\n", + "# Your code here:\n", + "import re\n", + "poem_just_words = re.split(r\"[;;\\.,\\s]\", poem)\n", + "poem_just_words_final = [word for word in poem_just_words if len(word) > 0]\n", + "\n", + "final_list = [word for word in poem_just_words_final if word not in blacklist]\n", + "print(final_list)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Challenge 3 - Regular Expressions\n", + "\n", + "Sometimes, we would like to perform more complex manipulations of our string. This is where regular expressions come in handy. In the cell below, return all characters that are upper case from the string specified below." + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['T', 'P']" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "poem = \"\"\"The apparition of these faces in the crowd;\n", + "Petals on a wet, black bough.\"\"\"\n", + "\n", + "# Your code here:\n", + "pattern = r\"[A-Z]\"\n", + "result = re.findall(pattern, poem)\n", + "result\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the cell below, filter the list provided and return all elements of the list containing a number. To filter the list, use the `re.search` function. Check if the function does not return `None`. You can read more about the `re.search` function [here](https://docs.python.org/3/library/re.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['123abc', 'abc123', 'JohnSmith1', 'ABBY4']" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = ['123abc', 'abc123', 'JohnSmith1', 'ABBY4', 'JANE']\n", + "\n", + "# Your code here:\n", + "pattern = r\"[0-9]\"\n", + "result = [element for element in data if re.search(pattern, element)]\n", + "result\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bonus Challenge - Regular Expressions II\n", + "\n", + "In the cell below, filter the list provided to keep only strings containing at least one digit and at least one lower case letter. As in the previous question, use the `re.search` function and check that the result is not `None`.\n", + "\n", + "To read more about regular expressions, check out [this link](https://developers.google.com/edu/python/regular-expressions)." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['123abc', 'abc123', 'JohnSmith1']" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = ['123abc', 'abc123', 'JohnSmith1', 'ABBY4', 'JANE']\n", + "# Your code here:\n", + "pattern = r\"[a-z][0-9]|[0-9][a-z]\"\n", + "result = [element for element in data if re.search(pattern, element)]\n", + "result\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/your-code/.ipynb_checkpoints/challenge-2-checkpoint.ipynb b/your-code/.ipynb_checkpoints/challenge-2-checkpoint.ipynb new file mode 100644 index 0000000..c8dc1b3 --- /dev/null +++ b/your-code/.ipynb_checkpoints/challenge-2-checkpoint.ipynb @@ -0,0 +1,426 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bag of Words Lab\n", + "\n", + "## Introduction\n", + "\n", + "**Bag of words (BoW)** is an important technique in text mining and [information retrieval](https://en.wikipedia.org/wiki/Information_retrieval). BoW uses term-frequency vectors to represent the content of text documents which makes it possible to use mathematics and computer programs to analyze and compare text documents.\n", + "\n", + "BoW contains the following information:\n", + "\n", + "1. A dictionary of all the terms (words) in the text documents. The terms are normalized in terms of the letter case (e.g. `Ironhack` => `ironhack`), tense (e.g. `had` => `have`), singular form (e.g. `students` => `student`), etc.\n", + "1. The number of occurrences of each normalized term in each document.\n", + "\n", + "For example, assume we have three text documents:\n", + "\n", + "DOC 1: **Ironhack is cool.**\n", + "\n", + "DOC 2: **I love Ironhack.**\n", + "\n", + "DOC 3: **I am a student at Ironhack.**\n", + "\n", + "The BoW of the above documents looks like below:\n", + "\n", + "| TERM | DOC 1 | DOC 2 | Doc 3 |\n", + "|---|---|---|---|\n", + "| a | 0 | 0 | 1 |\n", + "| am | 0 | 0 | 1 |\n", + "| at | 0 | 0 | 1 |\n", + "| cool | 1 | 0 | 0 |\n", + "| i | 0 | 1 | 1 |\n", + "| ironhack | 1 | 1 | 1 |\n", + "| is | 1 | 0 | 0 |\n", + "| love | 0 | 1 | 0 |\n", + "| student | 0 | 0 | 1 |\n", + "\n", + "\n", + "The term-frequency array of each document in BoW can be considered a high-dimensional vector. Data scientists use these vectors to represent the content of the documents. For instance, DOC 1 is represented with `[0, 0, 0, 1, 0, 1, 1, 0, 0]`, DOC 2 is represented with `[0, 0, 0, 0, 1, 1, 0, 1, 0]`, and DOC 3 is represented with `[1, 1, 1, 0, 1, 1, 0, 0, 1]`. **Two documents are considered identical if their vector representations have close [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity).**\n", + "\n", + "In real practice there are many additional techniques to improve the text mining accuracy such as using [stop words](https://en.wikipedia.org/wiki/Stop_words) (i.e. neglecting common words such as `a`, `I`, `to` that don't contribute much meaning), synonym list (e.g. consider `New York City` the same as `NYC` and `Big Apple`), and HTML tag removal if the data sources are webpages. In Module 3 you will learn how to use those advanced techniques for [natural language processing](https://en.wikipedia.org/wiki/Natural_language_processing), a component of text mining.\n", + "\n", + "In real text mining projects data analysts use packages such as Scikit-Learn and NLTK, which you will learn in Module 3, to extract BoW from texts. In this exercise, however, we would like you to create BoW manually with Python. This is because by manually creating BoW you can better understand the concept and also practice the Python skills you have learned so far." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The Challenge\n", + "\n", + "We need to create a BoW from a list of documents. The documents (`doc1.txt`, `doc2.txt`, and `doc3.txt`) can be found in the `your-code` directory of this exercise. You will read the content of each document into an array of strings named `corpus`.\n", + "\n", + "*What is a corpus (plural: corpora)? Read the reference in the README file.*\n", + "\n", + "Your challenge is to use Python to generate the BoW of these documents. Your BoW should look like below:\n", + "\n", + "```python\n", + "bag_of_words = ['a', 'am', 'at', 'cool', 'i', 'ironhack', 'is', 'love', 'student']\n", + "\n", + "term_freq = [\n", + " [0, 0, 0, 1, 0, 1, 1, 0, 0],\n", + " [0, 0, 0, 0, 1, 1, 0, 1, 0],\n", + " [1, 1, 1, 0, 1, 1, 0, 0, 1],\n", + "]\n", + "```\n", + "\n", + "Now let's define the `docs` array that contains the paths of `doc1.txt`, `doc2.txt`, and `doc3.txt`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "docs = ['doc1.txt', 'doc2.txt', 'doc3.txt']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define an empty array `corpus` that will contain the content strings of the docs. Loop `docs` and read the content of each doc into the `corpus` array." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "corpus = []\n", + "\n", + "# Write your code here\n", + "for doc in docs:\n", + " with open (doc,\"r\") as f:\n", + " corpus.append(f.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print `corpus`." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['I love Ironhack.', 'ironhack is cool', 'i am a student at ironhack']\n" + ] + } + ], + "source": [ + "print(corpus)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You expected to see:\n", + "\n", + "```['ironhack is cool', 'i love ironhack', 'i am a student at ironhack']```\n", + "\n", + "But you actually saw:\n", + "\n", + "```['Ironhack is cool.', 'I love Ironhack.', 'I am a student at Ironhack.']```\n", + "\n", + "This is because you haven't done two important steps:\n", + "\n", + "1. Remove punctuation from the strings\n", + "\n", + "1. Convert strings to lowercase\n", + "\n", + "Write your code below to process `corpus` (convert to lower case and remove special characters)." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['ironhack is cool', 'i love ironhack', 'i am a student at ironhack']\n" + ] + } + ], + "source": [ + "# Write your code here\n", + "\n", + "for element in corpus:\n", + " element1 = element.lower().replace(\".\", \"\")\n", + " corpus.remove(element)\n", + " corpus.append(element1)\n", + " \n", + "print(corpus)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now define `bag_of_words` as an empty array. It will be used to store the unique terms in `corpus`." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "bag_of_words = []" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Loop through `corpus`. In each loop, do the following:\n", + "\n", + "1. Break the string into an array of terms. \n", + "1. Create a sub-loop to iterate the terms array. \n", + " * In each sub-loop, you'll check if the current term is already contained in `bag_of_words`. If not in `bag_of_words`, append it to the array." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Write your code here\n", + "for element in corpus:\n", + " terms_array = element.split(\" \")\n", + " for term in terms_array:\n", + " if term not in bag_of_words:\n", + " bag_of_words.append(term)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print `bag_of_words`. You should see: \n", + "\n", + "```['ironhack', 'is', 'cool', 'i', 'love', 'am', 'a', 'student', 'at']```\n", + "\n", + "If not, fix your code in the previous cell." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['ironhack', 'is', 'cool', 'i', 'love', 'am', 'a', 'student', 'at']\n" + ] + } + ], + "source": [ + "print(bag_of_words)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we define an empty array called `term_freq`. Loop `corpus` for a second time. In each loop, create a sub-loop to iterate the terms in `bag_of_words`. Count how many times each term appears in each doc of `corpus`. Append the term-frequency array to `term_freq`." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "term_freq = []\n", + "\n", + "# Write your code here\n", + "\n", + "for element in corpus:\n", + " split_element = element.split(\" \") \n", + " word_list = []\n", + " for word in bag_of_words:\n", + " word_list.append(split_element.count(word))\n", + " term_freq.append(word_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print `term_freq`. You should see:\n", + "\n", + "```[[1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0], [1, 0, 0, 1, 0, 1, 1, 1, 1]]```" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0], [1, 0, 0, 1, 0, 1, 1, 1, 1]]\n" + ] + } + ], + "source": [ + "print(term_freq)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**If your output is correct, congratulations! You've solved the challenge!**\n", + "\n", + "If not, go back and check for errors in your code." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bonus Question\n", + "\n", + "Optimize your solution for the above question by removing stop words from the BoW. For your convenience, a list of stop words is defined for you in the next cell. With the stop words removed, your output should look like:\n", + "\n", + "```\n", + "bag_of_words = [am', 'at', 'cool', ironhack', 'is', 'love', 'student']\n", + "\n", + "term_freq = [\n", + "\t[0, 0, 1, 1, 1, 0, 0],\n", + " \t[0, 0, 0, 1, 0, 1, 0],\n", + " \t[1, 1, 0, 1, 0, 0, 1]\n", + "]\n", + "```\n", + "\n", + "**Requirements:**\n", + "\n", + "1. Combine all your previous codes to the cell below.\n", + "1. Improve your solution by ignoring stop words in `bag_of_words`.\n", + "\n", + "After you're done, your `bag_of_words` should be:\n", + "\n", + "```['ironhack', 'is', 'cool', 'love', 'am', 'student', 'at']```\n", + "\n", + "And your `term_freq` should be:\n", + "\n", + "```[[1, 1, 1, 0, 0, 0, 0], [1, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1, 1]]```" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[1, 1, 1, 0, 0, 0, 0], [1, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1, 1]]\n" + ] + } + ], + "source": [ + "stop_words = ['all', 'six', 'less', 'being', 'indeed', 'over', 'i', 'move', 'anyway', 'fifty', 'four', 'not', 'own', 'through', 'yourselves', 'go', 'where', 'mill', 'only', 'find', 'before', 'one', 'whose', 'system', 'how', 'somewhere', 'with', 'thick', 'show', 'had', 'enough', 'should', 'to', 'must', 'whom', 'seeming', 'under', 'ours', 'has', 'might', 'thereafter', 'latterly', 'do', 'them', 'his', 'around', 'than', 'get', 'very', 'de', 'none', 'cannot', 'every', 'whether', 'they', 'front', 'during', 'thus', 'now', 'him', 'nor', 'name', 'several', 'hereafter', 'always', 'who', 'cry', 'whither', 'this', 'someone', 'either', 'each', 'become', 'thereupon', 'sometime', 'side', 'two', 'therein', 'twelve', 'because', 'often', 'ten', 'our', 'eg', 'some', 'back', 'up', 'namely', 'towards', 'are', 'further', 'beyond', 'ourselves', 'yet', 'out', 'even', 'will', 'what', 'still', 'for', 'bottom', 'mine', 'since', 'please', 'forty', 'per', 'its', 'everything', 'behind', 'un', 'above', 'between', 'it', 'neither', 'seemed', 'ever', 'across', 'she', 'somehow', 'be', 'we', 'full', 'never', 'sixty', 'however', 'here', 'otherwise', 'were', 'whereupon', 'nowhere', 'although', 'found', 'alone', 're', 'along', 'fifteen', 'by', 'both', 'about', 'last', 'would', 'anything', 'via', 'many', 'could', 'thence', 'put', 'against', 'keep', 'etc', 'amount', 'became', 'ltd', 'hence', 'onto', 'or', 'con', 'among', 'already', 'co', 'afterwards', 'formerly', 'within', 'seems', 'into', 'others', 'while', 'whatever', 'except', 'down', 'hers', 'everyone', 'done', 'least', 'another', 'whoever', 'moreover', 'couldnt', 'throughout', 'anyhow', 'yourself', 'three', 'from', 'her', 'few', 'together', 'top', 'there', 'due', 'been', 'next', 'anyone', 'eleven', 'much', 'call', 'therefore', 'interest', 'then', 'thru', 'themselves', 'hundred', 'was', 'sincere', 'empty', 'more', 'himself', 'elsewhere', 'mostly', 'on', 'fire', 'becoming', 'hereby', 'amongst', 'else', 'part', 'everywhere', 'too', 'herself', 'former', 'those', 'he', 'me', 'myself', 'made', 'twenty', 'these', 'bill', 'cant', 'us', 'until', 'besides', 'nevertheless', 'below', 'anywhere', 'nine', 'can', 'of', 'your', 'toward', 'my', 'something', 'and', 'whereafter', 'whenever', 'give', 'almost', 'wherever', 'describe', 'beforehand', 'herein', 'an', 'as', 'itself', 'have', 'in', 'seem', 'whence', 'ie', 'any', 'fill', 'again', 'hasnt', 'inc', 'thereby', 'thin', 'no', 'perhaps', 'latter', 'meanwhile', 'when', 'detail', 'same', 'wherein', 'beside', 'also', 'that', 'other', 'take', 'which', 'becomes', 'you', 'if', 'nobody', 'see', 'though', 'may', 'after', 'upon', 'most', 'hereupon', 'eight', 'but', 'serious', 'nothing', 'such', 'why', 'a', 'off', 'whereby', 'third', 'whole', 'noone', 'sometimes', 'well', 'amoungst', 'yours', 'their', 'rather', 'without', 'so', 'five', 'the', 'first', 'whereas', 'once']\n", + "\n", + "# Write your code below\n", + "\n", + "# NOTE: I think there's an inconsistency in the question. \"is\", \"am\" and \"at\" appear in the original list of stop_words, so \n", + "# the final bag_of_words cannot be ['ironhack', 'is', 'cool', 'love', 'am', 'student', 'at'] I've manually removed \n", + "# \"is\", \"am\" and \"at\"from the list of stop_words so the output will be the expected according to the question\n", + "\n", + "new_bag_of_words = []\n", + "\n", + "for word in bag_of_words:\n", + " if word not in stop_words:\n", + " new_bag_of_words.append(word)\n", + " \n", + "term_freq = []\n", + "\n", + "for element in corpus:\n", + " split_element = element.split(\" \") \n", + " word_list = []\n", + " for word in new_bag_of_words:\n", + " word_list.append(split_element.count(word))\n", + " term_freq.append(word_list)\n", + " \n", + "print(term_freq)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Additional Challenge for the Nerds\n", + "\n", + "We will learn Scikit-Learn in Module 3 which has built in the BoW feature. Try to use Scikit-Learn to generate the BoW for this challenge and check whether the output is the same as yours. You will need to do some googling to find out how to use Scikit-Learn to generate BoW.\n", + "\n", + "**Notes:**\n", + "\n", + "* To install Scikit-Learn, use `pip install sklearn`. \n", + "\n", + "* Scikit-Learn removes stop words by default. You don't need to manually remove stop words.\n", + "\n", + "* Scikit-Learn's output has slightly different format from the output example demonstrated above. It's ok, you don't need to convert the Scikit-Learn output.\n", + "\n", + "The Scikit-Learn output will look like below:\n", + "\n", + "```python\n", + "# BoW:\n", + "{u'love': 5, u'ironhack': 3, u'student': 6, u'is': 4, u'cool': 2, u'am': 0, u'at': 1}\n", + "\n", + "# term_freq:\n", + "[[0 0 1 1 1 0 0]\n", + " [0 0 0 1 0 1 0]\n", + " [1 1 0 1 0 0 1]]\n", + " ```" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb index 4302084..58fa394 100644 --- a/your-code/challenge-1.ipynb +++ b/your-code/challenge-1.ipynb @@ -33,12 +33,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Durante un tiempo no estuvo segura de si su marido era su marido.\n" + ] + } + ], "source": [ "str_list = ['Durante', 'un', 'tiempo', 'no', 'estuvo', 'segura', 'de', 'si', 'su', 'marido', 'era', 'su', 'marido']\n", - "# Your code here:\n" + "# Your code here:\n", + "\n", + "my_string = \" \".join(str_list) + \".\"\n", + "print(my_string)\n" ] }, { @@ -50,12 +61,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Grocery list: bananas, bread, brownie mix, broccoli.\n" + ] + } + ], "source": [ "food_list = ['Bananas', 'Chocolate', 'bread', 'diapers', 'Ice Cream', 'Brownie Mix', 'broccoli']\n", - "# Your code here:\n" + "# Your code here:\n", + "\n", + "food_list_string = \"Grocery list:\"\n", + "\n", + "for food in food_list:\n", + " if food.lower()[0] == \"b\":\n", + " food_list_string += f\" {food.lower()},\"\n", + "\n", + "final_food_string = food_list_string[:-1]\n", + "\n", + "final_food_string += \".\"\n", + "\n", + "print(final_food_string)\n", + "\n" ] }, { @@ -69,9 +101,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The area of the circle with radius: 4.5 is: 63.61725123519331\n" + ] + } + ], "source": [ "import math\n", "\n", @@ -88,9 +128,14 @@ " # Sample Output: 78.53981633\n", " \n", " # Your code here:\n", + " area = pi*x**2\n", + " return area\n", + "\n", + "area = area(radius) \n", " \n", - " \n", - "# Your output string here:" + "# Your output string here:\n", + "\n", + "print(f\"{string1} {radius} {string2} {area}\")" ] }, { @@ -106,9 +151,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 62, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'with': 1, 'it': 1, 'i’ve': 1, 'who': 1, 'and': 1, 'say': 3, 'some': 2, 'if': 1, 'the': 1, 'perish': 1, 'think': 1, 'to': 2, 'enough': 1, 'desire': 1, 'tasted': 1, 'that': 1, 'but': 1, 'fire': 2, 'hold': 1, 'from': 1, 'great': 1, 'in': 2, 'what': 1, 'is': 1, 'end': 1, 'twice': 1, 'know': 1, 'will': 1, 'ice': 2, 'had': 1, 'would': 1, 'suffice': 1, 'those': 1, 'hate': 1, 'of': 2, 'for': 1, 'destruction': 1, 'i': 3, 'world': 1, 'favor': 1, 'also': 1}\n", + "{'with': 1, 'Some': 0, 'it': 1, 'I’ve': 0, 'who': 1, 'say': 3, 'if': 1, 'the': 1, 'perish': 1, 'But': 0, 'think': 1, 'to': 2, 'enough': 1, 'From': 0, 'desire': 1, 'tasted': 1, 'that': 1, 'Is': 0, 'fire': 2, 'hold': 1, 'To': 0, 'great': 1, 'in': 2, 'what': 1, 'I': 0, 'end': 1, 'twice': 1, 'know': 1, 'will': 1, 'ice': 2, 'had': 1, 'would': 1, 'suffice': 1, 'those': 1, 'hate': 1, 'of': 2, 'And': 0, 'for': 1, 'destruction': 1, 'world': 1, 'favor': 1, 'also': 1}\n" + ] + } + ], "source": [ "poem = \"\"\"Some say the world will end in fire,\n", "Some say in ice.\n", @@ -120,7 +174,47 @@ "Is also great\n", "And would suffice.\"\"\"\n", "\n", - "# Your code here:\n" + "# Your code here:\n", + "\n", + "poem_no_commas = poem.split(\",\")\n", + "poem_no_commas_string = \"\".join([fragment for fragment in poem_no_commas])\n", + "\n", + "poem_no_periods = poem_no_commas_string.split(\".\")\n", + "poem_no_periods_string = \"\".join([fragment for fragment in poem_no_periods])\n", + "\n", + "poem_transform_newlines = poem_no_periods_string.replace(\"\\n\", \" \").lower()\n", + "\n", + "final_poem_no_spaces = poem_transform_newlines.split(\" \")\n", + "\n", + "word_list = [(word, final_poem_no_spaces.count(word)) for word in final_poem_no_spaces]\n", + "\n", + "word_list_set = set()\n", + "for word in word_list:\n", + " word_list_set.add(word)\n", + " \n", + "words_dict = {}\n", + "for word_tuple in word_list_set:\n", + " words_dict[word_tuple[0]]= word_tuple[1]\n", + " \n", + "print(words_dict)\n", + "\n", + "# Segunda opcion usando re.split()\n", + "import re\n", + "pattern = r\"[,\\.\\n\\s]\"\n", + "final_poem = re.split(pattern, poem)\n", + "\n", + "# juntar en un string para poner todo en lower\n", + "\n", + "\n", + "word_list_set = set()\n", + "for word in final_word_list:\n", + " word_list_set.add(word)\n", + " \n", + "words_dict = {}\n", + "for word_tuple in word_list_set:\n", + " words_dict[word_tuple[0]]= word_tuple[1]\n", + "\n", + "print(words_dict)\n" ] }, { @@ -132,9 +226,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['I', 'was', 'angry', 'with', 'my', 'friend', 'I', 'told', 'my', 'wrath', 'my', 'wrath', 'did', 'end', 'I', 'was', 'angry', 'with', 'my', 'foe:', 'I', 'told', 'not', 'my', 'wrath', 'did', 'grow', 'And', 'I', 'waterd', 'fears', 'Night', '&', 'morning', 'with', 'my', 'tears:', 'And', 'I', 'sunned', 'with', 'smiles', 'And', 'with', 'soft', 'deceitful', 'wiles', 'And', 'grew', 'both', 'day', 'night', 'Till', 'bore', 'apple', 'bright', 'And', 'my', 'foe', 'beheld', 'shine', 'And', 'he', 'knew', 'that', 'was', 'mine', 'And', 'into', 'my', 'garden', 'stole', 'When', 'night', 'had', 'veild', 'pole', 'In', 'morning', 'glad', 'I', 'see', 'My', 'foe', 'outstretched', 'beneath', 'tree']\n" + ] + } + ], "source": [ "blacklist = ['and', 'as', 'an', 'a', 'the', 'in', 'it']\n", "\n", @@ -158,7 +260,13 @@ "In the morning glad I see; \n", "My foe outstretched beneath the tree.\"\"\"\n", "\n", - "# Your code here:\n" + "# Your code here:\n", + "import re\n", + "poem_just_words = re.split(r\"[;;\\.,\\s]\", poem)\n", + "poem_just_words_final = [word for word in poem_just_words if len(word) > 0]\n", + "\n", + "final_list = [word for word in poem_just_words_final if word not in blacklist]\n", + "print(final_list)\n" ] }, { @@ -172,14 +280,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 53, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['T', 'P']" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "poem = \"\"\"The apparition of these faces in the crowd;\n", "Petals on a wet, black bough.\"\"\"\n", "\n", - "# Your code here:\n" + "# Your code here:\n", + "pattern = r\"[A-Z]\"\n", + "result = re.findall(pattern, poem)\n", + "result\n" ] }, { @@ -191,13 +313,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['123abc', 'abc123', 'JohnSmith1', 'ABBY4']" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "data = ['123abc', 'abc123', 'JohnSmith1', 'ABBY4', 'JANE']\n", "\n", - "# Your code here:\n" + "# Your code here:\n", + "pattern = r\"[0-9]\"\n", + "result = [element for element in data if re.search(pattern, element)]\n", + "result\n" ] }, { @@ -213,13 +349,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 56, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['123abc', 'abc123', 'JohnSmith1']" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "data = ['123abc', 'abc123', 'JohnSmith1', 'ABBY4', 'JANE']\n", - "# Your code here:\n" + "# Your code here:\n", + "pattern = r\"[a-z][0-9]|[0-9][a-z]\"\n", + "result = [element for element in data if re.search(pattern, element)]\n", + "result\n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -238,7 +395,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.2" + "version": "3.7.4" } }, "nbformat": 4, diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb index 87c5656..c8dc1b3 100644 --- a/your-code/challenge-2.ipynb +++ b/your-code/challenge-2.ipynb @@ -72,7 +72,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -88,13 +88,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "corpus = []\n", "\n", - "# Write your code here\n" + "# Write your code here\n", + "for doc in docs:\n", + " with open (doc,\"r\") as f:\n", + " corpus.append(f.read())" ] }, { @@ -106,9 +109,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['I love Ironhack.', 'ironhack is cool', 'i am a student at ironhack']\n" + ] + } + ], "source": [ "print(corpus)" ] @@ -136,11 +147,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['ironhack is cool', 'i love ironhack', 'i am a student at ironhack']\n" + ] + } + ], "source": [ - "# Write your code here" + "# Write your code here\n", + "\n", + "for element in corpus:\n", + " element1 = element.lower().replace(\".\", \"\")\n", + " corpus.remove(element)\n", + " corpus.append(element1)\n", + " \n", + "print(corpus)\n", + "\n", + "\n" ] }, { @@ -152,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -172,11 +200,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ - "# Write your code here" + "# Write your code here\n", + "for element in corpus:\n", + " terms_array = element.split(\" \")\n", + " for term in terms_array:\n", + " if term not in bag_of_words:\n", + " bag_of_words.append(term)" ] }, { @@ -192,9 +225,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['ironhack', 'is', 'cool', 'i', 'love', 'am', 'a', 'student', 'at']\n" + ] + } + ], "source": [ "print(bag_of_words)" ] @@ -208,13 +249,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "term_freq = []\n", "\n", - "# Write your code here" + "# Write your code here\n", + "\n", + "for element in corpus:\n", + " split_element = element.split(\" \") \n", + " word_list = []\n", + " for word in bag_of_words:\n", + " word_list.append(split_element.count(word))\n", + " term_freq.append(word_list)" ] }, { @@ -228,9 +276,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0], [1, 0, 0, 1, 0, 1, 1, 1, 1]]\n" + ] + } + ], "source": [ "print(term_freq)" ] @@ -278,13 +334,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[1, 1, 1, 0, 0, 0, 0], [1, 0, 0, 1, 0, 0, 0], [1, 0, 0, 0, 1, 1, 1]]\n" + ] + } + ], "source": [ - "stop_words = ['all', 'six', 'less', 'being', 'indeed', 'over', 'move', 'anyway', 'fifty', 'four', 'not', 'own', 'through', 'yourselves', 'go', 'where', 'mill', 'only', 'find', 'before', 'one', 'whose', 'system', 'how', 'somewhere', 'with', 'thick', 'show', 'had', 'enough', 'should', 'to', 'must', 'whom', 'seeming', 'under', 'ours', 'has', 'might', 'thereafter', 'latterly', 'do', 'them', 'his', 'around', 'than', 'get', 'very', 'de', 'none', 'cannot', 'every', 'whether', 'they', 'front', 'during', 'thus', 'now', 'him', 'nor', 'name', 'several', 'hereafter', 'always', 'who', 'cry', 'whither', 'this', 'someone', 'either', 'each', 'become', 'thereupon', 'sometime', 'side', 'two', 'therein', 'twelve', 'because', 'often', 'ten', 'our', 'eg', 'some', 'back', 'up', 'namely', 'towards', 'are', 'further', 'beyond', 'ourselves', 'yet', 'out', 'even', 'will', 'what', 'still', 'for', 'bottom', 'mine', 'since', 'please', 'forty', 'per', 'its', 'everything', 'behind', 'un', 'above', 'between', 'it', 'neither', 'seemed', 'ever', 'across', 'she', 'somehow', 'be', 'we', 'full', 'never', 'sixty', 'however', 'here', 'otherwise', 'were', 'whereupon', 'nowhere', 'although', 'found', 'alone', 're', 'along', 'fifteen', 'by', 'both', 'about', 'last', 'would', 'anything', 'via', 'many', 'could', 'thence', 'put', 'against', 'keep', 'etc', 'amount', 'became', 'ltd', 'hence', 'onto', 'or', 'con', 'among', 'already', 'co', 'afterwards', 'formerly', 'within', 'seems', 'into', 'others', 'while', 'whatever', 'except', 'down', 'hers', 'everyone', 'done', 'least', 'another', 'whoever', 'moreover', 'couldnt', 'throughout', 'anyhow', 'yourself', 'three', 'from', 'her', 'few', 'together', 'top', 'there', 'due', 'been', 'next', 'anyone', 'eleven', 'much', 'call', 'therefore', 'interest', 'then', 'thru', 'themselves', 'hundred', 'was', 'sincere', 'empty', 'more', 'himself', 'elsewhere', 'mostly', 'on', 'fire', 'am', 'becoming', 'hereby', 'amongst', 'else', 'part', 'everywhere', 'too', 'herself', 'former', 'those', 'he', 'me', 'myself', 'made', 'twenty', 'these', 'bill', 'cant', 'us', 'until', 'besides', 'nevertheless', 'below', 'anywhere', 'nine', 'can', 'of', 'your', 'toward', 'my', 'something', 'and', 'whereafter', 'whenever', 'give', 'almost', 'wherever', 'is', 'describe', 'beforehand', 'herein', 'an', 'as', 'itself', 'at', 'have', 'in', 'seem', 'whence', 'ie', 'any', 'fill', 'again', 'hasnt', 'inc', 'thereby', 'thin', 'no', 'perhaps', 'latter', 'meanwhile', 'when', 'detail', 'same', 'wherein', 'beside', 'also', 'that', 'other', 'take', 'which', 'becomes', 'you', 'if', 'nobody', 'see', 'though', 'may', 'after', 'upon', 'most', 'hereupon', 'eight', 'but', 'serious', 'nothing', 'such', 'why', 'a', 'off', 'whereby', 'third', 'i', 'whole', 'noone', 'sometimes', 'well', 'amoungst', 'yours', 'their', 'rather', 'without', 'so', 'five', 'the', 'first', 'whereas', 'once']\n", + "stop_words = ['all', 'six', 'less', 'being', 'indeed', 'over', 'i', 'move', 'anyway', 'fifty', 'four', 'not', 'own', 'through', 'yourselves', 'go', 'where', 'mill', 'only', 'find', 'before', 'one', 'whose', 'system', 'how', 'somewhere', 'with', 'thick', 'show', 'had', 'enough', 'should', 'to', 'must', 'whom', 'seeming', 'under', 'ours', 'has', 'might', 'thereafter', 'latterly', 'do', 'them', 'his', 'around', 'than', 'get', 'very', 'de', 'none', 'cannot', 'every', 'whether', 'they', 'front', 'during', 'thus', 'now', 'him', 'nor', 'name', 'several', 'hereafter', 'always', 'who', 'cry', 'whither', 'this', 'someone', 'either', 'each', 'become', 'thereupon', 'sometime', 'side', 'two', 'therein', 'twelve', 'because', 'often', 'ten', 'our', 'eg', 'some', 'back', 'up', 'namely', 'towards', 'are', 'further', 'beyond', 'ourselves', 'yet', 'out', 'even', 'will', 'what', 'still', 'for', 'bottom', 'mine', 'since', 'please', 'forty', 'per', 'its', 'everything', 'behind', 'un', 'above', 'between', 'it', 'neither', 'seemed', 'ever', 'across', 'she', 'somehow', 'be', 'we', 'full', 'never', 'sixty', 'however', 'here', 'otherwise', 'were', 'whereupon', 'nowhere', 'although', 'found', 'alone', 're', 'along', 'fifteen', 'by', 'both', 'about', 'last', 'would', 'anything', 'via', 'many', 'could', 'thence', 'put', 'against', 'keep', 'etc', 'amount', 'became', 'ltd', 'hence', 'onto', 'or', 'con', 'among', 'already', 'co', 'afterwards', 'formerly', 'within', 'seems', 'into', 'others', 'while', 'whatever', 'except', 'down', 'hers', 'everyone', 'done', 'least', 'another', 'whoever', 'moreover', 'couldnt', 'throughout', 'anyhow', 'yourself', 'three', 'from', 'her', 'few', 'together', 'top', 'there', 'due', 'been', 'next', 'anyone', 'eleven', 'much', 'call', 'therefore', 'interest', 'then', 'thru', 'themselves', 'hundred', 'was', 'sincere', 'empty', 'more', 'himself', 'elsewhere', 'mostly', 'on', 'fire', 'becoming', 'hereby', 'amongst', 'else', 'part', 'everywhere', 'too', 'herself', 'former', 'those', 'he', 'me', 'myself', 'made', 'twenty', 'these', 'bill', 'cant', 'us', 'until', 'besides', 'nevertheless', 'below', 'anywhere', 'nine', 'can', 'of', 'your', 'toward', 'my', 'something', 'and', 'whereafter', 'whenever', 'give', 'almost', 'wherever', 'describe', 'beforehand', 'herein', 'an', 'as', 'itself', 'have', 'in', 'seem', 'whence', 'ie', 'any', 'fill', 'again', 'hasnt', 'inc', 'thereby', 'thin', 'no', 'perhaps', 'latter', 'meanwhile', 'when', 'detail', 'same', 'wherein', 'beside', 'also', 'that', 'other', 'take', 'which', 'becomes', 'you', 'if', 'nobody', 'see', 'though', 'may', 'after', 'upon', 'most', 'hereupon', 'eight', 'but', 'serious', 'nothing', 'such', 'why', 'a', 'off', 'whereby', 'third', 'whole', 'noone', 'sometimes', 'well', 'amoungst', 'yours', 'their', 'rather', 'without', 'so', 'five', 'the', 'first', 'whereas', 'once']\n", + "\n", + "# Write your code below\n", + "\n", + "# NOTE: I think there's an inconsistency in the question. \"is\", \"am\" and \"at\" appear in the original list of stop_words, so \n", + "# the final bag_of_words cannot be ['ironhack', 'is', 'cool', 'love', 'am', 'student', 'at'] I've manually removed \n", + "# \"is\", \"am\" and \"at\"from the list of stop_words so the output will be the expected according to the question\n", + "\n", + "new_bag_of_words = []\n", + "\n", + "for word in bag_of_words:\n", + " if word not in stop_words:\n", + " new_bag_of_words.append(word)\n", + " \n", + "term_freq = []\n", "\n", - "# Write your code below\n" + "for element in corpus:\n", + " split_element = element.split(\" \") \n", + " word_list = []\n", + " for word in new_bag_of_words:\n", + " word_list.append(split_element.count(word))\n", + " term_freq.append(word_list)\n", + " \n", + "print(term_freq)\n" ] }, { @@ -333,7 +418,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.2" + "version": "3.7.4" } }, "nbformat": 4, From a036409a9f6377c70e6c97ab5b95df46cfeb55a3 Mon Sep 17 00:00:00 2001 From: TGLSpain Date: Fri, 15 Jul 2022 11:13:40 -0300 Subject: [PATCH 2/2] second commit --- .../challenge-1-checkpoint.ipynb | 23 +++++++++---------- your-code/challenge-1.ipynb | 23 +++++++++---------- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/your-code/.ipynb_checkpoints/challenge-1-checkpoint.ipynb b/your-code/.ipynb_checkpoints/challenge-1-checkpoint.ipynb index 58fa394..8ec5889 100644 --- a/your-code/.ipynb_checkpoints/challenge-1-checkpoint.ipynb +++ b/your-code/.ipynb_checkpoints/challenge-1-checkpoint.ipynb @@ -151,15 +151,15 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'with': 1, 'it': 1, 'i’ve': 1, 'who': 1, 'and': 1, 'say': 3, 'some': 2, 'if': 1, 'the': 1, 'perish': 1, 'think': 1, 'to': 2, 'enough': 1, 'desire': 1, 'tasted': 1, 'that': 1, 'but': 1, 'fire': 2, 'hold': 1, 'from': 1, 'great': 1, 'in': 2, 'what': 1, 'is': 1, 'end': 1, 'twice': 1, 'know': 1, 'will': 1, 'ice': 2, 'had': 1, 'would': 1, 'suffice': 1, 'those': 1, 'hate': 1, 'of': 2, 'for': 1, 'destruction': 1, 'i': 3, 'world': 1, 'favor': 1, 'also': 1}\n", - "{'with': 1, 'Some': 0, 'it': 1, 'I’ve': 0, 'who': 1, 'say': 3, 'if': 1, 'the': 1, 'perish': 1, 'But': 0, 'think': 1, 'to': 2, 'enough': 1, 'From': 0, 'desire': 1, 'tasted': 1, 'that': 1, 'Is': 0, 'fire': 2, 'hold': 1, 'To': 0, 'great': 1, 'in': 2, 'what': 1, 'I': 0, 'end': 1, 'twice': 1, 'know': 1, 'will': 1, 'ice': 2, 'had': 1, 'would': 1, 'suffice': 1, 'those': 1, 'hate': 1, 'of': 2, 'And': 0, 'for': 1, 'destruction': 1, 'world': 1, 'favor': 1, 'also': 1}\n" + "{'had': 1, 'desire': 1, 'tasted': 1, 'i': 3, 'also': 1, 'hate': 1, 'the': 1, 'in': 2, 'those': 1, 'and': 1, 'to': 2, 'hold': 1, 'say': 3, 'think': 1, 'enough': 1, 'would': 1, 'for': 1, 'i’ve': 1, 'what': 1, 'who': 1, 'if': 1, 'with': 1, 'it': 1, 'fire': 2, 'is': 1, 'world': 1, 'from': 1, 'favor': 1, 'perish': 1, 'but': 1, 'destruction': 1, 'great': 1, 'suffice': 1, 'end': 1, 'of': 2, 'know': 1, 'ice': 2, 'that': 1, 'twice': 1, 'some': 2, 'will': 1}\n", + "{'had': 1, 'desire': 1, 'tasted': 1, 'i': 3, 'also': 1, 'hate': 1, 'the': 1, 'in': 2, 'those': 1, 'and': 1, 'to': 2, 'hold': 1, 'say': 3, 'think': 1, 'enough': 1, 'would': 1, 'for': 1, 'i’ve': 1, 'what': 1, 'who': 1, 'if': 1, 'with': 1, 'it': 1, 'fire': 2, 'is': 1, 'world': 1, 'from': 1, 'favor': 1, 'perish': 1, 'but': 1, 'destruction': 1, 'great': 1, 'suffice': 1, 'end': 1, 'of': 2, 'know': 1, 'ice': 2, 'that': 1, 'twice': 1, 'some': 2, 'will': 1}\n" ] } ], @@ -178,7 +178,6 @@ "\n", "poem_no_commas = poem.split(\",\")\n", "poem_no_commas_string = \"\".join([fragment for fragment in poem_no_commas])\n", - "\n", "poem_no_periods = poem_no_commas_string.split(\".\")\n", "poem_no_periods_string = \"\".join([fragment for fragment in poem_no_periods])\n", "\n", @@ -186,11 +185,11 @@ "\n", "final_poem_no_spaces = poem_transform_newlines.split(\" \")\n", "\n", - "word_list = [(word, final_poem_no_spaces.count(word)) for word in final_poem_no_spaces]\n", + "word_counts = [(word, final_poem_no_spaces.count(word)) for word in final_poem_no_spaces]\n", "\n", "word_list_set = set()\n", - "for word in word_list:\n", - " word_list_set.add(word)\n", + "for word_count in word_counts:\n", + " word_list_set.add(word_count)\n", " \n", "words_dict = {}\n", "for word_tuple in word_list_set:\n", @@ -201,14 +200,14 @@ "# Segunda opcion usando re.split()\n", "import re\n", "pattern = r\"[,\\.\\n\\s]\"\n", - "final_poem = re.split(pattern, poem)\n", - "\n", - "# juntar en un string para poner todo en lower\n", + "patterned_poem = re.split(pattern, poem)\n", + "final_poem = [element.lower() for element in patterned_poem if element != \"\"]\n", "\n", + "word_counts = [(word, final_poem.count(word)) for word in final_poem]\n", "\n", "word_list_set = set()\n", - "for word in final_word_list:\n", - " word_list_set.add(word)\n", + "for word_count in word_counts:\n", + " word_list_set.add(word_count)\n", " \n", "words_dict = {}\n", "for word_tuple in word_list_set:\n", diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb index 58fa394..8ec5889 100644 --- a/your-code/challenge-1.ipynb +++ b/your-code/challenge-1.ipynb @@ -151,15 +151,15 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'with': 1, 'it': 1, 'i’ve': 1, 'who': 1, 'and': 1, 'say': 3, 'some': 2, 'if': 1, 'the': 1, 'perish': 1, 'think': 1, 'to': 2, 'enough': 1, 'desire': 1, 'tasted': 1, 'that': 1, 'but': 1, 'fire': 2, 'hold': 1, 'from': 1, 'great': 1, 'in': 2, 'what': 1, 'is': 1, 'end': 1, 'twice': 1, 'know': 1, 'will': 1, 'ice': 2, 'had': 1, 'would': 1, 'suffice': 1, 'those': 1, 'hate': 1, 'of': 2, 'for': 1, 'destruction': 1, 'i': 3, 'world': 1, 'favor': 1, 'also': 1}\n", - "{'with': 1, 'Some': 0, 'it': 1, 'I’ve': 0, 'who': 1, 'say': 3, 'if': 1, 'the': 1, 'perish': 1, 'But': 0, 'think': 1, 'to': 2, 'enough': 1, 'From': 0, 'desire': 1, 'tasted': 1, 'that': 1, 'Is': 0, 'fire': 2, 'hold': 1, 'To': 0, 'great': 1, 'in': 2, 'what': 1, 'I': 0, 'end': 1, 'twice': 1, 'know': 1, 'will': 1, 'ice': 2, 'had': 1, 'would': 1, 'suffice': 1, 'those': 1, 'hate': 1, 'of': 2, 'And': 0, 'for': 1, 'destruction': 1, 'world': 1, 'favor': 1, 'also': 1}\n" + "{'had': 1, 'desire': 1, 'tasted': 1, 'i': 3, 'also': 1, 'hate': 1, 'the': 1, 'in': 2, 'those': 1, 'and': 1, 'to': 2, 'hold': 1, 'say': 3, 'think': 1, 'enough': 1, 'would': 1, 'for': 1, 'i’ve': 1, 'what': 1, 'who': 1, 'if': 1, 'with': 1, 'it': 1, 'fire': 2, 'is': 1, 'world': 1, 'from': 1, 'favor': 1, 'perish': 1, 'but': 1, 'destruction': 1, 'great': 1, 'suffice': 1, 'end': 1, 'of': 2, 'know': 1, 'ice': 2, 'that': 1, 'twice': 1, 'some': 2, 'will': 1}\n", + "{'had': 1, 'desire': 1, 'tasted': 1, 'i': 3, 'also': 1, 'hate': 1, 'the': 1, 'in': 2, 'those': 1, 'and': 1, 'to': 2, 'hold': 1, 'say': 3, 'think': 1, 'enough': 1, 'would': 1, 'for': 1, 'i’ve': 1, 'what': 1, 'who': 1, 'if': 1, 'with': 1, 'it': 1, 'fire': 2, 'is': 1, 'world': 1, 'from': 1, 'favor': 1, 'perish': 1, 'but': 1, 'destruction': 1, 'great': 1, 'suffice': 1, 'end': 1, 'of': 2, 'know': 1, 'ice': 2, 'that': 1, 'twice': 1, 'some': 2, 'will': 1}\n" ] } ], @@ -178,7 +178,6 @@ "\n", "poem_no_commas = poem.split(\",\")\n", "poem_no_commas_string = \"\".join([fragment for fragment in poem_no_commas])\n", - "\n", "poem_no_periods = poem_no_commas_string.split(\".\")\n", "poem_no_periods_string = \"\".join([fragment for fragment in poem_no_periods])\n", "\n", @@ -186,11 +185,11 @@ "\n", "final_poem_no_spaces = poem_transform_newlines.split(\" \")\n", "\n", - "word_list = [(word, final_poem_no_spaces.count(word)) for word in final_poem_no_spaces]\n", + "word_counts = [(word, final_poem_no_spaces.count(word)) for word in final_poem_no_spaces]\n", "\n", "word_list_set = set()\n", - "for word in word_list:\n", - " word_list_set.add(word)\n", + "for word_count in word_counts:\n", + " word_list_set.add(word_count)\n", " \n", "words_dict = {}\n", "for word_tuple in word_list_set:\n", @@ -201,14 +200,14 @@ "# Segunda opcion usando re.split()\n", "import re\n", "pattern = r\"[,\\.\\n\\s]\"\n", - "final_poem = re.split(pattern, poem)\n", - "\n", - "# juntar en un string para poner todo en lower\n", + "patterned_poem = re.split(pattern, poem)\n", + "final_poem = [element.lower() for element in patterned_poem if element != \"\"]\n", "\n", + "word_counts = [(word, final_poem.count(word)) for word in final_poem]\n", "\n", "word_list_set = set()\n", - "for word in final_word_list:\n", - " word_list_set.add(word)\n", + "for word_count in word_counts:\n", + " word_list_set.add(word_count)\n", " \n", "words_dict = {}\n", "for word_tuple in word_list_set:\n",