From 0e07b362c98d7ff3515ace51c43ff07eb192bccb Mon Sep 17 00:00:00 2001 From: Irving Palacios Date: Sun, 23 May 2021 23:18:18 -0500 Subject: [PATCH 1/3] [Proyecto Web Scrapping] Irving Palacios --- Datos_Cravioto.csv | 96 +++++++++++++++++ Intento MIERCOLES.ipynb | 229 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 325 insertions(+) create mode 100644 Datos_Cravioto.csv create mode 100644 Intento MIERCOLES.ipynb diff --git a/Datos_Cravioto.csv b/Datos_Cravioto.csv new file mode 100644 index 0000000..ee5dc9b --- /dev/null +++ b/Datos_Cravioto.csv @@ -0,0 +1,96 @@ +Articulo,Precio,URL Link +"ABRAZADERA MINI SIN FIN A INOX 10-16mm 3/8-5/8"" C/10 PZAS FIERO AB-04",$46.00, +"ABRAZADERA MINI SIN FIN A INOX 13-19mm 1/2-3/4"" C/10 PZAS FIERO AB-06",$46.00, +"ABRAZADERA SIN FIN A INOX 10-16mm 3/8-5/8"" C/10 PZAS FIERO AB-4",$76.00, +"ABRAZADERA SIN FIN A INOX 13-19mm 1/2-3/4"" C/10 PZAS FIERO AB-6",$76.00, +"ABRAZADERA SIN FIN A INOX 16-23mm 5/8-1"" C/10 PZAS FIERO AB-8",$84.00, +"ABRAZADERA SIN FIN A INOX 16-32mm 5/8-1 1/4"" C/10 PZAS FIERO AB-12",$84.00, +"ABRAZADERA SIN FIN A INOX 19-27mm 3/4-1 1/8"" C/10 PZAS FIERO AB-10",$84.00, +"ABRAZADERA SIN FIN A INOX 19-38mm 3/4-1 1/2"" C/10 PZAS FIERO AB-16",$97.00, +ACEITE LUBRICANTE P/MOTORES A 2 TIEMPOS LIQUIDO 16 Oz TRUPER ACT-2T-16,$109.00, +ACEITE LUBRICANTE P/MOTORES A 2 TIEMPOS LIQUIDO 4 Oz TRUPER ACT-2T-4,$35.00, +"ADAPTADOR RCA EXT ESPIGA GALVANIZADO 19mm 3/4""",$15.70, +"ADAPTADOR RCA EXT ESPIGA GALVANIZADO 25mm 1""",$24.17, +,$18.58, +APAGADOR DE PASO CAFÉ BAQUELITA ROYER 228,$89.00, +ARCO PRETUL APS-12 DE SOLERA,$38.23, +"BRIDA GALVANIZADA 19mm 3/4"" ARXFLUX",$32.00, +BROCHA 100mm 4 MANGO DE PLASTICO PRETUL BRP-4,$39.00, +BROCHA 125mm 5 MANGO DE PLASTICO PRETUL BRP-5,$6.00, +BROCHA 13mm 1/2 MANGO DE PLASTICO PRETUL BRP-1/2,$43.00, +BROCHA 150mm 6 MANGO DE PLASTICO PRETUL BRP-6,$8.50, +BROCHA 25mm 1 MANGO DE PLASTICO PRETUL BRP-1,$10.00, +BROCHA 38mm 1 1/2 MANGO DE PLASTICO PRETUL BRP-1 1/2,$13.50, +BROCHA 51mm 2 MANGO DE PLASTICO PRETUL BRP-2,$16.00, +BROCHA 63mm 2 1/2 MANGO DE PLASTICO PRETUL BRP-2 1/2,$20.00, +BROCHA 76mm 3 MANGO DE PLASTICO PRETUL BRP-3,$3.79, +,$655.52, +"CAJA CUADRADA GALVANIZADA STD. 4x4 S/TAPA KNOCK OUTS 1/2""",$78.00, +CALENTADOR DE DEPOSITO 40Lts LEÑA CORONA C/18,$73.13, +CAUTIN PRETUL CAU-30P TIPO LAPIZ 30W,$20.00, +CESPOL P/LAVABO FLEXIBLE CHICO 2218 FLEXIMATIC,$20.00, +CINTA ADHESIVA CANELA 48mm x 50mts TRUPER CCA-50,$24.00, +CINTA ADHESIVA TRANSPARENTE 48mm x 50mts TRUPER CTR-50,$32.00, +"CINTA MASKING TAPE 19mm 3/4"" x 50mts TRUPER MSK-3/4",$24.23, +"CINTA MASKING TAPE 25mm 1"" x 50mts TRUPER MSK-1",$9.01, +CLAVIJA MOVIL T/BISAGRA 502 MARFIL ROYER,$13.40, +CLAVIJA SENCILLA 2P REDONDA NEGRA PVC IUSA 416,$63.00, +"CONECTOR HEMBRA/MACHO MOVIBLE P/MANGUERA NYLON 6 10mm 3/8"" EDOMEX",$12.77, +CONECTOR HEMBRA/MACHO SIST.CLICK P/MANGUERA TRUPER CLIK-SET,$49.00, +,$176.15, +CONTACTO SENCILLO 1MOD 2P NEGRO COLGANTE ROYER 504,$354.37, +"CUCHARA PARA JARDIN 6"" CON MANGO DE MADERA GTS-SH TRUPER",$24.65, +DISCO DIAMANTE RIN CONTINUO P/LOSETA 4 AUSTROMEX 1501,$32.00, +DISCO DIAMANTE RIN SEGMENTADO P/CONCRETO 7 AUSTROMEX 1507,$30.00, +FLEXICO P/LAVABO 40cm 13-13mm 1/2x1/2 MAXIFLEX 25ME RUGO,$49.00, +FLEXOMETRO PRETUL PRO-3MEB 3 MTS C/BLISTER,$47.00, +FLEXOMETRO PRETUL PRO-3MEC 3 MTS EN COLORES,$29.00, +FLEXOMETRO PRETUL PRO-5MEB 5MTS C/BLISTER,$29.00, +FLEXOMETRO PRETUL PRO-5MEC 5 MTS EN COLORES,$15.56, +GUANTES DE LATEX PARA LIMPIEZA PUÑO LARGO TALLA GRANDE GU-313 TRUPER,$128.28, +GUANTES DE LATEX PARA LIMPIEZA PUÑO LARGO TALLA MEDIANA GU-312 TRUPER,$89.00, +GUIA DE ACERO P/CABLE 20.00mt,$291.22, +,$371.78, +INTERRUPTOR DE SEGURIDAD 2x30amp 2000 ROYER,$371.78, +JUEGO DE LLAVES TORX 8 PIEZAS TIPO NAVAJA CUERPO DE LAMINA TORX-8 TRUPER,$371.78, +KIT DE ACCESORIOS P/TANQUE BAJO SIST. STD PVC FLUIDMASTER FM200AK,$371.78, +LIJA DE AGUA G1000 FAJILLA 25pz A-99 FANDELI 05525,$30.00, +LIJA DE AGUA G1200 FAJILLA 25pz A-99 FANDELI 00048,$505.29, +LIJA DE AGUA G1500 FAJILLA 25pz A-99 FANDELI 00049,$125.00, +LIJA DE AGUA G2000 FAJILLA 25pz A-99 FANDELI 10965,$89.00, +"LLAVE DE GAS 10MM 7/8"" LL-GA-P PRETUL PAVONADA",$309.82, +LLAVE PARA EMPOTRAR Jgo SOLDABLE P/REG S/MAN CROMO URREA 652,$596.43, +"LLAVE PERICO PRETUL PET-10PB CROMADO 10"" C/BLISTER","$1,168.19", +"LLAVE PERICO PRETUL PET-8PB CROMADO 8"" C/BLISTER","$1,692.53", +MANERAL JUEGO JGO QUEEN GRANDE URREA QG,$16.00, +,$72.00, +"MANGUERA TRAMADA VERDE P/AGUA 13mm 1/2"" X100m STV.1/2.100",$40.09, +"MANGUERA TRAMADA VERDE P/AGUA 16mm 5/8""x100m TV.5/8.100",$65.01, +"MANGUERA TRAMADA VERDE P/AGUA 19mm 3/4""x100m TV.3/4.100",$89.73, +"NAVAJA PRETUL CUT-6PB CUTTER PLASTICO 6""",$142.29, +PEGAMENTO BLANCO BOTELLA 1 Kg FURIA,$506.91, +PEGAMENTO DE CONTACTO AMARILLO P/USO GENERAL LATA .135ml FURIA,$21.37, +PEGAMENTO DE CONTACTO AMARILLO P/USO GENERAL LATA .250ml FURIA,$24.51, +PEGAMENTO DE CONTACTO AMARILLO P/USO GENERAL LATA .500ml FURIA,$192.37, +PEGAMENTO DE CONTACTO AMARILLO P/USO GENERAL LATA 1 Lts FURIA,$333.80, +PEGAMENTO DE CONTACTO AMARILLO P/USO GENERAL LATA 4 Lts FURIA,$58.31, +PEGAMENTO INSTANTANEO USO GENERAL APLICADOR 2gr KRAZY KOLA LOKA,$228.64, +PEGAMENTO INSTANTANEO USO GENERAL BOTELLA C/BROCHA 5gr KRAZY KOLA LOKA,$45.19, +,$325.00, +PEGAMENTO PARA PVC RIG TODA PRESION HIDRAULICO LATA 475ml TANGIT,$19.16, +PEGAMENTO PARA PVC RIG TODA PRESION HIDRAULICO LATA 950ml TANGIT,$47.50, +SAPO P/WC PVC ROJO FM501 FLUIDMASTER,$96.45, +SOLDADURA SOLIDA 1/2-1/2 VERDE CARRETE 3mm 450gr OMEGA,$232.86, +SOPORTE P/LAVABO UNIVERSAL ALUMINIO TRES UÑAS,$292.92, +TALACHO TRUPER TP-5MX C/PICO C/MANGO 5 LB,$338.28, +"TUERCA CONICA C/ESPIGA LATON 10mm 3/8"" NACOBRE",$88.32, +VALVULA DE AGUJA 10-10mm 3/8 x3/8 FLARE-FLARE VRJ,$85.13, +VALVULA DE DESCARGA P/TANQUE BAJO PVC FLUIDMASTER FM507A,$80.16, +"VALVULA DE GLOBO ROSCADA 19mm 3/4"" LATON ALTA PRESION HARPER WYMAN",$72.26, +"VALVULA DE LLENADO P/TANQUE ESTACIONARIO 32mm 1 1/4"" HARPER WYMAN IUSA",$96.74, +"VALVULA DE LLENADO PARA TANQUE ESTACIONARIO T.E.2.""A"" INGUSA",$140.71, +,$38.44, +VALVULA DE PASO 10-10mm 3/8x3/8 FLARE-FLARE C/2 TCAS HARPER WYMAN,$38.73, +VALVULA DE PASO 10-13mm 3/8x1/2 FLARE-MNPT C/1 TCA HARPER WYMAN,$12.46, +VALVULA DE PASO 10-13mm 3/8x1/2 FLARE-SOLD C/1 TCA HARPER WYMAN,$120.51, +VALVULA DE PASO 13-13mm 1/2x1/2 SOLD-SOLD S/TCAS HARPER WYMAN,$186.41, diff --git a/Intento MIERCOLES.ipynb b/Intento MIERCOLES.ipynb new file mode 100644 index 0000000..b7773ed --- /dev/null +++ b/Intento MIERCOLES.ipynb @@ -0,0 +1,229 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running the INIT\n", + "\n", + "Esto es el kickstart\n", + "\n", + "\n", + "Running the request\n", + "Esto es el quotes parser\n", + "Esto es la funcion Output_results\n", + "\n", + "\n", + "[[]]\n", + "\n", + "\n", + "Esto es despues de r\n", + "\n", + "Scrapped the page number 1 \n", + "\n", + "Running the request\n", + "Esto es el quotes parser\n", + "Esto es la funcion Output_results\n", + "\n", + "\n", + "[[], []]\n", + "\n", + "\n", + "Esto es despues de r\n", + "\n", + "Scrapped the page number 2 \n", + "\n", + "Running the request\n", + "Esto es el quotes parser\n", + "Esto es la funcion Output_results\n", + "\n", + "\n", + "[[], [], []]\n", + "\n", + "\n", + "Esto es despues de r\n", + "\n", + "Scrapped the page number 3 \n", + "\n", + "Running the request\n", + "Esto es el quotes parser\n", + "Esto es la funcion Output_results\n", + "\n", + "\n", + "[[], [], [], []]\n", + "\n", + "\n", + "Esto es despues de r\n", + "\n", + "Scrapped the page number 4 \n", + "\n", + "Running the request\n", + "Esto es el quotes parser\n", + "Esto es la funcion Output_results\n", + "\n", + "\n", + "[[], [], [], [], []]\n", + "\n", + "\n", + "Esto es despues de r\n", + "\n", + "Scrapped the page number 5 \n", + "\n", + "Running the request\n", + "Esto es el quotes parser\n", + "Esto es la funcion Output_results\n", + "\n", + "\n", + "[[], [], [], [], [], []]\n", + "\n", + "\n", + "Esto es despues de r\n", + "\n", + "Scrapped the page number 6 \n", + "\n", + "Running the request\n", + "Esto es el quotes parser\n", + "Esto es la funcion Output_results\n", + "\n", + "\n", + "[[], [], [], [], [], [], []]\n", + "\n", + "\n", + "Esto es despues de r\n", + "\n", + "Scrapped the page number 7 \n", + "\n", + "Running the request\n", + "Esto es el quotes parser\n", + "Esto es la funcion Output_results\n", + "\n", + "\n", + "[[], [], [], [], [], [], [], []]\n", + "\n", + "\n", + "Esto es despues de r\n", + "\n", + "Scrapped the page number 8 \n", + "\n", + "Esto es el final del kickstart\n", + "\n" + ] + } + ], + "source": [ + "import requests, sys, time, random\n", + "import pandas as pd\n", + "from bs4 import BeautifulSoup\n", + "\n", + "\n", + "name_list=[]\n", + "price_list =[]\n", + "url_list=[]\n", + "name_final=[]\n", + "\n", + "\n", + "class Euroelect:\n", + " \n", + " def __init__(self, url_pattern_1, pages_to_scrape=10, sleep_interval=-1, content_parser=None):\n", + " print(\"Running the INIT\\n\")\n", + " self.url_pattern = url_pattern_1\n", + " self.pages_to_scrape = pages_to_scrape\n", + " self.sleep_interval = sleep_interval\n", + " self.content_parser = content_parser\n", + " \n", + " \n", + " def scrape_url(self, url):\n", + " print(\"Running the request\")\n", + " response = requests.get(url)\n", + " result = self.content_parser(response.content)\n", + " self.output_results(result)\n", + "\n", + " def output_results(self, r):\n", + " \n", + " print(\"Esto es la funcion Output_results\\n\\n\")\n", + " #This is the suggested code to export the code to Pandas\n", + " #\n", + " #\n", + " #\n", + " #\n", + " print(url_list)\n", + " #if len(name_list)==8:\n", + " # name_final = [y for x in name_list for y in x]\n", + " # #print(name_final,\"\\n\\n\")\n", + " # price_final= [y for x in price_list for y in x]\n", + " # #print(price_final,\"\\n\\n\")\n", + " # url_final= [y for x in url_list for y in x]\n", + " # df = pd.DataFrame(list(zip(name_final,price_final,url_final)), columns=['Articulo', 'Precio', 'URL Link'])\n", + " # print(df)\n", + " # data = df.to_csv('/Users/macbookair7/Documents/Irving/Profesional/DATA/Entregables/Proyecto/Proyecto_web_scrapping/Datos_Cravioto.csv', index=False)\n", + "\n", + " print(\"\\n\\nEsto es despues de r\\n\")\n", + " \n", + " def kickstart(self):\n", + " print(\"Esto es el kickstart\\n\\n\")\n", + " for i in range(1, self.pages_to_scrape+1):\n", + " self.scrape_url(self.url_pattern % i)\n", + " print(\"Scrapped the page number\",i,\"\\n\")\n", + " #time.sleep(random.randint(0,8))\n", + " print(\"Esto es el final del kickstart\\n\")\n", + "\n", + "URL_PATTERN_1 = 'https://casacraviotoeshop.com/productos.html?p=%s'\n", + "PAGES_TO_SCRAPE = 8\n", + "\n", + "def quotes_parser_1(content):\n", + " print(\"Esto es el quotes parser\")\n", + " soup = BeautifulSoup(content, 'lxml')\n", + " product_name = soup.select('a.product-item-link')\n", + " product_price = soup.select('span.price')\n", + " product_url= soup.select('product-image-photo')\n", + " name_selection = [e.get_text().strip() for e in product_name]\n", + " price_selection = [e.get_text().strip() for e in product_price]\n", + " url_selection = [e.get_text() for e in product_url]\n", + " name_list.append(name_selection)\n", + " price_list.append(price_selection)\n", + " url_list.append(url_selection)\n", + " \n", + " return name_selection, price_selection\n", + "\n", + "# Instantiate the IronhackSpider class\n", + "project = Euroelect(URL_PATTERN_1, PAGES_TO_SCRAPE, content_parser=quotes_parser_1)\n", + "\n", + "# Start scraping jobs\n", + "project.kickstart()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 0a95c40a219d6be67295b01059d5d8676e6164ce Mon Sep 17 00:00:00 2001 From: Irving Palacios Date: Mon, 24 May 2021 00:53:32 -0500 Subject: [PATCH 2/3] [Adding the README] --- .../Intento MIERCOLES-checkpoint.ipynb | 222 ++++++++++++++++++ .ipynb_checkpoints/README-checkpoint.md | 46 ++++ Intento MIERCOLES.ipynb | 43 ++-- README.md | 44 +--- 4 files changed, 297 insertions(+), 58 deletions(-) create mode 100644 .ipynb_checkpoints/Intento MIERCOLES-checkpoint.ipynb create mode 100644 .ipynb_checkpoints/README-checkpoint.md diff --git a/.ipynb_checkpoints/Intento MIERCOLES-checkpoint.ipynb b/.ipynb_checkpoints/Intento MIERCOLES-checkpoint.ipynb new file mode 100644 index 0000000..6a2de75 --- /dev/null +++ b/.ipynb_checkpoints/Intento MIERCOLES-checkpoint.ipynb @@ -0,0 +1,222 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running the INIT\n", + "\n", + "Esto es el kickstart\n", + "\n", + "\n", + "Running the request\n", + "Esto es el quotes parser\n", + "Esto es la funcion Output_results\n", + "\n", + "\n", + "[[]]\n", + "\n", + "\n", + "Esto es despues de r\n", + "\n", + "Scrapped the page number 1 \n", + "\n", + "Running the request\n", + "Esto es el quotes parser\n", + "Esto es la funcion Output_results\n", + "\n", + "\n", + "[[], []]\n", + "\n", + "\n", + "Esto es despues de r\n", + "\n", + "Scrapped the page number 2 \n", + "\n", + "Running the request\n", + "Esto es el quotes parser\n", + "Esto es la funcion Output_results\n", + "\n", + "\n", + "[[], [], []]\n", + "\n", + "\n", + "Esto es despues de r\n", + "\n", + "Scrapped the page number 3 \n", + "\n", + "Running the request\n", + "Esto es el quotes parser\n", + "Esto es la funcion Output_results\n", + "\n", + "\n", + "[[], [], [], []]\n", + "\n", + "\n", + "Esto es despues de r\n", + "\n", + "Scrapped the page number 4 \n", + "\n", + "Running the request\n", + "Esto es el quotes parser\n", + "Esto es la funcion Output_results\n", + "\n", + "\n", + "[[], [], [], [], []]\n", + "\n", + "\n", + "Esto es despues de r\n", + "\n", + "Scrapped the page number 5 \n", + "\n", + "Running the request\n", + "Esto es el quotes parser\n", + "Esto es la funcion Output_results\n", + "\n", + "\n", + "[[], [], [], [], [], []]\n", + "\n", + "\n", + "Esto es despues de r\n", + "\n", + "Scrapped the page number 6 \n", + "\n", + "Running the request\n", + "Esto es el quotes parser\n", + "Esto es la funcion Output_results\n", + "\n", + "\n", + "[[], [], [], [], [], [], []]\n", + "\n", + "\n", + "Esto es despues de r\n", + "\n", + "Scrapped the page number 7 \n", + "\n", + "Running the request\n", + "Esto es el quotes parser\n", + "Esto es la funcion Output_results\n", + "\n", + "\n", + "[[], [], [], [], [], [], [], []]\n", + "\n", + "\n", + "Esto es despues de r\n", + "\n", + "Scrapped the page number 8 \n", + "\n", + "Esto es el final del kickstart\n", + "\n" + ] + } + ], + "source": [ + "import requests, sys, time, random\n", + "import pandas as pd\n", + "from bs4 import BeautifulSoup\n", + "\n", + "\n", + "name_list=[]\n", + "price_list =[]\n", + "url_list=[]\n", + "name_final=[]\n", + "\n", + "\n", + "class Euroelect:\n", + " \n", + " def __init__(self, url_pattern_1, pages_to_scrape=10, sleep_interval=-1, content_parser=None):\n", + " print(\"Running the INIT\\n\")\n", + " self.url_pattern = url_pattern_1\n", + " self.pages_to_scrape = pages_to_scrape\n", + " self.sleep_interval = sleep_interval\n", + " self.content_parser = content_parser\n", + " \n", + " \n", + " def scrape_url(self, url):\n", + " response = requests.get(url)\n", + " result = self.content_parser(response.content)\n", + " self.output_results(result)\n", + "\n", + " def output_results(self, r):\n", + " \n", + " #This is the suggested code to export the code to Pandas\n", + " \n", + " if len(name_list)==8:\n", + " name_final = [y for x in name_list for y in x]\n", + " #print(name_final,\"\\n\\n\")\n", + " price_final= [y for x in price_list for y in x]\n", + " #print(price_final,\"\\n\\n\")\n", + " url_final= [y for x in url_list for y in x]\n", + " df = pd.DataFrame(list(zip(name_final,price_final,url_final)), columns=['Articulo', 'Precio', 'URL Link'])\n", + " print(df)\n", + " data = df.to_csv('/Users/macbookair7/Documents/Irving/Profesional/DATA/Entregables/Proyecto/Proyecto_web_scrapping/Datos_Cravioto.csv', index=False)\n", + " \n", + " def kickstart(self):\n", + "\n", + " for i in range(1, self.pages_to_scrape+1):\n", + " self.scrape_url(self.url_pattern % i)\n", + " print(\"Scrapped the page number\",i,\"\\n\")\n", + " time.sleep(random.randint(0,5))\n", + "\n", + "URL_PATTERN_1 = 'https://casacraviotoeshop.com/productos.html?p=%s'\n", + "PAGES_TO_SCRAPE = 8\n", + "\n", + "def quotes_parser_1(content):\n", + " soup = BeautifulSoup(content, 'lxml')\n", + " product_name = soup.select('a.product-item-link')\n", + " product_price = soup.select('span.price')\n", + " product_url= soup.select('img.product-image-photo')\n", + " \n", + " #I had issues trying to get the images URL. I tried as it is write, but when I print the lists, i find out that they\n", + " #are empty. Feedback is welcome (please hahaha).\n", + " \n", + " name_selection = [e.get_text().strip() for e in product_name]\n", + " price_selection = [e.get_text().strip() for e in product_price]\n", + " url_selection = [e.get_text() for e in product_url]\n", + " name_list.append(name_selection)\n", + " price_list.append(price_selection)\n", + " url_list.append(url_selection)\n", + " \n", + " return name_selection, price_selection\n", + "\n", + "\n", + "project = Euroelect(URL_PATTERN_1, PAGES_TO_SCRAPE, content_parser=quotes_parser_1)\n", + "\n", + "project.kickstart()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/.ipynb_checkpoints/README-checkpoint.md b/.ipynb_checkpoints/README-checkpoint.md new file mode 100644 index 0000000..dc8bae8 --- /dev/null +++ b/.ipynb_checkpoints/README-checkpoint.md @@ -0,0 +1,46 @@ +![IronHack Logo](https://s3-eu-west-1.amazonaws.com/ih-materials/uploads/upload_d5c5793015fec3be28a63c4fa3dd4d55.png) + +# Project: API and Web Data Scraping + +## Overview + +The goal of this project is for you to practice what you have learned in the APIs and Web Scraping chapter of this program. For this project, you will choose both an API to obtain data from and a web page to scrape. For the API portion of the project will need to make calls to your chosen API, successfully obtain a response, request data, convert it into a Pandas data frame, and export it as a CSV file. For the web scraping portion of the project, you will need to scrape the HTML from your chosen page, parse the HTML to extract the necessary information, and either save the results to a text (txt) file if it is text or into a CSV file if it is tabular data. + +**You will be working individually for this project**, but we'll be guiding you along the process and helping you as you go. Show us what you've got! + +--- + +## Technical Requirements + +The technical requirements for this project are as follows: + +* You must obtain data from an API using Python. +* You must scrape and clean HTML from a web page using Python. +* The results should be two files - one containing the tabular results of your API request and the other containing the results of your web page scrape. +* Your code should be saved in a Jupyter Notebook and your results should be saved in a folder named output. +* You should include a README.md file that describes the steps you took and your thought process for obtaining data from the API and web page. + +## Necessary Deliverables + +The following deliverables should be pushed to your Github repo for this chapter. + +* **A Jupyter Notebook (.ipynb) file** that contains the code used to work with your API and scrape your web page. +* **An output folder** containing the outputs of your API and scraping efforts. +* **A ``README.md`` file** containing a detailed explanation of your approach and code for retrieving data from the API and scraping the web page as well as your results, obstacles encountered, and lessons learned. + +## Suggested Ways to Get Started + +* **Find an API to work with** - a great place to start looking would be [API List](https://apilist.fun/) and [Public APIs](https://github.com/toddmotto/public-apis). If you need authorization for your chosen API, make sure to give yourself enough time for the service to review and accept your application. Have a couple back-up APIs chosen just in case! +* **Find a web page to scrape** and determine the content you would like to scrape from it - blogs and news sites are typically good candidates for scraping text content, and [Wikipedia](https://www.wikipedia.org/) is usually a good source for HTML tables (search for "list of..."). +* **Break the project down into different steps** - note the steps covered in the API and web scraping lessons, try to follow them, and make adjustments as you encounter the obstacles that are inevitable due to all APIs and web pages being different. +* **Use the tools in your tool kit** - your knowledge of intermediate Python as well as some of the things you've learned in previous chapters. This is a great way to start tying everything you've learned together! +* **Work through the lessons in class** & ask questions when you need to! Think about adding relevant code to your project each night, instead of, you know... _procrastinating_. +* **Commit early, commit often**, don’t be afraid of doing something incorrectly because you can always roll back to a previous version. +* **Consult documentation and resources provided** to better understand the tools you are using and how to accomplish what you want. + +## Useful Resources + +* [Requests Library Documentation: Quickstart](http://docs.python-requests.org/en/master/user/quickstart/) +* [BeautifulSoup Documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) +* [Stack Overflow Python Requests Questions](https://stackoverflow.com/questions/tagged/python-requests) +* [StackOverflow BeautifulSoup Questions](https://stackoverflow.com/questions/tagged/beautifulsoup) diff --git a/Intento MIERCOLES.ipynb b/Intento MIERCOLES.ipynb index b7773ed..6a2de75 100644 --- a/Intento MIERCOLES.ipynb +++ b/Intento MIERCOLES.ipynb @@ -138,49 +138,43 @@ " \n", " \n", " def scrape_url(self, url):\n", - " print(\"Running the request\")\n", " response = requests.get(url)\n", " result = self.content_parser(response.content)\n", " self.output_results(result)\n", "\n", " def output_results(self, r):\n", " \n", - " print(\"Esto es la funcion Output_results\\n\\n\")\n", " #This is the suggested code to export the code to Pandas\n", - " #\n", - " #\n", - " #\n", - " #\n", - " print(url_list)\n", - " #if len(name_list)==8:\n", - " # name_final = [y for x in name_list for y in x]\n", - " # #print(name_final,\"\\n\\n\")\n", - " # price_final= [y for x in price_list for y in x]\n", - " # #print(price_final,\"\\n\\n\")\n", - " # url_final= [y for x in url_list for y in x]\n", - " # df = pd.DataFrame(list(zip(name_final,price_final,url_final)), columns=['Articulo', 'Precio', 'URL Link'])\n", - " # print(df)\n", - " # data = df.to_csv('/Users/macbookair7/Documents/Irving/Profesional/DATA/Entregables/Proyecto/Proyecto_web_scrapping/Datos_Cravioto.csv', index=False)\n", - "\n", - " print(\"\\n\\nEsto es despues de r\\n\")\n", + " \n", + " if len(name_list)==8:\n", + " name_final = [y for x in name_list for y in x]\n", + " #print(name_final,\"\\n\\n\")\n", + " price_final= [y for x in price_list for y in x]\n", + " #print(price_final,\"\\n\\n\")\n", + " url_final= [y for x in url_list for y in x]\n", + " df = pd.DataFrame(list(zip(name_final,price_final,url_final)), columns=['Articulo', 'Precio', 'URL Link'])\n", + " print(df)\n", + " data = df.to_csv('/Users/macbookair7/Documents/Irving/Profesional/DATA/Entregables/Proyecto/Proyecto_web_scrapping/Datos_Cravioto.csv', index=False)\n", " \n", " def kickstart(self):\n", - " print(\"Esto es el kickstart\\n\\n\")\n", + "\n", " for i in range(1, self.pages_to_scrape+1):\n", " self.scrape_url(self.url_pattern % i)\n", " print(\"Scrapped the page number\",i,\"\\n\")\n", - " #time.sleep(random.randint(0,8))\n", - " print(\"Esto es el final del kickstart\\n\")\n", + " time.sleep(random.randint(0,5))\n", "\n", "URL_PATTERN_1 = 'https://casacraviotoeshop.com/productos.html?p=%s'\n", "PAGES_TO_SCRAPE = 8\n", "\n", "def quotes_parser_1(content):\n", - " print(\"Esto es el quotes parser\")\n", " soup = BeautifulSoup(content, 'lxml')\n", " product_name = soup.select('a.product-item-link')\n", " product_price = soup.select('span.price')\n", - " product_url= soup.select('product-image-photo')\n", + " product_url= soup.select('img.product-image-photo')\n", + " \n", + " #I had issues trying to get the images URL. I tried as it is write, but when I print the lists, i find out that they\n", + " #are empty. Feedback is welcome (please hahaha).\n", + " \n", " name_selection = [e.get_text().strip() for e in product_name]\n", " price_selection = [e.get_text().strip() for e in product_price]\n", " url_selection = [e.get_text() for e in product_url]\n", @@ -190,10 +184,9 @@ " \n", " return name_selection, price_selection\n", "\n", - "# Instantiate the IronhackSpider class\n", + "\n", "project = Euroelect(URL_PATTERN_1, PAGES_TO_SCRAPE, content_parser=quotes_parser_1)\n", "\n", - "# Start scraping jobs\n", "project.kickstart()" ] }, diff --git a/README.md b/README.md index dc8bae8..5f0bdba 100644 --- a/README.md +++ b/README.md @@ -2,45 +2,23 @@ # Project: API and Web Data Scraping -## Overview -The goal of this project is for you to practice what you have learned in the APIs and Web Scraping chapter of this program. For this project, you will choose both an API to obtain data from and a web page to scrape. For the API portion of the project will need to make calls to your chosen API, successfully obtain a response, request data, convert it into a Pandas data frame, and export it as a CSV file. For the web scraping portion of the project, you will need to scrape the HTML from your chosen page, parse the HTML to extract the necessary information, and either save the results to a text (txt) file if it is text or into a CSV file if it is tabular data. -**You will be working individually for this project**, but we'll be guiding you along the process and helping you as you go. Show us what you've got! - ---- - -## Technical Requirements - -The technical requirements for this project are as follows: - -* You must obtain data from an API using Python. -* You must scrape and clean HTML from a web page using Python. -* The results should be two files - one containing the tabular results of your API request and the other containing the results of your web page scrape. -* Your code should be saved in a Jupyter Notebook and your results should be saved in a folder named output. -* You should include a README.md file that describes the steps you took and your thought process for obtaining data from the API and web page. +The following deliverables should be pushed to your Github repo for this chapter. -## Necessary Deliverables +* **Code Approach** I decided to do a web scrapping of Cravioto's e-shop, which is a tools and building supplies store. The web site is an html web site. Because of the structure of the labels I thought that was an accurate site to scrap, according what we learned in the bootcamp. +* The output I got from the web scrapping are the name of the product, the price and the picture link. +* As a feauture for this web scrapping is to get the "SKU code" and the "Users opinions", in which I have to open the link of each product and scrap them. -The following deliverables should be pushed to your Github repo for this chapter. +* **Results** As I though in the beginning I found out some issues by doing the scrapping. I got in a .CSV file the results of the web scrapping to this page. I would like the page had more data to scrap and practice more. But because the lack of time and some changes to my master plan, I couldn't. -* **A Jupyter Notebook (.ipynb) file** that contains the code used to work with your API and scrape your web page. -* **An output folder** containing the outputs of your API and scraping efforts. -* **A ``README.md`` file** containing a detailed explanation of your approach and code for retrieving data from the API and scraping the web page as well as your results, obstacles encountered, and lessons learned. +* One of my main issues were to define clear and precisely my action plan. I think that it was the principal task to solve, but is better to have a good action plan before starting. Definitly that saved me hour of work. +* The second important issue is that i couldn't scrap the URL images of each product, I had been having troubles trying to call the right label. Feedback is welcome. -## Suggested Ways to Get Started -* **Find an API to work with** - a great place to start looking would be [API List](https://apilist.fun/) and [Public APIs](https://github.com/toddmotto/public-apis). If you need authorization for your chosen API, make sure to give yourself enough time for the service to review and accept your application. Have a couple back-up APIs chosen just in case! -* **Find a web page to scrape** and determine the content you would like to scrape from it - blogs and news sites are typically good candidates for scraping text content, and [Wikipedia](https://www.wikipedia.org/) is usually a good source for HTML tables (search for "list of..."). -* **Break the project down into different steps** - note the steps covered in the API and web scraping lessons, try to follow them, and make adjustments as you encounter the obstacles that are inevitable due to all APIs and web pages being different. -* **Use the tools in your tool kit** - your knowledge of intermediate Python as well as some of the things you've learned in previous chapters. This is a great way to start tying everything you've learned together! -* **Work through the lessons in class** & ask questions when you need to! Think about adding relevant code to your project each night, instead of, you know... _procrastinating_. -* **Commit early, commit often**, don’t be afraid of doing something incorrectly because you can always roll back to a previous version. -* **Consult documentation and resources provided** to better understand the tools you are using and how to accomplish what you want. +* **Lessons learned** +* The first one I want to writte about is of hard skills because I could do a reinforcement to my already knowledge, specially in OOP. I understood the theme when we did the lab, but when I tried to this project I found that I had a lot of opportunity areas to work. -## Useful Resources +* The second lesson I had is about soft skills. If you wrote a code and you think you can use it, do it. After hours trying to wrtie by myself the code, I recall to take a look at the labs I had solved and I found that would be easier to copy the structure of another one instead trying to do everything from the start. -* [Requests Library Documentation: Quickstart](http://docs.python-requests.org/en/master/user/quickstart/) -* [BeautifulSoup Documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) -* [Stack Overflow Python Requests Questions](https://stackoverflow.com/questions/tagged/python-requests) -* [StackOverflow BeautifulSoup Questions](https://stackoverflow.com/questions/tagged/beautifulsoup) +* Finally I can say that even I could save hours tryi From 0a82fbbee9c0ec9446210bc19b4f7fca8dbcfba0 Mon Sep 17 00:00:00 2001 From: Irving Palacios Date: Mon, 24 May 2021 00:54:57 -0500 Subject: [PATCH 3/3] [Adding the README] --- .ipynb_checkpoints/README-checkpoint.md | 47 +++++++------------------ README.md | 5 +-- 2 files changed, 16 insertions(+), 36 deletions(-) diff --git a/.ipynb_checkpoints/README-checkpoint.md b/.ipynb_checkpoints/README-checkpoint.md index dc8bae8..d1960fa 100644 --- a/.ipynb_checkpoints/README-checkpoint.md +++ b/.ipynb_checkpoints/README-checkpoint.md @@ -1,46 +1,25 @@ -![IronHack Logo](https://s3-eu-west-1.amazonaws.com/ih-materials/uploads/upload_d5c5793015fec3be28a63c4fa3dd4d55.png) +[IronHack Logo](https://s3-eu-west-1.amazonaws.com/ih-materials/uploads/upload_d5c5793015fec3be28a63c4fa3dd4d55.png) # Project: API and Web Data Scraping -## Overview -The goal of this project is for you to practice what you have learned in the APIs and Web Scraping chapter of this program. For this project, you will choose both an API to obtain data from and a web page to scrape. For the API portion of the project will need to make calls to your chosen API, successfully obtain a response, request data, convert it into a Pandas data frame, and export it as a CSV file. For the web scraping portion of the project, you will need to scrape the HTML from your chosen page, parse the HTML to extract the necessary information, and either save the results to a text (txt) file if it is text or into a CSV file if it is tabular data. -**You will be working individually for this project**, but we'll be guiding you along the process and helping you as you go. Show us what you've got! - ---- - -## Technical Requirements - -The technical requirements for this project are as follows: - -* You must obtain data from an API using Python. -* You must scrape and clean HTML from a web page using Python. -* The results should be two files - one containing the tabular results of your API request and the other containing the results of your web page scrape. -* Your code should be saved in a Jupyter Notebook and your results should be saved in a folder named output. -* You should include a README.md file that describes the steps you took and your thought process for obtaining data from the API and web page. +The following deliverables should be pushed to your Github repo for this chapter. -## Necessary Deliverables +* **Code Approach** I decided to do a web scrapping of Cravioto's e-shop, which is a tools and building supplies store. The web site is an html web site. Because of the structure of the labels I thought that was an accurate site to scrap, according what we learned in the bootcamp. +* The output I got from the web scrapping are the name of the product, the price and the picture link. +* As a feauture for this web scrapping is to get the "SKU code" and the "Users opinions", in which I have to open the link of each product and scrap them. -The following deliverables should be pushed to your Github repo for this chapter. +* **Results** As I though in the beginning I found out some issues by doing the scrapping. I got in a .CSV file the results of the web scrapping to this page. I would like the page had more data to scrap and practice more. But because the lack of time and some changes to my master plan, I couldn't. -* **A Jupyter Notebook (.ipynb) file** that contains the code used to work with your API and scrape your web page. -* **An output folder** containing the outputs of your API and scraping efforts. -* **A ``README.md`` file** containing a detailed explanation of your approach and code for retrieving data from the API and scraping the web page as well as your results, obstacles encountered, and lessons learned. +* One of my main issues were to define clear and precisely my action plan. I think that it was the principal task to solve, but is better to have a good action plan before starting. Definitly that saved me hour of work. +* The second important issue is that i couldn't scrap the URL images of each product, I had been having troubles trying to call the right label. Feedback is welcome. -## Suggested Ways to Get Started -* **Find an API to work with** - a great place to start looking would be [API List](https://apilist.fun/) and [Public APIs](https://github.com/toddmotto/public-apis). If you need authorization for your chosen API, make sure to give yourself enough time for the service to review and accept your application. Have a couple back-up APIs chosen just in case! -* **Find a web page to scrape** and determine the content you would like to scrape from it - blogs and news sites are typically good candidates for scraping text content, and [Wikipedia](https://www.wikipedia.org/) is usually a good source for HTML tables (search for "list of..."). -* **Break the project down into different steps** - note the steps covered in the API and web scraping lessons, try to follow them, and make adjustments as you encounter the obstacles that are inevitable due to all APIs and web pages being different. -* **Use the tools in your tool kit** - your knowledge of intermediate Python as well as some of the things you've learned in previous chapters. This is a great way to start tying everything you've learned together! -* **Work through the lessons in class** & ask questions when you need to! Think about adding relevant code to your project each night, instead of, you know... _procrastinating_. -* **Commit early, commit often**, don’t be afraid of doing something incorrectly because you can always roll back to a previous version. -* **Consult documentation and resources provided** to better understand the tools you are using and how to accomplish what you want. +* **Lessons learned** +* The first one I want to writte about is of hard skills because I could do a reinforcement to my already knowledge, specially in OOP. I understood the theme when we did the lab, but when I tried to this project I found that I had a lot of opportunity areas to work. -## Useful Resources +* The second lesson I had is about soft skills. If you wrote a code and you think you can use it, do it. After hours trying to wrtie by myself the code, I recall to take a look at the labs I had solved and I found that would be easier to copy the structure of another one instead trying to do everything from the start. -* [Requests Library Documentation: Quickstart](http://docs.python-requests.org/en/master/user/quickstart/) -* [BeautifulSoup Documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) -* [Stack Overflow Python Requests Questions](https://stackoverflow.com/questions/tagged/python-requests) -* [StackOverflow BeautifulSoup Questions](https://stackoverflow.com/questions/tagged/beautifulsoup) +* Finally I can say that even I could save hours since the begging, I couldn't get this new experience if I hadn't been through this. +* I found it hard, but not impossible. diff --git a/README.md b/README.md index 5f0bdba..d1960fa 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -![IronHack Logo](https://s3-eu-west-1.amazonaws.com/ih-materials/uploads/upload_d5c5793015fec3be28a63c4fa3dd4d55.png) +[IronHack Logo](https://s3-eu-west-1.amazonaws.com/ih-materials/uploads/upload_d5c5793015fec3be28a63c4fa3dd4d55.png) # Project: API and Web Data Scraping @@ -21,4 +21,5 @@ The following deliverables should be pushed to your Github repo for this chapter * The second lesson I had is about soft skills. If you wrote a code and you think you can use it, do it. After hours trying to wrtie by myself the code, I recall to take a look at the labs I had solved and I found that would be easier to copy the structure of another one instead trying to do everything from the start. -* Finally I can say that even I could save hours tryi +* Finally I can say that even I could save hours since the begging, I couldn't get this new experience if I hadn't been through this. +* I found it hard, but not impossible.