Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
508 changes: 508 additions & 0 deletions .ipynb_checkpoints/Project_2_API-checkpoint.ipynb

Large diffs are not rendered by default.

394 changes: 394 additions & 0 deletions .ipynb_checkpoints/Project_2_Web_Scrapping-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,394 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "6745d9ac",
"metadata": {},
"source": [
"# Project 2: Web Scrapping and API"
]
},
{
"cell_type": "markdown",
"id": "24c27440",
"metadata": {},
"source": [
"## Modules imports"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "e495f6f9",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import sys\n",
"import requests\n",
"from pathlib import Path\n",
"import datetime\n",
"from time import sleep\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "markdown",
"id": "9351dfe4",
"metadata": {},
"source": [
"## Get Content"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "ad3ba4d9",
"metadata": {},
"outputs": [],
"source": [
"def get_cars_content(content):\n",
" soup = BeautifulSoup(content, 'lxml')\n",
"\n",
" cars_body = soup.select(\".card-body\")\n",
" names = [car.h2.text.strip() for car in cars_body]\n",
" \n",
" details = [car.p.text.strip() for car in cars_body]\n",
" years = [year for car in details for year in re.findall(r\"\\d{4}\", car) ]\n",
" kilometers = [ kilometer.strip() for car in details for kilometer in re.findall(r\"\\W\\d.*\\d+\\W\", car)]\n",
" places = [ place for car in details for place in re.findall(r\"\\s*([\\S]+)$\", car)]\n",
" \n",
" prices_info = soup.select(\".payment-total.payment-highlight\")\n",
" prices = [car.text.strip() for car in prices_info] \n",
"\n",
" return zip(names,years,kilometers,places, prices)"
]
},
{
"cell_type": "markdown",
"id": "5c3ea94c",
"metadata": {},
"source": [
"## Request information"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "3638a61e",
"metadata": {},
"outputs": [],
"source": [
"def request_content(url):\n",
" cars_response = []\n",
" for page in range(1,11):\n",
" print(f\"Request information: {url%page}\")\n",
" response = requests.get(url%page, sleep(2)) \n",
" print(f\"Extracting information from page #{page}\")\n",
" cars_response.append(list((get_cars_content(response.content))))\n",
" sleep(2) \n",
" print('Information extracted successfuly')\n",
" print(10*\"--------\")\n",
" return cars_response"
]
},
{
"cell_type": "markdown",
"id": "bc1d8469",
"metadata": {},
"source": [
"## Main function"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "10e500ae",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def run():\n",
" base_url = \"https://www.kavak.com/mx/page-%s/compra-de-autos\"\n",
" print(\"The information is being extracted from website\")\n",
" sleep(2)\n",
" cars_data = request_content(base_url)\n",
" cars = [car for matrix in cars_data for car in matrix]\n",
" \n",
" print(\"The information has been completed, transforming data: \")\n",
" print(10*\"--------\")\n",
" sleep(3)\n",
" filename = f\"car_sales_kavak.csv\"\n",
" filename = \"data/\"+ filename\n",
" \n",
" print(f\"The file will be named {filename}\")\n",
" df = pd.DataFrame(cars , columns=[\"Car_Name\", \"Years\", \"Kilometers\", \"Places\",\"Prices\"],dtype=object)\n",
" \n",
"\n",
"# # print(df)\n",
" print(10*\"--------\")\n",
" sleep(3)\n",
" df.to_csv(filename, index = False)\n",
" print(f\"{filename} saved.\")\n",
" print(\"Finished.\")\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b0d1951a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The information is being extracted from website\n",
"Request information: https://www.kavak.com/mx/page-1/compra-de-autos\n",
"Extracting information from page #1\n",
"Information extracted successfuly\n",
"--------------------------------------------------------------------------------\n",
"Request information: https://www.kavak.com/mx/page-2/compra-de-autos\n",
"Extracting information from page #2\n",
"Information extracted successfuly\n",
"--------------------------------------------------------------------------------\n",
"Request information: https://www.kavak.com/mx/page-3/compra-de-autos\n",
"Extracting information from page #3\n",
"Information extracted successfuly\n",
"--------------------------------------------------------------------------------\n",
"Request information: https://www.kavak.com/mx/page-4/compra-de-autos\n",
"Extracting information from page #4\n",
"Information extracted successfuly\n",
"--------------------------------------------------------------------------------\n",
"Request information: https://www.kavak.com/mx/page-5/compra-de-autos\n",
"Extracting information from page #5\n",
"Information extracted successfuly\n",
"--------------------------------------------------------------------------------\n",
"Request information: https://www.kavak.com/mx/page-6/compra-de-autos\n",
"Extracting information from page #6\n",
"Information extracted successfuly\n",
"--------------------------------------------------------------------------------\n",
"Request information: https://www.kavak.com/mx/page-7/compra-de-autos\n",
"Extracting information from page #7\n",
"Information extracted successfuly\n",
"--------------------------------------------------------------------------------\n",
"Request information: https://www.kavak.com/mx/page-8/compra-de-autos\n",
"Extracting information from page #8\n",
"Information extracted successfuly\n",
"--------------------------------------------------------------------------------\n",
"Request information: https://www.kavak.com/mx/page-9/compra-de-autos\n",
"Extracting information from page #9\n",
"Information extracted successfuly\n",
"--------------------------------------------------------------------------------\n",
"Request information: https://www.kavak.com/mx/page-10/compra-de-autos\n",
"Extracting information from page #10\n",
"Information extracted successfuly\n",
"--------------------------------------------------------------------------------\n",
"The information has been completed, transforming data: \n",
"--------------------------------------------------------------------------------\n",
"The file will be named data/car_sales_kavak.csv\n",
"--------------------------------------------------------------------------------\n"
]
}
],
"source": [
"run()\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "4035eecd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Car_Name</th>\n",
" <th>Years</th>\n",
" <th>Kilometers</th>\n",
" <th>Places</th>\n",
" <th>Prices</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Kia Soul EX</td>\n",
" <td>2017</td>\n",
" <td>62,170</td>\n",
" <td>Monterrey</td>\n",
" <td>$249,999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Chevrolet Cruze LS Turbo</td>\n",
" <td>2017</td>\n",
" <td>72,910</td>\n",
" <td>Monterrey</td>\n",
" <td>$194,999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Honda Accord EXL</td>\n",
" <td>2015</td>\n",
" <td>105,360</td>\n",
" <td>Monterrey</td>\n",
" <td>$265,999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Audi Q7 Quattro S Line 3.0T</td>\n",
" <td>2012</td>\n",
" <td>111,450</td>\n",
" <td>Monterrey</td>\n",
" <td>$304,999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Infiniti Q50 Híbrido</td>\n",
" <td>2017</td>\n",
" <td>83,440</td>\n",
" <td>Monterrey</td>\n",
" <td>$396,999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>354</th>\n",
" <td>Hyundai Creta Limited</td>\n",
" <td>2019</td>\n",
" <td>109,700</td>\n",
" <td>Monterrey</td>\n",
" <td>$388,999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>355</th>\n",
" <td>Chevrolet Sonic LS (Línea anterior)</td>\n",
" <td>2016</td>\n",
" <td>95,100</td>\n",
" <td>Monterrey</td>\n",
" <td>$174,999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>356</th>\n",
" <td>Dodge Journey SE</td>\n",
" <td>2015</td>\n",
" <td>116,500</td>\n",
" <td>Monterrey</td>\n",
" <td>$254,999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>357</th>\n",
" <td>Volkswagen Passat CC 2.0T</td>\n",
" <td>2016</td>\n",
" <td>99,500</td>\n",
" <td>Monterrey</td>\n",
" <td>$312,999</td>\n",
" </tr>\n",
" <tr>\n",
" <th>358</th>\n",
" <td>Nissan Sentra Sense</td>\n",
" <td>2017</td>\n",
" <td>63,000</td>\n",
" <td>Monterrey</td>\n",
" <td>$222,999</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>359 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" Car_Name Years Kilometers Places \\\n",
"0 Kia Soul EX 2017 62,170 Monterrey \n",
"1 Chevrolet Cruze LS Turbo 2017 72,910 Monterrey \n",
"2 Honda Accord EXL 2015 105,360 Monterrey \n",
"3 Audi Q7 Quattro S Line 3.0T 2012 111,450 Monterrey \n",
"4 Infiniti Q50 Híbrido 2017 83,440 Monterrey \n",
".. ... ... ... ... \n",
"354 Hyundai Creta Limited 2019 109,700 Monterrey \n",
"355 Chevrolet Sonic LS (Línea anterior) 2016 95,100 Monterrey \n",
"356 Dodge Journey SE 2015 116,500 Monterrey \n",
"357 Volkswagen Passat CC 2.0T 2016 99,500 Monterrey \n",
"358 Nissan Sentra Sense 2017 63,000 Monterrey \n",
"\n",
" Prices \n",
"0 $249,999 \n",
"1 $194,999 \n",
"2 $265,999 \n",
"3 $304,999 \n",
"4 $396,999 \n",
".. ... \n",
"354 $388,999 \n",
"355 $174,999 \n",
"356 $254,999 \n",
"357 $312,999 \n",
"358 $222,999 \n",
"\n",
"[359 rows x 5 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('./data/car_sales_kavak.csv')\n",
"df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading