Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 177 additions & 0 deletions .ipynb_checkpoints/Properstar Scrapping-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# WARNING, the script takes a couple of minutes to run. The 3 cells need to be run, since the next cell only\n",
"# defines the function which is used in the third cell.\n",
"\n",
"from selenium import webdriver\n",
"from selenium.webdriver.chrome.options import Options\n",
"from selenium.webdriver.common.by import By\n",
"import pandas as pd\n",
"import math\n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def get_listings(driver):\n",
" dicts_for_df = []\n",
" articles = driver.find_elements_by_tag_name(\"article\")\n",
" \n",
" for article in articles:\n",
" listed_property = {}\n",
" \n",
" body_element = article.find_element_by_css_selector(\".item-body\")\n",
" listed_property[\"title\"] = body_element.find_element_by_tag_name(\"a\").text\n",
" listed_property[\"address\"] = body_element.find_element_by_css_selector(\".item-location\").text\n",
" listed_property[\"price\"] = article.find_element_by_css_selector(\"span.title-price\").text\n",
" listed_property_url_element = article.find_element_by_css_selector(\"a.link.listing-title.stretched-link\")\n",
" listed_property[\"url\"] = listed_property_url_element.get_attribute(\"href\")\n",
" details_element = body_element.find_element_by_css_selector(\"p.item-highlights\").text\n",
" details_list = details_element.split(\"•\")\n",
" listed_property_type_list = [detail.strip() for detail in details_list if \"piso\" in detail.lower() or \"casa\" in detail.lower()]\n",
" if len(listed_property_type_list) == 1:\n",
" listed_property[\"type\"] = listed_property_type_list[0]\n",
" else:\n",
" listed_property[\"type\"] = None\n",
" listed_property_bedrooms_list = [detail.strip() for detail in details_list if \"habitación/es\" in detail.lower()]\n",
" if len(listed_property_bedrooms_list) == 1:\n",
" listed_property[\"bedrooms\"] = listed_property_bedrooms_list[0]\n",
" else:\n",
" listed_property[\"bedrooms\"] = None\n",
" listed_property_rooms_list = [detail.strip() for detail in details_list if \"habit.\" in detail.lower()]\n",
" if len(listed_property_rooms_list) == 1:\n",
" listed_property[\"rooms\"] = listed_property_rooms_list[0]\n",
" else:\n",
" listed_property[\"rooms\"] = None \n",
" listed_property_bathrooms_list = [detail.strip() for detail in details_list if \"baño\" in detail.lower()]\n",
" if len(listed_property_bathrooms_list) == 1:\n",
" listed_property[\"bathrooms\"] = listed_property_bathrooms_list[0]\n",
" else:\n",
" listed_property[\"bathrooms\"] = None\n",
" listed_property_surface_list = [detail.strip() for detail in details_list if \"m²\" in detail.lower()]\n",
" if len(listed_property_surface_list) == 1:\n",
" listed_property[\"surface\"] = listed_property_surface_list[0]\n",
" else:\n",
" listed_property[\"surface\"] = None\n",
" \n",
" dicts_for_df.append(listed_property)\n",
" return dicts_for_df "
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Changing currency\n",
"Finding and calculating total results\n",
"Total results pages are 10\n",
"Getting page results for page 1\n",
"Getting page results for page 2\n",
"Getting page results for page 3\n",
"Getting page results for page 4\n",
"Getting page results for page 5\n",
"Getting page results for page 6\n",
"Getting page results for page 7\n",
"Getting page results for page 8\n",
"Getting page results for page 9\n",
"Getting page results for page 10\n",
"finished scrapping\n",
"finished\n"
]
}
],
"source": [
"driver = webdriver.Chrome(\"chromedriver.exe\")\n",
"scrapped_url = \"https://www.properstar.es/mexico/ciudad-de-mexico-loc/alquiler/piso-casa\"\n",
"final_data_for_df = []\n",
"\n",
"driver.get(scrapped_url)\n",
"\n",
"print(\"Changing currency\")\n",
"driver.find_element_by_css_selector(\".btn.btn-dropdown-toggle\").click()\n",
"region = driver.find_element_by_css_selector(\".regional-panel\")\n",
"buttons = region.find_elements_by_tag_name(\"button\")\n",
"currency_button = buttons[1]\n",
"currency_button.click()\n",
"currencies = driver.find_elements_by_css_selector(\".currencies-group\")\n",
"currencies_group = currencies[1]\n",
"currency_buttons = currencies_group.find_elements_by_tag_name(\"button\")\n",
"mexican_pesos = currency_buttons[44]\n",
"mexican_pesos.click()\n",
"\n",
"print(\"Finding and calculating total results\")\n",
"dicts_for_df = []\n",
"total_results_element = driver.find_element_by_css_selector(\"div.total-results\")\n",
"total_results = int(total_results_element.text.replace(\" listados\", \"\"))\n",
"total_results_pages = math.ceil(total_results/20)\n",
"listing_urls = [scrapped_url + f\"?p={i}\" for i in range (2,total_results_pages + 1)]\n",
"print(f\"Total results are {total_results}\")\n",
"print(f\"Total results pages are {total_results_pages}\")\n",
"\n",
"print(\"Getting page results for page 1\")\n",
"page_listings = get_listings(driver)\n",
"final_data_for_df.append(page_listings)\n",
"\n",
"for listing_url in listing_urls:\n",
" print(f\"Getting page results for page {listing_url.split('=')[1]}\")\n",
" driver.get(listing_url)\n",
" page_listings = get_listings(driver)\n",
" final_data_for_df.append(page_listings)\n",
"\n",
"driver.close()\n",
"print(\"finished scrapping\")\n",
"\n",
"flat_final_data_for_df = [listing for listings in final_data_for_df for listing in listings]\n",
"listings_df = pd.DataFrame(flat_final_data_for_df)\n",
"listings_df.to_csv(\"output/scrapped_data.csv\") \n",
" \n",
"listings_df.head()\n",
"print(\"finished\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading