diff --git a/main.py b/main.py index 98ae2b9..ea685dd 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -""" +"" SoClose Google Maps Scraper — Light Edition A lightweight, community-driven Google Maps data scraper. https://github.com/SoCloseSociety/GoogleMapScraper @@ -8,16 +8,16 @@ python main.py -q "restaurants+paris" -o results python main.py -u "https://www.google.com/maps/search/..." -o results python main.py --from-links results_links.csv -o results -""" +" import argparse import csv import logging import os import sys -import time -import random -import socket +time +random +socket from selenium import webdriver from selenium.webdriver.chrome.service import Service @@ -32,7 +32,7 @@ from webdriver_manager.chrome import ChromeDriverManager from bs4 import BeautifulSoup import pandas as pd - +from user_agent import generate_user_agent # --------------------------------------------------------------------------- # Configuration @@ -50,7 +50,6 @@ ) log = logging.getLogger("soclose-gmaps") - # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @@ -65,7 +64,6 @@ def check_internet(host="one.one.one.one", port=80, timeout=3): except OSError: return False - def create_driver(headless=False): """Create and return a configured Chrome WebDriver instance.""" opts = Options() @@ -91,12 +89,10 @@ def create_driver(headless=False): driver.set_page_load_timeout(30) return driver - def random_delay(bounds=DEFAULT_DELAY): """Sleep for a random duration within *bounds*.""" time.sleep(random.uniform(*bounds)) - # --------------------------------------------------------------------------- # Phase 1 — Collect place links # --------------------------------------------------------------------------- @@ -153,7 +149,6 @@ def collect_links(driver, url): log.info(f"Phase 1 complete — {len(links)} links collected.") return sorted(links) - # --------------------------------------------------------------------------- # Phase 2 — Extract business details # --------------------------------------------------------------------------- @@ -217,7 +212,6 @@ def extract_details(driver, link): return data - def scrape_details(driver, links, output_path): """Iterate through all links, extract details, and save to CSV. @@ -233,7 +227,7 @@ def scrape_details(driver, links, output_path): data = extract_details(driver, link) if data and data.get("name"): results.append(data) - log.info(f" -> {data['name']}") + log.info(f" -> {data["name"]}") else: log.warning(" -> No data extracted") except WebDriverException as exc: @@ -249,19 +243,18 @@ def scrape_details(driver, links, output_path): log.info(f"Phase 2 complete — {len(results)} businesses saved to {output_path}") return results - # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- -BANNER = r""" +BANNER = r"" ____ ____ _ / ___| ___ / ___| | ___ ___ ___ \___ \ / _ \| | | |/ _ \/ __|/ _ \ ___) | (_) | |___| | (_) \__ \ __/ - |____/ \___/ \____|_|\___/|___/\___| + |____/ \___/ \____|_|\"# noqa Google Maps Scraper — Light -""" +" def parse_args(): @@ -298,71 +291,32 @@ def parse_args(): metavar="CSV", help="Skip link collection and extract details from an existing links CSV.", ) + return parser.parse_args() -def main(): - """Entry point.""" +if __name__ == "__main__": args = parse_args() - print(BANNER) - # --- Resolve search URL --------------------------------------------------- - if args.from_links: - if not os.path.isfile(args.from_links): - log.error(f"File not found: {args.from_links}") - sys.exit(1) - search_url = None - elif args.url: - search_url = args.url - elif args.query: - query = args.query.replace(" ", "+") - search_url = f"https://www.google.com/maps/search/{query}/" - else: - log.error("Provide either --url or --query (see --help).") - sys.exit(1) - - # --- Internet check ------------------------------------------------------- if not check_internet(): - log.error("No internet connection detected. Aborting.") + log.error("No internet connection. Exiting...") sys.exit(1) - log.info("Starting Chrome driver ...") driver = create_driver(headless=args.headless) try: - # Phase 1 — Collect links - if args.from_links: - log.info(f"Loading links from {args.from_links}") - with open(args.from_links, "r", encoding="utf-8") as f: - reader = csv.reader(f) - links = [ - row[0] for row in reader - if row and "/maps/place/" in row[0] - ] + if args.url or args.query: + links = collect_links(driver, args.url or f'https://www.google.com/maps/search/{args.query.replace(" ", "+")}') + elif args.from_links: + with open(args.from_links, newline='') as csvfile: + reader = csv.reader(csvfile) + links = [row[0] for row in reader] else: - links = collect_links(driver, search_url) - if links: - links_csv = f"{args.output}_links.csv" - pd.DataFrame({"link": links}).to_csv(links_csv, index=False) - log.info(f"Links saved to {links_csv}") - - if not links: - log.warning("No links found. Nothing to scrape.") - return - - log.info(f"Total links: {len(links)}") + log.error("No URL or query provided. Exiting...") + sys.exit(1) - # Phase 2 — Extract details if not args.links_only: - details_csv = f"{args.output}_details.csv" - scrape_details(driver, links, details_csv) - - except KeyboardInterrupt: - log.info("\nInterrupted by user. Progress has been saved.") + output_path = f"{args.output}.csv" + scrape_details(driver, links, output_path) finally: driver.quit() - log.info("Browser closed. Done.") - - -if __name__ == "__main__": - main()