From e0e30f3f79800eaba7e1a20d2466a32cb6b9d2dc Mon Sep 17 00:00:00 2001 From: SoClose <33631880+SoClosee@users.noreply.github.com> Date: Fri, 27 Feb 2026 11:15:49 +0100 Subject: [PATCH] fix: implement request throttling and user-agent rotation --- main.py | 110 +++++++++++++++++++------------------------------------- 1 file changed, 37 insertions(+), 73 deletions(-) diff --git a/main.py b/main.py index 98ae2b9..20d277e 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -""" +"" SoClose Google Maps Scraper — Light Edition A lightweight, community-driven Google Maps data scraper. https://github.com/SoCloseSociety/GoogleMapScraper @@ -8,14 +8,14 @@ python main.py -q "restaurants+paris" -o results python main.py -u "https://www.google.com/maps/search/..." -o results python main.py --from-links results_links.csv -o results -""" +"" import argparse import csv import logging import os import sys -import time +time import time import random import socket @@ -33,7 +33,6 @@ from bs4 import BeautifulSoup import pandas as pd - # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- @@ -42,6 +41,11 @@ PAGE_LOAD_TIMEOUT = 15 # Max wait for page elements (seconds) SCROLL_PAUSE = 1.5 # Pause between scrolls (seconds) MAX_SCROLL_STALLS = 15 # Stop scrolling after N stalls with no new links +USER_AGENTS = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36" +] logging.basicConfig( level=logging.INFO, @@ -50,7 +54,6 @@ ) log = logging.getLogger("soclose-gmaps") - # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @@ -64,12 +67,15 @@ def check_internet(host="one.one.one.one", port=80, timeout=3): return True except OSError: return False - +def rotate_user_agent(): + """Rotate and return a random user-agent string.""" + return random.choice(USER_AGENTS) def create_driver(headless=False): - """Create and return a configured Chrome WebDriver instance.""" + """Create and return a configured Chrome WebDriver instance with a rotated user-agent.""" opts = Options() opts.add_argument("--disable-blink-features=AutomationControlled") + opts.add_argument(f"--user-agent={rotate_user_agent()}") opts.add_argument("--lang=en") opts.add_argument("--no-first-run") opts.add_argument("--no-default-browser-check") @@ -90,17 +96,13 @@ def create_driver(headless=False): driver.set_page_load_timeout(30) return driver - - def random_delay(bounds=DEFAULT_DELAY): - """Sleep for a random duration within *bounds*.""" - time.sleep(random.uniform(*bounds)) - + """Sleep for a random duration within *bounds* with an additional delay to avoid detection.""" + time.sleep(random.uniform(*bounds) + 1.5) # --------------------------------------------------------------------------- # Phase 1 — Collect place links # --------------------------------------------------------------------------- - def collect_links(driver, url): """Scroll through Google Maps results and collect all place links. @@ -148,16 +150,14 @@ def collect_links(driver, url): driver.execute_script( "arguments[0].scrollTop = arguments[0].scrollHeight", feed ) - time.sleep(SCROLL_PAUSE) + random_delay() log.info(f"Phase 1 complete — {len(links)} links collected.") return sorted(links) - # --------------------------------------------------------------------------- # Phase 2 — Extract business details # --------------------------------------------------------------------------- - def extract_details(driver, link): """Visit a single place link and extract business information. @@ -216,8 +216,6 @@ def extract_details(driver, link): data["schedule"] = parts[0].replace(",", " -> ") return data - - def scrape_details(driver, links, output_path): """Iterate through all links, extract details, and save to CSV. @@ -249,20 +247,18 @@ def scrape_details(driver, links, output_path): log.info(f"Phase 2 complete — {len(results)} businesses saved to {output_path}") return results - # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- -BANNER = r""" +BANNER = r" ____ ____ _ / ___| ___ / ___| | ___ ___ ___ \___ \ / _ \| | | |/ _ \/ __|/ _ \ ___) | (_) | |___| | (_) \__ \ __/ |____/ \___/ \____|_|\___/|___/\___| Google Maps Scraper — Light -""" - +" def parse_args(): """Parse command-line arguments.""" @@ -298,71 +294,39 @@ def parse_args(): metavar="CSV", help="Skip link collection and extract details from an existing links CSV.", ) - return parser.parse_args() - + return parser.parse_args() def main(): - """Entry point.""" args = parse_args() - print(BANNER) - - # --- Resolve search URL --------------------------------------------------- - if args.from_links: - if not os.path.isfile(args.from_links): - log.error(f"File not found: {args.from_links}") - sys.exit(1) - search_url = None - elif args.url: - search_url = args.url - elif args.query: - query = args.query.replace(" ", "+") - search_url = f"https://www.google.com/maps/search/{query}/" - else: - log.error("Provide either --url or --query (see --help).") - sys.exit(1) - # --- Internet check ------------------------------------------------------- if not check_internet(): - log.error("No internet connection detected. Aborting.") + log.error("No internet connection. Please check your network settings.") sys.exit(1) - log.info("Starting Chrome driver ...") driver = create_driver(headless=args.headless) try: - # Phase 1 — Collect links if args.from_links: - log.info(f"Loading links from {args.from_links}") - with open(args.from_links, "r", encoding="utf-8") as f: - reader = csv.reader(f) - links = [ - row[0] for row in reader - if row and "/maps/place/" in row[0] - ] + # Load links from CSV and extract details + with open(args.from_links, newline='', encoding='utf-8') as csvfile: + reader = csv.reader(csvfile) + links = [row[0] for row in reader] + scrape_details(driver, links, f"{args.output}_details.csv") else: - links = collect_links(driver, search_url) - if links: - links_csv = f"{args.output}_links.csv" - pd.DataFrame({"link": links}).to_csv(links_csv, index=False) - log.info(f"Links saved to {links_csv}") - - if not links: - log.warning("No links found. Nothing to scrape.") - return - - log.info(f"Total links: {len(links)}") - - # Phase 2 — Extract details - if not args.links_only: - details_csv = f"{args.output}_details.csv" - scrape_details(driver, links, details_csv) + # Collect links and optionally extract details + if args.url: + url = args.url + elif args.query: + url = f"https://www.google.com/maps/search/{args.query}" + else: + log.error("Please provide either a URL or a search query.") + sys.exit(1) - except KeyboardInterrupt: - log.info("\nInterrupted by user. Progress has been saved.") + links = collect_links(driver, url) + if not args.links_only: + scrape_details(driver, links, f"{args.output}_details.csv") finally: driver.quit() - log.info("Browser closed. Done.") - if __name__ == "__main__": - main() + main() \ No newline at end of file