From 63c4cc993cff3a767c68156184791aff158bae26 Mon Sep 17 00:00:00 2001 From: EvanUp Date: Mon, 10 Feb 2025 15:24:54 -0500 Subject: [PATCH 001/101] switching from requests to selenium --- .python-version | 1 + README.md | 21 +++++--- WebSearcher/searchers.py | 103 +++++++++++++++++++++++++++++---------- pyproject.toml | 2 + scripts/demo_search.py | 1 + tests/selenium_test.py | 7 +++ 6 files changed, 102 insertions(+), 33 deletions(-) create mode 100644 .python-version create mode 100644 tests/selenium_test.py diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..c84ccce --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.10.5 diff --git a/README.md b/README.md index 31f1e97..a611527 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,10 @@ ## Tools for conducting and parsing web searches [![PyPI version](https://badge.fury.io/py/WebSearcher.svg)](https://badge.fury.io/py/WebSearcher) +NOTE: In 0.5.*, we moved scraping to selenium + This package provides tools for conducting algorithm audits of web search and -includes a scraper built on `requests` with tools for geolocating, conducting, +includes a scraper built on `selenium` with tools for geolocating, conducting, and saving searches. It also includes a modular parser built on `BeautifulSoup` for decomposing a SERP into list of components with categorical classifications and position-based specifications. @@ -104,13 +106,14 @@ drwxr-xr-x 2 user user 4.0K 2024-11-11 10:55 html/ -rw-r--r-- 1 user user 990K 2024-11-11 10:55 serps.json ``` -### Step by Step +### Step by Step Example search and parse pipeline: ```python import WebSearcher as ws se = ws.SearchEngine() # 1. Initialize collector +se.launch_chromedriver(headless=False) # 2. Launch undetected chromedriver window se.search('immigration news') # 2. Conduct a search se.parse_results() # 3. Parse search results se.save_serp(append_to='serps.json') # 4. Save HTML and metadata @@ -153,14 +156,20 @@ vars(se) 'log': } ``` -#### 2. Conduct a Search +#### 2. Launch undetected chromedriver window +We've switched to using [undetected chrome](https://github.com/ultrafunkamsterdam/undetected-chromedriver) to scrape search results. You'll need to ensure that your chromedriver is up-to-date. All cookies are deleted following each search.launch_chromedriver accepts 3 optional arguments. The defaults are: + +se.launch_chromedriver(headless = False, use_subprocess = False, chromedriver_path = '') + + +#### 3. Conduct a Search ```python se.search('immigration news') # 2024-08-19 14:09:18.502 | INFO | WebSearcher.searchers | 200 | immigration news ``` -#### 3. Parse Search Results +#### 4. Parse Search Results The example below is primarily for parsing search results as you collect HTML. See `ws.parse_serp(html)` for parsing existing HTML data. @@ -185,7 +194,7 @@ se.results[0] ``` -#### 4. Save HTML and Metadata +#### 5. Save HTML and Metadata Recommended: Append html and meta data as lines to a json file for larger or ongoing collections. @@ -200,7 +209,7 @@ Alternative: Save individual html files in a directory, named by a provided or ( se.save_serp(save_dir='./serps') ``` -#### 5. Save Parsed Results +#### 6. Save Parsed Results Save to a json lines file. diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 13e2444..4f45bd1 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -4,12 +4,19 @@ from . import utils from . import logger from .models import BaseSERP +# selenium updates +import undetected_chromedriver as uc +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC import os import time import brotli import requests import subprocess + from datetime import datetime, timezone from typing import Any, Dict, Optional @@ -17,20 +24,30 @@ WS_VERSION = metadata.version('WebSearcher') # Default headers to send with requests (i.e. device fingerprint) -DEFAULT_HEADERS = { - 'Host': 'www.google.com', - 'Referer': 'https://www.google.com/', - 'Accept': '*/*', - 'Accept-Encoding': 'gzip,deflate,br', - 'Accept-Language': 'en-US,en;q=0.5', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0', -} + +#DEFAULT_HEADERS = { +# 'Host': 'www.google.com', +# 'Referer': 'https://www.google.com/', +# 'Accept': '*/*', +# 'Accept-Encoding': 'gzip,deflate,br', +# 'Accept-Language': 'en-US,en;q=0.5', +# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0', +#} + +#chromedriver_path = "/opt/homebrew/Caskroom/chromedriver/133.0.6943.53" +#driver = uc.Chrome(chromedriver_path = chromedriver_path) +#driver.get('https://www.google.com') +#search_box = driver.find_element(By.ID, "APjFqb") +#search_box.send_keys("how climate change works") +#search_box.send_keys(Keys.RETURN) +#html_content = driver.page_source + class SearchEngine: """Collect Search Engine Results Pages (SERPs)""" def __init__(self, - headers: Dict[str, str] = DEFAULT_HEADERS, + headers: Dict[str, str] = None, sesh: Optional[requests.Session] = None, ssh_tunnel: Optional[subprocess.Popen] = None, unzip: bool = True, @@ -54,8 +71,9 @@ def __init__(self, # Initialize data storage self.version: str = WS_VERSION self.base_url: str = 'https://www.google.com/search' - self.headers: Dict[str, str] = headers - self.sesh: requests.Session = sesh if sesh else wu.start_sesh(headers=self.headers) + self.headers: Dict[str, str] = None + #self.sesh: requests.Session = sesh if sesh else wu.start_sesh(headers=self.headers) + self.sesh = None self.ssh_tunnel: subprocess.Popen = ssh_tunnel self.unzip: bool = unzip self.params: Dict[str, Any] = {} @@ -83,6 +101,11 @@ def __init__(self, file_level=log_level, ).start(__name__) + def launch_chromedriver(self, headless = False, use_subprocess = False, chromedriver_path = ''): + self.headless = headless + self.use_subprocess = use_subprocess + self.chromedriver_path = chromedriver_path + self._init_chromedriver() def search(self, qry: str, location: str = None, num_results: int = None, serp_id: str = '', crawl_id: str = ''): """Conduct a search and save HTML @@ -95,9 +118,8 @@ def search(self, qry: str, location: str = None, num_results: int = None, serp_i crawl_id (str, optional): An identifier for this crawl """ self._prepare_search(qry=qry, location=location, num_results=num_results) - self._conduct_search(serp_id=serp_id, crawl_id=crawl_id) - self._handle_response() - + self._conduct_chromedriver_search(serp_id=serp_id, crawl_id=crawl_id) + #self._handle_response() def _prepare_search(self, qry: str, location: str = None, num_results: int = None): """Prepare a search URL and metadata for the given query and location""" @@ -111,23 +133,41 @@ def _prepare_search(self, qry: str, location: str = None, num_results: int = Non if self.loc and self.loc != 'None': self.params['uule'] = locations.get_location_id(canonical_name=self.loc) + def _init_chromedriver(self): + print('launching...') + if self.chromedriver_path == '': + #optionally: headless=True, use_subprocess=True + self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess) + else: + self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess, chromedriver_path = self.chromedriver_path) + #chromedriver_path = "/opt/homebrew/Caskroom/chromedriver/133.0.6943.53" + time.sleep(2) + self.driver.get('https://www.google.com') + time.sleep(2) - def _conduct_search(self, serp_id: str = '', crawl_id: str = ''): + def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = ''): """Send a search request and handle errors""" - self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat() self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp) self.crawl_id = crawl_id try: - self._send_request() - except requests.exceptions.ConnectionError: - self.log.exception(f'SERP | Connection error | {self.serp_id}') - self._reset_ssh_tunnel() - except requests.exceptions.Timeout: - self.log.exception(f'SERP | Timeout error | {self.serp_id}') - except Exception: + self._send_chromedriver_request() + except: self.log.exception(f'SERP | Unknown error | {self.serp_id}') + self.driver.delete_all_cookies() + def _send_chromedriver_request(self): + search_box = self.driver.find_element(By.ID, "APjFqb") + search_box.send_keys(self.qry) + search_box.send_keys(Keys.RETURN) + + # wait for the page to load + WebDriverWait(self.driver, 10).until( + EC.presence_of_element_located((By.ID, "search")) + ) + time.sleep(2) #including a sleep to allow the page to fully load + self.html = self.driver.page_source + self.url = self.driver.current_url def _send_request(self): self.url = f"{self.base_url}?{wu.join_url_quote(self.params)}" @@ -144,7 +184,6 @@ def _reset_ssh_tunnel(self): self.log.info(f'SERP | Restarted SSH tunnel | {self.serp_id}') time.sleep(10) # Allow time to establish connection - def _handle_response(self): try: if self.unzip: @@ -155,7 +194,6 @@ def _handle_response(self): except Exception: self.log.exception(f'Response handling error') - def _unzip_html(self): """Unzip brotli zipped html @@ -199,8 +237,8 @@ def prepare_serp_save(self): loc=self.loc, url=self.url, html=self.html, - response_code=self.response.status_code, - user_agent=self.headers['User-Agent'], + response_code= 0,#self.response.status_code, + user_agent='',#self.headers['User-Agent'], timestamp=self.timestamp, serp_id=self.serp_id, crawl_id=self.crawl_id, @@ -264,3 +302,14 @@ def save_results(self, save_dir: str = "", append_to: str = ""): utils.write_lines(self.results, fp) else: self.log.info(f'No parsed results for serp_id: {self.serp_id}') + + + + +#chromedriver_path = "/opt/homebrew/Caskroom/chromedriver/133.0.6943.53" +#ws = SearchEngine(chromedriver_path=chromedriver_path) +#ws.launch_chromedriver() +#qry = 'how climate change works' +#ws.search(qry) +#ws.parse_results() +#ws.results \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 57b0870..8a710ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,8 @@ dependencies = [ "brotli>=1.1.0", "pydantic>=2.9.2", "pandas>=2.2.3", + "undetected-chromedriver>=3.5.5", + "selenium>=4.9.0", ] [project.urls] diff --git a/scripts/demo_search.py b/scripts/demo_search.py index 9de4dde..94bfb68 100644 --- a/scripts/demo_search.py +++ b/scripts/demo_search.py @@ -29,6 +29,7 @@ def main(): # Search, parse, and save se = ws.SearchEngine() # Initialize searcher + se.launch_chromedriver(headless =False) # Launch browser se.search(args.query) # Conduct Search se.parse_results() # Parse Results se.save_serp(append_to=fp_serps) # Save SERP to json (html + metadata) diff --git a/tests/selenium_test.py b/tests/selenium_test.py new file mode 100644 index 0000000..f5fe929 --- /dev/null +++ b/tests/selenium_test.py @@ -0,0 +1,7 @@ +import WebSearcher as ws +se = ws.SearchEngine() # 1. Initialize collector +se.launch_chromedriver(headless = False) # 2. Launch undetected chromedriver window +se.search('immigration news') # 2. Conduct a search +se.parse_results() # 3. Parse search results +se.save_serp(append_to='serps.json') # 4. Save HTML and metadata +se.save_results(append_to='results.json') # 5. Save parsed results From 1d5e2554f002423a1e67882f6a9248bb640ea181 Mon Sep 17 00:00:00 2001 From: EvanUp Date: Mon, 17 Feb 2025 15:35:39 -0500 Subject: [PATCH 002/101] added code to expand ai overview text and urls --- WebSearcher/searchers.py | 43 +++++++++++++++++++++++++++++++++++++--- tests/selenium_test.py | 4 ++++ 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 4f45bd1..9f23f42 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -10,6 +10,7 @@ from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import NoSuchElementException import os import time @@ -107,7 +108,7 @@ def launch_chromedriver(self, headless = False, use_subprocess = False, chromedr self.chromedriver_path = chromedriver_path self._init_chromedriver() - def search(self, qry: str, location: str = None, num_results: int = None, serp_id: str = '', crawl_id: str = ''): + def search(self, qry: str, ai_expand = False, location: str = None, num_results: int = None, serp_id: str = '', crawl_id: str = ''): """Conduct a search and save HTML Args: @@ -118,9 +119,10 @@ def search(self, qry: str, location: str = None, num_results: int = None, serp_i crawl_id (str, optional): An identifier for this crawl """ self._prepare_search(qry=qry, location=location, num_results=num_results) - self._conduct_chromedriver_search(serp_id=serp_id, crawl_id=crawl_id) + self._conduct_chromedriver_search(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand) #self._handle_response() + def _prepare_search(self, qry: str, location: str = None, num_results: int = None): """Prepare a search URL and metadata for the given query and location""" self.qry = str(qry) @@ -145,7 +147,14 @@ def _init_chromedriver(self): self.driver.get('https://www.google.com') time.sleep(2) - def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = ''): + def _check_ai_expand(self): + try: + self.driver.find_element(By.XPATH, "//div[@jsname='rPRdsc' and @role='button']") + return True + except NoSuchElementException: + return False + + def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = '', ai_expand = False): """Send a search request and handle errors""" self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat() self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp) @@ -154,10 +163,38 @@ def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = ''): self._send_chromedriver_request() except: self.log.exception(f'SERP | Unknown error | {self.serp_id}') + + ## Look for AI overview box and click on it + if ai_expand: + ai_button = self._check_ai_expand() + if ai_button: + try: + show_more_button = WebDriverWait(self.driver, 1).until( + EC.element_to_be_clickable((By.XPATH, "//div[@jsname='rPRdsc' and @role='button']")) + ) + show_more_button.click() + if show_more_button is not None: + try: + # Wait for additional content to load + time.sleep(2) + + show_all_button = WebDriverWait(self.driver, 1).until( + EC.element_to_be_clickable((By.XPATH, '//div[contains(@class, "trEk7e") and @role="button"]')) + ) + show_all_button.click() + except: + pass + except: + pass + self.html = self.driver.page_source + else: + pass + self.driver.delete_all_cookies() def _send_chromedriver_request(self): search_box = self.driver.find_element(By.ID, "APjFqb") + search_box.clear() search_box.send_keys(self.qry) search_box.send_keys(Keys.RETURN) diff --git a/tests/selenium_test.py b/tests/selenium_test.py index f5fe929..8aa06c1 100644 --- a/tests/selenium_test.py +++ b/tests/selenium_test.py @@ -5,3 +5,7 @@ se.parse_results() # 3. Parse search results se.save_serp(append_to='serps.json') # 4. Save HTML and metadata se.save_results(append_to='results.json') # 5. Save parsed results + + +#import pandas as pd +#df = pd.DataFrame(se.results) # 6. Display results in a pandas dataframe \ No newline at end of file From 002386bd315decae3134a0a8f3b144a7209e9c33 Mon Sep 17 00:00:00 2001 From: gitronald Date: Wed, 26 Feb 2025 13:11:04 -0800 Subject: [PATCH 003/101] update: add lang arg to search using hl url param --- WebSearcher/searchers.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 13e2444..b8e1737 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -63,6 +63,7 @@ def __init__(self, # Initialize search details self.qry: str = None self.loc: str = None + self.lang: str = None self.num_results = None self.url: str = None self.timestamp: str = None @@ -84,7 +85,14 @@ def __init__(self, ).start(__name__) - def search(self, qry: str, location: str = None, num_results: int = None, serp_id: str = '', crawl_id: str = ''): + def search(self, + qry: str, + location: str = None, + lang: str = None, + num_results: int = None, + serp_id: str = '', + crawl_id: str = '' + ): """Conduct a search and save HTML Args: @@ -94,20 +102,23 @@ def search(self, qry: str, location: str = None, num_results: int = None, serp_i serp_id (str, optional): A unique identifier for this SERP crawl_id (str, optional): An identifier for this crawl """ - self._prepare_search(qry=qry, location=location, num_results=num_results) + self._prepare_search(qry=qry, location=location, lang=lang, num_results=num_results) self._conduct_search(serp_id=serp_id, crawl_id=crawl_id) self._handle_response() - def _prepare_search(self, qry: str, location: str = None, num_results: int = None): + def _prepare_search(self, qry: str, location: str = None, lang: str = None, num_results: int = None): """Prepare a search URL and metadata for the given query and location""" self.qry = str(qry) self.loc = str(location) if location else '' + self.lang = lang self.num_results = num_results self.params = {} self.params['q'] = wu.encode_param_value(self.qry) if self.num_results: self.params['num'] = self.num_results + if self.lang: + self.params['hl'] = self.lang if self.loc and self.loc != 'None': self.params['uule'] = locations.get_location_id(canonical_name=self.loc) From 4bf035ba6e8f916f9c4dc087d1f5c3bbaa88d190 Mon Sep 17 00:00:00 2001 From: gitronald Date: Wed, 26 Feb 2025 13:11:19 -0800 Subject: [PATCH 004/101] version: 0.5.1.dev0 --- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index 61bd55b..dd7c0c6 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.5.0" +__version__ = "0.5.1.dev0" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index 57b0870..c121063 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.5.0" +version = "0.5.1.dev0" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From 834769bd1bdd95cec2601bc7e7da8d2b7dd2fde1 Mon Sep 17 00:00:00 2001 From: gitronald Date: Wed, 26 Feb 2025 14:51:18 -0800 Subject: [PATCH 005/101] update: add lang to output --- WebSearcher/searchers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index b8e1737..779e64a 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -208,6 +208,7 @@ def prepare_serp_save(self): self.serp = BaseSERP( qry=self.qry, loc=self.loc, + lang=self.lang, url=self.url, html=self.html, response_code=self.response.status_code, From e09030b00cdc3b8a5e7f6373c57aa753b601f17c Mon Sep 17 00:00:00 2001 From: gitronald Date: Wed, 26 Feb 2025 14:57:30 -0800 Subject: [PATCH 006/101] update: add language to serp model --- WebSearcher/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/WebSearcher/models.py b/WebSearcher/models.py index c3f617a..a85d7c1 100644 --- a/WebSearcher/models.py +++ b/WebSearcher/models.py @@ -17,6 +17,7 @@ class BaseResult(BaseModel): class BaseSERP(BaseModel): qry: str # Search query loc: Optional[str] = None # Location if set, "Canonical Name" + lang: Optional[str] = None # Language if set url: str # URL of SERP html: str # Raw HTML of SERP timestamp: str # Timestamp of crawl From 9377d753326f4324cfc83e19f20b1ba279eec85f Mon Sep 17 00:00:00 2001 From: gitronald Date: Wed, 26 Feb 2025 14:58:02 -0800 Subject: [PATCH 007/101] version: 0.5.1.dev1 --- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index dd7c0c6..c9ad3af 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.5.1.dev0" +__version__ = "0.5.1.dev1" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index c121063..ee93f21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.5.1.dev0" +version = "0.5.1.dev1" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From 19a94f916f4c390dd74e7710394e8de850e59f34 Mon Sep 17 00:00:00 2001 From: mariaelissat <166256195+mariaelissat@users.noreply.github.com> Date: Sat, 1 Mar 2025 15:16:12 -0500 Subject: [PATCH 008/101] =?UTF-8?q?Update=20header=5Ftext=20en=20espa?= =?UTF-8?q?=C3=B1ol=20v2.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- WebSearcher/classifiers/header_text.py | 57 +++++++++++++++----------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/WebSearcher/classifiers/header_text.py b/WebSearcher/classifiers/header_text.py index dd0062f..5bc2cb9 100644 --- a/WebSearcher/classifiers/header_text.py +++ b/WebSearcher/classifiers/header_text.py @@ -51,51 +51,62 @@ def _get_header_level_mapping(level) -> dict: # WS type -> header level 2 text (e.g.,

title

) TYPE_TO_H2_MAPPING = { - "directions": ["Directions"], + "directions": ["Directions", "Ubicaciones"], "discussions_and_forums": ["Discussions and forums"], "general": ["Complementary Results", - "Resultados de la Web", "Web Result with Site Links", - "Web results"], - "images": ["Images"], - "jobs": ["Jobs"], + "Web results", "Resultados de la Web", + "AI-powered overview", "Visión general creada por IA", + "Things to know", "Cosas que debes saber"], + "images": ["Images", "Imágenes"], + "jobs": ["Jobs", "Empleos"], "knowledge": ["Calculator Result", - "Featured snippet from the web", - "Finance Results", + "Featured snippet from the web", "Fragmento destacado", + "Finance Results", "Resumen de Mercado", "From sources across the web", "Knowledge Result", "Resultado de traducci\u00f3n", "Sports Results", + "Table", "Posiciones", + "Stat Leaders", "Líderes de estadísticas", + "Teams", "Equipos", + "Players", "Jugadores", "Translation Result", "Unit Converter", - "Weather Result"], - "local_news": ["Local news"], + "Weather Result", "Clima" + "Artworks", "Obras de arte", + "Songs", "Canciones" + "Albums", "Álbumes", + "What people are saying", + "About", "Información", + "Profiles", "Perfiles"], + "local_news": ["Local news", "Noticias Locales"], "local_results": [ "Local Results", "Locations", - "Places", + "Places", "Sitios" "Businesses", "locations", ], "map_results": ["Map Results", - "Choice Hotels"], + "Choice Hotels", "Hoteles", "Hotel"], "omitted_notice": ["Notices about Filtered Results"], - "people_also_ask": ["People also ask"], + "people_also_ask": ["People also ask", "Más preguntas"], "perspectives": ["Perspectives & opinions", "Perspectives"], "searches_related": ["Additional searches", - "More searches", + "More searches", "Ver más", "Other searches", - "People also search for", + "People also search for", "También se buscó", "Related", "Related searches", "Related to this search", "Searches related to"], - "top_stories": ["Top stories", - "News", + "top_stories": ["Top stories", "Noticias Destacadas", "Noticias Principales", + "News", "Noticias", "Market news"], "twitter": ["Twitter Results"], - "videos": ["Videos"] + "videos": ["Videos", "Videos"] } # WS type -> header level 2 text (e.g.,

title

) @@ -104,13 +115,13 @@ def _get_header_level_mapping(level) -> dict: "latest_from": ["Latest from"], "products": ["Popular products"], "news_quotes": ["Quotes in the news"], - "recipes": ["Recipes"], + "recipes": ["Recipes", "Recetas"], "searches_related": ["Related searches"], - "scholarly_articles": ["Scholarly articles for"], - "top_stories": ["Top stories"], - "videos": ["Videos"], - "view_more_news": ["View more news"], - "view_more_videos": ["View more videos"] + "scholarly_articles": ["Scholarly articles for", "Artículos académicos para"], + "top_stories": ["Top stories", "Noticias destacadas", "Noticias Principales"], + "videos": ["Videos", "Videos"], + "view_more_news": ["View more news", "Más noticias", "Ver más"], + "view_more_videos": ["View more videos", "Más videos", "Ver más"] } # Invert from {label: [text, ...]} to [{text: label}, ...] From 3010788f8baf584dd8ca57cbd49d5c0890f7435c Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 6 Mar 2025 01:42:14 -0800 Subject: [PATCH 009/101] update: null arg handling --- WebSearcher/searchers.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 779e64a..3c4e3d6 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -10,6 +10,7 @@ import brotli import requests import subprocess +import pandas as pd from datetime import datetime, timezone from typing import Any, Dict, Optional @@ -110,16 +111,16 @@ def search(self, def _prepare_search(self, qry: str, location: str = None, lang: str = None, num_results: int = None): """Prepare a search URL and metadata for the given query and location""" self.qry = str(qry) - self.loc = str(location) if location else '' - self.lang = lang + self.loc = str(location) if not pd.isnull(location) else '' + self.lang = str(lang) if not pd.isnull(lang) else '' self.num_results = num_results self.params = {} self.params['q'] = wu.encode_param_value(self.qry) if self.num_results: self.params['num'] = self.num_results - if self.lang: + if self.lang and self.lang not in {'None', 'nan'}: self.params['hl'] = self.lang - if self.loc and self.loc != 'None': + if self.loc and self.loc not in {'None', 'nan'}: self.params['uule'] = locations.get_location_id(canonical_name=self.loc) From e547b888a8f344345c970c6fadb84171892c3c86 Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 6 Mar 2025 01:42:27 -0800 Subject: [PATCH 010/101] version: 0.5.1.dev4 --- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index c9ad3af..9e83a10 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.5.1.dev1" +__version__ = "0.5.1.dev4" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index ee93f21..ac2f6d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.5.1.dev1" +version = "0.5.1.dev4" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From 1ae28138382dfa3bca360e9230d65d8b5aa802e2 Mon Sep 17 00:00:00 2001 From: gitronald Date: Fri, 7 Mar 2025 09:23:43 -0800 Subject: [PATCH 011/101] fix: canonical name to uule converter with protobuf --- WebSearcher/locations.py | 83 ++++++++++++++++++++++++++++++---------- WebSearcher/searchers.py | 2 +- poetry.lock | 22 ++++++++++- pyproject.toml | 1 + 4 files changed, 85 insertions(+), 23 deletions(-) diff --git a/WebSearcher/locations.py b/WebSearcher/locations.py index 16b3ca2..c9fcbe1 100644 --- a/WebSearcher/locations.py +++ b/WebSearcher/locations.py @@ -2,39 +2,80 @@ import io import csv import base64 -import string import zipfile import requests -from bs4 import BeautifulSoup +from google.protobuf.internal import decoder, encoder # poetry add protobuf +from typing import Dict, Union, Any from . import logger from . import webutils as wu log = logger.Logger().start(__name__) -def get_location_id(canonical_name: str) -> str: - """Get location ID for URL parameter 'uule' - - Returns the url parameter for a given location's Canonical Name. - See download_locations to obtain a csv of locations and their canonical names. +def convert_canonical_name_to_uule(canon_name: str) -> str: + """ + Get UULE parameter based on a location's canonical name. + Args: canon_name: Canonical name of the location + Returns: UULE parameter for Google search + """ + fields = {1: 2, 2: 32, 4: canon_name} + encoded_string = encode_protobuf_string(fields) + return f'w+{encoded_string}' - Credit for figuring this out goes to the author of the PHP version: - https://github.com/512banque/uule-grabber/blob/master/uule.php - Args: - canonical_name (str): The "Canoncial Name" for a location. Use - download_locations to obtain file containing all options. Column name - is usually something like "Canonical Name" or "Canonical.Name". - - Returns: - str: The uule parameter key for a given location's Canonical Name. +def encode_protobuf_string(fields: Dict[int, Union[str, int]]) -> str: + """ + Encode a dictionary of field numbers and values into a base64-encoded protobuf string. + Args: fields: A dictionary where keys are protobuf field numbers and values are the data to encode + Returns: A base64-encoded protobuf message string + """ + encoded = bytearray() # Buffer to store encoded bytes + + for field_number, value in fields.items(): + wire_type = 2 if isinstance(value, str) else 0 # Determine wire type based on value type + tag = field_number << 3 | wire_type # Combine field number and wire type into tag + encoded.extend(encoder._VarintBytes(tag)) # Encode the tag into bytes + + # Encode the value based on wire type + if wire_type == 0: + encoded.extend(encoder._VarintBytes(value)) # Encode the integer as varint + if wire_type == 2: + value = value.encode('utf-8') # Convert string to bytes + encoded.extend(encoder._VarintBytes(len(value))) # Add length prefix + encoded.extend(value) # Add the actual bytes + return base64.b64encode(bytes(encoded)).decode('utf-8') # Convert to base64 and decode to string + + +def decode_protobuf_string(encoded_string: str) -> Dict[int, Any]: + """ + Decode a base64-encoded protobuf string into a dictionary of field numbers and values. + Args: encoded_string: A base64-encoded protobuf message + Returns: dictionary where keys are protobuf field numbers and values are the decoded values """ - uule_key = string.ascii_uppercase+string.ascii_lowercase+string.digits - uule_key = uule_key + '-_' + uule_key + '-_' # Double length, repeating - key = uule_key[len(canonical_name)] - b64 = base64.b64encode(canonical_name.encode('utf-8')).decode('utf-8') - return f'w+CAIQICI{key}{b64}' + + pos = 0 # Position tracker for decoding + fields = {} # Dictionary to store decoded field numbers and values + + protobuf_bytes = base64.b64decode(encoded_string) # Convert to protobuf bytes + while pos < len(protobuf_bytes): + + # Get field number and wire type + tag, pos_new = decoder._DecodeVarint(protobuf_bytes, pos) # Each protobuf field starts with a varint tag + field_number, wire_type = tag >> 3, tag & 7 # Extract field number and wire type from tag + + # Decode value based on wire type (0: varint, 2: length-delimited; others not supported) + if wire_type == 0: + value, pos_new = decoder._DecodeVarint(protobuf_bytes, pos_new) # Get the varint value and new position + elif wire_type == 2: + length, pos_start = decoder._DecodeVarint(protobuf_bytes, pos_new) # Get length and starting position + value = protobuf_bytes[pos_start:pos_start + length] # Extract data based on the length + pos_new = pos_start + length # Update the new position + value = value.decode('utf-8') # Assume UTF-8 encoding for strings + + fields[field_number] = value # Store the field number and value in the dictionary + pos = pos_new # Move to the next field using the updated position + return fields def download_locations( diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 3c4e3d6..01ca73f 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -121,7 +121,7 @@ def _prepare_search(self, qry: str, location: str = None, lang: str = None, num_ if self.lang and self.lang not in {'None', 'nan'}: self.params['hl'] = self.lang if self.loc and self.loc not in {'None', 'nan'}: - self.params['uule'] = locations.get_location_id(canonical_name=self.loc) + self.params['uule'] = locations.convert_canonical_name_to_uule(self.loc) def _conduct_search(self, serp_id: str = '', crawl_id: str = ''): diff --git a/poetry.lock b/poetry.lock index 57ccb86..d657798 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1220,6 +1220,26 @@ files = [ [package.dependencies] wcwidth = "*" +[[package]] +name = "protobuf" +version = "6.30.0" +description = "" +optional = false +python-versions = ">=3.9" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "protobuf-6.30.0-cp310-abi3-win32.whl", hash = "sha256:7337d76d8efe65ee09ee566b47b5914c517190196f414e5418fa236dfd1aed3e"}, + {file = "protobuf-6.30.0-cp310-abi3-win_amd64.whl", hash = "sha256:9b33d51cc95a7ec4f407004c8b744330b6911a37a782e2629c67e1e8ac41318f"}, + {file = "protobuf-6.30.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:52d4bb6fe76005860e1d0b8bfa126f5c97c19cc82704961f60718f50be16942d"}, + {file = "protobuf-6.30.0-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:7940ab4dfd60d514b2e1d3161549ea7aed5be37d53bafde16001ac470a3e202b"}, + {file = "protobuf-6.30.0-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:d79bf6a202a536b192b7e8d295d7eece0c86fbd9b583d147faf8cfeff46bf598"}, + {file = "protobuf-6.30.0-cp39-cp39-win32.whl", hash = "sha256:bb35ad251d222f03d6c4652c072dfee156be0ef9578373929c1a7ead2bd5492c"}, + {file = "protobuf-6.30.0-cp39-cp39-win_amd64.whl", hash = "sha256:501810e0eba1d327e783fde47cc767a563b0f1c292f1a3546d4f2b8c3612d4d0"}, + {file = "protobuf-6.30.0-py3-none-any.whl", hash = "sha256:e5ef216ea061b262b8994cb6b7d6637a4fb27b3fb4d8e216a6040c0b93bd10d7"}, + {file = "protobuf-6.30.0.tar.gz", hash = "sha256:852b675d276a7d028f660da075af1841c768618f76b90af771a8e2c29e6f5965"}, +] + [[package]] name = "psutil" version = "6.1.1" @@ -1940,4 +1960,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = ">=3.9" -content-hash = "9928a0553f056ecc96916fb2d6c4adeca729ec9f5c69ef72322077610def4d88" +content-hash = "aae03414bd510dcc398d4b52bd96660021224dfbf78564b91a1235d3e851a582" diff --git a/pyproject.toml b/pyproject.toml index ac2f6d2..aedbc1c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "brotli>=1.1.0", "pydantic>=2.9.2", "pandas>=2.2.3", + "protobuf (>=6.30.0,<7.0.0)", ] [project.urls] From bddd4a1d927a58de88299a67bf073a41d720612c Mon Sep 17 00:00:00 2001 From: gitronald Date: Fri, 7 Mar 2025 09:26:13 -0800 Subject: [PATCH 012/101] update: more specific dir name for geotargets csv download --- scripts/demo_locations.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/demo_locations.py b/scripts/demo_locations.py index e75870e..655f142 100644 --- a/scripts/demo_locations.py +++ b/scripts/demo_locations.py @@ -5,7 +5,7 @@ import WebSearcher as ws # Retrieve and save latest location data -data_dir = 'data/locations' +data_dir = 'data/google_locations' os.makedirs(data_dir, exist_ok=True) ws.download_locations(data_dir) @@ -116,4 +116,5 @@ dir_html = os.path.join("data", 'html') os.makedirs(dir_html, exist_ok=True) +se.save_search(append_to=os.path.join(dir_html, "searches.json")) se.save_serp(save_dir=dir_html) From 74a1487169b48dce73ed750625a2d2817bff861a Mon Sep 17 00:00:00 2001 From: gitronald Date: Fri, 7 Mar 2025 09:29:54 -0800 Subject: [PATCH 013/101] version: 0.5.1.dev5 --- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index 9e83a10..69bccf6 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.5.1.dev4" +__version__ = "0.5.1.dev5" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index aedbc1c..dd08db5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.5.1.dev4" +version = "0.5.1.dev5" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From e18eed58db021be98defb34e10f11a445a3bce1e Mon Sep 17 00:00:00 2001 From: gitronald Date: Fri, 7 Mar 2025 10:19:44 -0800 Subject: [PATCH 014/101] version: 0.5.1 --- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index 69bccf6..10b1c51 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.5.1.dev5" +__version__ = "0.5.1" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index dd08db5..b66d25d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.5.1.dev5" +version = "0.5.1" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From 8ffbf34a171245888740f2b7d1d0982049818ec7 Mon Sep 17 00:00:00 2001 From: "Ronald E. Robertson" Date: Fri, 7 Mar 2025 10:34:28 -0800 Subject: [PATCH 015/101] Update WebSearcher/classifiers/header_text.py fix missing commas Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- WebSearcher/classifiers/header_text.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/classifiers/header_text.py b/WebSearcher/classifiers/header_text.py index 5bc2cb9..8a573a2 100644 --- a/WebSearcher/classifiers/header_text.py +++ b/WebSearcher/classifiers/header_text.py @@ -73,9 +73,9 @@ def _get_header_level_mapping(level) -> dict: "Players", "Jugadores", "Translation Result", "Unit Converter", - "Weather Result", "Clima" + "Weather Result", "Clima", "Artworks", "Obras de arte", - "Songs", "Canciones" + "Songs", "Canciones", "Albums", "Álbumes", "What people are saying", "About", "Información", From a136095af2a9da7efb59d647486e97bb2d0013ac Mon Sep 17 00:00:00 2001 From: gitronald Date: Fri, 7 Mar 2025 10:48:34 -0800 Subject: [PATCH 016/101] update: formatting, drop repeated Video labels --- WebSearcher/classifiers/header_text.py | 87 +++++++++++++++----------- 1 file changed, 49 insertions(+), 38 deletions(-) diff --git a/WebSearcher/classifiers/header_text.py b/WebSearcher/classifiers/header_text.py index 8a573a2..8c0c3c7 100644 --- a/WebSearcher/classifiers/header_text.py +++ b/WebSearcher/classifiers/header_text.py @@ -51,35 +51,41 @@ def _get_header_level_mapping(level) -> dict: # WS type -> header level 2 text (e.g.,

title

) TYPE_TO_H2_MAPPING = { - "directions": ["Directions", "Ubicaciones"], + "directions": ["Directions", + "Ubicaciones"], "discussions_and_forums": ["Discussions and forums"], "general": ["Complementary Results", "Web Result with Site Links", - "Web results", "Resultados de la Web", - "AI-powered overview", "Visión general creada por IA", - "Things to know", "Cosas que debes saber"], - "images": ["Images", "Imágenes"], - "jobs": ["Jobs", "Empleos"], + "Web results", + "Resultados de la Web", + "AI-powered overview", + "Visión general creada por IA", + "Things to know", + "Cosas que debes saber"], + "images": ["Images", + "Imágenes"], + "jobs": ["Jobs", + "Empleos"], "knowledge": ["Calculator Result", - "Featured snippet from the web", "Fragmento destacado", - "Finance Results", "Resumen de Mercado", - "From sources across the web", - "Knowledge Result", - "Resultado de traducci\u00f3n", - "Sports Results", - "Table", "Posiciones", - "Stat Leaders", "Líderes de estadísticas", - "Teams", "Equipos", - "Players", "Jugadores", - "Translation Result", - "Unit Converter", - "Weather Result", "Clima", - "Artworks", "Obras de arte", - "Songs", "Canciones", - "Albums", "Álbumes", - "What people are saying", - "About", "Información", - "Profiles", "Perfiles"], + "Featured snippet from the web", "Fragmento destacado", + "Finance Results", "Resumen de Mercado", + "From sources across the web", + "Knowledge Result", + "Resultado de traducci\u00f3n", + "Sports Results", + "Table", "Posiciones", + "Stat Leaders", "Líderes de estadísticas", + "Teams", "Equipos", + "Players", "Jugadores", + "Translation Result", + "Unit Converter", + "Weather Result", "Clima", + "Artworks", "Obras de arte", + "Songs", "Canciones", + "Albums", "Álbumes", + "What people are saying", + "About", "Información", + "Profiles", "Perfiles"], "local_news": ["Local news", "Noticias Locales"], "local_results": [ "Local Results", @@ -89,24 +95,29 @@ def _get_header_level_mapping(level) -> dict: "locations", ], "map_results": ["Map Results", - "Choice Hotels", "Hoteles", "Hotel"], + "Choice Hotels", + "Hoteles", + "Hotel"], "omitted_notice": ["Notices about Filtered Results"], "people_also_ask": ["People also ask", "Más preguntas"], "perspectives": ["Perspectives & opinions", - "Perspectives"], + "Perspectives"], "searches_related": ["Additional searches", - "More searches", "Ver más", - "Other searches", - "People also search for", "También se buscó", - "Related", - "Related searches", - "Related to this search", - "Searches related to"], - "top_stories": ["Top stories", "Noticias Destacadas", "Noticias Principales", - "News", "Noticias", + "More searches", "Ver más", + "Other searches", + "People also search for", "También se buscó", + "Related", + "Related searches", + "Related to this search", + "Searches related to"], + "top_stories": ["Top stories", + "Noticias Destacadas", + "Noticias Principales", + "News", + "Noticias", "Market news"], "twitter": ["Twitter Results"], - "videos": ["Videos", "Videos"] + "videos": ["Videos"] } # WS type -> header level 2 text (e.g.,

title

) @@ -119,7 +130,7 @@ def _get_header_level_mapping(level) -> dict: "searches_related": ["Related searches"], "scholarly_articles": ["Scholarly articles for", "Artículos académicos para"], "top_stories": ["Top stories", "Noticias destacadas", "Noticias Principales"], - "videos": ["Videos", "Videos"], + "videos": ["Videos"], "view_more_news": ["View more news", "Más noticias", "Ver más"], "view_more_videos": ["View more videos", "Más videos", "Ver más"] } From f701a9eb5dd7f6bd65c2eea0d3b8d9e8416e61db Mon Sep 17 00:00:00 2001 From: gitronald Date: Fri, 7 Mar 2025 10:51:11 -0800 Subject: [PATCH 017/101] version: 0.5.2.dev0 --- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index 10b1c51..fa705da 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.5.1" +__version__ = "0.5.2.dev0" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index b66d25d..2bca1b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.5.1" +version = "0.5.2.dev0" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From 0730e6bbe37f91caa6dda6261d9c90e0b978dec0 Mon Sep 17 00:00:00 2001 From: gitronald Date: Sun, 9 Mar 2025 11:11:05 -0700 Subject: [PATCH 018/101] version: 0.5.2 --- README.md | 19 +++++++++++++++---- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 31f1e97..12d23f0 100644 --- a/README.md +++ b/README.md @@ -8,17 +8,26 @@ and saving searches. It also includes a modular parser built on `BeautifulSoup` for decomposing a SERP into list of components with categorical classifications and position-based specifications. -## Recent Update +## Recent Updates -`0.5.0` - poetry v2 +Below are some details about recent updates. For a longer list, see the [Update Log](#update-log). -For a longer list of updates, see the [Update Log](#update-log). + +`0.5.2` +- Added support for Spanish component headers by text +- Pull request [#74](https://github.com/gitronald/WebSearcher/pull/74) + +`0.5.1` +- Fixed canonical name -> UULE converter using `protobuf`, see [this gist](https://gist.github.com/gitronald/66cac42194ea2d489ff3a1e32651e736) for details +- Added lang arg to specify language in se.search, uses hl URL param and does not change Accept-Language request header (which defaults to en-US), but works in tests. +- Fixed null location/language arg input handling (again) +- Pull Request [#76](https://github.com/gitronald/WebSearcher/pull/76) ## Table of Contents - [WebSearcher](#websearcher) - [Tools for conducting and parsing web searches](#tools-for-conducting-and-parsing-web-searches) - - [Recent Update](#recent-update) + - [Recent Updates](#recent-updates) - [Table of Contents](#table-of-contents) - [Getting Started](#getting-started) - [Usage](#usage) @@ -261,6 +270,8 @@ pytest -k "1684837514.html" --- ## Update Log +`0.5.0` +- configuration now using poetry v2 `0.4.9` - last version with poetry v1, future versions (`>=0.5.0`) will use [poetry v2](https://python-poetry.org/blog/announcing-poetry-2.0.1/) configs. diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index fa705da..8cec324 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.5.2.dev0" +__version__ = "0.5.2" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index 2bca1b4..eb43408 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.5.2.dev0" +version = "0.5.2" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From 7b4b95c818b36cc7f5d55fce2c9cbdfa7d49bfe6 Mon Sep 17 00:00:00 2001 From: gitronald Date: Sun, 9 Mar 2025 11:20:39 -0700 Subject: [PATCH 019/101] version: 0.6.0.dev0 --- README.md | 15 +++++++++------ WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index b4f01cb..214e1bc 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,6 @@ ## Tools for conducting and parsing web searches [![PyPI version](https://badge.fury.io/py/WebSearcher.svg)](https://badge.fury.io/py/WebSearcher) -NOTE: In 0.5.*, we moved scraping to selenium - This package provides tools for conducting algorithm audits of web search and includes a scraper built on `selenium` with tools for geolocating, conducting, and saving searches. It also includes a modular parser built on `BeautifulSoup` @@ -15,6 +13,10 @@ and position-based specifications. Below are some details about recent updates. For a longer list, see the [Update Log](#update-log). +`0.6.0` +- method for collecting data with selenium; requests no longer works without a redirect +- Pull request [#72](https://github.com/gitronald/WebSearcher/pull/72) + `0.5.2` - Added support for Spanish component headers by text - Pull request [#74](https://github.com/gitronald/WebSearcher/pull/74) @@ -36,10 +38,11 @@ Below are some details about recent updates. For a longer list, see the [Update - [Example Search Script](#example-search-script) - [Step by Step](#step-by-step) - [1. Initialize Collector](#1-initialize-collector) - - [2. Conduct a Search](#2-conduct-a-search) - - [3. Parse Search Results](#3-parse-search-results) - - [4. Save HTML and Metadata](#4-save-html-and-metadata) - - [5. Save Parsed Results](#5-save-parsed-results) + - [2. Launch undetected chromedriver window](#2-launch-undetected-chromedriver-window) + - [3. Conduct a Search](#3-conduct-a-search) + - [4. Parse Search Results](#4-parse-search-results) + - [5. Save HTML and Metadata](#5-save-html-and-metadata) + - [6. Save Parsed Results](#6-save-parsed-results) - [Localization](#localization) - [Contributing](#contributing) - [Repair or Enhance a Parser](#repair-or-enhance-a-parser) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index 8cec324..e03cd22 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.5.2" +__version__ = "0.6.0.dev0" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index 68001ee..e521987 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.5.2" +version = "0.6.0.dev0" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From acea0222933ae3987a6a80c79d02e0505f04050f Mon Sep 17 00:00:00 2001 From: gitronald Date: Sun, 9 Mar 2025 13:52:25 -0700 Subject: [PATCH 020/101] update: dedupe args, add version_main for chromedriver launch --- WebSearcher/searchers.py | 18 ++++++++++++------ tests/selenium_test.py | 3 +-- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 0d46686..b29f8f3 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -62,9 +62,8 @@ def __init__(self, # Initialize data storage self.version: str = WS_VERSION self.base_url: str = 'https://www.google.com/search' - self.headers: Dict[str, str] = None - self.sesh: requests.Session = sesh if sesh else wu.start_sesh(headers=self.headers) - self.sesh = None + self.headers: Dict[str, str] = headers or DEFAULT_HEADERS + self.sesh: requests.Session = sesh or wu.start_sesh(headers=self.headers) self.ssh_tunnel: subprocess.Popen = ssh_tunnel self.unzip: bool = unzip self.params: Dict[str, Any] = {} @@ -93,10 +92,17 @@ def __init__(self, file_level=log_level, ).start(__name__) - def launch_chromedriver(self, headless = False, use_subprocess = False, chromedriver_path = ''): + def launch_chromedriver( + self, + headless: bool = False, + version_main: int = 133, + use_subprocess: bool = False, + chromedriver_path: str = '' + ) -> None: self.headless = headless self.use_subprocess = use_subprocess self.chromedriver_path = chromedriver_path + self.version_main = version_main self._init_chromedriver() def search(self, @@ -147,9 +153,9 @@ def _init_chromedriver(self): print('launching...') if self.chromedriver_path == '': #optionally: headless=True, use_subprocess=True - self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess) + self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess, version_main = self.version_main) else: - self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess, chromedriver_path = self.chromedriver_path) + self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess, chromedriver_path = self.chromedriver_path, version_main = self.version_main) #chromedriver_path = "/opt/homebrew/Caskroom/chromedriver/133.0.6943.53" time.sleep(2) self.driver.get('https://www.google.com') diff --git a/tests/selenium_test.py b/tests/selenium_test.py index 8aa06c1..855f573 100644 --- a/tests/selenium_test.py +++ b/tests/selenium_test.py @@ -1,11 +1,10 @@ import WebSearcher as ws se = ws.SearchEngine() # 1. Initialize collector -se.launch_chromedriver(headless = False) # 2. Launch undetected chromedriver window +se.launch_chromedriver(headless=False, version_main=133) # 2. Launch undetected chromedriver window se.search('immigration news') # 2. Conduct a search se.parse_results() # 3. Parse search results se.save_serp(append_to='serps.json') # 4. Save HTML and metadata se.save_results(append_to='results.json') # 5. Save parsed results - #import pandas as pd #df = pd.DataFrame(se.results) # 6. Display results in a pandas dataframe \ No newline at end of file From 80403f2f137dd6532cdf0f272e4a3f7cc080ce73 Mon Sep 17 00:00:00 2001 From: gitronald Date: Sun, 9 Mar 2025 14:20:01 -0700 Subject: [PATCH 021/101] update: poetry lock file --- poetry.lock | 298 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 292 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index d657798..225ef6d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -43,6 +43,27 @@ files = [ astroid = ["astroid (>=2,<4)"] test = ["astroid (>=2,<4)", "pytest", "pytest-cov", "pytest-xdist"] +[[package]] +name = "attrs" +version = "25.1.0" +description = "Classes Without Boilerplate" +optional = false +python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a"}, + {file = "attrs-25.1.0.tar.gz", hash = "sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e"}, +] + +[package.extras] +benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"] +tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] + [[package]] name = "beautifulsoup4" version = "4.13.1" @@ -222,8 +243,7 @@ version = "1.17.1" description = "Foreign Function Interface for Python calling C code." optional = false python-versions = ">=3.8" -groups = ["dev"] -markers = "python_version <= \"3.11\" and implementation_name == \"pypy\" or python_version >= \"3.12\" and implementation_name == \"pypy\"" +groups = ["main", "dev"] files = [ {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"}, {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"}, @@ -293,6 +313,7 @@ files = [ {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"}, {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"}, ] +markers = {main = "python_version <= \"3.11\" and os_name == \"nt\" and implementation_name != \"pypy\" or python_version >= \"3.12\" and os_name == \"nt\" and implementation_name != \"pypy\"", dev = "python_version <= \"3.11\" and implementation_name == \"pypy\" or python_version >= \"3.12\" and implementation_name == \"pypy\""} [package.dependencies] pycparser = "*" @@ -488,7 +509,7 @@ version = "1.2.2" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" -groups = ["dev"] +groups = ["main", "dev"] markers = "python_version < \"3.11\"" files = [ {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, @@ -532,6 +553,19 @@ docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3) testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"] typing = ["typing-extensions (>=4.12.2)"] +[[package]] +name = "h11" +version = "0.14.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +optional = false +python-versions = ">=3.7" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, + {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, +] + [[package]] name = "idna" version = "3.10" @@ -1035,6 +1069,22 @@ files = [ {file = "numpy-2.2.2.tar.gz", hash = "sha256:ed6906f61834d687738d25988ae117683705636936cc605be0bb208b23df4d8f"}, ] +[[package]] +name = "outcome" +version = "1.3.0.post0" +description = "Capture the outcome of Python function calls." +optional = false +python-versions = ">=3.7" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b"}, + {file = "outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8"}, +] + +[package.dependencies] +attrs = ">=19.2.0" + [[package]] name = "packaging" version = "24.2" @@ -1307,12 +1357,12 @@ version = "2.22" description = "C parser in Python" optional = false python-versions = ">=3.8" -groups = ["dev"] -markers = "python_version <= \"3.11\" and implementation_name == \"pypy\" or python_version >= \"3.12\" and implementation_name == \"pypy\"" +groups = ["main", "dev"] files = [ {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, ] +markers = {main = "python_version <= \"3.11\" and os_name == \"nt\" and implementation_name != \"pypy\" or python_version >= \"3.12\" and os_name == \"nt\" and implementation_name != \"pypy\"", dev = "python_version <= \"3.11\" and implementation_name == \"pypy\" or python_version >= \"3.12\" and implementation_name == \"pypy\""} [[package]] name = "pydantic" @@ -1466,6 +1516,20 @@ files = [ [package.extras] windows-terminal = ["colorama (>=0.4.6)"] +[[package]] +name = "pysocks" +version = "1.7.1" +description = "A Python SOCKS client module. See https://github.com/Anorov/PySocks for more information." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "PySocks-1.7.1-py27-none-any.whl", hash = "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299"}, + {file = "PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5"}, + {file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"}, +] + [[package]] name = "pytest" version = "8.3.4" @@ -1710,6 +1774,27 @@ files = [ [package.dependencies] requests = ">=1.0.0" +[[package]] +name = "selenium" +version = "4.29.0" +description = "Official Python bindings for Selenium WebDriver" +optional = false +python-versions = ">=3.9" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "selenium-4.29.0-py3-none-any.whl", hash = "sha256:ce5d26f1ddc1111641113653af33694c13947dd36c2df09cdd33f554351d372e"}, + {file = "selenium-4.29.0.tar.gz", hash = "sha256:3a62f7ec33e669364a6c0562a701deb69745b569c50d55f1a912bf8eb33358ba"}, +] + +[package.dependencies] +certifi = ">=2021.10.8" +trio = ">=0.17,<1.0" +trio-websocket = ">=0.9,<1.0" +typing_extensions = ">=4.9,<5.0" +urllib3 = {version = ">=1.26,<3", extras = ["socks"]} +websocket-client = ">=1.8,<2.0" + [[package]] name = "six" version = "1.17.0" @@ -1723,6 +1808,32 @@ files = [ {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, ] +[[package]] +name = "sniffio" +version = "1.3.1" +description = "Sniff out which async library your code is running under" +optional = false +python-versions = ">=3.7" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, + {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, +] + +[[package]] +name = "sortedcontainers" +version = "2.4.0" +description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" +optional = false +python-versions = "*" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, + {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, +] + [[package]] name = "soupsieve" version = "2.6" @@ -1878,6 +1989,47 @@ files = [ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<8.2)", "pytest-mock", "pytest-mypy-testing"] +[[package]] +name = "trio" +version = "0.29.0" +description = "A friendly Python library for async concurrency and I/O" +optional = false +python-versions = ">=3.9" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "trio-0.29.0-py3-none-any.whl", hash = "sha256:d8c463f1a9cc776ff63e331aba44c125f423a5a13c684307e828d930e625ba66"}, + {file = "trio-0.29.0.tar.gz", hash = "sha256:ea0d3967159fc130acb6939a0be0e558e364fee26b5deeecc893a6b08c361bdf"}, +] + +[package.dependencies] +attrs = ">=23.2.0" +cffi = {version = ">=1.14", markers = "os_name == \"nt\" and implementation_name != \"pypy\""} +exceptiongroup = {version = "*", markers = "python_version < \"3.11\""} +idna = "*" +outcome = "*" +sniffio = ">=1.3.0" +sortedcontainers = "*" + +[[package]] +name = "trio-websocket" +version = "0.12.2" +description = "WebSocket library for Trio" +optional = false +python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "trio_websocket-0.12.2-py3-none-any.whl", hash = "sha256:df605665f1db533f4a386c94525870851096a223adcb97f72a07e8b4beba45b6"}, + {file = "trio_websocket-0.12.2.tar.gz", hash = "sha256:22c72c436f3d1e264d0910a3951934798dcc5b00ae56fc4ee079d46c7cf20fae"}, +] + +[package.dependencies] +exceptiongroup = {version = "*", markers = "python_version < \"3.11\""} +outcome = ">=1.2.0" +trio = ">=0.11" +wsproto = ">=0.14" + [[package]] name = "typing-extensions" version = "4.12.2" @@ -1904,6 +2056,23 @@ files = [ {file = "tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694"}, ] +[[package]] +name = "undetected-chromedriver" +version = "3.5.5" +description = "('Selenium.webdriver.Chrome replacement with compatiblity for Brave, and other Chromium based browsers.', 'Not triggered by CloudFlare/Imperva/hCaptcha and such.', 'NOTE: results may vary due to many factors. No guarantees are given, except for ongoing efforts in understanding detection algorithms.')" +optional = false +python-versions = "*" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "undetected-chromedriver-3.5.5.tar.gz", hash = "sha256:9f945e1435005247abe17de316bcfda85b284a4177fd5f25167c78ced33b65ec"}, +] + +[package.dependencies] +requests = "*" +selenium = ">=4.9.0" +websockets = "*" + [[package]] name = "urllib3" version = "2.3.0" @@ -1917,6 +2086,9 @@ files = [ {file = "urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d"}, ] +[package.dependencies] +pysocks = {version = ">=1.5.6,<1.5.7 || >1.5.7,<2.0", optional = true, markers = "extra == \"socks\""} + [package.extras] brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] h2 = ["h2 (>=4,<5)"] @@ -1936,6 +2108,120 @@ files = [ {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, ] +[[package]] +name = "websocket-client" +version = "1.8.0" +description = "WebSocket client for Python with low level API options" +optional = false +python-versions = ">=3.8" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "websocket_client-1.8.0-py3-none-any.whl", hash = "sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526"}, + {file = "websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da"}, +] + +[package.extras] +docs = ["Sphinx (>=6.0)", "myst-parser (>=2.0.0)", "sphinx-rtd-theme (>=1.1.0)"] +optional = ["python-socks", "wsaccel"] +test = ["websockets"] + +[[package]] +name = "websockets" +version = "15.0.1" +description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" +optional = false +python-versions = ">=3.9" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "websockets-15.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d63efaa0cd96cf0c5fe4d581521d9fa87744540d4bc999ae6e08595a1014b45b"}, + {file = "websockets-15.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ac60e3b188ec7574cb761b08d50fcedf9d77f1530352db4eef1707fe9dee7205"}, + {file = "websockets-15.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5756779642579d902eed757b21b0164cd6fe338506a8083eb58af5c372e39d9a"}, + {file = "websockets-15.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fdfe3e2a29e4db3659dbd5bbf04560cea53dd9610273917799f1cde46aa725e"}, + {file = "websockets-15.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c2529b320eb9e35af0fa3016c187dffb84a3ecc572bcee7c3ce302bfeba52bf"}, + {file = "websockets-15.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac1e5c9054fe23226fb11e05a6e630837f074174c4c2f0fe442996112a6de4fb"}, + {file = "websockets-15.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5df592cd503496351d6dc14f7cdad49f268d8e618f80dce0cd5a36b93c3fc08d"}, + {file = "websockets-15.0.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0a34631031a8f05657e8e90903e656959234f3a04552259458aac0b0f9ae6fd9"}, + {file = "websockets-15.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3d00075aa65772e7ce9e990cab3ff1de702aa09be3940d1dc88d5abf1ab8a09c"}, + {file = "websockets-15.0.1-cp310-cp310-win32.whl", hash = "sha256:1234d4ef35db82f5446dca8e35a7da7964d02c127b095e172e54397fb6a6c256"}, + {file = "websockets-15.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:39c1fec2c11dc8d89bba6b2bf1556af381611a173ac2b511cf7231622058af41"}, + {file = "websockets-15.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:823c248b690b2fd9303ba00c4f66cd5e2d8c3ba4aa968b2779be9532a4dad431"}, + {file = "websockets-15.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678999709e68425ae2593acf2e3ebcbcf2e69885a5ee78f9eb80e6e371f1bf57"}, + {file = "websockets-15.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d50fd1ee42388dcfb2b3676132c78116490976f1300da28eb629272d5d93e905"}, + {file = "websockets-15.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d99e5546bf73dbad5bf3547174cd6cb8ba7273062a23808ffea025ecb1cf8562"}, + {file = "websockets-15.0.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66dd88c918e3287efc22409d426c8f729688d89a0c587c88971a0faa2c2f3792"}, + {file = "websockets-15.0.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8dd8327c795b3e3f219760fa603dcae1dcc148172290a8ab15158cf85a953413"}, + {file = "websockets-15.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8fdc51055e6ff4adeb88d58a11042ec9a5eae317a0a53d12c062c8a8865909e8"}, + {file = "websockets-15.0.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:693f0192126df6c2327cce3baa7c06f2a117575e32ab2308f7f8216c29d9e2e3"}, + {file = "websockets-15.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:54479983bd5fb469c38f2f5c7e3a24f9a4e70594cd68cd1fa6b9340dadaff7cf"}, + {file = "websockets-15.0.1-cp311-cp311-win32.whl", hash = "sha256:16b6c1b3e57799b9d38427dda63edcbe4926352c47cf88588c0be4ace18dac85"}, + {file = "websockets-15.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:27ccee0071a0e75d22cb35849b1db43f2ecd3e161041ac1ee9d2352ddf72f065"}, + {file = "websockets-15.0.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:3e90baa811a5d73f3ca0bcbf32064d663ed81318ab225ee4f427ad4e26e5aff3"}, + {file = "websockets-15.0.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:592f1a9fe869c778694f0aa806ba0374e97648ab57936f092fd9d87f8bc03665"}, + {file = "websockets-15.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0701bc3cfcb9164d04a14b149fd74be7347a530ad3bbf15ab2c678a2cd3dd9a2"}, + {file = "websockets-15.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8b56bdcdb4505c8078cb6c7157d9811a85790f2f2b3632c7d1462ab5783d215"}, + {file = "websockets-15.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0af68c55afbd5f07986df82831c7bff04846928ea8d1fd7f30052638788bc9b5"}, + {file = "websockets-15.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dee438fed052b52e4f98f76c5790513235efaa1ef7f3f2192c392cd7c91b65"}, + {file = "websockets-15.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d5f6b181bb38171a8ad1d6aa58a67a6aa9d4b38d0f8c5f496b9e42561dfc62fe"}, + {file = "websockets-15.0.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5d54b09eba2bada6011aea5375542a157637b91029687eb4fdb2dab11059c1b4"}, + {file = "websockets-15.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3be571a8b5afed347da347bfcf27ba12b069d9d7f42cb8c7028b5e98bbb12597"}, + {file = "websockets-15.0.1-cp312-cp312-win32.whl", hash = "sha256:c338ffa0520bdb12fbc527265235639fb76e7bc7faafbb93f6ba80d9c06578a9"}, + {file = "websockets-15.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcd5cf9e305d7b8338754470cf69cf81f420459dbae8a3b40cee57417f4614a7"}, + {file = "websockets-15.0.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ee443ef070bb3b6ed74514f5efaa37a252af57c90eb33b956d35c8e9c10a1931"}, + {file = "websockets-15.0.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5a939de6b7b4e18ca683218320fc67ea886038265fd1ed30173f5ce3f8e85675"}, + {file = "websockets-15.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:746ee8dba912cd6fc889a8147168991d50ed70447bf18bcda7039f7d2e3d9151"}, + {file = "websockets-15.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:595b6c3969023ecf9041b2936ac3827e4623bfa3ccf007575f04c5a6aa318c22"}, + {file = "websockets-15.0.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c714d2fc58b5ca3e285461a4cc0c9a66bd0e24c5da9911e30158286c9b5be7f"}, + {file = "websockets-15.0.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f3c1e2ab208db911594ae5b4f79addeb3501604a165019dd221c0bdcabe4db8"}, + {file = "websockets-15.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:229cf1d3ca6c1804400b0a9790dc66528e08a6a1feec0d5040e8b9eb14422375"}, + {file = "websockets-15.0.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:756c56e867a90fb00177d530dca4b097dd753cde348448a1012ed6c5131f8b7d"}, + {file = "websockets-15.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:558d023b3df0bffe50a04e710bc87742de35060580a293c2a984299ed83bc4e4"}, + {file = "websockets-15.0.1-cp313-cp313-win32.whl", hash = "sha256:ba9e56e8ceeeedb2e080147ba85ffcd5cd0711b89576b83784d8605a7df455fa"}, + {file = "websockets-15.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:e09473f095a819042ecb2ab9465aee615bd9c2028e4ef7d933600a8401c79561"}, + {file = "websockets-15.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5f4c04ead5aed67c8a1a20491d54cdfba5884507a48dd798ecaf13c74c4489f5"}, + {file = "websockets-15.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:abdc0c6c8c648b4805c5eacd131910d2a7f6455dfd3becab248ef108e89ab16a"}, + {file = "websockets-15.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a625e06551975f4b7ea7102bc43895b90742746797e2e14b70ed61c43a90f09b"}, + {file = "websockets-15.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d591f8de75824cbb7acad4e05d2d710484f15f29d4a915092675ad3456f11770"}, + {file = "websockets-15.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:47819cea040f31d670cc8d324bb6435c6f133b8c7a19ec3d61634e62f8d8f9eb"}, + {file = "websockets-15.0.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac017dd64572e5c3bd01939121e4d16cf30e5d7e110a119399cf3133b63ad054"}, + {file = "websockets-15.0.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4a9fac8e469d04ce6c25bb2610dc535235bd4aa14996b4e6dbebf5e007eba5ee"}, + {file = "websockets-15.0.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:363c6f671b761efcb30608d24925a382497c12c506b51661883c3e22337265ed"}, + {file = "websockets-15.0.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2034693ad3097d5355bfdacfffcbd3ef5694f9718ab7f29c29689a9eae841880"}, + {file = "websockets-15.0.1-cp39-cp39-win32.whl", hash = "sha256:3b1ac0d3e594bf121308112697cf4b32be538fb1444468fb0a6ae4feebc83411"}, + {file = "websockets-15.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:b7643a03db5c95c799b89b31c036d5f27eeb4d259c798e878d6937d71832b1e4"}, + {file = "websockets-15.0.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0c9e74d766f2818bb95f84c25be4dea09841ac0f734d1966f415e4edfc4ef1c3"}, + {file = "websockets-15.0.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1009ee0c7739c08a0cd59de430d6de452a55e42d6b522de7aa15e6f67db0b8e1"}, + {file = "websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76d1f20b1c7a2fa82367e04982e708723ba0e7b8d43aa643d3dcd404d74f1475"}, + {file = "websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f29d80eb9a9263b8d109135351caf568cc3f80b9928bccde535c235de55c22d9"}, + {file = "websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b359ed09954d7c18bbc1680f380c7301f92c60bf924171629c5db97febb12f04"}, + {file = "websockets-15.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:cad21560da69f4ce7658ca2cb83138fb4cf695a2ba3e475e0559e05991aa8122"}, + {file = "websockets-15.0.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7f493881579c90fc262d9cdbaa05a6b54b3811c2f300766748db79f098db9940"}, + {file = "websockets-15.0.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:47b099e1f4fbc95b701b6e85768e1fcdaf1630f3cbe4765fa216596f12310e2e"}, + {file = "websockets-15.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67f2b6de947f8c757db2db9c71527933ad0019737ec374a8a6be9a956786aaf9"}, + {file = "websockets-15.0.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d08eb4c2b7d6c41da6ca0600c077e93f5adcfd979cd777d747e9ee624556da4b"}, + {file = "websockets-15.0.1-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b826973a4a2ae47ba357e4e82fa44a463b8f168e1ca775ac64521442b19e87f"}, + {file = "websockets-15.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:21c1fa28a6a7e3cbdc171c694398b6df4744613ce9b36b1a498e816787e28123"}, + {file = "websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f"}, + {file = "websockets-15.0.1.tar.gz", hash = "sha256:82544de02076bafba038ce055ee6412d68da13ab47f0c60cab827346de828dee"}, +] + +[[package]] +name = "wsproto" +version = "1.2.0" +description = "WebSockets state-machine based protocol implementation" +optional = false +python-versions = ">=3.7.0" +groups = ["main"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "wsproto-1.2.0-py3-none-any.whl", hash = "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736"}, + {file = "wsproto-1.2.0.tar.gz", hash = "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065"}, +] + +[package.dependencies] +h11 = ">=0.9.0,<1" + [[package]] name = "zipp" version = "3.21.0" @@ -1960,4 +2246,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = ">=3.9" -content-hash = "aae03414bd510dcc398d4b52bd96660021224dfbf78564b91a1235d3e851a582" +content-hash = "d71e1b8f0d0886b2f716c19310371fb54f9216a14c38d50327a4f42283c08523" From 465c5538f8adc61147f8e5f25d8508befd940df1 Mon Sep 17 00:00:00 2001 From: gitronald Date: Sun, 9 Mar 2025 14:20:14 -0700 Subject: [PATCH 022/101] update: reorg selenium code --- WebSearcher/searchers.py | 123 +++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 62 deletions(-) diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index b29f8f3..7c50dc2 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -92,19 +92,6 @@ def __init__(self, file_level=log_level, ).start(__name__) - def launch_chromedriver( - self, - headless: bool = False, - version_main: int = 133, - use_subprocess: bool = False, - chromedriver_path: str = '' - ) -> None: - self.headless = headless - self.use_subprocess = use_subprocess - self.chromedriver_path = chromedriver_path - self.version_main = version_main - self._init_chromedriver() - def search(self, qry: str, location: str = None, @@ -127,9 +114,9 @@ def search(self, """ self._prepare_search(qry=qry, location=location, lang=lang, num_results=num_results) + if method == 'selenium': self._conduct_chromedriver_search(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand) - elif method == 'requests': self._conduct_search(serp_id=serp_id, crawl_id=crawl_id) self._handle_response() @@ -149,63 +136,29 @@ def _prepare_search(self, qry: str, location: str = None, lang: str = None, num_ if self.loc and self.loc not in {'None', 'nan'}: self.params['uule'] = locations.convert_canonical_name_to_uule(self.loc) + def launch_chromedriver( + self, + headless: bool = False, + version_main: int = 133, + use_subprocess: bool = False, + chromedriver_path: str = '' + ) -> None: + self.headless = headless + self.use_subprocess = use_subprocess + self.chromedriver_path = chromedriver_path + self.version_main = version_main + self._init_chromedriver() + def _init_chromedriver(self): - print('launching...') + self.log.info(f'SERP | Launching ChromeDriver | headless: {self.headless} | subprocess: {self.use_subprocess} | version: {self.version_main}') if self.chromedriver_path == '': - #optionally: headless=True, use_subprocess=True self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess, version_main = self.version_main) else: self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess, chromedriver_path = self.chromedriver_path, version_main = self.version_main) - #chromedriver_path = "/opt/homebrew/Caskroom/chromedriver/133.0.6943.53" time.sleep(2) self.driver.get('https://www.google.com') time.sleep(2) - def _check_ai_expand(self): - try: - self.driver.find_element(By.XPATH, "//div[@jsname='rPRdsc' and @role='button']") - return True - except NoSuchElementException: - return False - - def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = '', ai_expand = False): - """Send a search request and handle errors""" - self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat() - self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp) - self.crawl_id = crawl_id - try: - self._send_chromedriver_request() - except: - self.log.exception(f'SERP | Unknown error | {self.serp_id}') - - ## Look for AI overview box and click on it - if ai_expand: - ai_button = self._check_ai_expand() - if ai_button: - try: - show_more_button = WebDriverWait(self.driver, 1).until( - EC.element_to_be_clickable((By.XPATH, "//div[@jsname='rPRdsc' and @role='button']")) - ) - show_more_button.click() - if show_more_button is not None: - try: - # Wait for additional content to load - time.sleep(2) - - show_all_button = WebDriverWait(self.driver, 1).until( - EC.element_to_be_clickable((By.XPATH, '//div[contains(@class, "trEk7e") and @role="button"]')) - ) - show_all_button.click() - except: - pass - except: - pass - self.html = self.driver.page_source - else: - pass - - self.driver.delete_all_cookies() - def _send_chromedriver_request(self): search_box = self.driver.find_element(By.ID, "APjFqb") search_box.clear() @@ -227,6 +180,52 @@ def _send_request(self): log_msg = f"{log_msg} | {self.loc}" if self.loc else log_msg self.log.info(log_msg) + def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = '', ai_expand = False): + """Send a search request and handle errors""" + self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat() + self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp) + self.crawl_id = crawl_id + try: + self._send_chromedriver_request() + self.html = self.driver.page_source + except: + self.log.exception(f'SERP | Chromedriver error | {self.serp_id}') + + if ai_expand: + self._expand_ai_overview() # Expand AI overview box by clicking it + self.driver.delete_all_cookies() + + def _expand_ai_overview(self): + show_more_button_xpath = "//div[@jsname='rPRdsc' and @role='button']" + show_all_button_xpath = '//div[contains(@class, "trEk7e") and @role="button"]' + + try: + self.driver.find_element(By.XPATH, show_more_button_xpath) + show_more_button_exists = True + except NoSuchElementException: + show_more_button_exists = False + + if show_more_button_exists: + try: + show_more_button = WebDriverWait(self.driver, 1).until( + EC.element_to_be_clickable((By.XPATH, show_more_button_xpath)) + ) + if show_more_button is not None: + show_more_button.click() + try: + time.sleep(2) # Wait for additional content to load + show_all_button = WebDriverWait(self.driver, 1).until( + EC.element_to_be_clickable((By.XPATH, show_all_button_xpath)) + ) + show_all_button.click() + except Exception: + pass + + # Overwrite html with expanded content + self.html = self.driver.page_source + + except Exception: + pass def _reset_ssh_tunnel(self): if self.ssh_tunnel: From ea6eebc9850319447cff8cfd516f90e6887e0b54 Mon Sep 17 00:00:00 2001 From: gitronald Date: Sun, 9 Mar 2025 14:21:27 -0700 Subject: [PATCH 023/101] update: specify args, headless not working locally --- tests/selenium_test.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/selenium_test.py b/tests/selenium_test.py index 855f573..d5e644e 100644 --- a/tests/selenium_test.py +++ b/tests/selenium_test.py @@ -1,6 +1,11 @@ import WebSearcher as ws + +#chromedriver_path = "/opt/homebrew/Caskroom/chromedriver/133.0.6943.53" + se = ws.SearchEngine() # 1. Initialize collector -se.launch_chromedriver(headless=False, version_main=133) # 2. Launch undetected chromedriver window +se.launch_chromedriver(headless=False, # 2. Launch undetected_chromedriver window + use_subprocess=False, + version_main=133) se.search('immigration news') # 2. Conduct a search se.parse_results() # 3. Parse search results se.save_serp(append_to='serps.json') # 4. Save HTML and metadata From 82bf5078235bc613a1d947fbe2ed8598c654df48 Mon Sep 17 00:00:00 2001 From: gitronald Date: Mon, 10 Mar 2025 17:23:25 -0700 Subject: [PATCH 024/101] update: collection code and selenium test --- README.md | 64 +++------- WebSearcher/logger.py | 4 +- WebSearcher/searchers.py | 264 ++++++++++++++++++++++++++++----------- poetry.lock | 114 ++++++++++++++++- pyproject.toml | 1 + tests/selenium_test.py | 52 ++++++-- 6 files changed, 368 insertions(+), 131 deletions(-) diff --git a/README.md b/README.md index 214e1bc..677c408 100644 --- a/README.md +++ b/README.md @@ -38,11 +38,10 @@ Below are some details about recent updates. For a longer list, see the [Update - [Example Search Script](#example-search-script) - [Step by Step](#step-by-step) - [1. Initialize Collector](#1-initialize-collector) - - [2. Launch undetected chromedriver window](#2-launch-undetected-chromedriver-window) - - [3. Conduct a Search](#3-conduct-a-search) - - [4. Parse Search Results](#4-parse-search-results) - - [5. Save HTML and Metadata](#5-save-html-and-metadata) - - [6. Save Parsed Results](#6-save-parsed-results) + - [2. Conduct a Search](#2-conduct-a-search) + - [3. Parse Search Results](#3-parse-search-results) + - [4. Save HTML and Metadata](#4-save-html-and-metadata) + - [5. Save Parsed Results](#5-save-parsed-results) - [Localization](#localization) - [Contributing](#contributing) - [Repair or Enhance a Parser](#repair-or-enhance-a-parser) @@ -125,7 +124,6 @@ Example search and parse pipeline: ```python import WebSearcher as ws se = ws.SearchEngine() # 1. Initialize collector -se.launch_chromedriver(headless=False) # 2. Launch undetected chromedriver window se.search('immigration news') # 2. Conduct a search se.parse_results() # 3. Parse search results se.save_serp(append_to='serps.json') # 4. Save HTML and metadata @@ -138,50 +136,26 @@ se.save_results(append_to='results.json') # 5. Save parsed results ```python import WebSearcher as ws -# Initialize collector with optional defaults (headers, logs, ssh tunnels) -se = ws.SearchEngine() - -# Show collector settings -vars(se) -{'version': '0.4.1', - 'base_url': 'https://www.google.com/search', - 'headers': {'Host': 'www.google.com', - 'Referer': 'https://www.google.com/', - 'Accept': '*/*', - 'Accept-Encoding': 'gzip,deflate,br', - 'Accept-Language': 'en-US,en;q=0.5', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0'}, - 'sesh': , - 'ssh_tunnel': None, - 'unzip': True, - 'params': {}, - 'qry': None, - 'loc': None, - 'num_results': None, - 'url': None, - 'timestamp': None, - 'serp_id': None, - 'crawl_id': None, - 'response': None, - 'html': None, - 'results': [], - 'log': } -``` - -#### 2. Launch undetected chromedriver window -We've switched to using [undetected chrome](https://github.com/ultrafunkamsterdam/undetected-chromedriver) to scrape search results. You'll need to ensure that your chromedriver is up-to-date. All cookies are deleted following each search.launch_chromedriver accepts 3 optional arguments. The defaults are: - -se.launch_chromedriver(headless = False, use_subprocess = False, chromedriver_path = '') - +# Initialize collector with method and other settings +se = ws.SearchEngine( + method="selenium", + selenium_config = { + "headless": False, + "use_subprocess": False, + "driver_executable_path": "", + "version_main": 133, + } +) +``` -#### 3. Conduct a Search +#### 2. Conduct a Search ```python se.search('immigration news') # 2024-08-19 14:09:18.502 | INFO | WebSearcher.searchers | 200 | immigration news ``` -#### 4. Parse Search Results +#### 3. Parse Search Results The example below is primarily for parsing search results as you collect HTML. See `ws.parse_serp(html)` for parsing existing HTML data. @@ -206,7 +180,7 @@ se.results[0] ``` -#### 5. Save HTML and Metadata +#### 4. Save HTML and Metadata Recommended: Append html and meta data as lines to a json file for larger or ongoing collections. @@ -221,7 +195,7 @@ Alternative: Save individual html files in a directory, named by a provided or ( se.save_serp(save_dir='./serps') ``` -#### 6. Save Parsed Results +#### 5. Save Parsed Results Save to a json lines file. diff --git a/WebSearcher/logger.py b/WebSearcher/logger.py index 48ff44f..147000d 100644 --- a/WebSearcher/logger.py +++ b/WebSearcher/logger.py @@ -85,7 +85,9 @@ def __init__(self, 'urllib3': {'level': 'WARNING'}, 'asyncio': {'level': 'INFO'}, 'chardet.charsetprober': {'level': 'INFO'}, - 'parso': {'level': 'INFO'} # Fix for ipython autocomplete bug + 'parso': {'level': 'INFO'}, # Fix for ipython autocomplete bug + 'undetected_chromedriver': {'level': 'WARNING'}, + 'uc': {'level': 'WARNING'}, } self.log_config = { diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 7c50dc2..e2330fb 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -7,12 +7,15 @@ import os import time +import json import brotli import requests import subprocess import pandas as pd +from enum import Enum +from typing import Any, Dict, Optional, Union from datetime import datetime, timezone -from typing import Any, Dict, Optional +from dataclasses import dataclass, field # selenium updates import undetected_chromedriver as uc @@ -35,39 +38,118 @@ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0', } +class SearchMethod(Enum): + REQUESTS = "requests" + SELENIUM = "selenium" + +@dataclass +class BaseConfig: + """Common search configuration + + Attributes: + log_fp (str, optional): A file to log function process output to + log_mode (str, optional): Write over the log file or append to it + log_level (str, optional): The file logging level + + """ + log_fp: str = '' + log_mode: str = 'a+' + log_level: str = 'INFO' + +@dataclass +class SeleniumConfig: + """Selenium-specific configuration + + Attributes: + headless (bool): Whether to run the browser in headless mode + version_main (int): The main version of the ChromeDriver to use + use_subprocess (bool): Whether to use subprocess for ChromeDriver + driver_executable_path (str): Path to the ChromeDriver executable + + """ + headless: bool = False + version_main: int = 133 + use_subprocess: bool = False + driver_executable_path: str = '' + +@dataclass +class RequestsConfig: + """Requests-specific configuration + + Attributes: + headers (Dict[str, str]): Headers to send with requests + sesh (Optional[requests.Session]): A `requests.Session` object + ssh_tunnel (Optional[subprocess.Popen]): An SSH tunnel subprocess from `webutils` + unzip (bool): Unzip brotli zipped html responses + + """ + headers: Dict[str, str] = field(default_factory=lambda: DEFAULT_HEADERS) + sesh: Optional[requests.Session] = None + ssh_tunnel: Optional[subprocess.Popen] = None + unzip: bool = True + +@dataclass +class SearchConfig: + """Combined search engine configuration + + Attributes: + method (Union[str, SearchMethod]): The method to use for searching, either 'requests' or 'selenium' + base (BaseConfig): Common search configuration + selenium (SeleniumConfig): Selenium-specific configuration + requests (RequestsConfig): Requests-specific configuration + + """ + method: Union[str, SearchMethod] = SearchMethod.SELENIUM + base: BaseConfig = field(default_factory=BaseConfig) + selenium: SeleniumConfig = field(default_factory=SeleniumConfig) + requests: RequestsConfig = field(default_factory=RequestsConfig) + + class SearchEngine: """Collect Search Engine Results Pages (SERPs)""" def __init__(self, - headers: Dict[str, str] = None, - sesh: Optional[requests.Session] = None, - ssh_tunnel: Optional[subprocess.Popen] = None, - unzip: bool = True, - log_fp: str = '', - log_mode: str = 'a+', - log_level: str ='INFO', + method: Union[str, SearchMethod] = SearchMethod.SELENIUM, + base_config: Union[dict, BaseConfig] = None, + selenium_config: Union[dict, SeleniumConfig] = None, + requests_config: Union[dict, RequestsConfig] = None ) -> None: - """Initialize a `requests.Session` to conduct searches through or - pass an existing one with an optional SSH tunnel. - - Args: - headers (dict, optional): Headers to send with requests. - unzip (bool, optional): Unzip brotli zipped html responses. - sesh (None, optional): A `requests.Session` object. - ssh_tunnel (None, optional): An SSH tunnel subprocess from `webutils`. - log_fp (str, optional): A file to log function process output to. - log_mode (str, optional): Write over the log file or append to it. - log_level (str, optional): The file logging level. + """Initialize the search engine + + Args: + method (Union[str, SearchMethod], optional): The method to use for searching, either 'requests' or 'selenium'. Defaults to SearchMethod.SELENIUM. + base_config (Union[dict, BaseConfig], optional): Common search configuration. Defaults to None. + selenium_config (Union[dict, SeleniumConfig], optional): Selenium-specific configuration. Defaults to None. + requests_config (Union[dict, RequestsConfig], optional): Requests-specific configuration. Defaults to None. """ - # Initialize data storage + # Convert string method to enum if needed + if isinstance(method, str): + method = SearchMethod(method.lower()) + + # Handle config objects/dicts + def isdict(config): + return isinstance(config, dict) + base = BaseConfig(**base_config) if isdict(base_config) else base_config or BaseConfig() + selenium = SeleniumConfig(**selenium_config) if isdict(selenium_config) else selenium_config or SeleniumConfig() + requests = RequestsConfig(**requests_config) if isdict(requests_config) else requests_config or RequestsConfig() + self.config = SearchConfig( + method=method, + base=base, + selenium=selenium, + requests=requests + ) + + # Initialize common attributes self.version: str = WS_VERSION self.base_url: str = 'https://www.google.com/search' - self.headers: Dict[str, str] = headers or DEFAULT_HEADERS - self.sesh: requests.Session = sesh or wu.start_sesh(headers=self.headers) - self.ssh_tunnel: subprocess.Popen = ssh_tunnel - self.unzip: bool = unzip self.params: Dict[str, Any] = {} + # Initialize method-specific attributes + if self.config.method == SearchMethod.SELENIUM: + self.driver = None + else: + self.config.requests.sesh = self.config.requests.sesh or wu.start_sesh(headers=self.config.requests.headers) + # Initialize search details self.qry: str = None self.loc: str = None @@ -77,6 +159,8 @@ def __init__(self, self.timestamp: str = None self.serp_id: str = None self.crawl_id: str = None + + # Initialize search outputs self.response: requests.Response = None self.html: str = None self.results: list = [] @@ -85,11 +169,11 @@ def __init__(self, # Set a log file, prints to console by default self.log = logger.Logger( - console=True if not log_fp else False, - console_level=log_level, - file_name=log_fp, - file_mode=log_mode, - file_level=log_level, + console=True if not self.config.base.log_fp else False, + console_level=self.config.base.log_level, + file_name=self.config.base.log_fp, + file_mode=self.config.base.log_mode, + file_level=self.config.base.log_level, ).start(__name__) def search(self, @@ -97,7 +181,6 @@ def search(self, location: str = None, lang: str = None, num_results: int = None, - method: str = 'selenium', ai_expand: bool = False, serp_id: str = '', crawl_id: str = '' @@ -114,10 +197,10 @@ def search(self, """ self._prepare_search(qry=qry, location=location, lang=lang, num_results=num_results) - - if method == 'selenium': + if self.config.method == SearchMethod.SELENIUM: + self._init_chromedriver() self._conduct_chromedriver_search(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand) - elif method == 'requests': + elif self.config.method == SearchMethod.REQUESTS: self._conduct_search(serp_id=serp_id, crawl_id=crawl_id) self._handle_response() @@ -135,48 +218,55 @@ def _prepare_search(self, qry: str, location: str = None, lang: str = None, num_ self.params['hl'] = self.lang if self.loc and self.loc not in {'None', 'nan'}: self.params['uule'] = locations.convert_canonical_name_to_uule(self.loc) + self.url = f"{self.base_url}?{wu.join_url_quote(self.params)}" - def launch_chromedriver( - self, - headless: bool = False, - version_main: int = 133, - use_subprocess: bool = False, - chromedriver_path: str = '' - ) -> None: - self.headless = headless - self.use_subprocess = use_subprocess - self.chromedriver_path = chromedriver_path - self.version_main = version_main - self._init_chromedriver() - - def _init_chromedriver(self): - self.log.info(f'SERP | Launching ChromeDriver | headless: {self.headless} | subprocess: {self.use_subprocess} | version: {self.version_main}') - if self.chromedriver_path == '': - self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess, version_main = self.version_main) - else: - self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess, chromedriver_path = self.chromedriver_path, version_main = self.version_main) + # ========================================================================== + # Selenium method + + def _init_chromedriver(self) -> None: + """Initialize Chrome driver with selenium-specific config""" + self.log.debug(f'SERP | init uc chromedriver | kwargs: {self.config.selenium.__dict__}') + self.driver = uc.Chrome(**self.config.selenium.__dict__) + self.user_agent = self.driver.execute_script('return navigator.userAgent') + self.response_code = None + + # Log version information + self.browser_info = { + 'browser_id': "", + 'browser_name': self.driver.capabilities['browserName'], + 'browser_version': self.driver.capabilities['browserVersion'], + 'driver_version': self.driver.capabilities['chrome']['chromedriverVersion'].split(' ')[0], + } + self.browser_info['browser_id'] = utils.hash_id(json.dumps(self.browser_info)) + self.log.debug(json.dumps(self.browser_info, indent=4)) + + def _send_chromedriver_typed_query(self): + """Send a typed query to the search box""" time.sleep(2) self.driver.get('https://www.google.com') time.sleep(2) - - def _send_chromedriver_request(self): search_box = self.driver.find_element(By.ID, "APjFqb") search_box.clear() search_box.send_keys(self.qry) search_box.send_keys(Keys.RETURN) + + def _send_chromedriver_request(self): + """Use a prepared URL to conduct a search""" + + time.sleep(2) + self.driver.get(self.url) + time.sleep(2) # wait for the page to load WebDriverWait(self.driver, 10).until( EC.presence_of_element_located((By.ID, "search")) ) time.sleep(2) #including a sleep to allow the page to fully load + self.html = self.driver.page_source self.url = self.driver.current_url - - def _send_request(self): - self.url = f"{self.base_url}?{wu.join_url_quote(self.params)}" - self.response = self.sesh.get(self.url, timeout=10) - log_msg = f"{self.response.status_code} | {self.qry}" + self.response_code = 0 + log_msg = f"{self.response_code} | {self.qry}" log_msg = f"{log_msg} | {self.loc}" if self.loc else log_msg self.log.info(log_msg) @@ -222,21 +312,51 @@ def _expand_ai_overview(self): pass # Overwrite html with expanded content - self.html = self.driver.page_source + new_html = self.driver.page_source + self.log.debug(f'SERP | overwriting expanded content | len diff: {len(new_html) - len(self.html)}') + self.html = new_html except Exception: pass + # ========================================================================== + # Requests method + + def _conduct_search(self, serp_id: str = '', crawl_id: str = ''): + """Send a search request and handle errors""" + + self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat() + self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp) + self.crawl_id = crawl_id + self.user_agent = self.config.requests.headers['User-Agent'] + + try: + self._send_request() + except requests.exceptions.ConnectionError: + self.log.exception(f'SERP | Connection error | {self.serp_id}') + self._reset_ssh_tunnel() + except requests.exceptions.Timeout: + self.log.exception(f'SERP | Timeout error | {self.serp_id}') + except Exception: + self.log.exception(f'SERP | Unknown error | {self.serp_id}') + + def _send_request(self): + self.response = self.config.requests.sesh.get(self.url, timeout=10) + self.response_code = self.response.status_code + log_msg = f"{self.response_code} | {self.qry}" + log_msg = f"{log_msg} | {self.loc}" if self.loc else log_msg + self.log.info(log_msg) + def _reset_ssh_tunnel(self): - if self.ssh_tunnel: - self.ssh_tunnel.tunnel.kill() - self.ssh_tunnel.open_tunnel() + if self.config.requests.ssh_tunnel: + self.config.requests.ssh_tunnel.tunnel.kill() + self.config.requests.ssh_tunnel.open_tunnel() self.log.info(f'SERP | Restarted SSH tunnel | {self.serp_id}') time.sleep(10) # Allow time to establish connection def _handle_response(self): try: - if self.unzip: + if self.config.requests.unzip: self._unzip_html() else: self.html = self.response.content @@ -244,14 +364,11 @@ def _handle_response(self): except Exception: self.log.exception(f'Response handling error') - def _unzip_html(self): + def _unzip_html(self) -> None: """Unzip brotli zipped html Can allow zipped responses by setting the header `"Accept-Encoding"`. Zipped reponses are the default because it is more efficient. - - Returns: - str: Decompressed html """ rcontent = self.response.content @@ -263,6 +380,9 @@ def _unzip_html(self): self.log.exception(f'unzip error | serp_id : {self.serp_id}') self.html = rcontent + # ========================================================================== + # Parsing + def parse_results(self): """Parse a SERP - see parsers.py""" @@ -281,6 +401,9 @@ def parse_serp_features(self): except Exception: self.log.exception(f'Feature extraction error | serp_id : {self.serp_id}') + # ========================================================================== + # Saving + def prepare_serp_save(self): self.serp = BaseSERP( qry=self.qry, @@ -288,8 +411,8 @@ def prepare_serp_save(self): lang=self.lang, url=self.url, html=self.html, - response_code=0 if not self.response else self.response.status_code, - user_agent='' if not self.response else self.headers['User-Agent'], + response_code=self.response_code, + user_agent=self.user_agent, timestamp=self.timestamp, serp_id=self.serp_id, crawl_id=self.crawl_id, @@ -354,4 +477,3 @@ def save_results(self, save_dir: str = "", append_to: str = ""): else: self.log.info(f'No parsed results for serp_id: {self.serp_id}') - diff --git a/poetry.lock b/poetry.lock index 225ef6d..9ca06df 100644 --- a/poetry.lock +++ b/poetry.lock @@ -421,6 +421,22 @@ files = [ {file = "charset_normalizer-3.4.1.tar.gz", hash = "sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3"}, ] +[[package]] +name = "click" +version = "8.1.8" +description = "Composable command line interface toolkit" +optional = false +python-versions = ">=3.7" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"}, + {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + [[package]] name = "colorama" version = "0.4.6" @@ -428,7 +444,7 @@ description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" groups = ["dev"] -markers = "python_version <= \"3.11\" and sys_platform == \"win32\" or python_version >= \"3.12\" and sys_platform == \"win32\"" +markers = "python_version <= \"3.11\" and sys_platform == \"win32\" or python_version <= \"3.11\" and platform_system == \"Windows\" or python_version >= \"3.12\" and sys_platform == \"win32\" or python_version >= \"3.12\" and platform_system == \"Windows\"" files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, @@ -918,6 +934,32 @@ html5 = ["html5lib"] htmlsoup = ["BeautifulSoup4"] source = ["Cython (>=3.0.11)"] +[[package]] +name = "markdown-it-py" +version = "3.0.0" +description = "Python port of markdown-it. Markdown parsing, done right!" +optional = false +python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, + {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, +] + +[package.dependencies] +mdurl = ">=0.1,<1.0" + +[package.extras] +benchmarking = ["psutil", "pytest", "pytest-benchmark"] +code-style = ["pre-commit (>=3.0,<4.0)"] +compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] +linkify = ["linkify-it-py (>=1,<3)"] +plugins = ["mdit-py-plugins"] +profiling = ["gprof2dot"] +rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + [[package]] name = "matplotlib-inline" version = "0.1.7" @@ -934,6 +976,19 @@ files = [ [package.dependencies] traitlets = "*" +[[package]] +name = "mdurl" +version = "0.1.2" +description = "Markdown URL utilities" +optional = false +python-versions = ">=3.7" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, + {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, +] + [[package]] name = "nest-asyncio" version = "1.6.0" @@ -1774,6 +1829,27 @@ files = [ [package.dependencies] requests = ">=1.0.0" +[[package]] +name = "rich" +version = "13.9.4" +description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" +optional = false +python-versions = ">=3.8.0" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"}, + {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"}, +] + +[package.dependencies] +markdown-it-py = ">=2.2.0" +pygments = ">=2.13.0,<3.0.0" +typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.11\""} + +[package.extras] +jupyter = ["ipywidgets (>=7.5.1,<9)"] + [[package]] name = "selenium" version = "4.29.0" @@ -1795,6 +1871,19 @@ typing_extensions = ">=4.9,<5.0" urllib3 = {version = ">=1.26,<3", extras = ["socks"]} websocket-client = ">=1.8,<2.0" +[[package]] +name = "shellingham" +version = "1.5.4" +description = "Tool to Detect Surrounding Shell" +optional = false +python-versions = ">=3.7" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686"}, + {file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"}, +] + [[package]] name = "six" version = "1.17.0" @@ -2030,6 +2119,25 @@ outcome = ">=1.2.0" trio = ">=0.11" wsproto = ">=0.14" +[[package]] +name = "typer" +version = "0.15.2" +description = "Typer, build great CLIs. Easy to code. Based on Python type hints." +optional = false +python-versions = ">=3.7" +groups = ["dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" +files = [ + {file = "typer-0.15.2-py3-none-any.whl", hash = "sha256:46a499c6107d645a9c13f7ee46c5d5096cae6f5fc57dd11eccbbb9ae3e44ddfc"}, + {file = "typer-0.15.2.tar.gz", hash = "sha256:ab2fab47533a813c49fe1f16b1a370fd5819099c00b119e0633df65f22144ba5"}, +] + +[package.dependencies] +click = ">=8.0.0" +rich = ">=10.11.0" +shellingham = ">=1.3.0" +typing-extensions = ">=3.7.4.3" + [[package]] name = "typing-extensions" version = "4.12.2" @@ -2037,11 +2145,11 @@ description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" groups = ["main", "dev"] +markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, ] -markers = {main = "python_version <= \"3.11\" or python_version >= \"3.12\"", dev = "python_version < \"3.10\""} [[package]] name = "tzdata" @@ -2246,4 +2354,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = ">=3.9" -content-hash = "d71e1b8f0d0886b2f716c19310371fb54f9216a14c38d50327a4f42283c08523" +content-hash = "1afa3bf7c3d9ce06c3cf91b77da72e8f7bf4d543351120cdfe00bedb1286df6b" diff --git a/pyproject.toml b/pyproject.toml index e521987..ab2c164 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ optional = true pytest = "^8.3.4" syrupy = "^4.8.1" ipykernel = "^6.29.5" +typer = "^0.15.2" [build-system] requires = ["poetry-core>=2.0.0,<3.0.0"] diff --git a/tests/selenium_test.py b/tests/selenium_test.py index d5e644e..f6924b7 100644 --- a/tests/selenium_test.py +++ b/tests/selenium_test.py @@ -1,15 +1,45 @@ +import typer import WebSearcher as ws -#chromedriver_path = "/opt/homebrew/Caskroom/chromedriver/133.0.6943.53" +# driver_executable_path locations: +# /opt/homebrew/Caskroom/chromedriver/133.0.6943.53 # Mac +# /opt/google/chrome/google-chrome # Google Chrome 134.0.6998.88 | permissions error +# ~/.local/share/undetected_chromedriver/undetected_chromedriver # ChromeDriver 133.0.6943.141 -se = ws.SearchEngine() # 1. Initialize collector -se.launch_chromedriver(headless=False, # 2. Launch undetected_chromedriver window - use_subprocess=False, - version_main=133) -se.search('immigration news') # 2. Conduct a search -se.parse_results() # 3. Parse search results -se.save_serp(append_to='serps.json') # 4. Save HTML and metadata -se.save_results(append_to='results.json') # 5. Save parsed results +app = typer.Typer() -#import pandas as pd -#df = pd.DataFrame(se.results) # 6. Display results in a pandas dataframe \ No newline at end of file +@app.command() +def main( + query: str = typer.Argument(..., help="Search query to use"), + method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"), + headless: bool = typer.Option(False, help="Run browser in headless mode"), + use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"), + version_main: int = typer.Option(133, help="Main version of Chrome to use"), + ai_expand: bool = typer.Option(False, help="Expand AI overviews if present"), + driver_executable_path: str = typer.Option(None, help="Path to ChromeDriver executable"), + output_prefix: str = typer.Option("output", help="Prefix for output files") +) -> None: + + typer.echo(f"query: {query}\nmethod: {method}") + selenium_config = { + "headless": headless, + "use_subprocess": use_subprocess, + "driver_executable_path": driver_executable_path, + "version_main": version_main, + } + + se = ws.SearchEngine( + method=method, + selenium_config=selenium_config + ) + + se.search(qry=query, ai_expand=ai_expand) + se.parse_results() + + # Save results with the specified prefix + se.save_serp(append_to=f'{output_prefix}_serps.json') + se.save_search(append_to=f'{output_prefix}_searches.json') + se.save_results(append_to=f'{output_prefix}_results.json') + +if __name__ == "__main__": + app() \ No newline at end of file From 5ca2da7a774b07291197047c06e6f1cf8f03b5ab Mon Sep 17 00:00:00 2001 From: gitronald Date: Mon, 10 Mar 2025 23:16:39 -0700 Subject: [PATCH 025/101] update: save method variable along with metadata --- WebSearcher/models.py | 1 + WebSearcher/searchers.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/WebSearcher/models.py b/WebSearcher/models.py index a85d7c1..2a564e6 100644 --- a/WebSearcher/models.py +++ b/WebSearcher/models.py @@ -26,4 +26,5 @@ class BaseSERP(BaseModel): serp_id: str # Search Engine Results Page (SERP) ID crawl_id: str # Crawl ID for grouping SERPs version: str # WebSearcher version + method: str # Search method used diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index e2330fb..2ea604c 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -161,7 +161,7 @@ def isdict(config): self.crawl_id: str = None # Initialize search outputs - self.response: requests.Response = None + self.response = None # type: Optional[requests.Response] self.html: str = None self.results: list = [] self.serp_features: dict = {} @@ -417,6 +417,7 @@ def prepare_serp_save(self): serp_id=self.serp_id, crawl_id=self.crawl_id, version=self.version, + method=self.config.method.value ).model_dump() def save_serp(self, save_dir: str = "", append_to: str = ""): From b950989631d138f936299f32c75837529cab1593 Mon Sep 17 00:00:00 2001 From: gitronald Date: Tue, 11 Mar 2025 09:30:36 -0700 Subject: [PATCH 026/101] update: handle null links in tw result --- WebSearcher/component_parsers/twitter_result.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/WebSearcher/component_parsers/twitter_result.py b/WebSearcher/component_parsers/twitter_result.py index f9742a4..e2814c7 100644 --- a/WebSearcher/component_parsers/twitter_result.py +++ b/WebSearcher/component_parsers/twitter_result.py @@ -1,3 +1,5 @@ +from ..webutils import get_text, get_link + def parse_twitter_result(cmpt, sub_rank=0) -> list: """Parse a Twitter single result component @@ -28,7 +30,7 @@ def parse_twitter_result(cmpt, sub_rank=0) -> list: # Get snippet text, timestamp, and tweet url body, timestamp_url = cmpt.find('div', {'class':'tw-res'}).children - parsed['text'] = body.text - parsed['timestamp'] = timestamp_url.find('span').text - parsed['details'] = timestamp_url.find('a')['href'] + parsed['text'] = get_text(body) + parsed['timestamp'] = get_text(timestamp_url, 'span') + parsed['details'] = get_link(timestamp_url) return [parsed] \ No newline at end of file From dc00990cdc8505e760a499d58684f095431de73a Mon Sep 17 00:00:00 2001 From: gitronald Date: Tue, 11 Mar 2025 09:37:40 -0700 Subject: [PATCH 027/101] version: 0.6.0.dev1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ab2c164..2e7b240 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.6.0.dev0" +version = "0.6.0.dev1" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From a7cfd5adb8459267117fff6372e0db02ac8ccb62 Mon Sep 17 00:00:00 2001 From: gitronald Date: Tue, 11 Mar 2025 10:37:41 -0700 Subject: [PATCH 028/101] update: move driver init to search, add driver cleanup --- WebSearcher/searchers.py | 24 ++++++++++++++++++++++-- tests/selenium_test.py | 1 + 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 2ea604c..4cd86d5 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -198,7 +198,6 @@ def search(self, self._prepare_search(qry=qry, location=location, lang=lang, num_results=num_results) if self.config.method == SearchMethod.SELENIUM: - self._init_chromedriver() self._conduct_chromedriver_search(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand) elif self.config.method == SearchMethod.REQUESTS: self._conduct_search(serp_id=serp_id, crawl_id=crawl_id) @@ -272,6 +271,9 @@ def _send_chromedriver_request(self): def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = '', ai_expand = False): """Send a search request and handle errors""" + if not self.driver: + self._init_chromedriver() + self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat() self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp) self.crawl_id = crawl_id @@ -282,10 +284,11 @@ def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = '', ai self.log.exception(f'SERP | Chromedriver error | {self.serp_id}') if ai_expand: - self._expand_ai_overview() # Expand AI overview box by clicking it + self._expand_ai_overview() self.driver.delete_all_cookies() def _expand_ai_overview(self): + """Expand AI overview box by clicking it""" show_more_button_xpath = "//div[@jsname='rPRdsc' and @role='button']" show_all_button_xpath = '//div[contains(@class, "trEk7e") and @role="button"]' @@ -319,6 +322,23 @@ def _expand_ai_overview(self): except Exception: pass + def cleanup(self): + """Clean up resources, particularly Selenium's browser instance + + Returns: + bool: True if cleanup was successful or not needed, False if cleanup failed + """ + if self.config.method == SearchMethod.SELENIUM and hasattr(self, 'driver') and self.driver: + try: + self.driver.quit() + self.driver = None + self.log.debug(f'Browser successfully closed') + return True + except Exception as e: + self.log.warning(f'Failed to close browser: {e}') + return False + return True + # ========================================================================== # Requests method diff --git a/tests/selenium_test.py b/tests/selenium_test.py index f6924b7..1476a32 100644 --- a/tests/selenium_test.py +++ b/tests/selenium_test.py @@ -40,6 +40,7 @@ def main( se.save_serp(append_to=f'{output_prefix}_serps.json') se.save_search(append_to=f'{output_prefix}_searches.json') se.save_results(append_to=f'{output_prefix}_results.json') + se.cleanup() if __name__ == "__main__": app() \ No newline at end of file From 2a86a5abb2550ee705baafed0027478ee1a2ea98 Mon Sep 17 00:00:00 2001 From: gitronald Date: Tue, 11 Mar 2025 10:41:32 -0700 Subject: [PATCH 029/101] version: 0.6.0.dev2 --- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index e03cd22..e2af3af 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.6.0.dev0" +__version__ = "0.6.0.dev2" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index 2e7b240..b6c9c65 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.6.0.dev1" +version = "0.6.0.dev2" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From 76fa069afb7646977f03aa780df0507fc146653e Mon Sep 17 00:00:00 2001 From: gitronald Date: Wed, 19 Mar 2025 16:04:32 -0700 Subject: [PATCH 030/101] update: add parse both features and results options --- WebSearcher/parsers.py | 71 ++++++++++++++++++++++++++++------------ WebSearcher/searchers.py | 11 +++++-- 2 files changed, 59 insertions(+), 23 deletions(-) diff --git a/WebSearcher/parsers.py b/WebSearcher/parsers.py index 1e4f283..692171b 100644 --- a/WebSearcher/parsers.py +++ b/WebSearcher/parsers.py @@ -5,12 +5,23 @@ import re from bs4 import BeautifulSoup -from typing import Union, List, Dict +from typing import Union, List, Dict, Tuple -def parse_serp(serp: Union[str, BeautifulSoup]) -> List[Dict]: - """Parse a Search Engine Result Page (SERP)""" - +def parse_serp( + serp: Union[str, BeautifulSoup], + extract_features: bool = False + ) -> Union[List[Dict], Tuple[List[Dict], Dict]]: + """Parse a Search Engine Result Page (SERP) + + Args: + serp (Union[str, BeautifulSoup]): The HTML content of the SERP or a BeautifulSoup object + extract_features (bool, optional): Whether to also extract SERP features. Defaults to False. + + Returns: + Union[List[Dict], Tuple[List[Dict], Dict]]: If extract_features is False, returns a list of result components. + If extract_features is True, returns a tuple of (results, features). + """ # Extract components soup = webutils.make_soup(serp) extractor = Extractor(soup) @@ -22,18 +33,38 @@ def parse_serp(serp: Union[str, BeautifulSoup]) -> List[Dict]: cmpt.classify_component() cmpt.parse_component() - return component_list.export_component_results() + results = component_list.export_component_results() + + if extract_features: + # Extract features from the same soup object to avoid parsing twice + features = FeatureExtractor.extract_features(soup) + return results, features + + return results class FeatureExtractor: @staticmethod - def extract_features(html: str) -> dict: - rx_estimate = re.compile(r'
.*?
') - rx_language = re.compile(r']*\slang="([^"]+)"') - rx_no_results = re.compile(r"Your search - .*? - did not match any documents\.") + def extract_features(html_or_soup: Union[str, BeautifulSoup]) -> dict: + """Extract SERP features from HTML or a BeautifulSoup object + + Args: + html_or_soup (Union[str, BeautifulSoup]): The HTML content or a BeautifulSoup object + + Returns: + dict: The extracted features + """ + output = {} + if isinstance(html_or_soup, BeautifulSoup): + soup = html_or_soup + html = str(soup) + else: + html = html_or_soup + soup = webutils.make_soup(html) # Extract result estimate count and time + rx_estimate = re.compile(r'
.*?
') match = rx_estimate.search(html) result_estimate_div = match.group(0) if match else None if result_estimate_div is None: @@ -46,23 +77,21 @@ def extract_features(html: str) -> dict: output["result_estimate_time"] = float(time_match.group(1)) if time_match else None # Extract language + rx_language = re.compile(r']*\slang="([^"]+)"') match = rx_language.search(html) output['language'] = match.group(1) if match else None # No results notice + rx_no_results = re.compile(r"Your search - .*? - did not match any documents\.") match = rx_no_results.search(html) output['notice_no_results'] = bool(match) - # Shortened query notice - pattern = "(and any subsequent words) was ignored because we limit queries to 32 words." - output['notice_shortened_query'] = (pattern in html) - - # Server error notice - pattern = "We're sorry but it appears that there has been an internal server error while processing your request." - output['notice_server_error'] = (pattern in html) - - # Infinity scroll button - pattern = 'More results' - output['infinity_scroll'] = (pattern in html) - + string_match_dict = { + 'notice_shortened_query': "(and any subsequent words) was ignored because we limit queries to 32 words.", + 'notice_server_error': "We're sorry but it appears that there has been an internal server error while processing your request.", + 'infinity_scroll': 'More results' + } + for key, pattern in string_match_dict.items(): + output[key] = (pattern in html) + return output diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 4cd86d5..525c968 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -403,9 +403,17 @@ def _unzip_html(self) -> None: # ========================================================================== # Parsing + def parse_all(self): + """Parse results and extract SERP features in a single pass""" + assert self.html, "No HTML found" + try: + # Use the enhanced parse_serp function to get both results and features in one pass + self.results, self.serp_features = parsers.parse_serp(self.html, extract_features=True) + except Exception: + self.log.exception(f'Combined parsing error | serp_id : {self.serp_id}') + def parse_results(self): """Parse a SERP - see parsers.py""" - assert self.html, "No HTML found" try: self.results = parsers.parse_serp(self.html) @@ -414,7 +422,6 @@ def parse_results(self): def parse_serp_features(self): """Extract SERP features - see parsers.py""" - assert self.html, "No HTML found" try: self.serp_features = parsers.FeatureExtractor.extract_features(self.html) From 6270f5d8054731f024d7d9821170f2f0176daaba Mon Sep 17 00:00:00 2001 From: gitronald Date: Wed, 19 Mar 2025 16:05:10 -0700 Subject: [PATCH 031/101] version: 0.6.0.dev3 --- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index e2af3af..e3e41b6 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.6.0.dev2" +__version__ = "0.6.0.dev3" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index b6c9c65..e8525f5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.6.0.dev2" +version = "0.6.0.dev3" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From 6752498ad4077b4286ef41dcb2cb265c23b8fc72 Mon Sep 17 00:00:00 2001 From: gitronald Date: Wed, 26 Mar 2025 08:57:39 -0700 Subject: [PATCH 032/101] version: 0.6.0.dev4 --- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index e3e41b6..d33ee46 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.6.0.dev3" +__version__ = "0.6.0.dev4" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index e8525f5..2b85ba6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.6.0.dev3" +version = "0.6.0.dev4" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From 165f9e36a4770b081787d71e10c571fd8c73b16b Mon Sep 17 00:00:00 2001 From: gitronald Date: Wed, 26 Mar 2025 10:25:47 -0700 Subject: [PATCH 033/101] update: condense args, use currently reliable default query --- tests/selenium_test.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/tests/selenium_test.py b/tests/selenium_test.py index 1476a32..46afb15 100644 --- a/tests/selenium_test.py +++ b/tests/selenium_test.py @@ -10,29 +10,26 @@ @app.command() def main( - query: str = typer.Argument(..., help="Search query to use"), + query: str = typer.Argument("why is the sky blue?", help="Search query to use"), method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"), headless: bool = typer.Option(False, help="Run browser in headless mode"), use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"), version_main: int = typer.Option(133, help="Main version of Chrome to use"), - ai_expand: bool = typer.Option(False, help="Expand AI overviews if present"), - driver_executable_path: str = typer.Option(None, help="Path to ChromeDriver executable"), + ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"), + driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"), output_prefix: str = typer.Option("output", help="Prefix for output files") ) -> None: typer.echo(f"query: {query}\nmethod: {method}") - selenium_config = { - "headless": headless, - "use_subprocess": use_subprocess, - "driver_executable_path": driver_executable_path, - "version_main": version_main, - } - se = ws.SearchEngine( method=method, - selenium_config=selenium_config + selenium_config={ + "headless": headless, + "use_subprocess": use_subprocess, + "driver_executable_path": driver_executable_path, + "version_main": version_main, + } ) - se.search(qry=query, ai_expand=ai_expand) se.parse_results() From b77c413bb9abfbc5426436a7032fa6a23e3e1cd3 Mon Sep 17 00:00:00 2001 From: gitronald Date: Wed, 26 Mar 2025 10:28:13 -0700 Subject: [PATCH 034/101] update: use pydantic models for configs and defaults --- WebSearcher/models.py | 45 +++++++++++- WebSearcher/searchers.py | 144 ++++++++++++++------------------------- 2 files changed, 93 insertions(+), 96 deletions(-) diff --git a/WebSearcher/models.py b/WebSearcher/models.py index 2a564e6..efaadbd 100644 --- a/WebSearcher/models.py +++ b/WebSearcher/models.py @@ -1,5 +1,8 @@ -from pydantic import BaseModel -from typing import Any, Optional +from pydantic import BaseModel, Field +from typing import Any, Optional, Dict, Union +import subprocess +import requests +from enum import Enum class BaseResult(BaseModel): @@ -28,3 +31,41 @@ class BaseSERP(BaseModel): version: str # WebSearcher version method: str # Search method used + +class LogConfig(BaseModel): + log_fp: str = '' + log_mode: str = 'a+' + log_level: str = 'INFO' + + +class SeleniumConfig(BaseModel): + headless: bool = False + version_main: int = 133 + use_subprocess: bool = False + driver_executable_path: str = "" + + +class RequestsConfig(BaseModel): + model_config = {"arbitrary_types_allowed": True} + headers: Dict[str, str] = Field(default_factory=lambda: { + 'Host': 'www.google.com', + 'Referer': 'https://www.google.com/', + 'Accept': '*/*', + 'Accept-Encoding': 'gzip,deflate,br', + 'Accept-Language': 'en-US,en;q=0.5', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0', + }) + sesh: Optional[requests.Session] = None + ssh_tunnel: Optional[subprocess.Popen] = None + unzip: bool = True + + +class SearchMethod(Enum): + REQUESTS = "requests" + SELENIUM = "selenium" + +class SearchConfig(BaseModel): + method: Union[str, SearchMethod] = SearchMethod.SELENIUM + base: LogConfig = Field(default_factory=LogConfig) + selenium: SeleniumConfig = Field(default_factory=SeleniumConfig) + requests: RequestsConfig = Field(default_factory=RequestsConfig) diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 525c968..f693b62 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -3,19 +3,16 @@ from . import webutils as wu from . import utils from . import logger -from .models import BaseSERP +from .models import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod, BaseSERP import os import time import json import brotli import requests -import subprocess import pandas as pd -from enum import Enum from typing import Any, Dict, Optional, Union from datetime import datetime, timezone -from dataclasses import dataclass, field # selenium updates import undetected_chromedriver as uc @@ -28,88 +25,11 @@ from importlib import metadata WS_VERSION = metadata.version('WebSearcher') -# Default headers to send with requests (i.e. device fingerprint) -DEFAULT_HEADERS = { - 'Host': 'www.google.com', - 'Referer': 'https://www.google.com/', - 'Accept': '*/*', - 'Accept-Encoding': 'gzip,deflate,br', - 'Accept-Language': 'en-US,en;q=0.5', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0', -} - -class SearchMethod(Enum): - REQUESTS = "requests" - SELENIUM = "selenium" - -@dataclass -class BaseConfig: - """Common search configuration - - Attributes: - log_fp (str, optional): A file to log function process output to - log_mode (str, optional): Write over the log file or append to it - log_level (str, optional): The file logging level - - """ - log_fp: str = '' - log_mode: str = 'a+' - log_level: str = 'INFO' - -@dataclass -class SeleniumConfig: - """Selenium-specific configuration - - Attributes: - headless (bool): Whether to run the browser in headless mode - version_main (int): The main version of the ChromeDriver to use - use_subprocess (bool): Whether to use subprocess for ChromeDriver - driver_executable_path (str): Path to the ChromeDriver executable - - """ - headless: bool = False - version_main: int = 133 - use_subprocess: bool = False - driver_executable_path: str = '' - -@dataclass -class RequestsConfig: - """Requests-specific configuration - - Attributes: - headers (Dict[str, str]): Headers to send with requests - sesh (Optional[requests.Session]): A `requests.Session` object - ssh_tunnel (Optional[subprocess.Popen]): An SSH tunnel subprocess from `webutils` - unzip (bool): Unzip brotli zipped html responses - - """ - headers: Dict[str, str] = field(default_factory=lambda: DEFAULT_HEADERS) - sesh: Optional[requests.Session] = None - ssh_tunnel: Optional[subprocess.Popen] = None - unzip: bool = True - -@dataclass -class SearchConfig: - """Combined search engine configuration - - Attributes: - method (Union[str, SearchMethod]): The method to use for searching, either 'requests' or 'selenium' - base (BaseConfig): Common search configuration - selenium (SeleniumConfig): Selenium-specific configuration - requests (RequestsConfig): Requests-specific configuration - - """ - method: Union[str, SearchMethod] = SearchMethod.SELENIUM - base: BaseConfig = field(default_factory=BaseConfig) - selenium: SeleniumConfig = field(default_factory=SeleniumConfig) - requests: RequestsConfig = field(default_factory=RequestsConfig) - - class SearchEngine: """Collect Search Engine Results Pages (SERPs)""" def __init__(self, method: Union[str, SearchMethod] = SearchMethod.SELENIUM, - base_config: Union[dict, BaseConfig] = None, + base_config: Union[dict, LogConfig] = None, selenium_config: Union[dict, SeleniumConfig] = None, requests_config: Union[dict, RequestsConfig] = None ) -> None: @@ -117,7 +37,7 @@ def __init__(self, Args: method (Union[str, SearchMethod], optional): The method to use for searching, either 'requests' or 'selenium'. Defaults to SearchMethod.SELENIUM. - base_config (Union[dict, BaseConfig], optional): Common search configuration. Defaults to None. + base_config (Union[dict, LogConfig], optional): Common search configuration. Defaults to None. selenium_config (Union[dict, SeleniumConfig], optional): Selenium-specific configuration. Defaults to None. requests_config (Union[dict, RequestsConfig], optional): Requests-specific configuration. Defaults to None. """ @@ -129,7 +49,7 @@ def __init__(self, # Handle config objects/dicts def isdict(config): return isinstance(config, dict) - base = BaseConfig(**base_config) if isdict(base_config) else base_config or BaseConfig() + base = LogConfig(**base_config) if isdict(base_config) else base_config or LogConfig() selenium = SeleniumConfig(**selenium_config) if isdict(selenium_config) else selenium_config or SeleniumConfig() requests = RequestsConfig(**requests_config) if isdict(requests_config) else requests_config or RequestsConfig() self.config = SearchConfig( @@ -197,11 +117,7 @@ def search(self, """ self._prepare_search(qry=qry, location=location, lang=lang, num_results=num_results) - if self.config.method == SearchMethod.SELENIUM: - self._conduct_chromedriver_search(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand) - elif self.config.method == SearchMethod.REQUESTS: - self._conduct_search(serp_id=serp_id, crawl_id=crawl_id) - self._handle_response() + self._conduct_search(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand) def _prepare_search(self, qry: str, location: str = None, lang: str = None, num_results: int = None): """Prepare a search URL and metadata for the given query and location""" @@ -219,6 +135,12 @@ def _prepare_search(self, qry: str, location: str = None, lang: str = None, num_ self.params['uule'] = locations.convert_canonical_name_to_uule(self.loc) self.url = f"{self.base_url}?{wu.join_url_quote(self.params)}" + def _conduct_search(self, serp_id:str = '', crawl_id:str = '', ai_expand:bool = False): + if self.config.method == SearchMethod.SELENIUM: + self._conduct_search_chromedriver(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand) + elif self.config.method == SearchMethod.REQUESTS: + self._conduct_search_requests(serp_id=serp_id, crawl_id=crawl_id) + # ========================================================================== # Selenium method @@ -269,7 +191,7 @@ def _send_chromedriver_request(self): log_msg = f"{log_msg} | {self.loc}" if self.loc else log_msg self.log.info(log_msg) - def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = '', ai_expand = False): + def _conduct_search_chromedriver(self, serp_id: str = '', crawl_id: str = '', ai_expand = False): """Send a search request and handle errors""" if not self.driver: self._init_chromedriver() @@ -280,12 +202,19 @@ def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = '', ai try: self._send_chromedriver_request() self.html = self.driver.page_source - except: - self.log.exception(f'SERP | Chromedriver error | {self.serp_id}') + except Exception as e: + self.log.exception(f'SERP | Chromedriver error | {self.serp_id}: {str(e)}') if ai_expand: self._expand_ai_overview() - self.driver.delete_all_cookies() + + # Only delete cookies, don't close the driver here + # The driver will be closed when cleanup() is called + if self.driver: + try: + self.driver.delete_all_cookies() + except Exception as e: + self.log.warning(f"Failed to delete cookies: {str(e)}") def _expand_ai_overview(self): """Expand AI overview box by clicking it""" @@ -330,19 +259,44 @@ def cleanup(self): """ if self.config.method == SearchMethod.SELENIUM and hasattr(self, 'driver') and self.driver: try: + # Try a more thorough cleanup + try: + self.driver.delete_all_cookies() + except Exception: + pass + + try: + # Close all tabs/windows + original_handle = self.driver.current_window_handle + for handle in self.driver.window_handles: + self.driver.switch_to.window(handle) + self.driver.close() + except Exception: + pass + + # Finally quit the driver self.driver.quit() self.driver = None self.log.debug(f'Browser successfully closed') return True except Exception as e: self.log.warning(f'Failed to close browser: {e}') + # Force driver to be None so we create a fresh instance next time + self.driver = None return False return True + + def __del__(self): + """Destructor to ensure browser is closed when object is garbage collected""" + try: + self.cleanup() + except Exception: + pass # ========================================================================== # Requests method - def _conduct_search(self, serp_id: str = '', crawl_id: str = ''): + def _conduct_search_requests(self, serp_id: str = '', crawl_id: str = ''): """Send a search request and handle errors""" self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat() @@ -359,6 +313,8 @@ def _conduct_search(self, serp_id: str = '', crawl_id: str = ''): self.log.exception(f'SERP | Timeout error | {self.serp_id}') except Exception: self.log.exception(f'SERP | Unknown error | {self.serp_id}') + finally: + self._handle_response() def _send_request(self): self.response = self.config.requests.sesh.get(self.url, timeout=10) From 9b925ae2356c64762bbcd8f8fe27ba87d361f6f1 Mon Sep 17 00:00:00 2001 From: gitronald Date: Wed, 26 Mar 2025 11:10:07 -0700 Subject: [PATCH 035/101] update: model directory with multiple files, new BaseConfig model --- WebSearcher/components.py | 2 +- WebSearcher/models/__init__.py | 0 WebSearcher/{models.py => models/configs.py} | 46 ++++++-------------- WebSearcher/models/data.py | 28 ++++++++++++ WebSearcher/searchers.py | 18 +++----- tests/selenium_test.py | 19 ++++---- 6 files changed, 58 insertions(+), 55 deletions(-) create mode 100644 WebSearcher/models/__init__.py rename WebSearcher/{models.py => models/configs.py} (51%) create mode 100644 WebSearcher/models/data.py diff --git a/WebSearcher/components.py b/WebSearcher/components.py index 13449c9..d757a65 100644 --- a/WebSearcher/components.py +++ b/WebSearcher/components.py @@ -1,4 +1,4 @@ -from .models import BaseResult +from .models.data import BaseResult from .classifiers import ClassifyMain, ClassifyFooter, ClassifyHeaderComponent from .component_parsers import main_parser_dict, footer_parser_dict, header_parser_dict from .component_parsers import parse_unknown, parse_not_implemented diff --git a/WebSearcher/models/__init__.py b/WebSearcher/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/WebSearcher/models.py b/WebSearcher/models/configs.py similarity index 51% rename from WebSearcher/models.py rename to WebSearcher/models/configs.py index efaadbd..becf6c3 100644 --- a/WebSearcher/models.py +++ b/WebSearcher/models/configs.py @@ -1,51 +1,33 @@ from pydantic import BaseModel, Field -from typing import Any, Optional, Dict, Union +from typing import Dict, Optional, Union import subprocess import requests from enum import Enum - -class BaseResult(BaseModel): - sub_rank: int = 0 - type: str = 'unclassified' - sub_type: Optional[str] = None - title: Optional[str] = None - url: Optional[str] = None - text: Optional[str] = None - cite: Optional[str] = None - details: Optional[Any] = None - error: Optional[str] = None - - -class BaseSERP(BaseModel): - qry: str # Search query - loc: Optional[str] = None # Location if set, "Canonical Name" - lang: Optional[str] = None # Language if set - url: str # URL of SERP - html: str # Raw HTML of SERP - timestamp: str # Timestamp of crawl - response_code: int # HTTP response code - user_agent: str # User agent used for the crawl - serp_id: str # Search Engine Results Page (SERP) ID - crawl_id: str # Crawl ID for grouping SERPs - version: str # WebSearcher version - method: str # Search method used - - -class LogConfig(BaseModel): +class BaseConfig(BaseModel): + """Base class for all configuration classes""" + + @classmethod + def create(cls, config=None): + """Create a config instance from a dictionary or existing instance""" + if isinstance(config, dict): + return cls(**config) + return config or cls() + +class LogConfig(BaseConfig): log_fp: str = '' log_mode: str = 'a+' log_level: str = 'INFO' -class SeleniumConfig(BaseModel): +class SeleniumConfig(BaseConfig): headless: bool = False version_main: int = 133 use_subprocess: bool = False driver_executable_path: str = "" -class RequestsConfig(BaseModel): +class RequestsConfig(BaseConfig): model_config = {"arbitrary_types_allowed": True} headers: Dict[str, str] = Field(default_factory=lambda: { 'Host': 'www.google.com', diff --git a/WebSearcher/models/data.py b/WebSearcher/models/data.py new file mode 100644 index 0000000..8c7571b --- /dev/null +++ b/WebSearcher/models/data.py @@ -0,0 +1,28 @@ +from pydantic import BaseModel +from typing import Any, Optional + +class BaseResult(BaseModel): + sub_rank: int = 0 + type: str = 'unclassified' + sub_type: Optional[str] = None + title: Optional[str] = None + url: Optional[str] = None + text: Optional[str] = None + cite: Optional[str] = None + details: Optional[Any] = None + error: Optional[str] = None + + +class BaseSERP(BaseModel): + qry: str # Search query + loc: Optional[str] = None # Location if set, "Canonical Name" + lang: Optional[str] = None # Language if set + url: str # URL of SERP + html: str # Raw HTML of SERP + timestamp: str # Timestamp of crawl + response_code: int # HTTP response code + user_agent: str # User agent used for the crawl + serp_id: str # Search Engine Results Page (SERP) ID + crawl_id: str # Crawl ID for grouping SERPs + version: str # WebSearcher version + method: str # Search method used diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index f693b62..06a3f69 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -3,7 +3,8 @@ from . import webutils as wu from . import utils from . import logger -from .models import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod, BaseSERP +from .models.configs import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod +from .models.data import BaseSERP import os import time @@ -46,17 +47,10 @@ def __init__(self, if isinstance(method, str): method = SearchMethod(method.lower()) - # Handle config objects/dicts - def isdict(config): - return isinstance(config, dict) - base = LogConfig(**base_config) if isdict(base_config) else base_config or LogConfig() - selenium = SeleniumConfig(**selenium_config) if isdict(selenium_config) else selenium_config or SeleniumConfig() - requests = RequestsConfig(**requests_config) if isdict(requests_config) else requests_config or RequestsConfig() - self.config = SearchConfig( - method=method, - base=base, - selenium=selenium, - requests=requests + self.config = SearchConfig(method=method, + base=LogConfig(base_config), + selenium=SeleniumConfig(selenium_config), + requests=RequestsConfig(requests_config) ) # Initialize common attributes diff --git a/tests/selenium_test.py b/tests/selenium_test.py index 46afb15..0318fd2 100644 --- a/tests/selenium_test.py +++ b/tests/selenium_test.py @@ -10,16 +10,15 @@ @app.command() def main( - query: str = typer.Argument("why is the sky blue?", help="Search query to use"), - method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"), - headless: bool = typer.Option(False, help="Run browser in headless mode"), - use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"), - version_main: int = typer.Option(133, help="Main version of Chrome to use"), - ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"), - driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"), - output_prefix: str = typer.Option("output", help="Prefix for output files") -) -> None: - + query: str = typer.Argument("why is the sky blue?", help="Search query to use"), + method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"), + headless: bool = typer.Option(False, help="Run browser in headless mode"), + use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"), + version_main: int = typer.Option(133, help="Main version of Chrome to use"), + ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"), + driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"), + output_prefix: str = typer.Option("output", help="Prefix for output files") + ) -> None: typer.echo(f"query: {query}\nmethod: {method}") se = ws.SearchEngine( method=method, From 4e87302bf403572c1cc48430dc430d89a73d4dae Mon Sep 17 00:00:00 2001 From: gitronald Date: Wed, 26 Mar 2025 11:40:48 -0700 Subject: [PATCH 036/101] update: use baseconfig in searchconfig --- WebSearcher/models/configs.py | 20 +++++++++++++++++++- WebSearcher/models/data.py | 1 + WebSearcher/searchers.py | 23 +++++++++++------------ 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py index becf6c3..d202881 100644 --- a/WebSearcher/models/configs.py +++ b/WebSearcher/models/configs.py @@ -4,6 +4,7 @@ import requests from enum import Enum + class BaseConfig(BaseModel): """Base class for all configuration classes""" @@ -14,6 +15,7 @@ def create(cls, config=None): return cls(**config) return config or cls() + class LogConfig(BaseConfig): log_fp: str = '' log_mode: str = 'a+' @@ -46,7 +48,23 @@ class SearchMethod(Enum): REQUESTS = "requests" SELENIUM = "selenium" -class SearchConfig(BaseModel): + @classmethod + def create(cls, method=None): + """Convert string to SearchMethod enum or return existing enum instance""" + if method is None: + return cls.SELENIUM + if isinstance(method, cls): + return method + if isinstance(method, str): + try: + return cls(method.lower()) + except ValueError: + valid_values = [e.value for e in cls] + raise ValueError(f"Invalid search method: {method}. Valid values are: {valid_values}") + raise TypeError(f"Expected string or SearchMethod, got {type(method)}") + + +class SearchConfig(BaseConfig): method: Union[str, SearchMethod] = SearchMethod.SELENIUM base: LogConfig = Field(default_factory=LogConfig) selenium: SeleniumConfig = Field(default_factory=SeleniumConfig) diff --git a/WebSearcher/models/data.py b/WebSearcher/models/data.py index 8c7571b..4143f8d 100644 --- a/WebSearcher/models/data.py +++ b/WebSearcher/models/data.py @@ -1,6 +1,7 @@ from pydantic import BaseModel from typing import Any, Optional + class BaseResult(BaseModel): sub_rank: int = 0 type: str = 'unclassified' diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 06a3f69..3fd4c04 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -30,9 +30,9 @@ class SearchEngine: """Collect Search Engine Results Pages (SERPs)""" def __init__(self, method: Union[str, SearchMethod] = SearchMethod.SELENIUM, - base_config: Union[dict, LogConfig] = None, - selenium_config: Union[dict, SeleniumConfig] = None, - requests_config: Union[dict, RequestsConfig] = None + base_config: Union[dict, LogConfig] = {}, + selenium_config: Union[dict, SeleniumConfig] = {}, + requests_config: Union[dict, RequestsConfig] = {} ) -> None: """Initialize the search engine @@ -43,15 +43,14 @@ def __init__(self, requests_config (Union[dict, RequestsConfig], optional): Requests-specific configuration. Defaults to None. """ - # Convert string method to enum if needed - if isinstance(method, str): - method = SearchMethod(method.lower()) - - self.config = SearchConfig(method=method, - base=LogConfig(base_config), - selenium=SeleniumConfig(selenium_config), - requests=RequestsConfig(requests_config) - ) + # Initialize configuration + self.version = WS_VERSION + self.config = SearchConfig.create({ + "method": SearchMethod.create(method), + "base": LogConfig.create(base_config), + "selenium": SeleniumConfig.create(selenium_config), + "requests": RequestsConfig.create(requests_config), + }) # Initialize common attributes self.version: str = WS_VERSION From d0a7aa9f93d02f18d43407e9826d78299a1ce202 Mon Sep 17 00:00:00 2001 From: gitronald Date: Wed, 26 Mar 2025 12:11:27 -0700 Subject: [PATCH 037/101] update: clean log config, header as arg --- WebSearcher/models/configs.py | 8 ++++---- WebSearcher/searchers.py | 33 +++++++++++++++++---------------- 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py index d202881..9cacc50 100644 --- a/WebSearcher/models/configs.py +++ b/WebSearcher/models/configs.py @@ -17,9 +17,9 @@ def create(cls, config=None): class LogConfig(BaseConfig): - log_fp: str = '' - log_mode: str = 'a+' - log_level: str = 'INFO' + fp: str = '' + mode: str = 'a' + level: str = 'INFO' class SeleniumConfig(BaseConfig): @@ -66,6 +66,6 @@ def create(cls, method=None): class SearchConfig(BaseConfig): method: Union[str, SearchMethod] = SearchMethod.SELENIUM - base: LogConfig = Field(default_factory=LogConfig) + log: LogConfig = Field(default_factory=LogConfig) selenium: SeleniumConfig = Field(default_factory=SeleniumConfig) requests: RequestsConfig = Field(default_factory=RequestsConfig) diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 3fd4c04..71b2b8c 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -30,15 +30,16 @@ class SearchEngine: """Collect Search Engine Results Pages (SERPs)""" def __init__(self, method: Union[str, SearchMethod] = SearchMethod.SELENIUM, - base_config: Union[dict, LogConfig] = {}, + log_config: Union[dict, LogConfig] = {}, selenium_config: Union[dict, SeleniumConfig] = {}, - requests_config: Union[dict, RequestsConfig] = {} + requests_config: Union[dict, RequestsConfig] = {}, + headers: Dict[str, str] = None ) -> None: """Initialize the search engine Args: method (Union[str, SearchMethod], optional): The method to use for searching, either 'requests' or 'selenium'. Defaults to SearchMethod.SELENIUM. - base_config (Union[dict, LogConfig], optional): Common search configuration. Defaults to None. + log_config (Union[dict, LogConfig], optional): Common search configuration. Defaults to None. selenium_config (Union[dict, SeleniumConfig], optional): Selenium-specific configuration. Defaults to None. requests_config (Union[dict, RequestsConfig], optional): Requests-specific configuration. Defaults to None. """ @@ -47,18 +48,18 @@ def __init__(self, self.version = WS_VERSION self.config = SearchConfig.create({ "method": SearchMethod.create(method), - "base": LogConfig.create(base_config), + "log": LogConfig.create(log_config), "selenium": SeleniumConfig.create(selenium_config), "requests": RequestsConfig.create(requests_config), }) - # Initialize common attributes - self.version: str = WS_VERSION + # Initialize searcher self.base_url: str = 'https://www.google.com/search' self.params: Dict[str, Any] = {} - - # Initialize method-specific attributes - if self.config.method == SearchMethod.SELENIUM: + if self.config.method == SearchMethod.REQUESTS: + self.headers = headers or self.config.requests.headers + self.sesh = self.config.requests.sesh or wu.start_sesh(headers=self.headers) + elif self.config.method == SearchMethod.SELENIUM: self.driver = None else: self.config.requests.sesh = self.config.requests.sesh or wu.start_sesh(headers=self.config.requests.headers) @@ -82,11 +83,11 @@ def __init__(self, # Set a log file, prints to console by default self.log = logger.Logger( - console=True if not self.config.base.log_fp else False, - console_level=self.config.base.log_level, - file_name=self.config.base.log_fp, - file_mode=self.config.base.log_mode, - file_level=self.config.base.log_level, + console=True if not self.config.log.fp else False, + console_level=self.config.log.level, + file_name=self.config.log.fp, + file_mode=self.config.log.mode, + file_level=self.config.log.level, ).start(__name__) def search(self, @@ -295,7 +296,7 @@ def _conduct_search_requests(self, serp_id: str = '', crawl_id: str = ''): self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat() self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp) self.crawl_id = crawl_id - self.user_agent = self.config.requests.headers['User-Agent'] + self.user_agent = self.headers['User-Agent'] try: self._send_request() @@ -310,7 +311,7 @@ def _conduct_search_requests(self, serp_id: str = '', crawl_id: str = ''): self._handle_response() def _send_request(self): - self.response = self.config.requests.sesh.get(self.url, timeout=10) + self.response = self.sesh.get(self.url, timeout=10) self.response_code = self.response.status_code log_msg = f"{self.response_code} | {self.qry}" log_msg = f"{log_msg} | {self.loc}" if self.loc else log_msg From 2fc420dfa769e118921704081a94bcff41a0e6d0 Mon Sep 17 00:00:00 2001 From: gitronald Date: Wed, 26 Mar 2025 15:05:08 -0700 Subject: [PATCH 038/101] update: use search params pydantic model --- WebSearcher/models/configs.py | 30 ++++++++++++++- WebSearcher/searchers.py | 70 ++++++++++++++--------------------- 2 files changed, 56 insertions(+), 44 deletions(-) diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py index 9cacc50..c784141 100644 --- a/WebSearcher/models/configs.py +++ b/WebSearcher/models/configs.py @@ -1,9 +1,12 @@ -from pydantic import BaseModel, Field -from typing import Dict, Optional, Union +from pydantic import BaseModel, Field, computed_field +from typing import Dict, Optional, Union, Any import subprocess import requests from enum import Enum +from .. import webutils as wu +from .. import locations + class BaseConfig(BaseModel): """Base class for all configuration classes""" @@ -15,6 +18,29 @@ def create(cls, config=None): return cls(**config) return config or cls() +class SearchParams(BaseConfig): + qry: str = '' + num_results: Optional[int] = None + lang: Optional[str] = None + loc: Optional[str] = None + base_url: str = "https://www.google.com/search" + + @computed_field + def url_params(self) -> Dict[str, Any]: + params = {'q': wu.encode_param_value(self.qry)} + opt_params = { + 'num': self.num_results, + 'hl': self.lang, + 'uule': locations.convert_canonical_name_to_uule(self.loc) if self.loc else None, + } + opt_params = {k: v for k, v in opt_params.items() if v and v not in {'None', 'nan'}} + params.update(opt_params) + return params + + @computed_field + def url(self) -> str: + """Return the fully formed URL with parameters.""" + return f"{self.base_url}?{wu.join_url_quote(self.url_params)}" class LogConfig(BaseConfig): fp: str = '' diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 71b2b8c..f16331f 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -1,9 +1,8 @@ from . import parsers -from . import locations from . import webutils as wu from . import utils from . import logger -from .models.configs import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod +from .models.configs import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod, SearchParams from .models.data import BaseSERP import os @@ -12,7 +11,7 @@ import brotli import requests import pandas as pd -from typing import Any, Dict, Optional, Union +from typing import Dict, Optional, Union from datetime import datetime, timezone # selenium updates @@ -54,22 +53,16 @@ def __init__(self, }) # Initialize searcher - self.base_url: str = 'https://www.google.com/search' - self.params: Dict[str, Any] = {} if self.config.method == SearchMethod.REQUESTS: self.headers = headers or self.config.requests.headers self.sesh = self.config.requests.sesh or wu.start_sesh(headers=self.headers) elif self.config.method == SearchMethod.SELENIUM: self.driver = None - else: - self.config.requests.sesh = self.config.requests.sesh or wu.start_sesh(headers=self.config.requests.headers) + + self.search_params = SearchParams.create() + # Initialize search details - self.qry: str = None - self.loc: str = None - self.lang: str = None - self.num_results = None - self.url: str = None self.timestamp: str = None self.serp_id: str = None self.crawl_id: str = None @@ -113,21 +106,13 @@ def search(self, self._prepare_search(qry=qry, location=location, lang=lang, num_results=num_results) self._conduct_search(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand) - def _prepare_search(self, qry: str, location: str = None, lang: str = None, num_results: int = None): - """Prepare a search URL and metadata for the given query and location""" - self.qry = str(qry) - self.loc = str(location) if not pd.isnull(location) else '' - self.lang = str(lang) if not pd.isnull(lang) else '' - self.num_results = num_results - self.params = {} - self.params['q'] = wu.encode_param_value(self.qry) - if self.num_results: - self.params['num'] = self.num_results - if self.lang and self.lang not in {'None', 'nan'}: - self.params['hl'] = self.lang - if self.loc and self.loc not in {'None', 'nan'}: - self.params['uule'] = locations.convert_canonical_name_to_uule(self.loc) - self.url = f"{self.base_url}?{wu.join_url_quote(self.params)}" + def _prepare_search(self, qry: str, location: str, lang: str, num_results: int): + self.search_params = SearchParams.create({ + 'qry': str(qry), + 'loc': str(location) if not pd.isnull(location) else '', + 'lang': str(lang) if not pd.isnull(lang) else '', + 'num_results': num_results, + }) def _conduct_search(self, serp_id:str = '', crawl_id:str = '', ai_expand:bool = False): if self.config.method == SearchMethod.SELENIUM: @@ -162,14 +147,14 @@ def _send_chromedriver_typed_query(self): time.sleep(2) search_box = self.driver.find_element(By.ID, "APjFqb") search_box.clear() - search_box.send_keys(self.qry) + search_box.send_keys(self.search_params.qry) search_box.send_keys(Keys.RETURN) def _send_chromedriver_request(self): """Use a prepared URL to conduct a search""" time.sleep(2) - self.driver.get(self.url) + self.driver.get(self.search_params.url) time.sleep(2) # wait for the page to load @@ -179,19 +164,19 @@ def _send_chromedriver_request(self): time.sleep(2) #including a sleep to allow the page to fully load self.html = self.driver.page_source - self.url = self.driver.current_url + self.selenium_url = self.driver.current_url self.response_code = 0 - log_msg = f"{self.response_code} | {self.qry}" - log_msg = f"{log_msg} | {self.loc}" if self.loc else log_msg + log_msg = f"{self.response_code} | {self.search_params.qry}" + log_msg = f"{log_msg} | {self.search_params.loc}" if self.search_params.loc else log_msg self.log.info(log_msg) def _conduct_search_chromedriver(self, serp_id: str = '', crawl_id: str = '', ai_expand = False): """Send a search request and handle errors""" if not self.driver: self._init_chromedriver() - self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat() - self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp) + str_to_hash = self.search_params.qry + self.search_params.loc + self.timestamp + self.serp_id = serp_id if serp_id else utils.hash_id(str_to_hash) self.crawl_id = crawl_id try: self._send_chromedriver_request() @@ -294,7 +279,8 @@ def _conduct_search_requests(self, serp_id: str = '', crawl_id: str = ''): """Send a search request and handle errors""" self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat() - self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp) + str_to_hash = self.search_params.qry + self.search_params.loc + self.timestamp + self.serp_id = serp_id if serp_id else utils.hash_id(str_to_hash) self.crawl_id = crawl_id self.user_agent = self.headers['User-Agent'] @@ -311,10 +297,10 @@ def _conduct_search_requests(self, serp_id: str = '', crawl_id: str = ''): self._handle_response() def _send_request(self): - self.response = self.sesh.get(self.url, timeout=10) + self.response = self.sesh.get(self.search_params.url, timeout=10) self.response_code = self.response.status_code - log_msg = f"{self.response_code} | {self.qry}" - log_msg = f"{log_msg} | {self.loc}" if self.loc else log_msg + log_msg = f"{self.response_code} | {self.search_params.qry}" + log_msg = f"{log_msg} | {self.search_params.loc}" if self.search_params.loc else log_msg self.log.info(log_msg) def _reset_ssh_tunnel(self): @@ -383,10 +369,10 @@ def parse_serp_features(self): def prepare_serp_save(self): self.serp = BaseSERP( - qry=self.qry, - loc=self.loc, - lang=self.lang, - url=self.url, + qry=self.search_params.qry, + loc=self.search_params.loc, + lang=self.search_params.lang, + url=self.search_params.url, html=self.html, response_code=self.response_code, user_agent=self.user_agent, From 083282fb71c386fd1942be19c1f8e6d571cc43b4 Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 27 Mar 2025 08:49:26 -0700 Subject: [PATCH 039/101] update: move selenium to new searchers dir --- WebSearcher/searchers.py | 224 +++++++-------------- WebSearcher/searchers/__init__.py | 0 WebSearcher/searchers/selenium_searcher.py | 176 ++++++++++++++++ 3 files changed, 247 insertions(+), 153 deletions(-) create mode 100644 WebSearcher/searchers/__init__.py create mode 100644 WebSearcher/searchers/selenium_searcher.py diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index f16331f..70e954f 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -2,6 +2,7 @@ from . import webutils as wu from . import utils from . import logger +from .searchers.selenium_searcher import SeleniumDriver from .models.configs import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod, SearchParams from .models.data import BaseSERP @@ -14,13 +15,7 @@ from typing import Dict, Optional, Union from datetime import datetime, timezone -# selenium updates -import undetected_chromedriver as uc -from selenium.webdriver.common.by import By -from selenium.webdriver.common.keys import Keys -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.common.exceptions import NoSuchElementException +# selenium imports no longer needed here as they're in selenium_utils.py from importlib import metadata WS_VERSION = metadata.version('WebSearcher') @@ -52,15 +47,43 @@ def __init__(self, "requests": RequestsConfig.create(requests_config), }) + # Set a log file, prints to console by default + self.log = logger.Logger( + console=True if not self.config.log.fp else False, + console_level=self.config.log.level, + file_name=self.config.log.fp, + file_mode=self.config.log.mode, + file_level=self.config.log.level, + ).start(__name__) + # Initialize searcher if self.config.method == SearchMethod.REQUESTS: self.headers = headers or self.config.requests.headers self.sesh = self.config.requests.sesh or wu.start_sesh(headers=self.headers) elif self.config.method == SearchMethod.SELENIUM: - self.driver = None + self.selenium_driver = SeleniumDriver(config=self.config.selenium, logger=self.log) + self.selenium_driver.driver = None self.search_params = SearchParams.create() + # Initialize search details + self.serp = { + 'version': self.version, + 'method': self.config.method.value, + 'crawl_id': None, + 'serp_id': None, + 'qry': None, + 'loc': None, + 'lang': None, + 'url': None, + 'response_code': None, + 'user_agent': None, + 'timestamp': None, + 'serp_id': None, + 'html': None, + 'results': [], + 'features': {}, + } # Initialize search details self.timestamp: str = None @@ -74,15 +97,6 @@ def __init__(self, self.serp_features: dict = {} self.serp: dict = {} - # Set a log file, prints to console by default - self.log = logger.Logger( - console=True if not self.config.log.fp else False, - console_level=self.config.log.level, - file_name=self.config.log.fp, - file_mode=self.config.log.mode, - file_level=self.config.log.level, - ).start(__name__) - def search(self, qry: str, location: str = None, @@ -123,154 +137,38 @@ def _conduct_search(self, serp_id:str = '', crawl_id:str = '', ai_expand:bool = # ========================================================================== # Selenium method - def _init_chromedriver(self) -> None: - """Initialize Chrome driver with selenium-specific config""" - self.log.debug(f'SERP | init uc chromedriver | kwargs: {self.config.selenium.__dict__}') - self.driver = uc.Chrome(**self.config.selenium.__dict__) - self.user_agent = self.driver.execute_script('return navigator.userAgent') - self.response_code = None - - # Log version information - self.browser_info = { - 'browser_id': "", - 'browser_name': self.driver.capabilities['browserName'], - 'browser_version': self.driver.capabilities['browserVersion'], - 'driver_version': self.driver.capabilities['chrome']['chromedriverVersion'].split(' ')[0], - } - self.browser_info['browser_id'] = utils.hash_id(json.dumps(self.browser_info)) - self.log.debug(json.dumps(self.browser_info, indent=4)) - - def _send_chromedriver_typed_query(self): - """Send a typed query to the search box""" - time.sleep(2) - self.driver.get('https://www.google.com') - time.sleep(2) - search_box = self.driver.find_element(By.ID, "APjFqb") - search_box.clear() - search_box.send_keys(self.search_params.qry) - search_box.send_keys(Keys.RETURN) - - def _send_chromedriver_request(self): - """Use a prepared URL to conduct a search""" - - time.sleep(2) - self.driver.get(self.search_params.url) - time.sleep(2) - - # wait for the page to load - WebDriverWait(self.driver, 10).until( - EC.presence_of_element_located((By.ID, "search")) - ) - time.sleep(2) #including a sleep to allow the page to fully load - - self.html = self.driver.page_source - self.selenium_url = self.driver.current_url - self.response_code = 0 - log_msg = f"{self.response_code} | {self.search_params.qry}" - log_msg = f"{log_msg} | {self.search_params.loc}" if self.search_params.loc else log_msg - self.log.info(log_msg) - def _conduct_search_chromedriver(self, serp_id: str = '', crawl_id: str = '', ai_expand = False): """Send a search request and handle errors""" - if not self.driver: - self._init_chromedriver() + if not self.selenium_driver.driver: + self.selenium_driver.init_driver() + + self.crawl_id = crawl_id self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat() str_to_hash = self.search_params.qry + self.search_params.loc + self.timestamp self.serp_id = serp_id if serp_id else utils.hash_id(str_to_hash) - self.crawl_id = crawl_id + try: - self._send_chromedriver_request() - self.html = self.driver.page_source + response_data = self.selenium_driver.send_request(self.search_params.url) + self.html = response_data['html'] + self.selenium_url = response_data['url'] + self.response_code = response_data['response_code'] + self.user_agent = self.selenium_driver.user_agent + + log_msg = f"{self.response_code} | {self.search_params.qry}" + log_msg = f"{log_msg} | {self.search_params.loc}" if self.search_params.loc else log_msg + self.log.info(log_msg) + except Exception as e: self.log.exception(f'SERP | Chromedriver error | {self.serp_id}: {str(e)}') if ai_expand: - self._expand_ai_overview() + expanded_html = self.selenium_driver.expand_ai_overview() + if expanded_html: + self.log.debug(f'SERP | overwriting expanded content | len diff: {len(expanded_html) - len(self.html)}') + self.html = expanded_html # Only delete cookies, don't close the driver here - # The driver will be closed when cleanup() is called - if self.driver: - try: - self.driver.delete_all_cookies() - except Exception as e: - self.log.warning(f"Failed to delete cookies: {str(e)}") - - def _expand_ai_overview(self): - """Expand AI overview box by clicking it""" - show_more_button_xpath = "//div[@jsname='rPRdsc' and @role='button']" - show_all_button_xpath = '//div[contains(@class, "trEk7e") and @role="button"]' - - try: - self.driver.find_element(By.XPATH, show_more_button_xpath) - show_more_button_exists = True - except NoSuchElementException: - show_more_button_exists = False - - if show_more_button_exists: - try: - show_more_button = WebDriverWait(self.driver, 1).until( - EC.element_to_be_clickable((By.XPATH, show_more_button_xpath)) - ) - if show_more_button is not None: - show_more_button.click() - try: - time.sleep(2) # Wait for additional content to load - show_all_button = WebDriverWait(self.driver, 1).until( - EC.element_to_be_clickable((By.XPATH, show_all_button_xpath)) - ) - show_all_button.click() - except Exception: - pass - - # Overwrite html with expanded content - new_html = self.driver.page_source - self.log.debug(f'SERP | overwriting expanded content | len diff: {len(new_html) - len(self.html)}') - self.html = new_html - - except Exception: - pass - - def cleanup(self): - """Clean up resources, particularly Selenium's browser instance - - Returns: - bool: True if cleanup was successful or not needed, False if cleanup failed - """ - if self.config.method == SearchMethod.SELENIUM and hasattr(self, 'driver') and self.driver: - try: - # Try a more thorough cleanup - try: - self.driver.delete_all_cookies() - except Exception: - pass - - try: - # Close all tabs/windows - original_handle = self.driver.current_window_handle - for handle in self.driver.window_handles: - self.driver.switch_to.window(handle) - self.driver.close() - except Exception: - pass - - # Finally quit the driver - self.driver.quit() - self.driver = None - self.log.debug(f'Browser successfully closed') - return True - except Exception as e: - self.log.warning(f'Failed to close browser: {e}') - # Force driver to be None so we create a fresh instance next time - self.driver = None - return False - return True - - def __del__(self): - """Destructor to ensure browser is closed when object is garbage collected""" - try: - self.cleanup() - except Exception: - pass + self.selenium_driver.delete_cookies() # ========================================================================== # Requests method @@ -441,3 +339,23 @@ def save_results(self, save_dir: str = "", append_to: str = ""): else: self.log.info(f'No parsed results for serp_id: {self.serp_id}') + def cleanup(self): + """Clean up resources, particularly Selenium's browser instance + + Returns: + bool: True if cleanup was successful or not needed, False if cleanup failed + """ + if self.config.method == SearchMethod.SELENIUM and hasattr(self, 'selenium_driver'): + result = self.selenium_driver.cleanup() + if result: + self.selenium_driver.driver = None # Update the reference + return result + return True + + def __del__(self): + """Destructor to ensure browser is closed when object is garbage collected""" + try: + self.cleanup() + except Exception: + pass + diff --git a/WebSearcher/searchers/__init__.py b/WebSearcher/searchers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/WebSearcher/searchers/selenium_searcher.py b/WebSearcher/searchers/selenium_searcher.py new file mode 100644 index 0000000..3d5f5d6 --- /dev/null +++ b/WebSearcher/searchers/selenium_searcher.py @@ -0,0 +1,176 @@ +import time +import json +from typing import Dict, Optional, Any + +import undetected_chromedriver as uc +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import NoSuchElementException + +from .. import utils +from ..models.configs import SeleniumConfig + + +class SeleniumDriver: + """Handle Selenium-based web interactions for search engines""" + + def __init__(self, config: SeleniumConfig, logger): + """Initialize a Selenium driver with the given configuration + + Args: + config (SeleniumConfig): Configuration for Selenium + logger: Logger instance + """ + self.config = config + self.log = logger + self.driver = None + self.user_agent = None + self.response_code = None + self.browser_info = {} + + def init_driver(self) -> None: + """Initialize Chrome driver with selenium-specific config""" + self.log.debug(f'SERP | init uc chromedriver | kwargs: {self.config.__dict__}') + self.driver = uc.Chrome(**self.config.__dict__) + self.user_agent = self.driver.execute_script('return navigator.userAgent') + self.response_code = None + + # Log version information + self.browser_info = { + 'browser_id': "", + 'browser_name': self.driver.capabilities['browserName'], + 'browser_version': self.driver.capabilities['browserVersion'], + 'driver_version': self.driver.capabilities['chrome']['chromedriverVersion'].split(' ')[0], + } + self.browser_info['browser_id'] = utils.hash_id(json.dumps(self.browser_info)) + self.log.debug(json.dumps(self.browser_info, indent=4)) + + def send_typed_query(self, query: str): + """Send a typed query to the search box""" + time.sleep(2) + self.driver.get('https://www.google.com') + time.sleep(2) + search_box = self.driver.find_element(By.ID, "APjFqb") + search_box.clear() + search_box.send_keys(query) + search_box.send_keys(Keys.RETURN) + + def send_request(self, url: str) -> Dict[str, Any]: + """Use a prepared URL to conduct a search + + Args: + url (str): The URL to request + + Returns: + Dict[str, Any]: Dictionary containing response data + """ + time.sleep(2) + self.driver.get(url) + time.sleep(2) + + # wait for the page to load + WebDriverWait(self.driver, 10).until( + EC.presence_of_element_located((By.ID, "search")) + ) + time.sleep(2) #including a sleep to allow the page to fully load + + html = self.driver.page_source + selenium_url = self.driver.current_url + self.response_code = 0 + + return { + 'html': html, + 'url': selenium_url, + 'response_code': self.response_code, + } + + def expand_ai_overview(self): + """Expand AI overview box by clicking it + + Returns: + str: Updated HTML if expansion occurred, None otherwise + """ + show_more_button_xpath = "//div[@jsname='rPRdsc' and @role='button']" + show_all_button_xpath = '//div[contains(@class, "trEk7e") and @role="button"]' + + try: + self.driver.find_element(By.XPATH, show_more_button_xpath) + show_more_button_exists = True + except NoSuchElementException: + show_more_button_exists = False + + if show_more_button_exists: + try: + show_more_button = WebDriverWait(self.driver, 1).until( + EC.element_to_be_clickable((By.XPATH, show_more_button_xpath)) + ) + if show_more_button is not None: + show_more_button.click() + try: + time.sleep(2) # Wait for additional content to load + show_all_button = WebDriverWait(self.driver, 1).until( + EC.element_to_be_clickable((By.XPATH, show_all_button_xpath)) + ) + show_all_button.click() + except Exception: + pass + + # Return expanded content + return self.driver.page_source + + except Exception: + pass + + return None + + def cleanup(self) -> bool: + """Clean up resources, particularly Selenium's browser instance + + Returns: + bool: True if cleanup was successful or not needed, False if cleanup failed + """ + if self.driver: + try: + # Try a more thorough cleanup + try: + self.driver.delete_all_cookies() + except Exception: + pass + + try: + # Close all tabs/windows + original_handle = self.driver.current_window_handle + for handle in self.driver.window_handles: + self.driver.switch_to.window(handle) + self.driver.close() + except Exception: + pass + + # Finally quit the driver + self.driver.quit() + self.driver = None + self.log.debug(f'Browser successfully closed') + return True + except Exception as e: + self.log.warning(f'Failed to close browser: {e}') + # Force driver to be None so we create a fresh instance next time + self.driver = None + return False + return True + + def delete_cookies(self): + """Delete all cookies from the browser""" + if self.driver: + try: + self.driver.delete_all_cookies() + except Exception as e: + self.log.warning(f"Failed to delete cookies: {str(e)}") + + def __del__(self): + """Destructor to ensure browser is closed when object is garbage collected""" + try: + self.cleanup() + except Exception: + pass From 7ae02c6bfe71e54a9c1ab6d75c96a6e9d4b04665 Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 27 Mar 2025 08:49:33 -0700 Subject: [PATCH 040/101] update: model docs --- WebSearcher/models/configs.py | 29 ++++++++++++++---- WebSearcher/models/data.py | 56 +++++++++++++++++++++-------------- 2 files changed, 57 insertions(+), 28 deletions(-) diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py index c784141..0f48176 100644 --- a/WebSearcher/models/configs.py +++ b/WebSearcher/models/configs.py @@ -19,12 +19,24 @@ def create(cls, config=None): return config or cls() class SearchParams(BaseConfig): - qry: str = '' - num_results: Optional[int] = None - lang: Optional[str] = None - loc: Optional[str] = None - base_url: str = "https://www.google.com/search" + """ + Contains parameters for a search request and utility methods for URL generation. + This class stores search query parameters and provides methods to convert + them into properly formatted URL parameters and complete search URLs. + """ + qry: str = Field('', description="The search query text") + num_results: Optional[int] = Field(None, description="Number of results to return") + lang: Optional[str] = Field(None, description="Language code (e.g., 'en')") + loc: Optional[str] = Field(None, description="Location in Canonical Name format") + base_url: str = Field("https://www.google.com/search", description="Base search engine URL") + + """ + Generates a dictionary of URL parameters based on the search parameters. + + Converts the search parameters to a dictionary format suitable for URL encoding, + removing any None values and handling special parameters like location. + """ @computed_field def url_params(self) -> Dict[str, Any]: params = {'q': wu.encode_param_value(self.qry)} @@ -37,9 +49,14 @@ def url_params(self) -> Dict[str, Any]: params.update(opt_params) return params + """ + Returns the fully formed search URL with all parameters. + + Combines the base URL with the encoded parameters to create a complete, + properly escaped search URL. + """ @computed_field def url(self) -> str: - """Return the fully formed URL with parameters.""" return f"{self.base_url}?{wu.join_url_quote(self.url_params)}" class LogConfig(BaseConfig): diff --git a/WebSearcher/models/data.py b/WebSearcher/models/data.py index 4143f8d..b141668 100644 --- a/WebSearcher/models/data.py +++ b/WebSearcher/models/data.py @@ -1,29 +1,41 @@ -from pydantic import BaseModel +from pydantic import BaseModel, Field from typing import Any, Optional class BaseResult(BaseModel): - sub_rank: int = 0 - type: str = 'unclassified' - sub_type: Optional[str] = None - title: Optional[str] = None - url: Optional[str] = None - text: Optional[str] = None - cite: Optional[str] = None - details: Optional[Any] = None - error: Optional[str] = None + """ + Represents a single search result item extracted from a SERP. + + Contains the structured data of one search result including its rank, + type, title, URL, and other metadata. + """ + sub_rank: int = Field(0, description="Position within a results component") + type: str = Field('unclassified', description="Result type (general, ad, etc.)") + sub_type: Optional[str] = Field(None, description="Result sub-type (e.g., header, item)") + title: Optional[str] = Field(None, description="Title of the search result") + url: Optional[str] = Field(None, description="URL of the search result") + text: Optional[str] = Field(None, description="Snippet text from the search result") + cite: Optional[str] = Field(None, description="Citation or source information") + details: Optional[Any] = Field(None, description="Additional structured details specific to result type") + error: Optional[str] = Field(None, description="Error message if result parsing failed") class BaseSERP(BaseModel): - qry: str # Search query - loc: Optional[str] = None # Location if set, "Canonical Name" - lang: Optional[str] = None # Language if set - url: str # URL of SERP - html: str # Raw HTML of SERP - timestamp: str # Timestamp of crawl - response_code: int # HTTP response code - user_agent: str # User agent used for the crawl - serp_id: str # Search Engine Results Page (SERP) ID - crawl_id: str # Crawl ID for grouping SERPs - version: str # WebSearcher version - method: str # Search method used + """ + Represents a complete Search Engine Results Page (SERP). + + Contains all data related to a single search query including the query itself, + raw HTML response, metadata about the request, and identifiers for tracking. + """ + qry: str = Field(..., description="Search query") + loc: Optional[str] = Field(None, description="Location if set, in Canonical Name format") + lang: Optional[str] = Field(None, description="Language code if set") + url: str = Field(..., description="URL of the SERP") + html: str = Field(..., description="Raw HTML of the SERP") + timestamp: str = Field(..., description="ISO format timestamp of the crawl") + response_code: int = Field(..., description="HTTP response code") + user_agent: str = Field(..., description="User agent used for the request") + serp_id: str = Field(..., description="Unique identifier for this SERP") + crawl_id: str = Field(..., description="Identifier for grouping related SERPs") + version: str = Field(..., description="WebSearcher version used") + method: str = Field(..., description="Search method used (selenium/requests)") From b725ace767dde113b8dae32cc52346c23f56b6ee Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 27 Mar 2025 10:27:04 -0700 Subject: [PATCH 041/101] add: searches directory for diff methods --- WebSearcher/models/configs.py | 55 +------- WebSearcher/models/searches.py | 76 +++++++++++ .../{searchers => search_methods}/__init__.py | 0 .../selenium_searcher.py | 64 ++++----- WebSearcher/searchers.py | 127 +++++++----------- 5 files changed, 153 insertions(+), 169 deletions(-) create mode 100644 WebSearcher/models/searches.py rename WebSearcher/{searchers => search_methods}/__init__.py (100%) rename WebSearcher/{searchers => search_methods}/selenium_searcher.py (82%) diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py index 0f48176..c64ee30 100644 --- a/WebSearcher/models/configs.py +++ b/WebSearcher/models/configs.py @@ -1,12 +1,8 @@ -from pydantic import BaseModel, Field, computed_field -from typing import Dict, Optional, Union, Any -import subprocess import requests +import subprocess from enum import Enum - -from .. import webutils as wu -from .. import locations - +from typing import Dict, Optional, Union +from pydantic import BaseModel, Field class BaseConfig(BaseModel): """Base class for all configuration classes""" @@ -18,60 +14,17 @@ def create(cls, config=None): return cls(**config) return config or cls() -class SearchParams(BaseConfig): - """ - Contains parameters for a search request and utility methods for URL generation. - - This class stores search query parameters and provides methods to convert - them into properly formatted URL parameters and complete search URLs. - """ - qry: str = Field('', description="The search query text") - num_results: Optional[int] = Field(None, description="Number of results to return") - lang: Optional[str] = Field(None, description="Language code (e.g., 'en')") - loc: Optional[str] = Field(None, description="Location in Canonical Name format") - base_url: str = Field("https://www.google.com/search", description="Base search engine URL") - - """ - Generates a dictionary of URL parameters based on the search parameters. - - Converts the search parameters to a dictionary format suitable for URL encoding, - removing any None values and handling special parameters like location. - """ - @computed_field - def url_params(self) -> Dict[str, Any]: - params = {'q': wu.encode_param_value(self.qry)} - opt_params = { - 'num': self.num_results, - 'hl': self.lang, - 'uule': locations.convert_canonical_name_to_uule(self.loc) if self.loc else None, - } - opt_params = {k: v for k, v in opt_params.items() if v and v not in {'None', 'nan'}} - params.update(opt_params) - return params - - """ - Returns the fully formed search URL with all parameters. - - Combines the base URL with the encoded parameters to create a complete, - properly escaped search URL. - """ - @computed_field - def url(self) -> str: - return f"{self.base_url}?{wu.join_url_quote(self.url_params)}" - class LogConfig(BaseConfig): fp: str = '' mode: str = 'a' level: str = 'INFO' - class SeleniumConfig(BaseConfig): headless: bool = False version_main: int = 133 use_subprocess: bool = False driver_executable_path: str = "" - class RequestsConfig(BaseConfig): model_config = {"arbitrary_types_allowed": True} headers: Dict[str, str] = Field(default_factory=lambda: { @@ -86,7 +39,6 @@ class RequestsConfig(BaseConfig): ssh_tunnel: Optional[subprocess.Popen] = None unzip: bool = True - class SearchMethod(Enum): REQUESTS = "requests" SELENIUM = "selenium" @@ -106,7 +58,6 @@ def create(cls, method=None): raise ValueError(f"Invalid search method: {method}. Valid values are: {valid_values}") raise TypeError(f"Expected string or SearchMethod, got {type(method)}") - class SearchConfig(BaseConfig): method: Union[str, SearchMethod] = SearchMethod.SELENIUM log: LogConfig = Field(default_factory=LogConfig) diff --git a/WebSearcher/models/searches.py b/WebSearcher/models/searches.py new file mode 100644 index 0000000..3570823 --- /dev/null +++ b/WebSearcher/models/searches.py @@ -0,0 +1,76 @@ +from pydantic import Field, computed_field +from typing import Dict, Optional, Any, List +from datetime import datetime, timezone + +from ..utils import hash_id +from ..import webutils as wu +from ..import locations +from .configs import BaseConfig + + +class SearchParams(BaseConfig): + """Contains parameters for a search request and utility methods for URL generation""" + qry: str = Field('', description="The search query text") + num_results: Optional[int] = Field(None, description="Number of results to return") + lang: Optional[str] = Field(None, description="Language code (e.g., 'en')") + loc: Optional[str] = Field(None, description="Location in Canonical Name format") + base_url: str = Field("https://www.google.com/search", description="Base search engine URL") + + @computed_field + def url_params(self) -> Dict[str, Any]: + """Generates a dictionary of URL parameters based on the search parameters""" + params = {'q': wu.encode_param_value(self.qry)} + opt_params = { + 'num': self.num_results, + 'hl': self.lang, + 'uule': locations.convert_canonical_name_to_uule(self.loc) if self.loc else None, + } + opt_params = {k: v for k, v in opt_params.items() if v and v not in {'None', 'nan'}} + params.update(opt_params) + return params + + @computed_field + def url(self) -> str: + """Returns the fully formed search URL with all parameters""" + return f"{self.base_url}?{wu.join_url_quote(self.url_params)}" + + @computed_field + def serp_id(self) -> str: + """Computes a unique SERP ID based on query, location, and timestamp""" + timestamp = datetime.now().isoformat() + return hash_id(f"{self.qry}{self.loc}{timestamp}") + + def to_dict_output(self) -> Dict[str, Any]: + """Outputs the variables needed for SERPDetails as a dictionary""" + timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat() + return { + "qry": self.qry, + "loc": self.loc, + "lang": self.lang, + "url": self.url, + "serp_id": hash_id(f"{self.qry}{self.loc}{timestamp}"), + "timestamp": timestamp, + } + + +class SERPDetails(BaseConfig): + """ + Contains details about a Search Engine Results Page (SERP). + + This class stores all the information related to a SERP, including + search parameters, response data, parsed results and features. + """ + version: str = Field(None, description="WebSearcher version") + method: str = Field(None, description="Search method used (requests or selenium)") + crawl_id: Optional[str] = Field(None, description="ID for the crawl session") + serp_id: Optional[str] = Field(None, description="Unique ID for this SERP") + qry: Optional[str] = Field(None, description="Search query") + loc: Optional[str] = Field(None, description="Location used for search") + lang: Optional[str] = Field(None, description="Language used for search") + url: Optional[str] = Field(None, description="Full search URL") + response_code: Optional[int] = Field(None, description="HTTP response code") + user_agent: Optional[str] = Field(None, description="User agent used for request") + timestamp: Optional[str] = Field(None, description="ISO timestamp of search") + html: Optional[str] = Field(None, description="Raw HTML response") + results: List[Dict[str, Any]] = Field(default_factory=list, description="Parsed search results") + features: Dict[str, Any] = Field(default_factory=dict, description="Extracted SERP features") diff --git a/WebSearcher/searchers/__init__.py b/WebSearcher/search_methods/__init__.py similarity index 100% rename from WebSearcher/searchers/__init__.py rename to WebSearcher/search_methods/__init__.py diff --git a/WebSearcher/searchers/selenium_searcher.py b/WebSearcher/search_methods/selenium_searcher.py similarity index 82% rename from WebSearcher/searchers/selenium_searcher.py rename to WebSearcher/search_methods/selenium_searcher.py index 3d5f5d6..3529c6f 100644 --- a/WebSearcher/searchers/selenium_searcher.py +++ b/WebSearcher/search_methods/selenium_searcher.py @@ -12,7 +12,6 @@ from .. import utils from ..models.configs import SeleniumConfig - class SeleniumDriver: """Handle Selenium-based web interactions for search engines""" @@ -26,16 +25,12 @@ def __init__(self, config: SeleniumConfig, logger): self.config = config self.log = logger self.driver = None - self.user_agent = None - self.response_code = None self.browser_info = {} def init_driver(self) -> None: """Initialize Chrome driver with selenium-specific config""" self.log.debug(f'SERP | init uc chromedriver | kwargs: {self.config.__dict__}') self.driver = uc.Chrome(**self.config.__dict__) - self.user_agent = self.driver.execute_script('return navigator.userAgent') - self.response_code = None # Log version information self.browser_info = { @@ -43,6 +38,7 @@ def init_driver(self) -> None: 'browser_name': self.driver.capabilities['browserName'], 'browser_version': self.driver.capabilities['browserVersion'], 'driver_version': self.driver.capabilities['chrome']['chromedriverVersion'].split(' ')[0], + 'user_agent': self.driver.execute_script('return navigator.userAgent'), } self.browser_info['browser_id'] = utils.hash_id(json.dumps(self.browser_info)) self.log.debug(json.dumps(self.browser_info, indent=4)) @@ -58,40 +54,34 @@ def send_typed_query(self, query: str): search_box.send_keys(Keys.RETURN) def send_request(self, url: str) -> Dict[str, Any]: - """Use a prepared URL to conduct a search - - Args: - url (str): The URL to request - - Returns: - Dict[str, Any]: Dictionary containing response data - """ - time.sleep(2) - self.driver.get(url) - time.sleep(2) - - # wait for the page to load - WebDriverWait(self.driver, 10).until( - EC.presence_of_element_located((By.ID, "search")) - ) - time.sleep(2) #including a sleep to allow the page to fully load + """Visit a URL with selenium and save HTML response""" + + try: + self.driver.get(url) + time.sleep(2) + WebDriverWait(self.driver, 10).until( + EC.presence_of_element_located((By.ID, "search")) + ) + time.sleep(2) + response_output = { + 'html': self.driver.page_source, + 'url': self.driver.current_url, + 'user_agent': self.browser_info['user_agent'], + 'response_code': 200, + } + except Exception as e: + self.log.exception(f'SERP | Chromedriver error | {str(e)}') + response_output = { + 'html': '', + 'url': '', + 'user_agent': self.browser_info['user_agent'], + 'response_code': 0, + } + finally: + return response_output - html = self.driver.page_source - selenium_url = self.driver.current_url - self.response_code = 0 - - return { - 'html': html, - 'url': selenium_url, - 'response_code': self.response_code, - } - def expand_ai_overview(self): - """Expand AI overview box by clicking it - - Returns: - str: Updated HTML if expansion occurred, None otherwise - """ + """Expand AI overview box by clicking it""" show_more_button_xpath = "//div[@jsname='rPRdsc' and @role='button']" show_all_button_xpath = '//div[contains(@class, "trEk7e") and @role="button"]' diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 70e954f..bce7a45 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -2,21 +2,19 @@ from . import webutils as wu from . import utils from . import logger -from .searchers.selenium_searcher import SeleniumDriver -from .models.configs import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod, SearchParams +from .search_methods.selenium_searcher import SeleniumDriver +from .models.configs import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod +from .models.searches import SearchParams, SERPDetails from .models.data import BaseSERP import os import time -import json import brotli import requests import pandas as pd from typing import Dict, Optional, Union from datetime import datetime, timezone -# selenium imports no longer needed here as they're in selenium_utils.py - from importlib import metadata WS_VERSION = metadata.version('WebSearcher') @@ -40,6 +38,7 @@ def __init__(self, # Initialize configuration self.version = WS_VERSION + self.method = method.value if isinstance(method, SearchMethod) else method self.config = SearchConfig.create({ "method": SearchMethod.create(method), "log": LogConfig.create(log_config), @@ -64,38 +63,18 @@ def __init__(self, self.selenium_driver = SeleniumDriver(config=self.config.selenium, logger=self.log) self.selenium_driver.driver = None + # Initialize search params and output self.search_params = SearchParams.create() - - # Initialize search details - self.serp = { - 'version': self.version, - 'method': self.config.method.value, - 'crawl_id': None, - 'serp_id': None, - 'qry': None, - 'loc': None, - 'lang': None, - 'url': None, - 'response_code': None, - 'user_agent': None, - 'timestamp': None, - 'serp_id': None, - 'html': None, - 'results': [], - 'features': {}, - } - - # Initialize search details - self.timestamp: str = None - self.serp_id: str = None - self.crawl_id: str = None + self.serp_template = SERPDetails.create({'version': self.version, 'method': self.config.method.value}) # Initialize search outputs - self.response = None # type: Optional[requests.Response] - self.html: str = None + self._response = { + "url": None, + "response_code": None, + "html": None, + } self.results: list = [] self.serp_features: dict = {} - self.serp: dict = {} def search(self, qry: str, @@ -103,7 +82,6 @@ def search(self, lang: str = None, num_results: int = None, ai_expand: bool = False, - serp_id: str = '', crawl_id: str = '' ): """Conduct a search and save HTML @@ -118,7 +96,7 @@ def search(self, """ self._prepare_search(qry=qry, location=location, lang=lang, num_results=num_results) - self._conduct_search(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand) + self._conduct_search(crawl_id=crawl_id, ai_expand=ai_expand) def _prepare_search(self, qry: str, location: str, lang: str, num_results: int): self.search_params = SearchParams.create({ @@ -128,44 +106,29 @@ def _prepare_search(self, qry: str, location: str, lang: str, num_results: int): 'num_results': num_results, }) - def _conduct_search(self, serp_id:str = '', crawl_id:str = '', ai_expand:bool = False): + def _conduct_search(self, crawl_id: str = '', ai_expand: bool = False): if self.config.method == SearchMethod.SELENIUM: - self._conduct_search_chromedriver(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand) + self._conduct_search_chromedriver(crawl_id=crawl_id, ai_expand=ai_expand) elif self.config.method == SearchMethod.REQUESTS: - self._conduct_search_requests(serp_id=serp_id, crawl_id=crawl_id) + self._conduct_search_requests(crawl_id=crawl_id) # ========================================================================== # Selenium method - def _conduct_search_chromedriver(self, serp_id: str = '', crawl_id: str = '', ai_expand = False): + def _conduct_search_chromedriver(self, crawl_id: str = '', ai_expand = False): """Send a search request and handle errors""" if not self.selenium_driver.driver: self.selenium_driver.init_driver() - - self.crawl_id = crawl_id - self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat() - str_to_hash = self.search_params.qry + self.search_params.loc + self.timestamp - self.serp_id = serp_id if serp_id else utils.hash_id(str_to_hash) - - try: - response_data = self.selenium_driver.send_request(self.search_params.url) - self.html = response_data['html'] - self.selenium_url = response_data['url'] - self.response_code = response_data['response_code'] - self.user_agent = self.selenium_driver.user_agent - - log_msg = f"{self.response_code} | {self.search_params.qry}" - log_msg = f"{log_msg} | {self.search_params.loc}" if self.search_params.loc else log_msg - self.log.info(log_msg) - - except Exception as e: - self.log.exception(f'SERP | Chromedriver error | {self.serp_id}: {str(e)}') + response_output = self.selenium_driver.send_request(self.search_params.url) + serp = self.search_params.to_dict_output() | response_output + self.serp = BaseSERP(version=self.version, method=self.method, crawl_id=crawl_id, **serp).model_dump() + self.log.info(" | ".join([f"{k}: {self.serp[k]}" for k in {'response_code','qry','loc'} if self.serp[k]])) if ai_expand: expanded_html = self.selenium_driver.expand_ai_overview() if expanded_html: - self.log.debug(f'SERP | overwriting expanded content | len diff: {len(expanded_html) - len(self.html)}') - self.html = expanded_html + self.log.debug(f"SERP | expanded html | len diff: {len(expanded_html) - len(self.serp['html'])}") + self.serp['html'] = expanded_html # Only delete cookies, don't close the driver here self.selenium_driver.delete_cookies() @@ -175,7 +138,7 @@ def _conduct_search_chromedriver(self, serp_id: str = '', crawl_id: str = '', ai def _conduct_search_requests(self, serp_id: str = '', crawl_id: str = ''): """Send a search request and handle errors""" - + self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat() str_to_hash = self.search_params.qry + self.search_params.loc + self.timestamp self.serp_id = serp_id if serp_id else utils.hash_id(str_to_hash) @@ -239,26 +202,26 @@ def _unzip_html(self) -> None: def parse_all(self): """Parse results and extract SERP features in a single pass""" - assert self.html, "No HTML found" + assert self.serp['html'], "No HTML found" try: # Use the enhanced parse_serp function to get both results and features in one pass - self.results, self.serp_features = parsers.parse_serp(self.html, extract_features=True) + self.results, self.serp_features = parsers.parse_serp(self.serp['html'], extract_features=True) except Exception: self.log.exception(f'Combined parsing error | serp_id : {self.serp_id}') def parse_results(self): """Parse a SERP - see parsers.py""" - assert self.html, "No HTML found" + assert self.serp['html'], "No HTML found" try: - self.results = parsers.parse_serp(self.html) + self.results = parsers.parse_serp(self.serp['html']) except Exception: self.log.exception(f'Parsing error | serp_id : {self.serp_id}') def parse_serp_features(self): """Extract SERP features - see parsers.py""" - assert self.html, "No HTML found" + assert self.serp['html'], "No HTML found" try: - self.serp_features = parsers.FeatureExtractor.extract_features(self.html) + self.serp_features = parsers.FeatureExtractor.extract_features(self.serp['html']) except Exception: self.log.exception(f'Feature extraction error | serp_id : {self.serp_id}') @@ -267,16 +230,16 @@ def parse_serp_features(self): def prepare_serp_save(self): self.serp = BaseSERP( - qry=self.search_params.qry, - loc=self.search_params.loc, - lang=self.search_params.lang, - url=self.search_params.url, - html=self.html, - response_code=self.response_code, - user_agent=self.user_agent, - timestamp=self.timestamp, - serp_id=self.serp_id, - crawl_id=self.crawl_id, + qry=self.serp['qry'], + loc=self.serp['loc'], + lang=self.serp['lang'], + url=self.serp['url'], + html=self.serp['html'], + response_code=self.serp['response_code'], + user_agent=self.serp['user_agent'], + timestamp=self.serp['timestamp'], + serp_id=self.serp['serp_id'], + crawl_id=self.serp['crawl_id'], version=self.version, method=self.config.method.value ).model_dump() @@ -288,7 +251,7 @@ def save_serp(self, save_dir: str = "", append_to: str = ""): save_dir (str, optional): Save results as `save_dir/{serp_id}.html` append_to (str, optional): Append results to this file path """ - assert self.html, "No HTML found" + assert self.serp['html'], "No HTML found" assert save_dir or append_to, "Must provide a save_dir or append_to file path" if append_to: @@ -298,7 +261,7 @@ def save_serp(self, save_dir: str = "", append_to: str = ""): else: fp = os.path.join(save_dir, f'{self.serp_id}.html') with open(fp, 'w') as outfile: - outfile.write(self.html) + outfile.write(self.serp['html']) def save_search(self, append_to: str = ""): """Save search metadata (excludes HTML) to file @@ -306,7 +269,7 @@ def save_search(self, append_to: str = ""): Args: append_to (str, optional): Append results to this file path """ - assert self.html, "No HTML found" + assert self.serp['html'], "No HTML found" assert append_to, "Must provide an append_to file path" if not self.serp: @@ -330,7 +293,11 @@ def save_results(self, save_dir: str = "", append_to: str = ""): if self.results: if append_to: - result_metadata = {'crawl_id': self.crawl_id, 'serp_id': self.serp_id, 'version': self.version} + result_metadata = { + 'crawl_id': self.serp["crawl_id"], + 'serp_id': self.serp["serp_id"], + 'version': self.version + } results_output = [{**result, **result_metadata} for result in self.results] utils.write_lines(results_output, append_to) else: From 6d4642f187d6519334527814faf45895310248f4 Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 27 Mar 2025 14:16:31 -0700 Subject: [PATCH 042/101] add: file for requests code, update outputs --- WebSearcher/models/data.py | 2 +- WebSearcher/models/searches.py | 25 +- WebSearcher/parsers.py | 10 +- .../search_methods/requests_searcher.py | 91 +++++++ WebSearcher/searchers.py | 230 ++++++------------ tests/selenium_test.py | 23 +- 6 files changed, 182 insertions(+), 199 deletions(-) create mode 100644 WebSearcher/search_methods/requests_searcher.py diff --git a/WebSearcher/models/data.py b/WebSearcher/models/data.py index b141668..45c4bef 100644 --- a/WebSearcher/models/data.py +++ b/WebSearcher/models/data.py @@ -1,5 +1,5 @@ from pydantic import BaseModel, Field -from typing import Any, Optional +from typing import Any, Optional, List, Dict class BaseResult(BaseModel): diff --git a/WebSearcher/models/searches.py b/WebSearcher/models/searches.py index 3570823..6884ec2 100644 --- a/WebSearcher/models/searches.py +++ b/WebSearcher/models/searches.py @@ -40,7 +40,7 @@ def serp_id(self) -> str: timestamp = datetime.now().isoformat() return hash_id(f"{self.qry}{self.loc}{timestamp}") - def to_dict_output(self) -> Dict[str, Any]: + def to_serp_output(self) -> Dict[str, Any]: """Outputs the variables needed for SERPDetails as a dictionary""" timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat() return { @@ -51,26 +51,3 @@ def to_dict_output(self) -> Dict[str, Any]: "serp_id": hash_id(f"{self.qry}{self.loc}{timestamp}"), "timestamp": timestamp, } - - -class SERPDetails(BaseConfig): - """ - Contains details about a Search Engine Results Page (SERP). - - This class stores all the information related to a SERP, including - search parameters, response data, parsed results and features. - """ - version: str = Field(None, description="WebSearcher version") - method: str = Field(None, description="Search method used (requests or selenium)") - crawl_id: Optional[str] = Field(None, description="ID for the crawl session") - serp_id: Optional[str] = Field(None, description="Unique ID for this SERP") - qry: Optional[str] = Field(None, description="Search query") - loc: Optional[str] = Field(None, description="Location used for search") - lang: Optional[str] = Field(None, description="Language used for search") - url: Optional[str] = Field(None, description="Full search URL") - response_code: Optional[int] = Field(None, description="HTTP response code") - user_agent: Optional[str] = Field(None, description="User agent used for request") - timestamp: Optional[str] = Field(None, description="ISO timestamp of search") - html: Optional[str] = Field(None, description="Raw HTML response") - results: List[Dict[str, Any]] = Field(default_factory=list, description="Parsed search results") - features: Dict[str, Any] = Field(default_factory=dict, description="Extracted SERP features") diff --git a/WebSearcher/parsers.py b/WebSearcher/parsers.py index 692171b..7f5fb0f 100644 --- a/WebSearcher/parsers.py +++ b/WebSearcher/parsers.py @@ -32,14 +32,14 @@ def parse_serp( for cmpt in component_list: cmpt.classify_component() cmpt.parse_component() - results = component_list.export_component_results() if extract_features: - # Extract features from the same soup object to avoid parsing twice - features = FeatureExtractor.extract_features(soup) - return results, features - + return { + "features": FeatureExtractor.extract_features(soup), + "results": results + } + return results diff --git a/WebSearcher/search_methods/requests_searcher.py b/WebSearcher/search_methods/requests_searcher.py new file mode 100644 index 0000000..7b0ad62 --- /dev/null +++ b/WebSearcher/search_methods/requests_searcher.py @@ -0,0 +1,91 @@ +import time +import brotli +import requests +from typing import Dict, Optional, Any +from datetime import datetime, timezone + +from .. import utils + +class RequestsSearcher: + """Handle Requests-based web interactions for search engines""" + + def __init__(self, config, headers, logger): + """Initialize a Requests searcher with the given configuration + + Args: + config: RequestsConfig instance + headers: Dictionary of HTTP headers + logger: Logger instance + """ + self.config = config + self.headers = headers + self.log = logger + self.sesh = self.config.sesh or self._start_session() + + def _start_session(self): + """Start a new requests session with the configured headers""" + session = requests.Session() + session.headers.update(self.headers) + return session + + def send_request(self, search_params) -> Dict[str, Any]: + """Send a request and handle the response + + Args: + search_params: SearchParams instance + serp_id: Optional SERP ID + crawl_id: Optional crawl ID + + Returns: + Dictionary with response data + """ + + response_data = { + 'html': '', + 'url': search_params.url, + 'user_agent': self.headers.get('User-Agent'), + 'response_code': 0, + } + + try: + response = self.sesh.get(search_params.url, timeout=10) + response_data['html'] = self._handle_response_content(response) + response_data['response_code'] = response.status_code + except requests.exceptions.ConnectionError: + self.log.exception(f'Requests | Connection error') + self._reset_ssh_tunnel() + except requests.exceptions.Timeout: + self.log.exception(f'Requests | Timeout error') + except Exception: + self.log.exception(f'Requests | Unknown error') + finally: + return response_data + + def _handle_response_content(self, response): + try: + if self.config.unzip: + html = self._unzip_html(response.content) + else: + html = response.content + return html.decode('utf-8', 'ignore') + except Exception: + self.log.exception(f'Response handling error') + return response.content + + def _unzip_html(self, content) -> bytes: + """Unzip brotli zipped html""" + try: + return brotli.decompress(content) + except brotli.error: + return content + except Exception: + self.log.exception(f'unzip error') + return content + + def _reset_ssh_tunnel(self): + """Reset the SSH tunnel if configured""" + if self.config.ssh_tunnel: + self.config.ssh_tunnel.tunnel.kill() + self.config.ssh_tunnel.open_tunnel() + self.log.info(f'SERP | Restarted SSH tunnel') + time.sleep(10) # Allow time to establish connection diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index bce7a45..30ac74d 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -1,19 +1,15 @@ from . import parsers -from . import webutils as wu from . import utils from . import logger from .search_methods.selenium_searcher import SeleniumDriver +from .search_methods.requests_searcher import RequestsSearcher from .models.configs import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod -from .models.searches import SearchParams, SERPDetails +from .models.searches import SearchParams from .models.data import BaseSERP import os -import time -import brotli -import requests import pandas as pd -from typing import Dict, Optional, Union -from datetime import datetime, timezone +from typing import Dict, Union from importlib import metadata WS_VERSION = metadata.version('WebSearcher') @@ -58,23 +54,15 @@ def __init__(self, # Initialize searcher if self.config.method == SearchMethod.REQUESTS: self.headers = headers or self.config.requests.headers - self.sesh = self.config.requests.sesh or wu.start_sesh(headers=self.headers) + self.requests_searcher = RequestsSearcher(config=self.config.requests, headers=self.headers, logger=self.log) elif self.config.method == SearchMethod.SELENIUM: self.selenium_driver = SeleniumDriver(config=self.config.selenium, logger=self.log) self.selenium_driver.driver = None # Initialize search params and output self.search_params = SearchParams.create() - self.serp_template = SERPDetails.create({'version': self.version, 'method': self.config.method.value}) + self.parsed = {'results': [], 'features': {}} - # Initialize search outputs - self._response = { - "url": None, - "response_code": None, - "html": None, - } - self.results: list = [] - self.serp_features: dict = {} def search(self, qry: str, @@ -91,7 +79,6 @@ def search(self, location (str, optional): A location's Canonical Name num_results (int, optional): The number of results to return ai_expand: (bool, optional): Whether to use selenium to expand AI overviews - serp_id (str, optional): A unique identifier for this SERP crawl_id (str, optional): An identifier for this crawl """ @@ -119,131 +106,66 @@ def _conduct_search_chromedriver(self, crawl_id: str = '', ai_expand = False): """Send a search request and handle errors""" if not self.selenium_driver.driver: self.selenium_driver.init_driver() + + # Conduct search + serp_output = self.search_params.to_serp_output() response_output = self.selenium_driver.send_request(self.search_params.url) - serp = self.search_params.to_dict_output() | response_output - self.serp = BaseSERP(version=self.version, method=self.method, crawl_id=crawl_id, **serp).model_dump() + serp_output.update(response_output) + + # Store output + self.serp = BaseSERP( + version=self.version, + method=self.method, + crawl_id=crawl_id, + **serp_output + ).model_dump() self.log.info(" | ".join([f"{k}: {self.serp[k]}" for k in {'response_code','qry','loc'} if self.serp[k]])) + # Expand AI overview if ai_expand: expanded_html = self.selenium_driver.expand_ai_overview() if expanded_html: self.log.debug(f"SERP | expanded html | len diff: {len(expanded_html) - len(self.serp['html'])}") self.serp['html'] = expanded_html - - # Only delete cookies, don't close the driver here + + # Delete cookies self.selenium_driver.delete_cookies() # ========================================================================== # Requests method - def _conduct_search_requests(self, serp_id: str = '', crawl_id: str = ''): - """Send a search request and handle errors""" - - self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat() - str_to_hash = self.search_params.qry + self.search_params.loc + self.timestamp - self.serp_id = serp_id if serp_id else utils.hash_id(str_to_hash) - self.crawl_id = crawl_id - self.user_agent = self.headers['User-Agent'] - - try: - self._send_request() - except requests.exceptions.ConnectionError: - self.log.exception(f'SERP | Connection error | {self.serp_id}') - self._reset_ssh_tunnel() - except requests.exceptions.Timeout: - self.log.exception(f'SERP | Timeout error | {self.serp_id}') - except Exception: - self.log.exception(f'SERP | Unknown error | {self.serp_id}') - finally: - self._handle_response() - - def _send_request(self): - self.response = self.sesh.get(self.search_params.url, timeout=10) - self.response_code = self.response.status_code - log_msg = f"{self.response_code} | {self.search_params.qry}" - log_msg = f"{log_msg} | {self.search_params.loc}" if self.search_params.loc else log_msg - self.log.info(log_msg) - - def _reset_ssh_tunnel(self): - if self.config.requests.ssh_tunnel: - self.config.requests.ssh_tunnel.tunnel.kill() - self.config.requests.ssh_tunnel.open_tunnel() - self.log.info(f'SERP | Restarted SSH tunnel | {self.serp_id}') - time.sleep(10) # Allow time to establish connection - - def _handle_response(self): - try: - if self.config.requests.unzip: - self._unzip_html() - else: - self.html = self.response.content - self.html = self.html.decode('utf-8', 'ignore') - except Exception: - self.log.exception(f'Response handling error') - - def _unzip_html(self) -> None: - """Unzip brotli zipped html - - Can allow zipped responses by setting the header `"Accept-Encoding"`. - Zipped reponses are the default because it is more efficient. - """ - - rcontent = self.response.content - try: - self.html = brotli.decompress(rcontent) - except brotli.error: - self.html = rcontent - except Exception: - self.log.exception(f'unzip error | serp_id : {self.serp_id}') - self.html = rcontent + def _conduct_search_requests(self, crawl_id: str = ''): + """Send a search request using the requests library""" + + # Conduct search + serp_output = self.search_params.to_serp_output() + serp_output['version'] = self.version + serp_output['method'] = self.method + serp_output['crawl_id'] = crawl_id + response_output = self.requests_searcher.send_request(self.search_params) + serp_output.update(response_output) + self.serp = BaseSERP(**serp_output).model_dump() + self.log.info(" | ".join([f"{k}: {self.serp[k]}" for k in {'response_code','qry','loc'} if self.serp[k]])) # ========================================================================== # Parsing - def parse_all(self): - """Parse results and extract SERP features in a single pass""" - assert self.serp['html'], "No HTML found" + def parse_serp(self, extract_features=True): try: - # Use the enhanced parse_serp function to get both results and features in one pass - self.results, self.serp_features = parsers.parse_serp(self.serp['html'], extract_features=True) + metadata = {k:v for k,v in self.serp.items() if k not in ['html']} + parsed = parsers.parse_serp(self.serp['html'], extract_features=extract_features) + self.parsed = metadata | parsed except Exception: - self.log.exception(f'Combined parsing error | serp_id : {self.serp_id}') + self.log.exception(f'Parsing error | serp_id : {self.serp["serp_id"]}') def parse_results(self): - """Parse a SERP - see parsers.py""" - assert self.serp['html'], "No HTML found" - try: - self.results = parsers.parse_serp(self.serp['html']) - except Exception: - self.log.exception(f'Parsing error | serp_id : {self.serp_id}') - - def parse_serp_features(self): - """Extract SERP features - see parsers.py""" - assert self.serp['html'], "No HTML found" - try: - self.serp_features = parsers.FeatureExtractor.extract_features(self.serp['html']) - except Exception: - self.log.exception(f'Feature extraction error | serp_id : {self.serp_id}') + """Backwards compatibility for parsing results""" + self.parse_serp() + self.results = self.parsed['results'] # ========================================================================== # Saving - def prepare_serp_save(self): - self.serp = BaseSERP( - qry=self.serp['qry'], - loc=self.serp['loc'], - lang=self.serp['lang'], - url=self.serp['url'], - html=self.serp['html'], - response_code=self.serp['response_code'], - user_agent=self.serp['user_agent'], - timestamp=self.serp['timestamp'], - serp_id=self.serp['serp_id'], - crawl_id=self.serp['crawl_id'], - version=self.version, - method=self.config.method.value - ).model_dump() - def save_serp(self, save_dir: str = "", append_to: str = ""): """Save SERP to file @@ -251,35 +173,35 @@ def save_serp(self, save_dir: str = "", append_to: str = ""): save_dir (str, optional): Save results as `save_dir/{serp_id}.html` append_to (str, optional): Append results to this file path """ - assert self.serp['html'], "No HTML found" - assert save_dir or append_to, "Must provide a save_dir or append_to file path" - - if append_to: - self.prepare_serp_save() + if not save_dir and not append_to: + self.log.warning("Must provide a save_dir or append_to file path to save a SERP") + return + elif append_to: utils.write_lines([self.serp], append_to) - - else: - fp = os.path.join(save_dir, f'{self.serp_id}.html') + elif save_dir: + fp = os.path.join(save_dir, f'{self.serp["serp_id"]}.html') with open(fp, 'w') as outfile: outfile.write(self.serp['html']) - def save_search(self, append_to: str = ""): - """Save search metadata (excludes HTML) to file - - Args: - append_to (str, optional): Append results to this file path - """ - assert self.serp['html'], "No HTML found" - assert append_to, "Must provide an append_to file path" - - if not self.serp: - self.prepare_serp_save() + def save_parsed(self, save_dir: str = "", append_to: str = ""): + """Save parsed SERP to file""" + if not save_dir and not append_to: + self.log.warning("Must provide a save_dir or append_to file path to save parsed SERP") + return + if not self.parsed: + self.log.warning("No parsed SERP available to save") + return - if not self.serp_features: - self.parse_serp_features() + fp = append_to if append_to else os.path.join(save_dir, 'parsed.json') + utils.write_lines([self.parsed], fp) + + def save_search(self, append_to: str = ""): + """Save SERP metadata (excludes HTML) to file""" + if not append_to: + self.log.warning("Must provide an append_to file path to save SERP metadata") + return self.serp_metadata = {k: v for k, v in self.serp.items() if k != 'html'} - self.serp_metadata.update(self.serp_features) utils.write_lines([self.serp_metadata], append_to) def save_results(self, save_dir: str = "", append_to: str = ""): @@ -289,22 +211,18 @@ def save_results(self, save_dir: str = "", append_to: str = ""): save_dir (str, optional): Save results as `save_dir/results/{serp_id}.json` append_to (bool, optional): Append results to this file path """ - assert save_dir or append_to, "Must provide a save_dir or append_to file path" - - if self.results: - if append_to: - result_metadata = { - 'crawl_id': self.serp["crawl_id"], - 'serp_id': self.serp["serp_id"], - 'version': self.version - } - results_output = [{**result, **result_metadata} for result in self.results] - utils.write_lines(results_output, append_to) - else: - fp = os.path.join(save_dir, 'results', f'{self.serp_id}.json') - utils.write_lines(self.results, fp) - else: - self.log.info(f'No parsed results for serp_id: {self.serp_id}') + if not save_dir and not append_to: + self.log.warning("Must provide a save_dir or append_to file path to save results") + return + if not self.parsed["results"]: + self.log.warning(f'No parsed results to save') + return + + # Add metadta to results + result_metadata = {k: self.serp[k] for k in ['crawl_id', 'serp_id', 'version']} + results_output = [{**result, **result_metadata} for result in self.parsed["results"]] + fp = append_to if append_to else os.path.join(save_dir, 'results.json') + utils.write_lines(results_output, fp) def cleanup(self): """Clean up resources, particularly Selenium's browser instance diff --git a/tests/selenium_test.py b/tests/selenium_test.py index 0318fd2..7540fb2 100644 --- a/tests/selenium_test.py +++ b/tests/selenium_test.py @@ -10,15 +10,15 @@ @app.command() def main( - query: str = typer.Argument("why is the sky blue?", help="Search query to use"), - method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"), - headless: bool = typer.Option(False, help="Run browser in headless mode"), - use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"), - version_main: int = typer.Option(133, help="Main version of Chrome to use"), - ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"), - driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"), - output_prefix: str = typer.Option("output", help="Prefix for output files") - ) -> None: + query: str = typer.Argument("why is the sky blue?", help="Search query to use"), + method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"), + headless: bool = typer.Option(False, help="Run browser in headless mode"), + use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"), + version_main: int = typer.Option(133, help="Main version of Chrome to use"), + ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"), + driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"), + output_prefix: str = typer.Option("output", help="Prefix for output files") +) -> None: typer.echo(f"query: {query}\nmethod: {method}") se = ws.SearchEngine( method=method, @@ -31,11 +31,8 @@ def main( ) se.search(qry=query, ai_expand=ai_expand) se.parse_results() - - # Save results with the specified prefix se.save_serp(append_to=f'{output_prefix}_serps.json') - se.save_search(append_to=f'{output_prefix}_searches.json') - se.save_results(append_to=f'{output_prefix}_results.json') + se.save_parsed(append_to=f'{output_prefix}_parsed.json') se.cleanup() if __name__ == "__main__": From bd2b76eca850ab302d84c2f7049d20c20fee775c Mon Sep 17 00:00:00 2001 From: gitronald Date: Fri, 28 Mar 2025 09:21:34 -0700 Subject: [PATCH 043/101] version: 0.6.0.dev5 --- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index d33ee46..0aad1e5 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.6.0.dev4" +__version__ = "0.6.0.dev5" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index 2b85ba6..9309f37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.6.0.dev4" +version = "0.6.0.dev5" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From 0c959b945254c358834b5e5ec0c95e4de49b95e0 Mon Sep 17 00:00:00 2001 From: gitronald Date: Fri, 28 Mar 2025 09:22:33 -0700 Subject: [PATCH 044/101] update: cleaner selenium cleanup --- .../search_methods/selenium_searcher.py | 29 +++++++++---------- WebSearcher/searchers.py | 23 +-------------- tests/selenium_test.py | 2 +- 3 files changed, 16 insertions(+), 38 deletions(-) diff --git a/WebSearcher/search_methods/selenium_searcher.py b/WebSearcher/search_methods/selenium_searcher.py index 3529c6f..3b5c8dc 100644 --- a/WebSearcher/search_methods/selenium_searcher.py +++ b/WebSearcher/search_methods/selenium_searcher.py @@ -123,21 +123,8 @@ def cleanup(self) -> bool: """ if self.driver: try: - # Try a more thorough cleanup - try: - self.driver.delete_all_cookies() - except Exception: - pass - - try: - # Close all tabs/windows - original_handle = self.driver.current_window_handle - for handle in self.driver.window_handles: - self.driver.switch_to.window(handle) - self.driver.close() - except Exception: - pass - + self.delete_cookies() + self.close_all_windows() # Finally quit the driver self.driver.quit() self.driver = None @@ -150,6 +137,18 @@ def cleanup(self) -> bool: return False return True + def close_all_windows(self): + try: + # Close all tabs/windows + original_handle = self.driver.current_window_handle + for handle in self.driver.window_handles: + self.driver.switch_to.window(handle) + self.driver.close() + self.driver.switch_to.window(original_handle) + self.driver.close() + except Exception: + pass + def delete_cookies(self): """Delete all cookies from the browser""" if self.driver: diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 30ac74d..3604426 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -218,29 +218,8 @@ def save_results(self, save_dir: str = "", append_to: str = ""): self.log.warning(f'No parsed results to save') return - # Add metadta to results + # Add metadata to results result_metadata = {k: self.serp[k] for k in ['crawl_id', 'serp_id', 'version']} results_output = [{**result, **result_metadata} for result in self.parsed["results"]] fp = append_to if append_to else os.path.join(save_dir, 'results.json') utils.write_lines(results_output, fp) - - def cleanup(self): - """Clean up resources, particularly Selenium's browser instance - - Returns: - bool: True if cleanup was successful or not needed, False if cleanup failed - """ - if self.config.method == SearchMethod.SELENIUM and hasattr(self, 'selenium_driver'): - result = self.selenium_driver.cleanup() - if result: - self.selenium_driver.driver = None # Update the reference - return result - return True - - def __del__(self): - """Destructor to ensure browser is closed when object is garbage collected""" - try: - self.cleanup() - except Exception: - pass - diff --git a/tests/selenium_test.py b/tests/selenium_test.py index 7540fb2..7711c6c 100644 --- a/tests/selenium_test.py +++ b/tests/selenium_test.py @@ -32,8 +32,8 @@ def main( se.search(qry=query, ai_expand=ai_expand) se.parse_results() se.save_serp(append_to=f'{output_prefix}_serps.json') + se.save_search(append_to=f'{output_prefix}_searches.json') se.save_parsed(append_to=f'{output_prefix}_parsed.json') - se.cleanup() if __name__ == "__main__": app() \ No newline at end of file From 82ef0dbe11324aac72badece2a3e9d9847fbff65 Mon Sep 17 00:00:00 2001 From: gitronald Date: Fri, 28 Mar 2025 09:23:40 -0700 Subject: [PATCH 045/101] update: consistent logging and serp handling --- WebSearcher/searchers.py | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 3604426..36367ac 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -82,10 +82,6 @@ def search(self, crawl_id (str, optional): An identifier for this crawl """ - self._prepare_search(qry=qry, location=location, lang=lang, num_results=num_results) - self._conduct_search(crawl_id=crawl_id, ai_expand=ai_expand) - - def _prepare_search(self, qry: str, location: str, lang: str, num_results: int): self.search_params = SearchParams.create({ 'qry': str(qry), 'loc': str(location) if not pd.isnull(location) else '', @@ -93,7 +89,6 @@ def _prepare_search(self, qry: str, location: str, lang: str, num_results: int): 'num_results': num_results, }) - def _conduct_search(self, crawl_id: str = '', ai_expand: bool = False): if self.config.method == SearchMethod.SELENIUM: self._conduct_search_chromedriver(crawl_id=crawl_id, ai_expand=ai_expand) elif self.config.method == SearchMethod.REQUESTS: @@ -109,17 +104,13 @@ def _conduct_search_chromedriver(self, crawl_id: str = '', ai_expand = False): # Conduct search serp_output = self.search_params.to_serp_output() + serp_output['version'] = self.version + serp_output['method'] = self.method + serp_output['crawl_id'] = crawl_id response_output = self.selenium_driver.send_request(self.search_params.url) serp_output.update(response_output) - - # Store output - self.serp = BaseSERP( - version=self.version, - method=self.method, - crawl_id=crawl_id, - **serp_output - ).model_dump() - self.log.info(" | ".join([f"{k}: {self.serp[k]}" for k in {'response_code','qry','loc'} if self.serp[k]])) + self.serp = BaseSERP(**serp_output).model_dump() + self.log.info(" | ".join([f"{self.serp[k]}" for k in {'response_code','qry','loc'} if self.serp[k]])) # Expand AI overview if ai_expand: @@ -145,16 +136,16 @@ def _conduct_search_requests(self, crawl_id: str = ''): response_output = self.requests_searcher.send_request(self.search_params) serp_output.update(response_output) self.serp = BaseSERP(**serp_output).model_dump() - self.log.info(" | ".join([f"{k}: {self.serp[k]}" for k in {'response_code','qry','loc'} if self.serp[k]])) + self.log.info(" | ".join([f"{self.serp[k]}" for k in {'qry','response_code','loc'} if self.serp[k]])) # ========================================================================== # Parsing def parse_serp(self, extract_features=True): try: - metadata = {k:v for k,v in self.serp.items() if k not in ['html']} + parsed_metadata = {k:v for k,v in self.serp.items() if k in ['crawl_id', 'serp_id', 'version', 'method']} parsed = parsers.parse_serp(self.serp['html'], extract_features=extract_features) - self.parsed = metadata | parsed + self.parsed = parsed_metadata | parsed except Exception: self.log.exception(f'Parsing error | serp_id : {self.serp["serp_id"]}') From 3fd8e19cc0914d34d3fa00bdc765e733b8f4088c Mon Sep 17 00:00:00 2001 From: gitronald Date: Fri, 28 Mar 2025 10:05:26 -0700 Subject: [PATCH 046/101] update: simplify search logic, use SearchParams, ai expand logic in selenium file --- WebSearcher/models/configs.py | 6 ++ .../search_methods/requests_searcher.py | 23 +++--- .../search_methods/selenium_searcher.py | 38 ++++++---- WebSearcher/searchers.py | 73 +++++-------------- 4 files changed, 58 insertions(+), 82 deletions(-) diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py index c64ee30..429d872 100644 --- a/WebSearcher/models/configs.py +++ b/WebSearcher/models/configs.py @@ -39,6 +39,12 @@ class RequestsConfig(BaseConfig): ssh_tunnel: Optional[subprocess.Popen] = None unzip: bool = True + def update_headers(self, new_headers: Dict[str, str]) -> None: + """Update the headers dictionary with new values.""" + self.headers.update(new_headers) + + + class SearchMethod(Enum): REQUESTS = "requests" SELENIUM = "selenium" diff --git a/WebSearcher/search_methods/requests_searcher.py b/WebSearcher/search_methods/requests_searcher.py index 7b0ad62..1b12450 100644 --- a/WebSearcher/search_methods/requests_searcher.py +++ b/WebSearcher/search_methods/requests_searcher.py @@ -1,15 +1,15 @@ import time import brotli import requests -from typing import Dict, Optional, Any -from datetime import datetime, timezone +from typing import Dict, Any -from .. import utils +from ..models.configs import RequestsConfig +from ..models.searches import SearchParams class RequestsSearcher: """Handle Requests-based web interactions for search engines""" - def __init__(self, config, headers, logger): + def __init__(self, config: RequestsConfig, logger): """Initialize a Requests searcher with the given configuration Args: @@ -18,17 +18,16 @@ def __init__(self, config, headers, logger): logger: Logger instance """ self.config = config - self.headers = headers self.log = logger self.sesh = self.config.sesh or self._start_session() def _start_session(self): """Start a new requests session with the configured headers""" session = requests.Session() - session.headers.update(self.headers) + session.headers.update(self.config.headers) return session - def send_request(self, search_params) -> Dict[str, Any]: + def send_request(self, search_params: SearchParams) -> Dict[str, Any]: """Send a request and handle the response Args: @@ -40,17 +39,17 @@ def send_request(self, search_params) -> Dict[str, Any]: Dictionary with response data """ - response_data = { + response_output = { 'html': '', 'url': search_params.url, - 'user_agent': self.headers.get('User-Agent'), + 'user_agent': self.config.headers.get('User-Agent'), 'response_code': 0, } try: response = self.sesh.get(search_params.url, timeout=10) - response_data['html'] = self._handle_response_content(response) - response_data['response_code'] = response.status_code + response_output['html'] = self._handle_response_content(response) + response_output['response_code'] = response.status_code except requests.exceptions.ConnectionError: self.log.exception(f'Requests | Connection error') self._reset_ssh_tunnel() @@ -59,7 +58,7 @@ def send_request(self, search_params) -> Dict[str, Any]: except Exception: self.log.exception(f'Requests | Unknown error') finally: - return response_data + return response_output def _handle_response_content(self, response): try: diff --git a/WebSearcher/search_methods/selenium_searcher.py b/WebSearcher/search_methods/selenium_searcher.py index 3b5c8dc..d78fe44 100644 --- a/WebSearcher/search_methods/selenium_searcher.py +++ b/WebSearcher/search_methods/selenium_searcher.py @@ -1,6 +1,6 @@ import time import json -from typing import Dict, Optional, Any +from typing import Dict, Any import undetected_chromedriver as uc from selenium.webdriver.common.by import By @@ -11,6 +11,7 @@ from .. import utils from ..models.configs import SeleniumConfig +from ..models.searches import SearchParams class SeleniumDriver: """Handle Selenium-based web interactions for search engines""" @@ -53,31 +54,38 @@ def send_typed_query(self, query: str): search_box.send_keys(query) search_box.send_keys(Keys.RETURN) - def send_request(self, url: str) -> Dict[str, Any]: + def send_request(self, search_params: SearchParams, ai_expand: bool = False) -> Dict[str, Any]: """Visit a URL with selenium and save HTML response""" + response_output = { + 'html': '', + 'url': search_params.url, + 'user_agent': self.browser_info['user_agent'], + 'response_code': 0, + } + try: - self.driver.get(url) + self.driver.get(search_params.url) time.sleep(2) WebDriverWait(self.driver, 10).until( EC.presence_of_element_located((By.ID, "search")) ) time.sleep(2) - response_output = { - 'html': self.driver.page_source, - 'url': self.driver.current_url, - 'user_agent': self.browser_info['user_agent'], - 'response_code': 200, - } + response_output['html'] = self.driver.page_source + response_output['url'] = self.driver.current_url + response_output['response_code'] = 200 + + # Expand AI overview if requested + if ai_expand: + expanded_html = self.expand_ai_overview() + if expanded_html: + self.log.debug(f"SERP | expanded html | len diff: {len(expanded_html) - len(self.serp['html'])}") + response_output['html'] = expanded_html + except Exception as e: self.log.exception(f'SERP | Chromedriver error | {str(e)}') - response_output = { - 'html': '', - 'url': '', - 'user_agent': self.browser_info['user_agent'], - 'response_code': 0, - } finally: + self.delete_cookies() return response_output def expand_ai_overview(self): diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 36367ac..8566d17 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -21,7 +21,7 @@ def __init__(self, log_config: Union[dict, LogConfig] = {}, selenium_config: Union[dict, SeleniumConfig] = {}, requests_config: Union[dict, RequestsConfig] = {}, - headers: Dict[str, str] = None + crawl_id: str = '', ) -> None: """Initialize the search engine @@ -33,7 +33,6 @@ def __init__(self, """ # Initialize configuration - self.version = WS_VERSION self.method = method.value if isinstance(method, SearchMethod) else method self.config = SearchConfig.create({ "method": SearchMethod.create(method), @@ -41,7 +40,12 @@ def __init__(self, "selenium": SeleniumConfig.create(selenium_config), "requests": RequestsConfig.create(requests_config), }) - + self.session_data = { + "method": self.config.method.value, + "version": WS_VERSION, + "crawl_id": crawl_id, + } + # Set a log file, prints to console by default self.log = logger.Logger( console=True if not self.config.log.fp else False, @@ -51,14 +55,6 @@ def __init__(self, file_level=self.config.log.level, ).start(__name__) - # Initialize searcher - if self.config.method == SearchMethod.REQUESTS: - self.headers = headers or self.config.requests.headers - self.requests_searcher = RequestsSearcher(config=self.config.requests, headers=self.headers, logger=self.log) - elif self.config.method == SearchMethod.SELENIUM: - self.selenium_driver = SeleniumDriver(config=self.config.selenium, logger=self.log) - self.selenium_driver.driver = None - # Initialize search params and output self.search_params = SearchParams.create() self.parsed = {'results': [], 'features': {}} @@ -70,7 +66,7 @@ def search(self, lang: str = None, num_results: int = None, ai_expand: bool = False, - crawl_id: str = '' + headers: Dict[str, str] = {}, ): """Conduct a search and save HTML @@ -90,58 +86,25 @@ def search(self, }) if self.config.method == SearchMethod.SELENIUM: - self._conduct_search_chromedriver(crawl_id=crawl_id, ai_expand=ai_expand) - elif self.config.method == SearchMethod.REQUESTS: - self._conduct_search_requests(crawl_id=crawl_id) - - # ========================================================================== - # Selenium method - - def _conduct_search_chromedriver(self, crawl_id: str = '', ai_expand = False): - """Send a search request and handle errors""" - if not self.selenium_driver.driver: + self.selenium_driver = SeleniumDriver(config=self.config.selenium, logger=self.log) self.selenium_driver.init_driver() + self.response_output = self.selenium_driver.send_request(self.search_params, ai_expand=ai_expand) + + elif self.config.method == SearchMethod.REQUESTS: + self.config.requests.update_headers(headers) + self.requests_searcher = RequestsSearcher(config=self.config.requests, logger=self.log) + self.response_output = self.requests_searcher.send_request(self.search_params) - # Conduct search serp_output = self.search_params.to_serp_output() - serp_output['version'] = self.version - serp_output['method'] = self.method - serp_output['crawl_id'] = crawl_id - response_output = self.selenium_driver.send_request(self.search_params.url) - serp_output.update(response_output) + serp_output.update(self.session_data) + serp_output.update(self.response_output) self.serp = BaseSERP(**serp_output).model_dump() self.log.info(" | ".join([f"{self.serp[k]}" for k in {'response_code','qry','loc'} if self.serp[k]])) - # Expand AI overview - if ai_expand: - expanded_html = self.selenium_driver.expand_ai_overview() - if expanded_html: - self.log.debug(f"SERP | expanded html | len diff: {len(expanded_html) - len(self.serp['html'])}") - self.serp['html'] = expanded_html - - # Delete cookies - self.selenium_driver.delete_cookies() - - # ========================================================================== - # Requests method - - def _conduct_search_requests(self, crawl_id: str = ''): - """Send a search request using the requests library""" - - # Conduct search - serp_output = self.search_params.to_serp_output() - serp_output['version'] = self.version - serp_output['method'] = self.method - serp_output['crawl_id'] = crawl_id - response_output = self.requests_searcher.send_request(self.search_params) - serp_output.update(response_output) - self.serp = BaseSERP(**serp_output).model_dump() - self.log.info(" | ".join([f"{self.serp[k]}" for k in {'qry','response_code','loc'} if self.serp[k]])) - # ========================================================================== # Parsing - def parse_serp(self, extract_features=True): + def parse_serp(self, extract_features: bool = True): try: parsed_metadata = {k:v for k,v in self.serp.items() if k in ['crawl_id', 'serp_id', 'version', 'method']} parsed = parsers.parse_serp(self.serp['html'], extract_features=extract_features) From bdb5975a0f3460940d8681338cb1d81d7114dd79 Mon Sep 17 00:00:00 2001 From: gitronald Date: Fri, 28 Mar 2025 10:13:08 -0700 Subject: [PATCH 047/101] update: drop python version file, use python>=3.10 in pyproject --- .python-version | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .python-version diff --git a/.python-version b/.python-version deleted file mode 100644 index c84ccce..0000000 --- a/.python-version +++ /dev/null @@ -1 +0,0 @@ -3.10.5 From d5c753925b4c5e1a88f0debb16757366a2426fa1 Mon Sep 17 00:00:00 2001 From: gitronald Date: Fri, 28 Mar 2025 10:24:44 -0700 Subject: [PATCH 048/101] fix: selenium output reference --- WebSearcher/search_methods/selenium_searcher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/WebSearcher/search_methods/selenium_searcher.py b/WebSearcher/search_methods/selenium_searcher.py index d78fe44..d8315ca 100644 --- a/WebSearcher/search_methods/selenium_searcher.py +++ b/WebSearcher/search_methods/selenium_searcher.py @@ -79,7 +79,8 @@ def send_request(self, search_params: SearchParams, ai_expand: bool = False) -> if ai_expand: expanded_html = self.expand_ai_overview() if expanded_html: - self.log.debug(f"SERP | expanded html | len diff: {len(expanded_html) - len(self.serp['html'])}") + len_diff = len(expanded_html) - len(response_output['html']) + self.log.debug(f"SERP | expanded html | len diff: {len_diff}") response_output['html'] = expanded_html except Exception as e: From 64ee056fb3843345de1c755efa4dc224edd64f04 Mon Sep 17 00:00:00 2001 From: gitronald Date: Fri, 28 Mar 2025 11:35:23 -0700 Subject: [PATCH 049/101] update: demo scripts --- scripts/demo_search.py | 71 +++++++++++++++++++-------------- scripts/demo_searches.py | 85 ++++++++++++++++++++++++++-------------- tests/selenium_test.py | 39 ------------------ 3 files changed, 97 insertions(+), 98 deletions(-) delete mode 100644 tests/selenium_test.py diff --git a/scripts/demo_search.py b/scripts/demo_search.py index 94bfb68..3debcaf 100644 --- a/scripts/demo_search.py +++ b/scripts/demo_search.py @@ -2,45 +2,58 @@ """ import os -import argparse +import typer import pandas as pd import WebSearcher as ws -pd.set_option('display.width', 120, +pd.set_option('display.width', 160, 'display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', 40) -def main(): - # Settings - parser = argparse.ArgumentParser() - parser.add_argument("-q", "--query", type=str, help="A search query", required=True) - parser.add_argument("-d", "--data_dir", type=str, help="Directory to save data", - default=os.path.join("data", f"demo-ws-v{ws.__version__}")) - args = parser.parse_args() - print(f'WebSearcher v{ws.__version__}\nSearch Query: {args.query}\nOutput Dir: {args.data_dir}\n') +DEFAULT_DATA_DIR = os.path.join("data", f"demo-ws-v{ws.__version__}") +app = typer.Typer() + +@app.command() +def main( + query: str = typer.Argument("why is the sky blue?", help="Search query to use"), + method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"), + data_dir: str = typer.Option(DEFAULT_DATA_DIR, help="Prefix for output files"), + headless: bool = typer.Option(False, help="Run browser in headless mode"), + use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"), + version_main: int = typer.Option(133, help="Main version of Chrome to use"), + ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"), + driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"), +) -> None: + # Filepaths - fp_serps = os.path.join(args.data_dir, 'serps.json') - fp_results = os.path.join(args.data_dir, 'results.json') - fp_searches = os.path.join(args.data_dir, 'searches.json') - dir_html = os.path.join(args.data_dir, 'html') - os.makedirs(dir_html, exist_ok=True) - - # Search, parse, and save - se = ws.SearchEngine() # Initialize searcher - se.launch_chromedriver(headless =False) # Launch browser - se.search(args.query) # Conduct Search - se.parse_results() # Parse Results - se.save_serp(append_to=fp_serps) # Save SERP to json (html + metadata) - se.save_results(append_to=fp_results) # Save results to json - se.save_serp(save_dir=dir_html) # Save SERP html to dir (no metadata) - se.save_search(append_to=fp_searches) # Save search metadata + extracted features + fps = {k: os.path.join(data_dir, f"{k}.json") for k in ["serps", "parsed", "searches"]} + os.makedirs(data_dir, exist_ok=True) + print(f'WebSearcher v{ws.__version__}\nSearch Query: {query}\nOutput Dir: {data_dir}\n') + + # Setup search engine + se = ws.SearchEngine( + method=method, + selenium_config={ + "headless": headless, + "use_subprocess": use_subprocess, + "driver_executable_path": driver_executable_path, + "version_main": version_main, + } + ) + + # Search and parse + se.search(query, ai_expand=ai_expand) # Conduct Search + se.parse_results() # Parse Results + se.save_serp(append_to=fps['serps']) # Save SERP to json (html + metadata) + se.save_search(append_to=fps['searches']) # Save search metadata to json + se.save_parsed(append_to=fps['parsed']) # Save results/features to json # Convert results to dataframe and print select columns - if se.results: - results = pd.DataFrame(se.results) - print(results[['type', 'title', 'url']]) + if se.parsed["results"]: + results = pd.DataFrame(se.parsed["results"]) + print(results[['type', 'sub_type', 'title', 'url']]) if __name__ == "__main__": - main() \ No newline at end of file + app() \ No newline at end of file diff --git a/scripts/demo_searches.py b/scripts/demo_searches.py index 6c96341..82eee67 100644 --- a/scripts/demo_searches.py +++ b/scripts/demo_searches.py @@ -3,37 +3,62 @@ import os import time +import typer import pandas as pd import WebSearcher as ws -pd.set_option('display.width', 120, - 'display.max_colwidth', 40, +pd.set_option('display.width', 160, 'display.max_rows', None, - 'display.max_columns', None) - -# Filepaths -data_dir = os.path.join("data", f"demo-ws-v{ws.__version__}") -fp_serps = os.path.join(data_dir, 'serps.json') -fp_results = os.path.join(data_dir, 'results.json') -dir_html = os.path.join(data_dir, 'html') -os.makedirs(dir_html, exist_ok=True) - -# Load query list from file, from: https://ahrefs.com/blog/top-google-searches/ -fp_queries = 'data/tests/top_searches_google_2020-04.tsv' -top_list = pd.read_csv(fp_queries, sep='\t') -queries = top_list['keyword'] - -# Search, parse, and save -for qry in queries: - se = ws.SearchEngine() # Initialize searcher - se.search(qry) # Conduct Search - se.parse_results() # Parse Results - se.save_serp(append_to=fp_serps) # Save SERP to json (html + metadata) - se.save_results(append_to=fp_results) # Save results to json - se.save_serp(save_dir=dir_html) # Save SERP html to dir (no metadata) - - # Convert results to dataframe and print select columns - if se.results: - results = pd.DataFrame(se.results) - print(results[['type', 'title', 'url']]) - time.sleep(30) + 'display.max_columns', None, + 'display.max_colwidth', 40) + +DEFAULT_DATA_DIR = os.path.join("data", f"demo-ws-v{ws.__version__}") + +app = typer.Typer() + +@app.command() +def main( + method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"), + data_dir: str = typer.Option(DEFAULT_DATA_DIR, help="Prefix for output files"), + headless: bool = typer.Option(False, help="Run browser in headless mode"), + use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"), + version_main: int = typer.Option(133, help="Main version of Chrome to use"), + ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"), + driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"), +) -> None: + + # Filepaths + fps = {k: os.path.join(data_dir, f"{k}.json") for k in ["serps", "parsed", "searches"]} + os.makedirs(data_dir, exist_ok=True) + + # Load query list from file, from: https://ahrefs.com/blog/top-google-searches/ + fp_queries = 'data/tests/top_searches_google_2020-04.tsv' + top_list = pd.read_csv(fp_queries, sep='\t') + queries = top_list['keyword'] + + for qry in queries: + + # Setup search engine + se = ws.SearchEngine( + method=method, + selenium_config={ + "headless": headless, + "use_subprocess": use_subprocess, + "driver_executable_path": driver_executable_path, + "version_main": version_main, + } + ) + + # Search, parse, and save + se.search(qry, ai_expand=ai_expand) # Conduct Search + se.parse_results() # Parse Results + se.save_serp(append_to=fps['serps']) # Save SERP to json (html + metadata) + se.save_search(append_to=fps['searches']) # Save search to json (metadata only) + se.save_parsed(append_to=fps['parsed']) # Save parsed results and SERP features to json + + # Convert results to dataframe and print select columns + if se.parsed["results"]: + results = pd.DataFrame(se.parsed["results"]) + print(results[['type', 'sub_type', 'title', 'url']]) + + time.sleep(30) diff --git a/tests/selenium_test.py b/tests/selenium_test.py deleted file mode 100644 index 7711c6c..0000000 --- a/tests/selenium_test.py +++ /dev/null @@ -1,39 +0,0 @@ -import typer -import WebSearcher as ws - -# driver_executable_path locations: -# /opt/homebrew/Caskroom/chromedriver/133.0.6943.53 # Mac -# /opt/google/chrome/google-chrome # Google Chrome 134.0.6998.88 | permissions error -# ~/.local/share/undetected_chromedriver/undetected_chromedriver # ChromeDriver 133.0.6943.141 - -app = typer.Typer() - -@app.command() -def main( - query: str = typer.Argument("why is the sky blue?", help="Search query to use"), - method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"), - headless: bool = typer.Option(False, help="Run browser in headless mode"), - use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"), - version_main: int = typer.Option(133, help="Main version of Chrome to use"), - ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"), - driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"), - output_prefix: str = typer.Option("output", help="Prefix for output files") -) -> None: - typer.echo(f"query: {query}\nmethod: {method}") - se = ws.SearchEngine( - method=method, - selenium_config={ - "headless": headless, - "use_subprocess": use_subprocess, - "driver_executable_path": driver_executable_path, - "version_main": version_main, - } - ) - se.search(qry=query, ai_expand=ai_expand) - se.parse_results() - se.save_serp(append_to=f'{output_prefix}_serps.json') - se.save_search(append_to=f'{output_prefix}_searches.json') - se.save_parsed(append_to=f'{output_prefix}_parsed.json') - -if __name__ == "__main__": - app() \ No newline at end of file From 16ba005e9a861accb14ce27aed39c4b98b0d073c Mon Sep 17 00:00:00 2001 From: gitronald Date: Tue, 1 Apr 2025 07:48:47 -0700 Subject: [PATCH 050/101] update: timestamp before request, ai expand as search param, load searcher method on se init --- WebSearcher/models/configs.py | 13 ++++---- WebSearcher/models/searches.py | 13 +++----- .../search_methods/requests_searcher.py | 5 +++ .../search_methods/selenium_searcher.py | 6 ++-- WebSearcher/searchers.py | 32 +++++++++++-------- 5 files changed, 40 insertions(+), 29 deletions(-) diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py index 429d872..99021cf 100644 --- a/WebSearcher/models/configs.py +++ b/WebSearcher/models/configs.py @@ -2,7 +2,7 @@ import subprocess from enum import Enum from typing import Dict, Optional, Union -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, computed_field class BaseConfig(BaseModel): """Base class for all configuration classes""" @@ -35,14 +35,15 @@ class RequestsConfig(BaseConfig): 'Accept-Language': 'en-US,en;q=0.5', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0', }) - sesh: Optional[requests.Session] = None ssh_tunnel: Optional[subprocess.Popen] = None unzip: bool = True - def update_headers(self, new_headers: Dict[str, str]) -> None: - """Update the headers dictionary with new values.""" - self.headers.update(new_headers) - + @computed_field + def sesh(self) -> requests.Session: + """Create and configure a requests session with the current headers.""" + sesh = requests.Session() + sesh.headers.update(self.headers) + return sesh class SearchMethod(Enum): diff --git a/WebSearcher/models/searches.py b/WebSearcher/models/searches.py index 6884ec2..b213e3d 100644 --- a/WebSearcher/models/searches.py +++ b/WebSearcher/models/searches.py @@ -1,6 +1,6 @@ from pydantic import Field, computed_field from typing import Dict, Optional, Any, List -from datetime import datetime, timezone +from datetime import datetime from ..utils import hash_id from ..import webutils as wu @@ -15,6 +15,8 @@ class SearchParams(BaseConfig): lang: Optional[str] = Field(None, description="Language code (e.g., 'en')") loc: Optional[str] = Field(None, description="Location in Canonical Name format") base_url: str = Field("https://www.google.com/search", description="Base search engine URL") + ai_expand: bool = Field(False, description="Expand AI overviews if present") + headers: Dict[str, str] = Field(default_factory=dict, description="Custom headers") @computed_field def url_params(self) -> Dict[str, Any]: @@ -36,18 +38,13 @@ def url(self) -> str: @computed_field def serp_id(self) -> str: - """Computes a unique SERP ID based on query, location, and timestamp""" - timestamp = datetime.now().isoformat() - return hash_id(f"{self.qry}{self.loc}{timestamp}") + return hash_id(f"{self.qry}{self.loc}{datetime.now().isoformat()}") def to_serp_output(self) -> Dict[str, Any]: - """Outputs the variables needed for SERPDetails as a dictionary""" - timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat() return { "qry": self.qry, "loc": self.loc, "lang": self.lang, "url": self.url, - "serp_id": hash_id(f"{self.qry}{self.loc}{timestamp}"), - "timestamp": timestamp, + "serp_id": self.serp_id, } diff --git a/WebSearcher/search_methods/requests_searcher.py b/WebSearcher/search_methods/requests_searcher.py index 1b12450..666afcb 100644 --- a/WebSearcher/search_methods/requests_searcher.py +++ b/WebSearcher/search_methods/requests_searcher.py @@ -1,6 +1,7 @@ import time import brotli import requests +from datetime import datetime, timezone from typing import Dict, Any from ..models.configs import RequestsConfig @@ -38,12 +39,16 @@ def send_request(self, search_params: SearchParams) -> Dict[str, Any]: Returns: Dictionary with response data """ + + if search_params.headers: + self.sesh.headers.update(search_params.headers) response_output = { 'html': '', 'url': search_params.url, 'user_agent': self.config.headers.get('User-Agent'), 'response_code': 0, + 'timestamp': datetime.now(timezone.utc).replace(tzinfo=None).isoformat() } try: diff --git a/WebSearcher/search_methods/selenium_searcher.py b/WebSearcher/search_methods/selenium_searcher.py index d8315ca..00eb829 100644 --- a/WebSearcher/search_methods/selenium_searcher.py +++ b/WebSearcher/search_methods/selenium_searcher.py @@ -1,5 +1,6 @@ import time import json +from datetime import datetime, timezone from typing import Dict, Any import undetected_chromedriver as uc @@ -54,7 +55,7 @@ def send_typed_query(self, query: str): search_box.send_keys(query) search_box.send_keys(Keys.RETURN) - def send_request(self, search_params: SearchParams, ai_expand: bool = False) -> Dict[str, Any]: + def send_request(self, search_params: SearchParams) -> Dict[str, Any]: """Visit a URL with selenium and save HTML response""" response_output = { @@ -62,6 +63,7 @@ def send_request(self, search_params: SearchParams, ai_expand: bool = False) -> 'url': search_params.url, 'user_agent': self.browser_info['user_agent'], 'response_code': 0, + 'timestamp': datetime.now(timezone.utc).replace(tzinfo=None).isoformat() } try: @@ -76,7 +78,7 @@ def send_request(self, search_params: SearchParams, ai_expand: bool = False) -> response_output['response_code'] = 200 # Expand AI overview if requested - if ai_expand: + if search_params.ai_expand: expanded_html = self.expand_ai_overview() if expanded_html: len_diff = len(expanded_html) - len(response_output['html']) diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 8566d17..72c791f 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -1,8 +1,10 @@ from . import parsers from . import utils from . import logger + from .search_methods.selenium_searcher import SeleniumDriver from .search_methods.requests_searcher import RequestsSearcher + from .models.configs import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod from .models.searches import SearchParams from .models.data import BaseSERP @@ -30,8 +32,9 @@ def __init__(self, log_config (Union[dict, LogConfig], optional): Common search configuration. Defaults to None. selenium_config (Union[dict, SeleniumConfig], optional): Selenium-specific configuration. Defaults to None. requests_config (Union[dict, RequestsConfig], optional): Requests-specific configuration. Defaults to None. + crawl_id (str, optional): A unique identifier for the crawl. Defaults to ''. """ - + # Initialize configuration self.method = method.value if isinstance(method, SearchMethod) else method self.config = SearchConfig.create({ @@ -40,12 +43,14 @@ def __init__(self, "selenium": SeleniumConfig.create(selenium_config), "requests": RequestsConfig.create(requests_config), }) + + # Initialize session data self.session_data = { "method": self.config.method.value, "version": WS_VERSION, "crawl_id": crawl_id, } - + # Set a log file, prints to console by default self.log = logger.Logger( console=True if not self.config.log.fp else False, @@ -55,6 +60,12 @@ def __init__(self, file_level=self.config.log.level, ).start(__name__) + if self.config.method == SearchMethod.SELENIUM: + self.searcher = SeleniumDriver(config=self.config.selenium, logger=self.log) + self.searcher.init_driver() + elif self.config.method == SearchMethod.REQUESTS: + self.searcher = RequestsSearcher(config=self.config.requests, logger=self.log) + # Initialize search params and output self.search_params = SearchParams.create() self.parsed = {'results': [], 'features': {}} @@ -73,28 +84,23 @@ def search(self, Args: qry (str): The search query location (str, optional): A location's Canonical Name + lang (str, optional): A language code (e.g., 'en') num_results (int, optional): The number of results to return ai_expand: (bool, optional): Whether to use selenium to expand AI overviews - crawl_id (str, optional): An identifier for this crawl + headers (Dict[str, str], optional): Custom headers to include in the request """ + self.log.warning('starting search config') self.search_params = SearchParams.create({ 'qry': str(qry), 'loc': str(location) if not pd.isnull(location) else '', 'lang': str(lang) if not pd.isnull(lang) else '', 'num_results': num_results, + 'ai_expand': ai_expand, + 'headers': headers, }) - if self.config.method == SearchMethod.SELENIUM: - self.selenium_driver = SeleniumDriver(config=self.config.selenium, logger=self.log) - self.selenium_driver.init_driver() - self.response_output = self.selenium_driver.send_request(self.search_params, ai_expand=ai_expand) - - elif self.config.method == SearchMethod.REQUESTS: - self.config.requests.update_headers(headers) - self.requests_searcher = RequestsSearcher(config=self.config.requests, logger=self.log) - self.response_output = self.requests_searcher.send_request(self.search_params) - + self.response_output = self.searcher.send_request(self.search_params) serp_output = self.search_params.to_serp_output() serp_output.update(self.session_data) serp_output.update(self.response_output) From 51ee3b2e7809c2275d1df5e587b302d0968c8e6f Mon Sep 17 00:00:00 2001 From: gitronald Date: Tue, 1 Apr 2025 07:49:56 -0700 Subject: [PATCH 051/101] update: poetry lock --- poetry.lock | 214 +++++----------------------------------------------- 1 file changed, 20 insertions(+), 194 deletions(-) diff --git a/poetry.lock b/poetry.lock index 9ca06df..db4733a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. [[package]] name = "annotated-types" @@ -7,7 +7,6 @@ description = "Reusable constraint types to use with typing.Annotated" optional = false python-versions = ">=3.8" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"}, {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, @@ -20,7 +19,7 @@ description = "Disable App Nap on macOS >= 10.9" optional = false python-versions = ">=3.6" groups = ["dev"] -markers = "python_version <= \"3.11\" and platform_system == \"Darwin\" or python_version >= \"3.12\" and platform_system == \"Darwin\"" +markers = "platform_system == \"Darwin\"" files = [ {file = "appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c"}, {file = "appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee"}, @@ -33,7 +32,6 @@ description = "Annotate AST trees with source code positions" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2"}, {file = "asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7"}, @@ -50,19 +48,18 @@ description = "Classes Without Boilerplate" optional = false python-versions = ">=3.8" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a"}, {file = "attrs-25.1.0.tar.gz", hash = "sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e"}, ] [package.extras] -benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"] -cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] -dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +benchmark = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"] +cov = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"] +dev = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"] docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"] -tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] -tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] +tests = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"] +tests-mypy = ["mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\""] [[package]] name = "beautifulsoup4" @@ -71,7 +68,6 @@ description = "Screen-scraping library" optional = false python-versions = ">=3.7.0" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "beautifulsoup4-4.13.1-py3-none-any.whl", hash = "sha256:72465267014897bb10ca749bb632bde6c2d20f3254afd5458544bd74e6c2e6d8"}, {file = "beautifulsoup4-4.13.1.tar.gz", hash = "sha256:741c8b6903a1e4ae8ba32b9c9ae7510dab7a197fdbadcf9fcdeb0891ef5ec66a"}, @@ -95,7 +91,6 @@ description = "Python bindings for the Brotli compression library" optional = false python-versions = "*" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "Brotli-1.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e1140c64812cb9b06c922e77f1c26a75ec5e3f0fb2bf92cc8c58720dec276752"}, {file = "Brotli-1.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c8fd5270e906eef71d4a8d19b7c6a43760c6abcfcc10c9101d14eb2357418de9"}, @@ -231,7 +226,6 @@ description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe"}, {file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"}, @@ -313,7 +307,7 @@ files = [ {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"}, {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"}, ] -markers = {main = "python_version <= \"3.11\" and os_name == \"nt\" and implementation_name != \"pypy\" or python_version >= \"3.12\" and os_name == \"nt\" and implementation_name != \"pypy\"", dev = "python_version <= \"3.11\" and implementation_name == \"pypy\" or python_version >= \"3.12\" and implementation_name == \"pypy\""} +markers = {main = "os_name == \"nt\" and implementation_name != \"pypy\"", dev = "implementation_name == \"pypy\""} [package.dependencies] pycparser = "*" @@ -325,7 +319,6 @@ description = "The Real First Universal Charset Detector. Open, modern and activ optional = false python-versions = ">=3.7" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "charset_normalizer-3.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de"}, {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176"}, @@ -428,7 +421,6 @@ description = "Composable command line interface toolkit" optional = false python-versions = ">=3.7" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"}, {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"}, @@ -444,7 +436,7 @@ description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" groups = ["dev"] -markers = "python_version <= \"3.11\" and sys_platform == \"win32\" or python_version <= \"3.11\" and platform_system == \"Windows\" or python_version >= \"3.12\" and sys_platform == \"win32\" or python_version >= \"3.12\" and platform_system == \"Windows\"" +markers = "sys_platform == \"win32\" or platform_system == \"Windows\"" files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, @@ -457,7 +449,6 @@ description = "Jupyter Python Comm implementation, for usage in ipykernel, xeus- optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "comm-0.2.2-py3-none-any.whl", hash = "sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3"}, {file = "comm-0.2.2.tar.gz", hash = "sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e"}, @@ -476,7 +467,6 @@ description = "An implementation of the Debug Adapter Protocol for Python" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "debugpy-1.8.12-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:a2ba7ffe58efeae5b8fad1165357edfe01464f9aef25e814e891ec690e7dd82a"}, {file = "debugpy-1.8.12-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cbbd4149c4fc5e7d508ece083e78c17442ee13b0e69bfa6bd63003e486770f45"}, @@ -513,7 +503,6 @@ description = "Decorators for Humans" optional = false python-versions = ">=3.5" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"}, {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, @@ -542,14 +531,13 @@ description = "Get the currently executing AST node of a frame, and other inform optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa"}, {file = "executing-2.2.0.tar.gz", hash = "sha256:5d108c028108fe2551d1a7b2e8b713341e2cb4fc0aa7dcf966fa4327a5226755"}, ] [package.extras] -tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"] +tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich ; python_version >= \"3.11\""] [[package]] name = "filelock" @@ -558,7 +546,6 @@ description = "A platform independent file lock." optional = false python-versions = ">=3.9" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "filelock-3.17.0-py3-none-any.whl", hash = "sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338"}, {file = "filelock-3.17.0.tar.gz", hash = "sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e"}, @@ -567,7 +554,7 @@ files = [ [package.extras] docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"] testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"] -typing = ["typing-extensions (>=4.12.2)"] +typing = ["typing-extensions (>=4.12.2) ; python_version < \"3.11\""] [[package]] name = "h11" @@ -576,7 +563,6 @@ description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" optional = false python-versions = ">=3.7" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, @@ -589,7 +575,6 @@ description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.6" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"}, {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"}, @@ -598,31 +583,6 @@ files = [ [package.extras] all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] -[[package]] -name = "importlib-metadata" -version = "8.6.1" -description = "Read metadata from Python packages" -optional = false -python-versions = ">=3.9" -groups = ["dev"] -markers = "python_version < \"3.10\"" -files = [ - {file = "importlib_metadata-8.6.1-py3-none-any.whl", hash = "sha256:02a89390c1e15fdfdc0d7c6b25cb3e62650d0494005c97d6f148bf5b9787525e"}, - {file = "importlib_metadata-8.6.1.tar.gz", hash = "sha256:310b41d755445d74569f993ccfc22838295d9fe005425094fad953d7f15c8580"}, -] - -[package.dependencies] -zipp = ">=3.20" - -[package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] -cover = ["pytest-cov"] -doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -enabler = ["pytest-enabler (>=2.2)"] -perf = ["ipython"] -test = ["flufl.flake8", "importlib_resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] -type = ["pytest-mypy"] - [[package]] name = "iniconfig" version = "2.0.0" @@ -630,7 +590,6 @@ description = "brain-dead simple config-ini parsing" optional = false python-versions = ">=3.7" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, @@ -643,7 +602,6 @@ description = "IPython Kernel for Jupyter" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "ipykernel-6.29.5-py3-none-any.whl", hash = "sha256:afdb66ba5aa354b09b91379bac28ae4afebbb30e8b39510c9690afb7a10421b5"}, {file = "ipykernel-6.29.5.tar.gz", hash = "sha256:f093a22c4a40f8828f8e330a9c297cb93dcab13bd9678ded6de8e5cf81c56215"}, @@ -678,7 +636,6 @@ description = "IPython: Productive Interactive Computing" optional = false python-versions = ">=3.9" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "ipython-8.18.1-py3-none-any.whl", hash = "sha256:e8267419d72d81955ec1177f8a29aaa90ac80ad647499201119e2f05e99aa397"}, {file = "ipython-8.18.1.tar.gz", hash = "sha256:ca6f079bb33457c66e233e4580ebfc4128855b4cf6370dddd73842a9563e8a27"}, @@ -695,7 +652,6 @@ prompt-toolkit = ">=3.0.41,<3.1.0" pygments = ">=2.4.0" stack-data = "*" traitlets = ">=5" -typing-extensions = {version = "*", markers = "python_version < \"3.10\""} [package.extras] all = ["black", "curio", "docrepr", "exceptiongroup", "ipykernel", "ipyparallel", "ipywidgets", "matplotlib", "matplotlib (!=3.2.0)", "nbconvert", "nbformat", "notebook", "numpy (>=1.22)", "pandas", "pickleshare", "pytest (<7)", "pytest (<7.1)", "pytest-asyncio (<0.22)", "qtconsole", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "stack-data", "testpath", "trio", "typing-extensions"] @@ -717,7 +673,6 @@ description = "An autocompletion tool for Python that can be used for text edito optional = false python-versions = ">=3.6" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9"}, {file = "jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0"}, @@ -738,14 +693,12 @@ description = "Jupyter protocol implementation and client libraries" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "jupyter_client-8.6.3-py3-none-any.whl", hash = "sha256:e8a19cc986cc45905ac3362915f410f3af85424b4c0905e94fa5f2cb08e8f23f"}, {file = "jupyter_client-8.6.3.tar.gz", hash = "sha256:35b3a0947c4a6e9d589eb97d7d4cd5e90f910ee73101611f01283732bd6d9419"}, ] [package.dependencies] -importlib-metadata = {version = ">=4.8.3", markers = "python_version < \"3.10\""} jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0" python-dateutil = ">=2.8.2" pyzmq = ">=23.0" @@ -754,7 +707,7 @@ traitlets = ">=5.3" [package.extras] docs = ["ipykernel", "myst-parser", "pydata-sphinx-theme", "sphinx (>=4)", "sphinx-autodoc-typehints", "sphinxcontrib-github-alt", "sphinxcontrib-spelling"] -test = ["coverage", "ipykernel (>=6.14)", "mypy", "paramiko", "pre-commit", "pytest (<8.2.0)", "pytest-cov", "pytest-jupyter[client] (>=0.4.1)", "pytest-timeout"] +test = ["coverage", "ipykernel (>=6.14)", "mypy", "paramiko ; sys_platform == \"win32\"", "pre-commit", "pytest (<8.2.0)", "pytest-cov", "pytest-jupyter[client] (>=0.4.1)", "pytest-timeout"] [[package]] name = "jupyter-core" @@ -763,7 +716,6 @@ description = "Jupyter core package. A base package on which Jupyter projects re optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "jupyter_core-5.7.2-py3-none-any.whl", hash = "sha256:4f7315d2f6b4bcf2e3e7cb6e46772eba760ae459cd1f59d29eb57b0a01bd7409"}, {file = "jupyter_core-5.7.2.tar.gz", hash = "sha256:aa5f8d32bbf6b431ac830496da7392035d6f61b4f54872f15c4bd2a9c3f536d9"}, @@ -785,7 +737,6 @@ description = "Powerful and Pythonic XML processing library combining libxml2/li optional = false python-versions = ">=3.6" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656"}, {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ae5fe5c4b525aa82b8076c1a59d642c17b6e8739ecf852522c6321852178119d"}, @@ -941,7 +892,6 @@ description = "Python port of markdown-it. Markdown parsing, done right!" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, @@ -967,7 +917,6 @@ description = "Inline Matplotlib backend for Jupyter" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca"}, {file = "matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90"}, @@ -983,7 +932,6 @@ description = "Markdown URL utilities" optional = false python-versions = ">=3.7" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, @@ -996,68 +944,11 @@ description = "Patch asyncio to allow nested event loops" optional = false python-versions = ">=3.5" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"}, {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"}, ] -[[package]] -name = "numpy" -version = "2.0.2" -description = "Fundamental package for array computing in Python" -optional = false -python-versions = ">=3.9" -groups = ["main"] -markers = "python_version < \"3.11\"" -files = [ - {file = "numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece"}, - {file = "numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04"}, - {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66"}, - {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b"}, - {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd"}, - {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318"}, - {file = "numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8"}, - {file = "numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326"}, - {file = "numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97"}, - {file = "numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131"}, - {file = "numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448"}, - {file = "numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195"}, - {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57"}, - {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a"}, - {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669"}, - {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951"}, - {file = "numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9"}, - {file = "numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15"}, - {file = "numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4"}, - {file = "numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc"}, - {file = "numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b"}, - {file = "numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e"}, - {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c"}, - {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c"}, - {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692"}, - {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a"}, - {file = "numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c"}, - {file = "numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded"}, - {file = "numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5"}, - {file = "numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a"}, - {file = "numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c"}, - {file = "numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd"}, - {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b"}, - {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729"}, - {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1"}, - {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd"}, - {file = "numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d"}, - {file = "numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d"}, - {file = "numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa"}, - {file = "numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73"}, - {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8"}, - {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4"}, - {file = "numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c"}, - {file = "numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385"}, - {file = "numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78"}, -] - [[package]] name = "numpy" version = "2.2.2" @@ -1065,7 +956,6 @@ description = "Fundamental package for array computing in Python" optional = false python-versions = ">=3.10" groups = ["main"] -markers = "python_version == \"3.11\" or python_version >= \"3.12\"" files = [ {file = "numpy-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7079129b64cb78bdc8d611d1fd7e8002c0a2565da6a47c4df8062349fee90e3e"}, {file = "numpy-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2ec6c689c61df613b783aeb21f945c4cbe6c51c28cb70aae8430577ab39f163e"}, @@ -1131,7 +1021,6 @@ description = "Capture the outcome of Python function calls." optional = false python-versions = ">=3.7" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b"}, {file = "outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8"}, @@ -1147,7 +1036,6 @@ description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, @@ -1160,7 +1048,6 @@ description = "Powerful data structures for data analysis, time series, and stat optional = false python-versions = ">=3.9" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5"}, {file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"}, @@ -1248,7 +1135,6 @@ description = "A Python Parser" optional = false python-versions = ">=3.6" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18"}, {file = "parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d"}, @@ -1265,7 +1151,7 @@ description = "Pexpect allows easy control of interactive console applications." optional = false python-versions = "*" groups = ["dev"] -markers = "python_version <= \"3.11\" and sys_platform != \"win32\" or python_version >= \"3.12\" and sys_platform != \"win32\"" +markers = "sys_platform != \"win32\"" files = [ {file = "pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523"}, {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"}, @@ -1281,7 +1167,6 @@ description = "A small Python package for determining appropriate platform-speci optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"}, {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"}, @@ -1299,7 +1184,6 @@ description = "plugin and hook calling mechanisms for python" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, @@ -1316,7 +1200,6 @@ description = "Library for building powerful interactive command lines in Python optional = false python-versions = ">=3.8.0" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "prompt_toolkit-3.0.50-py3-none-any.whl", hash = "sha256:9b6427eb19e479d98acff65196a307c555eb567989e6d88ebbb1b509d9779198"}, {file = "prompt_toolkit-3.0.50.tar.gz", hash = "sha256:544748f3860a2623ca5cd6d2795e7a14f3d0e1c3c9728359013f79877fc89bab"}, @@ -1332,7 +1215,6 @@ description = "" optional = false python-versions = ">=3.9" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "protobuf-6.30.0-cp310-abi3-win32.whl", hash = "sha256:7337d76d8efe65ee09ee566b47b5914c517190196f414e5418fa236dfd1aed3e"}, {file = "protobuf-6.30.0-cp310-abi3-win_amd64.whl", hash = "sha256:9b33d51cc95a7ec4f407004c8b744330b6911a37a782e2629c67e1e8ac41318f"}, @@ -1352,7 +1234,6 @@ description = "Cross-platform lib for process and system monitoring in Python." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "psutil-6.1.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:9ccc4316f24409159897799b83004cb1e24f9819b0dcf9c0b68bdcb6cefee6a8"}, {file = "psutil-6.1.1-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ca9609c77ea3b8481ab005da74ed894035936223422dc591d6772b147421f777"}, @@ -1384,7 +1265,7 @@ description = "Run a subprocess in a pseudo terminal" optional = false python-versions = "*" groups = ["dev"] -markers = "python_version <= \"3.11\" and sys_platform != \"win32\" or python_version >= \"3.12\" and sys_platform != \"win32\"" +markers = "sys_platform != \"win32\"" files = [ {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, @@ -1397,7 +1278,6 @@ description = "Safely evaluate AST nodes without side effects" optional = false python-versions = "*" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0"}, {file = "pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42"}, @@ -1417,7 +1297,7 @@ files = [ {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, ] -markers = {main = "python_version <= \"3.11\" and os_name == \"nt\" and implementation_name != \"pypy\" or python_version >= \"3.12\" and os_name == \"nt\" and implementation_name != \"pypy\"", dev = "python_version <= \"3.11\" and implementation_name == \"pypy\" or python_version >= \"3.12\" and implementation_name == \"pypy\""} +markers = {main = "os_name == \"nt\" and implementation_name != \"pypy\"", dev = "implementation_name == \"pypy\""} [[package]] name = "pydantic" @@ -1426,7 +1306,6 @@ description = "Data validation using Python type hints" optional = false python-versions = ">=3.8" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pydantic-2.10.6-py3-none-any.whl", hash = "sha256:427d664bf0b8a2b34ff5dd0f5a18df00591adcee7198fbd71981054cef37b584"}, {file = "pydantic-2.10.6.tar.gz", hash = "sha256:ca5daa827cce33de7a42be142548b0096bf05a7e7b365aebfa5f8eeec7128236"}, @@ -1439,7 +1318,7 @@ typing-extensions = ">=4.12.2" [package.extras] email = ["email-validator (>=2.0.0)"] -timezone = ["tzdata"] +timezone = ["tzdata ; python_version >= \"3.9\" and platform_system == \"Windows\""] [[package]] name = "pydantic-core" @@ -1448,7 +1327,6 @@ description = "Core functionality for Pydantic validation and serialization" optional = false python-versions = ">=3.8" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pydantic_core-2.27.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2d367ca20b2f14095a8f4fa1210f5a7b78b8a20009ecced6b12818f455b1e9fa"}, {file = "pydantic_core-2.27.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:491a2b73db93fab69731eaee494f320faa4e093dbed776be1a829c2eb222c34c"}, @@ -1562,7 +1440,6 @@ description = "Pygments is a syntax highlighting package written in Python." optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"}, {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"}, @@ -1578,7 +1455,6 @@ description = "A Python SOCKS client module. See https://github.com/Anorov/PySoc optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "PySocks-1.7.1-py27-none-any.whl", hash = "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299"}, {file = "PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5"}, @@ -1592,7 +1468,6 @@ description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"}, {file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"}, @@ -1616,7 +1491,6 @@ description = "Extensions to the standard Python datetime module" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" groups = ["main", "dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, @@ -1632,7 +1506,6 @@ description = "World timezone definitions, modern and historical" optional = false python-versions = "*" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pytz-2025.1-py2.py3-none-any.whl", hash = "sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57"}, {file = "pytz-2025.1.tar.gz", hash = "sha256:c2db42be2a2518b28e65f9207c4d05e6ff547d1efa4086469ef855e4ab70178e"}, @@ -1645,7 +1518,7 @@ description = "Python for Window Extensions" optional = false python-versions = "*" groups = ["dev"] -markers = "python_version <= \"3.11\" and sys_platform == \"win32\" and platform_python_implementation != \"PyPy\" or python_version >= \"3.12\" and sys_platform == \"win32\" and platform_python_implementation != \"PyPy\"" +markers = "sys_platform == \"win32\" and platform_python_implementation != \"PyPy\"" files = [ {file = "pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e"}, {file = "pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e"}, @@ -1674,7 +1547,6 @@ description = "Python bindings for 0MQ" optional = false python-versions = ">=3.7" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "pyzmq-26.2.1-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:f39d1227e8256d19899d953e6e19ed2ccb689102e6d85e024da5acf410f301eb"}, {file = "pyzmq-26.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a23948554c692df95daed595fdd3b76b420a4939d7a8a28d6d7dea9711878641"}, @@ -1797,7 +1669,6 @@ description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, @@ -1820,7 +1691,6 @@ description = "File transport adapter for Requests" optional = false python-versions = "*" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "requests_file-2.1.0-py2.py3-none-any.whl", hash = "sha256:cf270de5a4c5874e84599fc5778303d496c10ae5e870bfa378818f35d21bda5c"}, {file = "requests_file-2.1.0.tar.gz", hash = "sha256:0f549a3f3b0699415ac04d167e9cb39bccfb730cb832b4d20be3d9867356e658"}, @@ -1836,7 +1706,6 @@ description = "Render rich text, tables, progress bars, syntax highlighting, mar optional = false python-versions = ">=3.8.0" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"}, {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"}, @@ -1857,7 +1726,6 @@ description = "Official Python bindings for Selenium WebDriver" optional = false python-versions = ">=3.9" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "selenium-4.29.0-py3-none-any.whl", hash = "sha256:ce5d26f1ddc1111641113653af33694c13947dd36c2df09cdd33f554351d372e"}, {file = "selenium-4.29.0.tar.gz", hash = "sha256:3a62f7ec33e669364a6c0562a701deb69745b569c50d55f1a912bf8eb33358ba"}, @@ -1878,7 +1746,6 @@ description = "Tool to Detect Surrounding Shell" optional = false python-versions = ">=3.7" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686"}, {file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"}, @@ -1891,7 +1758,6 @@ description = "Python 2 and 3 compatibility utilities" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" groups = ["main", "dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"}, {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, @@ -1904,7 +1770,6 @@ description = "Sniff out which async library your code is running under" optional = false python-versions = ">=3.7" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, @@ -1917,7 +1782,6 @@ description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" optional = false python-versions = "*" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, @@ -1930,7 +1794,6 @@ description = "A modern CSS selector implementation for Beautiful Soup." optional = false python-versions = ">=3.8" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"}, {file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"}, @@ -1943,7 +1806,6 @@ description = "Extract data from python stack frames and tracebacks for informat optional = false python-versions = "*" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695"}, {file = "stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9"}, @@ -1964,7 +1826,6 @@ description = "Pytest Snapshot Test Utility" optional = false python-versions = ">=3.8.1" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "syrupy-4.8.1-py3-none-any.whl", hash = "sha256:274f97cbaf44175f5e478a2f3a53559d31f41c66c6bf28131695f94ac893ea00"}, {file = "syrupy-4.8.1.tar.gz", hash = "sha256:8da8c0311e6d92de0b15767768c6ab98982b7b4a4c67083c08fbac3fbad4d44c"}, @@ -1980,7 +1841,6 @@ description = "Accurately separates a URL's subdomain, domain, and public suffix optional = false python-versions = ">=3.9" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "tldextract-5.1.3-py3-none-any.whl", hash = "sha256:78de310cc2ca018692de5ddf320f9d6bd7c5cf857d0fd4f2175f0cdf4440ea75"}, {file = "tldextract-5.1.3.tar.gz", hash = "sha256:d43c7284c23f5dc8a42fd0fee2abede2ff74cc622674e4cb07f514ab3330c338"}, @@ -2046,7 +1906,6 @@ description = "Tornado is a Python web framework and asynchronous networking lib optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "tornado-6.4.2-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:e828cce1123e9e44ae2a50a9de3055497ab1d0aeb440c5ac23064d9e44880da1"}, {file = "tornado-6.4.2-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:072ce12ada169c5b00b7d92a99ba089447ccc993ea2143c9ede887e0937aa803"}, @@ -2068,7 +1927,6 @@ description = "Traitlets Python configuration system" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f"}, {file = "traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7"}, @@ -2085,7 +1943,6 @@ description = "A friendly Python library for async concurrency and I/O" optional = false python-versions = ">=3.9" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "trio-0.29.0-py3-none-any.whl", hash = "sha256:d8c463f1a9cc776ff63e331aba44c125f423a5a13c684307e828d930e625ba66"}, {file = "trio-0.29.0.tar.gz", hash = "sha256:ea0d3967159fc130acb6939a0be0e558e364fee26b5deeecc893a6b08c361bdf"}, @@ -2107,7 +1964,6 @@ description = "WebSocket library for Trio" optional = false python-versions = ">=3.8" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "trio_websocket-0.12.2-py3-none-any.whl", hash = "sha256:df605665f1db533f4a386c94525870851096a223adcb97f72a07e8b4beba45b6"}, {file = "trio_websocket-0.12.2.tar.gz", hash = "sha256:22c72c436f3d1e264d0910a3951934798dcc5b00ae56fc4ee079d46c7cf20fae"}, @@ -2126,7 +1982,6 @@ description = "Typer, build great CLIs. Easy to code. Based on Python type hints optional = false python-versions = ">=3.7" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "typer-0.15.2-py3-none-any.whl", hash = "sha256:46a499c6107d645a9c13f7ee46c5d5096cae6f5fc57dd11eccbbb9ae3e44ddfc"}, {file = "typer-0.15.2.tar.gz", hash = "sha256:ab2fab47533a813c49fe1f16b1a370fd5819099c00b119e0633df65f22144ba5"}, @@ -2145,7 +2000,6 @@ description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" groups = ["main", "dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, @@ -2158,7 +2012,6 @@ description = "Provider of IANA time zone data" optional = false python-versions = ">=2" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "tzdata-2025.1-py2.py3-none-any.whl", hash = "sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639"}, {file = "tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694"}, @@ -2171,7 +2024,6 @@ description = "('Selenium.webdriver.Chrome replacement with compatiblity for Bra optional = false python-versions = "*" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "undetected-chromedriver-3.5.5.tar.gz", hash = "sha256:9f945e1435005247abe17de316bcfda85b284a4177fd5f25167c78ced33b65ec"}, ] @@ -2188,7 +2040,6 @@ description = "HTTP library with thread-safe connection pooling, file post, and optional = false python-versions = ">=3.9" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df"}, {file = "urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d"}, @@ -2198,7 +2049,7 @@ files = [ pysocks = {version = ">=1.5.6,<1.5.7 || >1.5.7,<2.0", optional = true, markers = "extra == \"socks\""} [package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""] h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] @@ -2210,7 +2061,6 @@ description = "Measures the displayed width of unicode strings in a terminal" optional = false python-versions = "*" groups = ["dev"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"}, {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, @@ -2223,7 +2073,6 @@ description = "WebSocket client for Python with low level API options" optional = false python-versions = ">=3.8" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "websocket_client-1.8.0-py3-none-any.whl", hash = "sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526"}, {file = "websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da"}, @@ -2241,7 +2090,6 @@ description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" optional = false python-versions = ">=3.9" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "websockets-15.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d63efaa0cd96cf0c5fe4d581521d9fa87744540d4bc999ae6e08595a1014b45b"}, {file = "websockets-15.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ac60e3b188ec7574cb761b08d50fcedf9d77f1530352db4eef1707fe9dee7205"}, @@ -2321,7 +2169,6 @@ description = "WebSockets state-machine based protocol implementation" optional = false python-versions = ">=3.7.0" groups = ["main"] -markers = "python_version <= \"3.11\" or python_version >= \"3.12\"" files = [ {file = "wsproto-1.2.0-py3-none-any.whl", hash = "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736"}, {file = "wsproto-1.2.0.tar.gz", hash = "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065"}, @@ -2330,28 +2177,7 @@ files = [ [package.dependencies] h11 = ">=0.9.0,<1" -[[package]] -name = "zipp" -version = "3.21.0" -description = "Backport of pathlib-compatible object wrapper for zip files" -optional = false -python-versions = ">=3.9" -groups = ["dev"] -markers = "python_version < \"3.10\"" -files = [ - {file = "zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931"}, - {file = "zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4"}, -] - -[package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] -cover = ["pytest-cov"] -doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -enabler = ["pytest-enabler (>=2.2)"] -test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] -type = ["pytest-mypy"] - [metadata] lock-version = "2.1" -python-versions = ">=3.9" -content-hash = "1afa3bf7c3d9ce06c3cf91b77da72e8f7bf4d543351120cdfe00bedb1286df6b" +python-versions = ">=3.10" +content-hash = "19e460b385e6e3fb8901153196b5cbdcf0a318c743113b0916abc353115c9a4f" From 6ee5209eab936f071632d042207a7d725376480c Mon Sep 17 00:00:00 2001 From: gitronald Date: Tue, 1 Apr 2025 10:19:42 -0700 Subject: [PATCH 052/101] update: using orjson for speed, must decode dumps to string --- .../search_methods/selenium_searcher.py | 6 +- WebSearcher/utils.py | 10 ++- poetry.lock | 80 ++++++++++++++++++- pyproject.toml | 1 + 4 files changed, 89 insertions(+), 8 deletions(-) diff --git a/WebSearcher/search_methods/selenium_searcher.py b/WebSearcher/search_methods/selenium_searcher.py index 00eb829..1e67025 100644 --- a/WebSearcher/search_methods/selenium_searcher.py +++ b/WebSearcher/search_methods/selenium_searcher.py @@ -1,5 +1,5 @@ import time -import json +import orjson from datetime import datetime, timezone from typing import Dict, Any @@ -42,8 +42,8 @@ def init_driver(self) -> None: 'driver_version': self.driver.capabilities['chrome']['chromedriverVersion'].split(' ')[0], 'user_agent': self.driver.execute_script('return navigator.userAgent'), } - self.browser_info['browser_id'] = utils.hash_id(json.dumps(self.browser_info)) - self.log.debug(json.dumps(self.browser_info, indent=4)) + self.browser_info['browser_id'] = utils.hash_id(orjson.dumps(self.browser_info).decode('utf-8')) + self.log.debug(orjson.dumps(self.browser_info, option=orjson.OPT_INDENT_2)) def send_typed_query(self, query: str): """Send a typed query to the search box""" diff --git a/WebSearcher/utils.py b/WebSearcher/utils.py index a12a270..dec20e9 100644 --- a/WebSearcher/utils.py +++ b/WebSearcher/utils.py @@ -1,10 +1,9 @@ import re import os -import json +import orjson import random import hashlib import itertools -from timeit import default_timer from string import ascii_letters, digits # Files ------------------------------------------------------------------------ @@ -24,7 +23,7 @@ def read_lines(fp): with open(fp, 'r') as infile: if is_json: - return [json.loads(line) for line in infile] + return [orjson.loads(line) for line in infile] else: return [line.strip() for line in infile] @@ -38,7 +37,10 @@ def write_lines(iter_data, fp, overwrite=False): with open(fp, mode) as outfile: for data in iter_data: - line_output = json.dumps(data) if is_json else data + if is_json: + line_output = orjson.dumps(data).decode('utf-8') + else: + line_output = data outfile.write(f"{line_output}\n") diff --git a/poetry.lock b/poetry.lock index db4733a..34ccb6a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1014,6 +1014,84 @@ files = [ {file = "numpy-2.2.2.tar.gz", hash = "sha256:ed6906f61834d687738d25988ae117683705636936cc605be0bb208b23df4d8f"}, ] +[[package]] +name = "orjson" +version = "3.10.16" +description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "orjson-3.10.16-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:4cb473b8e79154fa778fb56d2d73763d977be3dcc140587e07dbc545bbfc38f8"}, + {file = "orjson-3.10.16-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:622a8e85eeec1948690409a19ca1c7d9fd8ff116f4861d261e6ae2094fe59a00"}, + {file = "orjson-3.10.16-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c682d852d0ce77613993dc967e90e151899fe2d8e71c20e9be164080f468e370"}, + {file = "orjson-3.10.16-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8c520ae736acd2e32df193bcff73491e64c936f3e44a2916b548da048a48b46b"}, + {file = "orjson-3.10.16-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:134f87c76bfae00f2094d85cfab261b289b76d78c6da8a7a3b3c09d362fd1e06"}, + {file = "orjson-3.10.16-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b59afde79563e2cf37cfe62ee3b71c063fd5546c8e662d7fcfc2a3d5031a5c4c"}, + {file = "orjson-3.10.16-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:113602f8241daaff05d6fad25bd481d54c42d8d72ef4c831bb3ab682a54d9e15"}, + {file = "orjson-3.10.16-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4fc0077d101f8fab4031e6554fc17b4c2ad8fdbc56ee64a727f3c95b379e31da"}, + {file = "orjson-3.10.16-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:9c6bf6ff180cd69e93f3f50380224218cfab79953a868ea3908430bcfaf9cb5e"}, + {file = "orjson-3.10.16-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5673eadfa952f95a7cd76418ff189df11b0a9c34b1995dff43a6fdbce5d63bf4"}, + {file = "orjson-3.10.16-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5fe638a423d852b0ae1e1a79895851696cb0d9fa0946fdbfd5da5072d9bb9551"}, + {file = "orjson-3.10.16-cp310-cp310-win32.whl", hash = "sha256:33af58f479b3c6435ab8f8b57999874b4b40c804c7a36b5cc6b54d8f28e1d3dd"}, + {file = "orjson-3.10.16-cp310-cp310-win_amd64.whl", hash = "sha256:0338356b3f56d71293c583350af26f053017071836b07e064e92819ecf1aa055"}, + {file = "orjson-3.10.16-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:44fcbe1a1884f8bc9e2e863168b0f84230c3d634afe41c678637d2728ea8e739"}, + {file = "orjson-3.10.16-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78177bf0a9d0192e0b34c3d78bcff7fe21d1b5d84aeb5ebdfe0dbe637b885225"}, + {file = "orjson-3.10.16-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:12824073a010a754bb27330cad21d6e9b98374f497f391b8707752b96f72e741"}, + {file = "orjson-3.10.16-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddd41007e56284e9867864aa2f29f3136bb1dd19a49ca43c0b4eda22a579cf53"}, + {file = "orjson-3.10.16-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0877c4d35de639645de83666458ca1f12560d9fa7aa9b25d8bb8f52f61627d14"}, + {file = "orjson-3.10.16-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9a09a539e9cc3beead3e7107093b4ac176d015bec64f811afb5965fce077a03c"}, + {file = "orjson-3.10.16-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31b98bc9b40610fec971d9a4d67bb2ed02eec0a8ae35f8ccd2086320c28526ca"}, + {file = "orjson-3.10.16-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0ce243f5a8739f3a18830bc62dc2e05b69a7545bafd3e3249f86668b2bcd8e50"}, + {file = "orjson-3.10.16-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:64792c0025bae049b3074c6abe0cf06f23c8e9f5a445f4bab31dc5ca23dbf9e1"}, + {file = "orjson-3.10.16-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ea53f7e68eec718b8e17e942f7ca56c6bd43562eb19db3f22d90d75e13f0431d"}, + {file = "orjson-3.10.16-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a741ba1a9488c92227711bde8c8c2b63d7d3816883268c808fbeada00400c164"}, + {file = "orjson-3.10.16-cp311-cp311-win32.whl", hash = "sha256:c7ed2c61bb8226384c3fdf1fb01c51b47b03e3f4536c985078cccc2fd19f1619"}, + {file = "orjson-3.10.16-cp311-cp311-win_amd64.whl", hash = "sha256:cd67d8b3e0e56222a2e7b7f7da9031e30ecd1fe251c023340b9f12caca85ab60"}, + {file = "orjson-3.10.16-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:6d3444abbfa71ba21bb042caa4b062535b122248259fdb9deea567969140abca"}, + {file = "orjson-3.10.16-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:30245c08d818fdcaa48b7d5b81499b8cae09acabb216fe61ca619876b128e184"}, + {file = "orjson-3.10.16-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0ba1d0baa71bf7579a4ccdcf503e6f3098ef9542106a0eca82395898c8a500a"}, + {file = "orjson-3.10.16-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb0beefa5ef3af8845f3a69ff2a4aa62529b5acec1cfe5f8a6b4141033fd46ef"}, + {file = "orjson-3.10.16-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6daa0e1c9bf2e030e93c98394de94506f2a4d12e1e9dadd7c53d5e44d0f9628e"}, + {file = "orjson-3.10.16-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9da9019afb21e02410ef600e56666652b73eb3e4d213a0ec919ff391a7dd52aa"}, + {file = "orjson-3.10.16-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:daeb3a1ee17b69981d3aae30c3b4e786b0f8c9e6c71f2b48f1aef934f63f38f4"}, + {file = "orjson-3.10.16-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80fed80eaf0e20a31942ae5d0728849862446512769692474be5e6b73123a23b"}, + {file = "orjson-3.10.16-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:73390ed838f03764540a7bdc4071fe0123914c2cc02fb6abf35182d5fd1b7a42"}, + {file = "orjson-3.10.16-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:a22bba012a0c94ec02a7768953020ab0d3e2b884760f859176343a36c01adf87"}, + {file = "orjson-3.10.16-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5385bbfdbc90ff5b2635b7e6bebf259652db00a92b5e3c45b616df75b9058e88"}, + {file = "orjson-3.10.16-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:02c6279016346e774dd92625d46c6c40db687b8a0d685aadb91e26e46cc33e1e"}, + {file = "orjson-3.10.16-cp312-cp312-win32.whl", hash = "sha256:7ca55097a11426db80f79378e873a8c51f4dde9ffc22de44850f9696b7eb0e8c"}, + {file = "orjson-3.10.16-cp312-cp312-win_amd64.whl", hash = "sha256:86d127efdd3f9bf5f04809b70faca1e6836556ea3cc46e662b44dab3fe71f3d6"}, + {file = "orjson-3.10.16-cp313-cp313-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:148a97f7de811ba14bc6dbc4a433e0341ffd2cc285065199fb5f6a98013744bd"}, + {file = "orjson-3.10.16-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:1d960c1bf0e734ea36d0adc880076de3846aaec45ffad29b78c7f1b7962516b8"}, + {file = "orjson-3.10.16-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a318cd184d1269f68634464b12871386808dc8b7c27de8565234d25975a7a137"}, + {file = "orjson-3.10.16-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:df23f8df3ef9223d1d6748bea63fca55aae7da30a875700809c500a05975522b"}, + {file = "orjson-3.10.16-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b94dda8dd6d1378f1037d7f3f6b21db769ef911c4567cbaa962bb6dc5021cf90"}, + {file = "orjson-3.10.16-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f12970a26666a8775346003fd94347d03ccb98ab8aa063036818381acf5f523e"}, + {file = "orjson-3.10.16-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:15a1431a245d856bd56e4d29ea0023eb4d2c8f71efe914beb3dee8ab3f0cd7fb"}, + {file = "orjson-3.10.16-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c83655cfc247f399a222567d146524674a7b217af7ef8289c0ff53cfe8db09f0"}, + {file = "orjson-3.10.16-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:fa59ae64cb6ddde8f09bdbf7baf933c4cd05734ad84dcf4e43b887eb24e37652"}, + {file = "orjson-3.10.16-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:ca5426e5aacc2e9507d341bc169d8af9c3cbe88f4cd4c1cf2f87e8564730eb56"}, + {file = "orjson-3.10.16-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:6fd5da4edf98a400946cd3a195680de56f1e7575109b9acb9493331047157430"}, + {file = "orjson-3.10.16-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:980ecc7a53e567169282a5e0ff078393bac78320d44238da4e246d71a4e0e8f5"}, + {file = "orjson-3.10.16-cp313-cp313-win32.whl", hash = "sha256:28f79944dd006ac540a6465ebd5f8f45dfdf0948ff998eac7a908275b4c1add6"}, + {file = "orjson-3.10.16-cp313-cp313-win_amd64.whl", hash = "sha256:fe0a145e96d51971407cb8ba947e63ead2aa915db59d6631a355f5f2150b56b7"}, + {file = "orjson-3.10.16-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:c35b5c1fb5a5d6d2fea825dec5d3d16bea3c06ac744708a8e1ff41d4ba10cdf1"}, + {file = "orjson-3.10.16-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9aac7ecc86218b4b3048c768f227a9452287001d7548500150bb75ee21bf55d"}, + {file = "orjson-3.10.16-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6e19f5102fff36f923b6dfdb3236ec710b649da975ed57c29833cb910c5a73ab"}, + {file = "orjson-3.10.16-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:17210490408eb62755a334a6f20ed17c39f27b4f45d89a38cd144cd458eba80b"}, + {file = "orjson-3.10.16-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fbbe04451db85916e52a9f720bd89bf41f803cf63b038595674691680cbebd1b"}, + {file = "orjson-3.10.16-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a966eba501a3a1f309f5a6af32ed9eb8f316fa19d9947bac3e6350dc63a6f0a"}, + {file = "orjson-3.10.16-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01e0d22f06c81e6c435723343e1eefc710e0510a35d897856766d475f2a15687"}, + {file = "orjson-3.10.16-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:7c1e602d028ee285dbd300fb9820b342b937df64d5a3336e1618b354e95a2569"}, + {file = "orjson-3.10.16-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:d230e5020666a6725629df81e210dc11c3eae7d52fe909a7157b3875238484f3"}, + {file = "orjson-3.10.16-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:0f8baac07d4555f57d44746a7d80fbe6b2c4fe2ed68136b4abb51cfec512a5e9"}, + {file = "orjson-3.10.16-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:524e48420b90fc66953e91b660b3d05faaf921277d6707e328fde1c218b31250"}, + {file = "orjson-3.10.16-cp39-cp39-win32.whl", hash = "sha256:a9f614e31423d7292dbca966a53b2d775c64528c7d91424ab2747d8ab8ce5c72"}, + {file = "orjson-3.10.16-cp39-cp39-win_amd64.whl", hash = "sha256:c338dc2296d1ed0d5c5c27dfb22d00b330555cb706c2e0be1e1c3940a0895905"}, + {file = "orjson-3.10.16.tar.gz", hash = "sha256:d2aaa5c495e11d17b9b93205f5fa196737ee3202f000aaebf028dc9a73750f10"}, +] + [[package]] name = "outcome" version = "1.3.0.post0" @@ -2180,4 +2258,4 @@ h11 = ">=0.9.0,<1" [metadata] lock-version = "2.1" python-versions = ">=3.10" -content-hash = "19e460b385e6e3fb8901153196b5cbdcf0a318c743113b0916abc353115c9a4f" +content-hash = "684e3794b5ea4541fde5a46b9bf83f67cbeedcecf4cd969dce683ffc3210b382" diff --git a/pyproject.toml b/pyproject.toml index ba8b769..488993d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ dependencies = [ "undetected-chromedriver>=3.5.5", "selenium>=4.9.0", "protobuf (>=6.30.0,<7.0.0)", + "orjson (>=3.10.16,<4.0.0)", ] [project.urls] From 48ae902d10c46d5ee5ffcbdac9b1095431073df7 Mon Sep 17 00:00:00 2001 From: gitronald Date: Tue, 1 Apr 2025 10:23:18 -0700 Subject: [PATCH 053/101] update: archive result collector, ignore archive --- .gitignore | 3 +- WebSearcher/result_collector.py | 99 --------------------------------- 2 files changed, 2 insertions(+), 100 deletions(-) delete mode 100644 WebSearcher/result_collector.py diff --git a/.gitignore b/.gitignore index 10689ea..c705978 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ -.pytest_cache .venv +.archive build data @@ -9,4 +9,5 @@ notebooks *__pycache__ # Ignore test data +.pytest_cache tests/__snapshots__/* diff --git a/WebSearcher/result_collector.py b/WebSearcher/result_collector.py deleted file mode 100644 index 77041cc..0000000 --- a/WebSearcher/result_collector.py +++ /dev/null @@ -1,99 +0,0 @@ -""" Collect HTML for individual results from a SERP -""" - -import time -import requests -from . import utils -from . import webutils as wu - - -def check_valid_url(result): - """Check if result has url and url is in a valid format""" - if 'url' in result: - return True if result['url'].startswith('http') else False - else: - return False - - -def scrape_results_html(results, serp_id, log, headers, ssh_tunnel, - save_dir='.', append_to=''): - """Scrape and save all unique, non-internal URLs parsed from the SERP - - Args: - save_dir (str, optional): Save results html as `save_dir/results_html/{serp_id}.json` - append_to (str, optional): Append results html to this file path - """ - - results_html = [] - if not results: - log.info(f'No results to scrape for serp_id {serp_id}') - else: - - results_wurl = [r for r in results if check_valid_url(r)] - - if results_wurl: - - # Prepare session - keep_headers = ['User-Agent'] - headers = {k:v for k,v in headers.items() if k in keep_headers} - if ssh_tunnel: - result_sesh = wu.start_sesh(headers=headers, proxy_port=ssh_tunnel.port) - else: - result_sesh = wu.start_sesh(headers=headers) - - # Get all unique result urls - result_urls = [] - unique_urls = set() - for result in results_wurl: - # If the result has a url and we haven't seen it yet - if result['url'] and result['url'] not in unique_urls: - # Take a subset of the keys - keep_keys = {'serp_id', 'serp_rank', 'url'} - res = {k:v for k,v in result.items() if k in keep_keys} - result_urls.append(res) - unique_urls.add(result['url']) - - # Scrape results HTML - for result in result_urls: - result = scrape_result_html(result_sesh, result, log, ssh_tunnel) - results_html.append(result) - - # Save results HTML - if append_to: - # Append to aggregate file - utils.write_lines(results_html, append_to) - else: - # Save new SERP-specific file - fp = os.path.join(save_dir, 'results_html', f'{serp_id}.json') - utils.write_lines(results_html, fp) - - -def scrape_result_html(result_sesh, result, log, ssh_tunnel): - resid = f"{result['serp_id']} | {result['url']}" - - try: - r = result_sesh.get(result['url'], timeout=15) - result['html'] = r.content.decode('utf-8', 'ignore') - - except requests.exceptions.TooManyRedirects: - result['html'] = 'error_redirects' - log.exception(f"Results | RedirectsErr | {resid}") - - except requests.exceptions.Timeout: - result['html'] = 'error_timeout' - log.exception(f"Results | TimeoutErr | {resid}") - - except requests.exceptions.ConnectionError: - result['html'] = 'error_connection' - log.exception(f"Results | ConnectionErr | {resid}") - - # SSH Tunnel may have died, reset SSH session - if ssh_tunnel: - ssh_tunnel.tunnel.kill() - ssh_tunnel.open_tunnel() - log.info('Results | Restarted SSH tunnel') - time.sleep(10) # Allow time to establish connection - - except Exception: - result['html'] = 'error_unknown' - log.exception(f"Results | Collection Error | {resid}") From d076e4f41b6486b75adf66945951afffba4daa8e Mon Sep 17 00:00:00 2001 From: gitronald Date: Wed, 2 Apr 2025 09:44:52 -0700 Subject: [PATCH 054/101] fix: downgrade log warning to debug --- WebSearcher/searchers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index 72c791f..ae86474 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -90,7 +90,7 @@ def search(self, headers (Dict[str, str], optional): Custom headers to include in the request """ - self.log.warning('starting search config') + self.log.debug('starting search config') self.search_params = SearchParams.create({ 'qry': str(qry), 'loc': str(location) if not pd.isnull(location) else '', From e726acddb23292415d6877ba823cf955656b411a Mon Sep 17 00:00:00 2001 From: gitronald Date: Wed, 2 Apr 2025 10:43:30 -0700 Subject: [PATCH 055/101] update: breaking change for log config, using logger kwargs --- WebSearcher/models/configs.py | 11 +++++++---- WebSearcher/searchers.py | 16 +++------------- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py index 99021cf..81e011d 100644 --- a/WebSearcher/models/configs.py +++ b/WebSearcher/models/configs.py @@ -15,9 +15,13 @@ def create(cls, config=None): return config or cls() class LogConfig(BaseConfig): - fp: str = '' - mode: str = 'a' - level: str = 'INFO' + console: bool = True + console_format: str = 'medium' + console_level: str = 'INFO' + file_name: str = '' + file_mode: str = 'a' + file_format: str = 'detailed' + file_level: str = 'INFO' class SeleniumConfig(BaseConfig): headless: bool = False @@ -45,7 +49,6 @@ def sesh(self) -> requests.Session: sesh.headers.update(self.headers) return sesh - class SearchMethod(Enum): REQUESTS = "requests" SELENIUM = "selenium" diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py index ae86474..2147b39 100644 --- a/WebSearcher/searchers.py +++ b/WebSearcher/searchers.py @@ -35,7 +35,7 @@ def __init__(self, crawl_id (str, optional): A unique identifier for the crawl. Defaults to ''. """ - # Initialize configuration + # Initialize config settings, log, and session data self.method = method.value if isinstance(method, SearchMethod) else method self.config = SearchConfig.create({ "method": SearchMethod.create(method), @@ -43,23 +43,14 @@ def __init__(self, "selenium": SeleniumConfig.create(selenium_config), "requests": RequestsConfig.create(requests_config), }) - - # Initialize session data + self.log = logger.Logger(**self.config.log.model_dump()).start(__name__) self.session_data = { "method": self.config.method.value, "version": WS_VERSION, "crawl_id": crawl_id, } - # Set a log file, prints to console by default - self.log = logger.Logger( - console=True if not self.config.log.fp else False, - console_level=self.config.log.level, - file_name=self.config.log.fp, - file_mode=self.config.log.mode, - file_level=self.config.log.level, - ).start(__name__) - + # Initialize searcher based on method if self.config.method == SearchMethod.SELENIUM: self.searcher = SeleniumDriver(config=self.config.selenium, logger=self.log) self.searcher.init_driver() @@ -70,7 +61,6 @@ def __init__(self, self.search_params = SearchParams.create() self.parsed = {'results': [], 'features': {}} - def search(self, qry: str, location: str = None, From 70e774f0b4ee3ffa16ffa5dca8bb130784005271 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 24 Apr 2025 18:59:52 +0000 Subject: [PATCH 056/101] build(deps): bump h11 from 0.14.0 to 0.16.0 Bumps [h11](https://github.com/python-hyper/h11) from 0.14.0 to 0.16.0. - [Commits](https://github.com/python-hyper/h11/compare/v0.14.0...v0.16.0) --- updated-dependencies: - dependency-name: h11 dependency-version: 0.16.0 dependency-type: indirect ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 34ccb6a..79638ee 100644 --- a/poetry.lock +++ b/poetry.lock @@ -558,14 +558,14 @@ typing = ["typing-extensions (>=4.12.2) ; python_version < \"3.11\""] [[package]] name = "h11" -version = "0.14.0" +version = "0.16.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" groups = ["main"] files = [ - {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, - {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, + {file = "h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86"}, + {file = "h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1"}, ] [[package]] From 592c69b27bb77a6954b300a6907d18bfda34ef76 Mon Sep 17 00:00:00 2001 From: gitronald Date: Sun, 27 Apr 2025 15:58:24 -0700 Subject: [PATCH 057/101] update: ad component parsers --- WebSearcher/component_parsers/ads.py | 101 ++++++++++++++++++++++----- 1 file changed, 83 insertions(+), 18 deletions(-) diff --git a/WebSearcher/component_parsers/ads.py b/WebSearcher/component_parsers/ads.py index d38917c..53ab212 100644 --- a/WebSearcher/component_parsers/ads.py +++ b/WebSearcher/component_parsers/ads.py @@ -13,6 +13,16 @@ from .shopping_ads import parse_shopping_ads import bs4 +PARSED = { + 'type': 'ad', + 'sub_type': '', + 'sub_rank': 0, + 'title': '', + 'url': '', + 'cite': '', + 'text': '', +} + def parse_ads(cmpt: bs4.element.Tag) -> list: """Parse ads from ad component""" @@ -33,6 +43,8 @@ def parse_ads(cmpt: bs4.element.Tag) -> list: parsed_list.extend(parse_shopping_ads(sub)) elif "uEierd" in sub_classes: parsed_list.append(parse_ad(sub)) + elif sub_type == 'carousel': + parsed_list = parse_ad_carousel(cmpt, sub_type) return parsed_list @@ -41,7 +53,8 @@ def classify_ad_type(cmpt: bs4.element.Tag) -> str: label_divs = { "legacy": webutils.find_all_divs(cmpt, 'div', {'class': 'ad_cclk'}), "secondary": webutils.find_all_divs(cmpt, 'div', {'class': 'd5oMvf'}), - "standard": webutils.find_all_divs(cmpt, 'div', {'class': ['uEierd', 'commercial-unit-desktop-top']}) + "standard": webutils.find_all_divs(cmpt, 'div', {'class': ['uEierd', 'commercial-unit-desktop-top']}), + "carousel": webutils.find_all_divs(cmpt, 'g-scrolling-carousel'), } for label, divs in label_divs.items(): if divs: @@ -49,12 +62,63 @@ def classify_ad_type(cmpt: bs4.element.Tag) -> str: return 'unknown' +def parse_ad_carousel(cmpt: bs4.element.Tag, sub_type: str, filter_visible: bool = True) -> list: + + def parse_ad_carousel_div(sub: bs4.element.Tag, sub_type: str, sub_rank: int) -> dict: + """Parse ad carousel div, seen 2025-02-06""" + parsed = PARSED.copy() + parsed['sub_type'] = sub_type + parsed['sub_rank'] = sub_rank + parsed['title'] = webutils.get_text(sub, 'div', {'class':'e7SMre'}) + parsed['url'] = webutils.get_link(sub) + parsed['text'] = webutils.get_text(sub, 'div', {"class":"vrAZpb"}) + parsed['cite'] = webutils.get_text(sub, 'div', {"class":"zpIwr"}) + parsed['visible'] = not (sub.has_attr('data-has-shown') and sub['data-has-shown'] == 'false') + return parsed + + def parse_ad_carousel_card(sub: bs4.element.Tag, sub_type: str, sub_rank: int) -> dict: + """Parse ad carousel card, seen 2024-09-21""" + parsed = PARSED.copy() + parsed['sub_type'] = sub_type + parsed['sub_rank'] = sub_rank + parsed['title'] = webutils.get_text(sub, 'div', {'class':'gCv54b'}) + parsed['url'] = webutils.get_link(sub, {"class": "KTsHxd"}) + parsed['text'] = webutils.get_text(sub, 'div', {"class":"VHpBje"}) + parsed['cite'] = webutils.get_text(sub, 'div', {"class":"j958Pd"}) + parsed['visible'] = not (sub.has_attr('data-viewurl') and sub['data-viewurl']) + return parsed + + ad_carousel_parsers = [ + {'find_kwargs': {'name': 'g-inner-card'}, + 'parser': parse_ad_carousel_card}, + {'find_kwargs': {'name': 'div', 'attrs': {'class': 'ZPze1e'}}, + 'parser': parse_ad_carousel_div} + ] + + output_list = [] + ad_carousel = cmpt.find('g-scrolling-carousel') + if ad_carousel: + for parser_details in ad_carousel_parsers: + parser_func = parser_details['parser'] + kwargs = parser_details['find_kwargs'] + sub_cmpts = webutils.find_all_divs(ad_carousel, **kwargs) + print(f"sub_cmpts: {len(sub_cmpts)}") + if sub_cmpts: + for sub_rank, sub in enumerate(sub_cmpts): + parsed = parser_func(sub, sub_type, sub_rank) + output_list.append(parsed) + + if filter_visible: + output_list = [{k:v for k,v in x.items() if k != 'visible'} for x in output_list if x['visible']] + return output_list + + def parse_ad(sub: bs4.element.Tag, sub_rank: int = 0) -> dict: """Parse details of a single ad subcomponent, similar to general""" - parsed = {"type": "ad", - "sub_type": "standard", - "sub_rank": sub_rank} - + parsed = PARSED.copy() + parsed["sub_type"] = "standard" + parsed["sub_rank"] = sub_rank + parsed['title'] = webutils.get_text(sub, 'div', {'role':'heading'}) parsed['url'] = webutils.get_link(sub, {"class":"sVXRqc"}) parsed['cite'] = webutils.get_text(sub, 'span', {"role":"text"}) @@ -96,13 +160,14 @@ def parse_ad_menu(sub: bs4.element.Tag) -> list: def parse_ad_secondary(sub: bs4.element.Tag, sub_rank: int = 0) -> dict: """Parse details of a single ad subcomponent, similar to general""" + parsed = PARSED.copy() + parsed["sub_type"] = "secondary" + parsed["sub_rank"] = sub_rank - parsed = {"type": "ad", - "sub_type": "secondary", - "sub_rank": sub_rank} - parsed['title'] = sub.find('div', {'role':'heading'}).text - parsed['url'] = sub.find('div', {'class':'d5oMvf'}).find('a')['href'] - parsed['cite'] = sub.find('span', {'class':'gBIQub'}).text + parsed['title'] = webutils.get_text(sub, 'div', {'role':'heading'}) + link_div = sub.find('div', {'class':'d5oMvf'}) + parsed['url'] = webutils.get_link(link_div) if link_div else '' + parsed['cite'] = webutils.get_text(sub, 'span', {'class':'gBIQub'}) # Take the top div with this class, should be main result abstract text_divs = sub.find_all('div', {'class':'yDYNvb'}) @@ -123,14 +188,14 @@ def parse_ad_secondary(sub: bs4.element.Tag, sub_rank: int = 0) -> dict: def parse_ad_legacy(sub: bs4.element.Tag, sub_rank: int = 0) -> dict: """[legacy] Parse details of a single ad subcomponent, similar to general""" - - parsed = {"type": "ad", - "sub_type": "legacy", - "sub_rank": sub_rank} + parsed = PARSED.copy() + parsed["sub_type"] = "legacy" + parsed["sub_rank"] = sub_rank + header = sub.find('div', {'class':'ad_cclk'}) - parsed['title'] = header.find('h3').text - parsed['url'] = header.find('cite').text - parsed['text'] = sub.find('div', {'class':'ads-creative'}).text + parsed['title'] = webutils.get_text(header, 'h3') + parsed['url'] = webutils.get_text(header, 'cite') + parsed['text'] = webutils.get_text(sub, 'div', {'class':'ads-creative'}) bottom_text = sub.find('ul') if bottom_text: From 2ca408153a8c03be145d1f2ff800b803befc1390 Mon Sep 17 00:00:00 2001 From: gitronald Date: Sun, 27 Apr 2025 15:58:37 -0700 Subject: [PATCH 058/101] version: 0.6.5.dev0 --- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index 9eb7a83..5201135 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.6.4" +__version__ = "0.6.5.dev0" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index 0e936df..8bad132 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.6.4" +version = "0.6.5.dev0" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From 0f9dc401640e2084430aff484736b241e42ee62f Mon Sep 17 00:00:00 2001 From: gitronald Date: Sun, 27 Apr 2025 16:17:10 -0700 Subject: [PATCH 059/101] update: videos component parser --- WebSearcher/component_parsers/ads.py | 1 + WebSearcher/component_parsers/videos.py | 22 ++++++++++++---------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/WebSearcher/component_parsers/ads.py b/WebSearcher/component_parsers/ads.py index 53ab212..b2aac52 100644 --- a/WebSearcher/component_parsers/ads.py +++ b/WebSearcher/component_parsers/ads.py @@ -6,6 +6,7 @@ - added new div class for text field - added labels (e.g., "Provides abortions") from , appended to text field +2025-04-27: added carousel sub_type, global parsed output """ diff --git a/WebSearcher/component_parsers/videos.py b/WebSearcher/component_parsers/videos.py index 950d849..cd374d3 100644 --- a/WebSearcher/component_parsers/videos.py +++ b/WebSearcher/component_parsers/videos.py @@ -1,8 +1,9 @@ """ Parsers for video components Changelog -2021-05-08: added find_all for divs with class 'VibNM' -2021-05-08: added adjustment for new cite and timestamp +2024-05-08: added find_all for divs with class 'VibNM' +2024-05-08: added adjustment for new cite and timestamp +2025-04-27: added div subcomponent class and sub_type labels """ @@ -23,24 +24,25 @@ def parse_videos(cmpt) -> list: # Get known div structures divs = [] name_attrs = [ - {'name':'g-inner-card'}, - {'name':'div', 'attrs':{'class':'VibNM'}}, - {'name':'div', 'attrs':{'class':'mLmaBd'}}, - {'name':'div', 'attrs':{'class':'RzdJxc'}}, + ({'name':'g-inner-card'}, 'unspecified-0'), + ({'name':'div', 'attrs':{'class':'VibNM'}}, 'unspecified-1'), + ({'name':'div', 'attrs':{'class':'mLmaBd'}}, 'unspecified-2'), + ({'name':'div', 'attrs':{'class':'RzdJxc'}}, 'unspecified-3'), + ({'name':'div', 'attrs':{'class':'sHEJob'}}, 'vertical'), ] - for kwargs in name_attrs: + for kwargs, sub_type in name_attrs: divs = webutils.find_all_divs(cmpt, **kwargs) if divs: break divs = list(filter(None, divs)) if divs: - return [parse_video(div, i) for i, div in enumerate(divs)] + return [parse_video(div, sub_type, i) for i, div in enumerate(divs)] else: return [{'type': 'videos', 'sub_rank': 0, 'error': 'No subcomponents found'}] -def parse_video(sub, sub_rank=0) -> dict: +def parse_video(sub, sub_type: str, sub_rank=0) -> dict: """Parse a videos subcomponent Args: @@ -52,6 +54,7 @@ def parse_video(sub, sub_rank=0) -> dict: parsed = { 'type': 'videos', + 'sub_type': sub_type, 'sub_rank': sub_rank, 'url': get_url(sub), 'title': webutils.get_text(sub, 'div', {'role':'heading'}), @@ -82,7 +85,6 @@ def parse_video(sub, sub_rank=0) -> dict: return parsed - def get_url(sub): """Get video URL by filtering for non-hash links""" all_urls = sub.find_all('a') From 502a025acad207f89fd4adc1f749919c6f68cb07 Mon Sep 17 00:00:00 2001 From: gitronald Date: Sun, 27 Apr 2025 16:17:25 -0700 Subject: [PATCH 060/101] version: 0.6.5.dev1 --- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index 5201135..c70f874 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.6.5.dev0" +__version__ = "0.6.5.dev1" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index 8bad132..dff6630 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.6.5.dev0" +version = "0.6.5.dev1" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From cc2395c5847ee0e9135031979f74208f5e4fb683 Mon Sep 17 00:00:00 2001 From: gitronald Date: Sun, 27 Apr 2025 17:03:07 -0700 Subject: [PATCH 061/101] update: discussions and forums classifier --- WebSearcher/classifiers/main.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/WebSearcher/classifiers/main.py b/WebSearcher/classifiers/main.py index 70cb570..8eba449 100644 --- a/WebSearcher/classifiers/main.py +++ b/WebSearcher/classifiers/main.py @@ -14,6 +14,7 @@ def classify(cmpt: bs4.element.Tag) -> str: # Ordered list of classifiers to try component_classifiers = [ ClassifyMain.top_stories, # Check top stories + ClassifyMain.discussions_and_forums, # Check discussions and forums ClassifyHeaderText.classify, # Check levels 2 & 3 header text ClassifyMain.news_quotes, # Check news quotes ClassifyMain.img_cards, # Check image cards @@ -40,6 +41,12 @@ def classify(cmpt: bs4.element.Tag) -> str: return cmpt_type + @staticmethod + def discussions_and_forums(cmpt: bs4.element.Tag) -> str: + conditions = [ + cmpt.find("div", {"class": "IFnjPb"}), + ] + return 'discussions_and_forums' if all(conditions) else "unknown" @staticmethod def available_on(cmpt: bs4.element.Tag) -> str: From cedc7d2ced5faea4c3a0c8d1150aef026ce830fe Mon Sep 17 00:00:00 2001 From: gitronald Date: Sun, 27 Apr 2025 17:03:38 -0700 Subject: [PATCH 062/101] update: extract more divs for top_bar layout --- WebSearcher/extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WebSearcher/extractors.py b/WebSearcher/extractors.py index 5b1202e..5481d7a 100644 --- a/WebSearcher/extractors.py +++ b/WebSearcher/extractors.py @@ -203,7 +203,7 @@ def extract_from_top_bar(self, drop_tags: set = {}) -> list: top_bar_divs = Extractor.extract_from_top_bar_divs(self.layout_divs['top-bars']) column.extend(top_bar_divs) - rso_layout_divs = self.layout_divs['rso'].find_all('div', {'class':'sATSHe'}) + rso_layout_divs = self.layout_divs['rso'].find_all('div', {'class':['sATSHe','vtSz8d', 'cUnQKe','g']}) if rso_layout_divs: self.layout_label = 'top-bars-divs' layout_column = [div for div in rso_layout_divs if div.name not in drop_tags] From 1c68ea84f51300720a12b15f1948715bdc3c3800 Mon Sep 17 00:00:00 2001 From: gitronald Date: Sun, 27 Apr 2025 17:04:22 -0700 Subject: [PATCH 063/101] version: 0.6.5.dev2 --- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index c70f874..cbc0ed6 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.6.5.dev1" +__version__ = "0.6.5.dev2" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index dff6630..5d2edd7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.6.5.dev1" +version = "0.6.5.dev2" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From 29d62c7df3c23e9d4939da0f6a08592d81a2e702 Mon Sep 17 00:00:00 2001 From: gitronald Date: Mon, 28 Apr 2025 11:02:21 -0700 Subject: [PATCH 064/101] fix: drop debug print and fix print var --- WebSearcher/component_parsers/ads.py | 1 - WebSearcher/component_parsers/images.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/WebSearcher/component_parsers/ads.py b/WebSearcher/component_parsers/ads.py index b2aac52..885f51c 100644 --- a/WebSearcher/component_parsers/ads.py +++ b/WebSearcher/component_parsers/ads.py @@ -103,7 +103,6 @@ def parse_ad_carousel_card(sub: bs4.element.Tag, sub_type: str, sub_rank: int) - parser_func = parser_details['parser'] kwargs = parser_details['find_kwargs'] sub_cmpts = webutils.find_all_divs(ad_carousel, **kwargs) - print(f"sub_cmpts: {len(sub_cmpts)}") if sub_cmpts: for sub_rank, sub in enumerate(sub_cmpts): parsed = parser_func(sub, sub_type, sub_rank) diff --git a/WebSearcher/component_parsers/images.py b/WebSearcher/component_parsers/images.py index 27f932d..74c8026 100644 --- a/WebSearcher/component_parsers/images.py +++ b/WebSearcher/component_parsers/images.py @@ -121,7 +121,7 @@ def get_image_url_from_attrs(sub): try: url = func(sub) if url.startswith('data:image'): - raise ValueError(f"Data URL: {img_src}") + raise ValueError(f"Data URL: {url}") else: return url except Exception as e: From fa411b86dee3e2920284ea4eeb299b2ce6a34d45 Mon Sep 17 00:00:00 2001 From: gitronald Date: Mon, 28 Apr 2025 11:02:41 -0700 Subject: [PATCH 065/101] update: expand general classifier classes --- WebSearcher/classifiers/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WebSearcher/classifiers/main.py b/WebSearcher/classifiers/main.py index 8eba449..43a2803 100644 --- a/WebSearcher/classifiers/main.py +++ b/WebSearcher/classifiers/main.py @@ -75,7 +75,7 @@ def general(cmpt: bs4.element.Tag) -> str: "format-01": cmpt.attrs["class"] == ["g"], "format-02": ( ("g" in cmpt.attrs["class"]) & any(s in ["Ww4FFb"] for s in cmpt.attrs["class"]) ), - "format-03": any(s in ["hlcw0c", "MjjYud"] for s in cmpt.attrs["class"]), + "format-03": any(s in ["hlcw0c", "MjjYud", "PmEWq"] for s in cmpt.attrs["class"]), "format-04": cmpt.find('div', {'class': ['g', 'Ww4FFb']}), } else: From 2f9bb28fd6dfdd2395012d5d5de278d73a3a5912 Mon Sep 17 00:00:00 2001 From: gitronald Date: Mon, 28 Apr 2025 11:03:40 -0700 Subject: [PATCH 066/101] update: extract from top bar for 2025 serps --- WebSearcher/extractors.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/WebSearcher/extractors.py b/WebSearcher/extractors.py index 5481d7a..6900187 100644 --- a/WebSearcher/extractors.py +++ b/WebSearcher/extractors.py @@ -202,8 +202,19 @@ def extract_from_top_bar(self, drop_tags: set = {}) -> list: top_bar_divs = Extractor.extract_from_top_bar_divs(self.layout_divs['top-bars']) column.extend(top_bar_divs) - - rso_layout_divs = self.layout_divs['rso'].find_all('div', {'class':['sATSHe','vtSz8d', 'cUnQKe','g']}) + # No duplicates, but missing data + # rso_layout_divs = self.layout_divs['rso'].find_all('div', {'class':'sATSHe'}) + + div_classes = [ + 'cUnQKe', # people also ask + 'g', # general + 'Lv2Cle', # images-medium + 'oIk2Cb', # searches_related + 'Ww4FFb', # discussions_and_forums + 'vtSz8d', # videos + ] + rso_layout_divs = self.layout_divs['rso'].find_all('div', attrs={'class': div_classes}, recursive=True) + if rso_layout_divs: self.layout_label = 'top-bars-divs' layout_column = [div for div in rso_layout_divs if div.name not in drop_tags] From 5373a852eb11bac9b312c0dd17a411afd1050e06 Mon Sep 17 00:00:00 2001 From: gitronald Date: Mon, 28 Apr 2025 11:06:44 -0700 Subject: [PATCH 067/101] update: expand images sub cmpt class list and title/url parsing --- WebSearcher/component_parsers/images.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/WebSearcher/component_parsers/images.py b/WebSearcher/component_parsers/images.py index 74c8026..7e9bc51 100644 --- a/WebSearcher/component_parsers/images.py +++ b/WebSearcher/component_parsers/images.py @@ -1,3 +1,10 @@ +""" Parsers for image components + +Changelog +2025-04-28: added div subcomponent class and sub_type labels + +""" + from ..webutils import get_text, get_link, get_div def parse_images(cmpt) -> list: @@ -25,7 +32,7 @@ def parse_images(cmpt) -> list: parsed_list.extend(parsed_subs) else: # Medium images with titles and urls - subs = cmpt.find_all('div', {'class':'eA0Zlc'}) + subs = cmpt.find_all('div', {'class': ['eA0Zlc', 'vCUuC']}) parsed_subs = [parse_image_medium(sub, sub_rank + len(parsed_list)) for sub_rank, sub in enumerate(subs)] parsed_list.extend(parsed_subs) @@ -63,9 +70,14 @@ def parse_image_medium(sub, sub_rank=0) -> dict: """ title_div = get_div(sub, 'a', {'class':'EZAeBe'}) - title = get_text(title_div) if title_div else get_img_alt(sub) + title = get_text(title_div) if title_div else get_text(sub, 'span', {'class':'Yt787'}) url = get_link(sub) if title_div else get_img_url(sub) + if not title: + title = get_img_alt(sub) + if not url: + url = get_link(sub, attrs={'class':['EZAeBe', 'ddkIM']}) + return { "type": "images", "sub_type": "medium", From 036e67a6d108a5c9756ff2eed742a69e3ff56a65 Mon Sep 17 00:00:00 2001 From: gitronald Date: Mon, 28 Apr 2025 11:09:48 -0700 Subject: [PATCH 068/101] update: reduce doc strings --- WebSearcher/component_parsers/images.py | 37 ++++--------------------- 1 file changed, 5 insertions(+), 32 deletions(-) diff --git a/WebSearcher/component_parsers/images.py b/WebSearcher/component_parsers/images.py index 7e9bc51..ac4907c 100644 --- a/WebSearcher/component_parsers/images.py +++ b/WebSearcher/component_parsers/images.py @@ -8,14 +8,7 @@ from ..webutils import get_text, get_link, get_div def parse_images(cmpt) -> list: - """Parse an image component - - Args: - cmpt (bs4 object): an image component - - Returns: - list: list of parsed subcomponent dictionaries - """ + """Parse an images component""" parsed_list = [] @@ -42,14 +35,7 @@ def parse_images(cmpt) -> list: return parsed_list def parse_image_multimedia(sub, sub_rank=0) -> dict: - """Parse an image subcomponent - - Args: - sub (bs4 object): an image subcomponent - - Returns: - dict : parsed subresult - """ + """Parse an images multimedia subcomponent""" return { "type": "images", "sub_type": "multimedia", @@ -60,14 +46,7 @@ def parse_image_multimedia(sub, sub_rank=0) -> dict: } def parse_image_medium(sub, sub_rank=0) -> dict: - """Parse an image subcomponent - - Args: - sub (bs4 object): an image subcomponent - - Returns: - dict : parsed subresult - """ + """Parse an images medium subcomponent""" title_div = get_div(sub, 'a', {'class':'EZAeBe'}) title = get_text(title_div) if title_div else get_text(sub, 'span', {'class':'Yt787'}) @@ -89,14 +68,8 @@ def parse_image_medium(sub, sub_rank=0) -> dict: } def parse_image_small(sub, sub_rank=0) -> dict: - """Parse an image subcomponent - - Args: - sub (bs4 object): an image subcomponent - - Returns: - dict : parsed subresult - """ + """Parse an images small subcomponent""" + return { "type": "images", "sub_type": "small", From 205bba576da3b674ad5299bbf0df13a2c1302b77 Mon Sep 17 00:00:00 2001 From: gitronald Date: Mon, 28 Apr 2025 11:25:06 -0700 Subject: [PATCH 069/101] version: 0.6.5.dev3 --- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index cbc0ed6..9c3b696 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.6.5.dev2" +__version__ = "0.6.5.dev3" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index 5d2edd7..2f43c50 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.6.5.dev2" +version = "0.6.5.dev3" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From e23e70bc486daec7a064fe9e3286d85ed24745f5 Mon Sep 17 00:00:00 2001 From: gitronald Date: Tue, 29 Apr 2025 08:35:48 -0700 Subject: [PATCH 070/101] update: more restrictive discussions classifier --- WebSearcher/classifiers/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WebSearcher/classifiers/main.py b/WebSearcher/classifiers/main.py index 43a2803..6131c39 100644 --- a/WebSearcher/classifiers/main.py +++ b/WebSearcher/classifiers/main.py @@ -44,7 +44,7 @@ def classify(cmpt: bs4.element.Tag) -> str: @staticmethod def discussions_and_forums(cmpt: bs4.element.Tag) -> str: conditions = [ - cmpt.find("div", {"class": "IFnjPb"}), + cmpt.find("div", {"class": "IFnjPb", "role": "heading"}), ] return 'discussions_and_forums' if all(conditions) else "unknown" From 41cfba2b9202157da852c09ad5565cc7bd1bd8d0 Mon Sep 17 00:00:00 2001 From: gitronald Date: Tue, 29 Apr 2025 08:36:17 -0700 Subject: [PATCH 071/101] update: expand classes for video cmpt extraction --- WebSearcher/extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WebSearcher/extractors.py b/WebSearcher/extractors.py index 6900187..58ba5f7 100644 --- a/WebSearcher/extractors.py +++ b/WebSearcher/extractors.py @@ -116,7 +116,6 @@ def extract_main(self): # if shopping_ads: # self.components.add_component(shopping_ads, section='main', type='shopping_ads') - def extract_main_ads_top(self): """Extract the main ads section of the SERP""" ads = self.soup.find('div', {'id':'tads'}) @@ -212,6 +211,7 @@ def extract_from_top_bar(self, drop_tags: set = {}) -> list: 'oIk2Cb', # searches_related 'Ww4FFb', # discussions_and_forums 'vtSz8d', # videos + 'uVMCKf', # videos ] rso_layout_divs = self.layout_divs['rso'].find_all('div', attrs={'class': div_classes}, recursive=True) From 2d017015cd03c7d6f0754d032ed4c197722d7cd3 Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 8 May 2025 11:02:02 -0700 Subject: [PATCH 072/101] fix: no empty whitespace in filter_empty_divs func --- WebSearcher/webutils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WebSearcher/webutils.py b/WebSearcher/webutils.py index 4489468..a36c2b4 100644 --- a/WebSearcher/webutils.py +++ b/WebSearcher/webutils.py @@ -128,7 +128,7 @@ def find_all_divs(soup: BeautifulSoup, name: str, attrs: dict = {}, filter_empty def filter_empty_divs(divs): divs = [c for c in divs if c] - divs = [c for c in divs if c.text != ''] + divs = [c for c in divs if c.text.strip() != ''] return divs def find_children(soup, name: str, attrs: dict = {}, filter_empty: bool = False): From 9d66539c15fd14b7f075494983a6b0624fcbfad6 Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 8 May 2025 11:02:56 -0700 Subject: [PATCH 073/101] update: more knowledge panel identifiers --- WebSearcher/classifiers/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/WebSearcher/classifiers/main.py b/WebSearcher/classifiers/main.py index 6131c39..4130fd3 100644 --- a/WebSearcher/classifiers/main.py +++ b/WebSearcher/classifiers/main.py @@ -150,6 +150,7 @@ def knowledge_panel(cmpt: bs4.element.Tag) -> str: cmpt.find("h1", {"class": "VW3apb"}), cmpt.find("div", {"class": ["knowledge-panel", "knavi", "kp-blk", "kp-wholepage-osrp"]}), cmpt.find("div", {"aria-label": "Featured results", "role": "complementary"}), + cmpt.find("div", {"jscontroller": "qTdDb"}), webutils.check_dict_value(cmpt.attrs, "jscontroller", "qTdDb") ] return 'knowledge' if any(conditions) else "unknown" From 85f5766db982ad3565ae81ee06af757ef6b0e42f Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 8 May 2025 11:03:09 -0700 Subject: [PATCH 074/101] fix: count sub ranks for standard ads --- WebSearcher/component_parsers/ads.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/component_parsers/ads.py b/WebSearcher/component_parsers/ads.py index 885f51c..26d7480 100644 --- a/WebSearcher/component_parsers/ads.py +++ b/WebSearcher/component_parsers/ads.py @@ -38,12 +38,12 @@ def parse_ads(cmpt: bs4.element.Tag) -> list: parsed_list = [parse_ad_secondary(sub, sub_rank) for sub_rank, sub in enumerate(subs)] elif sub_type == 'standard': subs = webutils.find_all_divs(cmpt, 'div', {'class': ['uEierd', 'commercial-unit-desktop-top']}) - for sub in subs: + for sub_rank, sub in enumerate(subs): sub_classes = sub.attrs.get("class", []) if "commercial-unit-desktop-top" in sub_classes: parsed_list.extend(parse_shopping_ads(sub)) elif "uEierd" in sub_classes: - parsed_list.append(parse_ad(sub)) + parsed_list.append(parse_ad(sub, sub_rank=sub_rank)) elif sub_type == 'carousel': parsed_list = parse_ad_carousel(cmpt, sub_type) return parsed_list From ac79df03ae57df31ac475b2ed9bd85df4fd0d9c6 Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 8 May 2025 11:05:23 -0700 Subject: [PATCH 075/101] update: result types dictionaries --- WebSearcher/models/cmpt_mappings.py | 185 ++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 WebSearcher/models/cmpt_mappings.py diff --git a/WebSearcher/models/cmpt_mappings.py b/WebSearcher/models/cmpt_mappings.py new file mode 100644 index 0000000..616bbad --- /dev/null +++ b/WebSearcher/models/cmpt_mappings.py @@ -0,0 +1,185 @@ +""" +Metadata about WebSearcher result types and subtypes. +This provides documentation and structure for the various result types parsed by WebSearcher. +""" + +# Header result types with descriptions and subtypes +HEADER_RESULT_TYPES = { + "notice": { + "description": "Special notices and suggestions shown at the top of search results", + "sub_types": [ + "query_edit", + "query_edit_no_results", + "query_suggestion", + "location_choose_area", + "location_use_precise_location", + "language_tip", + ], + }, + "top_image_carousel": { + "description": "Carousel of images displayed at the top of search results", + "sub_types": [], + }, +} + +# Main result types with descriptions and subtypes +MAIN_RESULT_TYPES = { + "ad": { + "description": "Advertisements displayed in search results", + "sub_types": ["standard", "legacy", "secondary", "submenu"], + }, + "available_on": { + "description": "Where entertainment content is available to stream or purchase", + "sub_types": [], + }, + "banner": { + "description": "Banner notifications shown at top of results", + "sub_types": [], + }, + "discussions_and_forums": { + "description": "Forum and discussion board results", + "sub_types": [], + }, + "general": { + "description": "Standard web search results", + "sub_types": [ + "video", + "submenu", + "submenu_mini", + "submenu_rating", + "submenu_scholarly", + "submenu_product", + "subresult", + ], + }, + "general_questions": { + "description": "General results with related questions", + "sub_types": [], + }, + "images": { + "description": "Image search results", + "sub_types": ["multimedia", "medium", "small"], + }, + "knowledge": { + "description": "Knowledge panels and featured snippets", + "sub_types": [ + "ai_overview", + "featured_results", + "featured_snippet", + "unit_converter", + "sports", + "weather", + "finance", + "dictionary", + "translate", + "calculator", + "election", + "panel", + ], + }, + "latest_from": { + "description": "Latest news results from specific sources", + "sub_types": [], + }, + "local_news": { + "description": "News results specific to a location", + "sub_types": [], + }, + "local_results": { + "description": "Map-based local business results", + "sub_types": ["places", "locations", "businesses"], # Dynamically generated + }, + "map_results": {"description": "Map-only results", "sub_types": []}, + "news_quotes": { + "description": "Quote snippets from news articles", + "sub_types": [], + }, + "notice": { + "description": "Special notices about searches", + "sub_types": [ + "query_edit", + "query_edit_no_results", + "query_suggestion", + "location_choose_area", + "location_use_precise_location", + "language_tip", + ], + }, + "people_also_ask": { + "description": "Related questions that people search for", + "sub_types": [], + }, + "perspectives": {"description": "Opinion and perspective results", "sub_types": []}, + "scholarly_articles": {"description": "Google Scholar results", "sub_types": []}, + "searches_related": { + "description": "Related search terms", + "sub_types": [ + "additional_searches", + "related_searches", + ], # Dynamically generated + }, + "shopping_ads": {"description": "Product shopping advertisements", "sub_types": []}, + "top_image_carousel": { + "description": "Carousel of images displayed at top of page", + "sub_types": [], + }, + "top_stories": {"description": "Featured news stories", "sub_types": []}, + "twitter_cards": { + "description": "Twitter content displayed in cards", + "sub_types": [], + }, + "twitter_result": {"description": "Individual Twitter result", "sub_types": []}, + "videos": {"description": "Video results", "sub_types": []}, + "view_more_news": {"description": "News result expansion links", "sub_types": []}, + "knowledge_rhs": { + "description": "Knowledge panels in right-hand sidebar", + "sub_types": [], + }, + "unknown": {"description": "Unclassified components", "sub_types": []}, +} + +# Footer result types with descriptions and subtypes +FOOTER_RESULT_TYPES = { + "img_cards": {"description": "Image cards displayed in footer", "sub_types": []}, + "searches_related": { + "description": "Related searches displayed in footer", + "sub_types": [ + "additional_searches", + "related_searches", + ], # Dynamically generated + }, + "discover_more": {"description": "'Discover more' suggestions", "sub_types": []}, + "general": { + "description": "General results in footer", + "sub_types": [ + "video", + "submenu", + "submenu_mini", + "submenu_rating", + "submenu_scholarly", + "submenu_product", + "subresult", + ], + }, + "people_also_ask": {"description": "Related questions in footer", "sub_types": []}, + "omitted_notice": { + "description": "Notices about filtered results", + "sub_types": [], + }, +} + +# Special types not directly linked to parsers +SPECIAL_RESULT_TYPES = { + "unclassified": { + "description": "Default type in the BaseResult model", + "sub_types": [], + }, +} + +# Combined dictionary of all result types +ALL_RESULT_TYPES = { + **HEADER_RESULT_TYPES, + **MAIN_RESULT_TYPES, + **FOOTER_RESULT_TYPES, + **SPECIAL_RESULT_TYPES, +} From a737aafa96c84c39c89b7ba92b8387140954689c Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 8 May 2025 11:07:49 -0700 Subject: [PATCH 076/101] move: extractors to dir --- WebSearcher/{ => extractors}/extractors.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename WebSearcher/{ => extractors}/extractors.py (100%) diff --git a/WebSearcher/extractors.py b/WebSearcher/extractors/extractors.py similarity index 100% rename from WebSearcher/extractors.py rename to WebSearcher/extractors/extractors.py From b6be243a985201a7fbe3b1961cca233168d5da6a Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 8 May 2025 11:08:29 -0700 Subject: [PATCH 077/101] rename: extractors code --- WebSearcher/extractors/{extractors.py => __init__.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename WebSearcher/extractors/{extractors.py => __init__.py} (100%) diff --git a/WebSearcher/extractors/extractors.py b/WebSearcher/extractors/__init__.py similarity index 100% rename from WebSearcher/extractors/extractors.py rename to WebSearcher/extractors/__init__.py From 52c79f616b5149b4b313377783d82f0936cca92d Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 8 May 2025 11:09:28 -0700 Subject: [PATCH 078/101] add: breakout extractor functions into files by section --- WebSearcher/extractors/__init__.py | 379 +-------------------- WebSearcher/extractors/extractor_footer.py | 54 +++ WebSearcher/extractors/extractor_header.py | 33 ++ WebSearcher/extractors/extractor_main.py | 227 ++++++++++++ WebSearcher/extractors/extractor_rhs.py | 43 +++ 5 files changed, 375 insertions(+), 361 deletions(-) create mode 100644 WebSearcher/extractors/extractor_footer.py create mode 100644 WebSearcher/extractors/extractor_header.py create mode 100644 WebSearcher/extractors/extractor_main.py create mode 100644 WebSearcher/extractors/extractor_rhs.py diff --git a/WebSearcher/extractors/__init__.py b/WebSearcher/extractors/__init__.py index 58ba5f7..59252a6 100644 --- a/WebSearcher/extractors/__init__.py +++ b/WebSearcher/extractors/__init__.py @@ -1,370 +1,27 @@ -from .components import Component, ComponentList -from . import utils -from . import webutils -from . import logger -log = logger.Logger().start(__name__) import bs4 +from ..components import ComponentList +from .extractor_rhs import ExtractorRightHandSide +from .extractor_main import ExtractorMain +from .extractor_header import ExtractorHeader +from .extractor_footer import ExtractorFooter +from .. import logger +log = logger.Logger().start(__name__) class Extractor: def __init__(self, soup: bs4.BeautifulSoup): self.soup = soup self.components = ComponentList() - self.rhs = {} - self.layout_divs = { - "rso": None, - "top-bars": None, - "left-bar": None, - } - self.layouts = { - "rso": False, - "top-bars": False, - "left-bar": False, - "standard": False, - "no-rso": False, - } - self.layout_label = None - self.layout_extractors = { - "standard": self.extract_from_standard, - "top-bars": self.extract_from_top_bar, - "left-bar": self.extract_from_left_bar, - "no-rso": self.extract_from_no_rso - } + self.rhs_handler = ExtractorRightHandSide(self.soup, self.components) + self.header_handler = ExtractorHeader(self.soup, self.components) + self.main_handler = ExtractorMain(self.soup, self.components) + self.footer_handler = ExtractorFooter(self.soup, self.components) def extract_components(self): - log.debug("Extracting Components") - self.extract_rhs() - self.extract_header() - self.extract_main() - self.extract_footer() - self.append_rhs() - log.debug(f"Extracted {self.components.cmpt_rank_counter:,} components") - - # -------------------------------------------------------------------------- - # Right Hand Sidebar Components - # -------------------------------------------------------------------------- - - def extract_rhs(self): - """Extract the Right Hand Side (RHS) Knowledge Panel. Can appear in arbitrary order, must extract first.""" - rhs_kws = ('div', {'id': 'rhs'}) - rhs = self.soup.find(*rhs_kws).extract() if self.soup.find(*rhs_kws) else None - if rhs: - rhs_layouts = { - 'rhs_complementary': rhs if webutils.check_dict_value(rhs.attrs, "role", "complementary") else None, - 'rhs_knowledge': rhs.find('div', {'class': ['kp-wholepage', 'knowledge-panel', 'TzHB6b']}), - } - rhs_layout = next((layout for layout, component in rhs_layouts.items() if component), None) - if rhs_layout: - log.debug(f"rhs_layout: {rhs_layout}") - self.rhs = {"elem": rhs_layouts[rhs_layout], - "section": "rhs", - "type": "knowledge_rhs"} - else: - log.debug(f"no rhs_layout") - - - def append_rhs(self): - """Append the RHS Knowledge Panel to the components list at the end""" - if self.rhs: - log.debug(f"appending rhs") - self.components.add_component(**self.rhs) - self.rhs = None - - - # -------------------------------------------------------------------------- - # Header Components - # -------------------------------------------------------------------------- - - def extract_header(self): - """Extract the header section, often a carousel of images or other suggestions.""" - self.extract_top_bar() - self.extract_notices() - - - def extract_top_bar(self): - """Extract the top bar section, often a carousel of images or other suggestions.""" - top_bar = self.soup.find('div', {'id':'appbar'}) - if top_bar: - has_img = top_bar.find(lambda tag: tag.has_attr('src') and not tag.has_attr('data-src')) - if top_bar.find('g-scrolling-carousel') and has_img: - self.components.add_component(top_bar, section='header', type='top_image_carousel') - - - def extract_notices(self): - """Append notices to the components list at the end""" - notices = webutils.find_all_divs(self.soup, "div", {"id": "oFNiHe"}) - notices = webutils.filter_empty_divs(notices) - log.debug(f"notices: {len(notices)}") - for notice in notices: - self.components.add_component(notice, section="header", type="notice") - - # -------------------------------------------------------------------------- - # Main Components - # -------------------------------------------------------------------------- - - def extract_main(self): - """Extract the main results sections of the SERP""" - # self.extract_main_shopping_ads() - self.extract_main_ads_top() - self.extract_main_components() - self.extract_main_ads_bottom() - - - # def extract_main_shopping_ads(self): - # """Extract the main shopping ads section of the SERP""" - # shopping_ads = self.soup.find('div', {'class': 'commercial-unit-desktop-top'}) - # if shopping_ads: - # self.components.add_component(shopping_ads, section='main', type='shopping_ads') - - def extract_main_ads_top(self): - """Extract the main ads section of the SERP""" - ads = self.soup.find('div', {'id':'tads'}) - if ads and webutils.get_text(ads): - # Filter if already extracted as shopping ads - # if not ads.find('div', {'class': 'commercial-unit-desktop-top'}): - self.components.add_component(ads, section='main', type='ad') - - - def extract_main_components(self, drop_tags: set={'script', 'style', None}): - """Extract main components based on SERP layout""" - log.debug("Extracting main column components") - self.check_layout_main() - try: - layout_extractor = self.layout_extractors[self.layout_label] - column = layout_extractor(drop_tags) - for component in column: - if Extractor.is_valid_main_component(component): - self.components.add_component(component, section='main') - except KeyError: - raise ValueError(f"no extractor for layout_label: {self.layout_label}") - log.debug(f"Extracted main components: {self.components.cmpt_rank_counter:,}") - - - def extract_main_ads_bottom(self): - """Extract the main ads section of the SERP""" - ads = self.soup.find('div', {'id':'tadsb'}) - if ads and webutils.get_text(ads): - self.components.add_component(ads, section='main', type='ad') - - # -------------------------------------------------------------------------- - # Layout Specifics - # -------------------------------------------------------------------------- - - - def check_layout_main(self): - """Divide and label the page layout""" - log.debug(f"Checking SERP layout") - - # Layout soup subsets - self.layout_divs['rso'] = self.soup.find('div', {'id':'rso'}) - self.layout_divs['left-bar'] = self.soup.find('div', {'class': 'OeVqAd'}) - self.layout_divs['top-bars'] = self.soup.find_all('div', {'class': ['XqFnDf', 'M8OgIe']}) - - # Layout classifications - self.layouts['rso'] = bool(self.layout_divs['rso']) - self.layouts['top-bars'] = bool(self.layout_divs['top-bars']) - self.layouts['left-bar'] = bool(self.layout_divs['left-bar']) - self.layouts['standard'] = (self.layouts['rso'] & - (not self.layouts['top-bars']) & - (not self.layouts['left-bar'])) - self.layouts['no-rso'] = not self.layouts['rso'] - - # Get layout label - label_matches = [k for k,v in self.layouts.items() if k !='rso' and v] - first_match = label_matches[0] if label_matches else None - self.layout_label = first_match - log.debug(f"layout: {self.layout_label}") - - - def extract_from_standard(self, drop_tags: set = {}) -> list: - - if self.layout_divs['rso'].find('div', {'id':'kp-wp-tab-overview'}): - log.debug("layout update: standard-alt-1") - self.layout_label = 'standard-alt' - column = self.layout_divs['rso'].find_all('div', {'class':'TzHB6b'}) - return column - - column = Extractor.extract_children(self.layout_divs['rso'], drop_tags) - column = [c for c in column if Extractor.is_valid_main_component(c)] - - if len(column) == 0: - log.debug("layout update: standard-alt-0") - self.layout_label = 'standard-alt' - divs = self.layout_divs['rso'].find_all('div', {'id':'kp-wp-tab-overview'}) - column = sum([div.find_all('div', {'class':'TzHB6b'}) for div in divs], []) - return column - - - def extract_from_top_bar(self, drop_tags: set = {}) -> list: - """Extract components from top-bars layout""" - column = [] - - top_bar_divs = Extractor.extract_from_top_bar_divs(self.layout_divs['top-bars']) - column.extend(top_bar_divs) - # No duplicates, but missing data - # rso_layout_divs = self.layout_divs['rso'].find_all('div', {'class':'sATSHe'}) - - div_classes = [ - 'cUnQKe', # people also ask - 'g', # general - 'Lv2Cle', # images-medium - 'oIk2Cb', # searches_related - 'Ww4FFb', # discussions_and_forums - 'vtSz8d', # videos - 'uVMCKf', # videos - ] - rso_layout_divs = self.layout_divs['rso'].find_all('div', attrs={'class': div_classes}, recursive=True) - - if rso_layout_divs: - self.layout_label = 'top-bars-divs' - layout_column = [div for div in rso_layout_divs if div.name not in drop_tags] - else: - self.layout_label = 'top-bars-children' - layout_column = Extractor.extract_children(self.layout_divs['rso'], drop_tags) - log.debug(f"layout update: {self.layout_label}") - - column.extend(layout_column) - return column - - @staticmethod - def extract_from_top_bar_divs(soup, drop_tags: set = {}) -> list: - output_list = [] - for top_bar in soup: - if webutils.check_dict_value(top_bar.attrs, "class", ["M8OgIe"]): - knowledge_divs = webutils.find_all_divs(top_bar, "div", {"jscontroller": ["qTdDb", "OWrb3e"]}) - output_list.extend(knowledge_divs) - log.debug(f"layout: M8OgIe divs: {len(knowledge_divs)}") - else: - output_list.append(top_bar) - return output_list - - - def extract_from_left_bar(self, drop_tags: set = {}) -> list: - """Extract components from left-bar layout""" - column = self.soup.find_all('div', {'class':'TzHB6b'}) - return column - - - def extract_from_no_rso(self, drop_tags: set = {}) -> list: - """Extract components from no-rso layout""" - log.debug("layout: no-rso") - column = [] - section1 = self.soup.find_all('div', {'class':'UDZeY OTFaAf'}) - for div in section1: - - # Conditional handling for Twitter result - if div.find('h2') and div.find('h2').text == "Twitter Results": - column.append(div.find('div').parent) - - # Conditional handling for g-section with header - elif div.find('g-section-with-header'): - column.append(div.find('g-section-with-header').parent) - - # Include divs with a "View more" type of button - elif div.find('g-more-link'): - column.append(div) - - # Include footer components that appear in the main column - elif div.find('div', {'class':'oIk2Cb'}): - column.append(div) - - else: - # Handle general results - for child in div.find_all('div', {'class':'g'}): - column.append(child) - - # Find section 2 results and append to column list - section2 = self.soup.find('div', {'class':'WvKfwe a3spGf'}) - if section2: - for child in section2.children: - column.append(child) - column = [c for c in column if c.name not in drop_tags] - return column - - - @staticmethod - def extract_children(soup: bs4.BeautifulSoup, drop_tags: set = {}) -> list: - """Extract children from BeautifulSoup, drop specific tags, flatten list""" - log.debug("layout: extracting children") - children = [] - for child in soup.children: - if child.name in drop_tags: - continue - if not child.attrs: - children.extend(child.contents) - else: - children.append(child) - return children - - - @staticmethod - def is_valid_main_component(c) -> bool: - """Check if a given component is neither empty nor a hidden survey""" - if not c: - return False - else: - drop_text = { - "Main results", # Remove empty rso component; hidden

header - "Twitter Results", # Remove empty Twitter component - "", # Remove empty divs - } - return c.text not in drop_text and not Extractor.is_hidden_survey(c) - - @staticmethod - def is_hidden_survey(element): - """Check if a component is a hidden survey component; no visual presence so filter out""" - conditions = [ - element.find('promo-throttler'), - webutils.check_dict_value(element.attrs, "class", ["ULSxyf"]), - ] - return all(conditions) - - - # -------------------------------------------------------------------------- - # Footer Components - # -------------------------------------------------------------------------- - - - def extract_footer(self): - """Extract the footer section of the SERP""" - log.debug("extracting footer components") - - footer_div = self.soup.find('div', {'id':'botstuff'}) - footer_component_list = [] - - # Check if footer div exists - if footer_div: - footer_component_divs = webutils.find_all_divs(self.soup, 'div', {'id':['bres', 'brs']}) - if footer_component_divs: - log.debug(f"found footer components: {len(footer_component_divs):,}") - - # Expand components by checking for nested divs - for footer_component_div in footer_component_divs: - expanded_divs = webutils.find_all_divs(footer_component_div, "div", {"class":"MjjYud"}) - if expanded_divs and len(expanded_divs) > 1: - footer_component_list.extend(expanded_divs) - else: - footer_component_list.append(footer_component_div) - - # Check for omitted notice - omitted_notice = self.soup.find('div', {'class':'ClPXac'}) - if omitted_notice: - footer_component_list.append(omitted_notice) - - footer_component_list = [e for e in footer_component_list if not Extractor.is_hidden_footer(e)] - log.debug(f'footer_component_list len: {len(footer_component_list)}') - - for footer_component in footer_component_list: - self.components.add_component(footer_component, section='footer') - - - @staticmethod - def is_hidden_footer(element): - """Check if a component is a hidden footer component; no visual presence so filter out""" - conditions = [ - # element.find("b", {"class":"uDuvJd"}), - element.find("span", {"class":"oUAcPd"}), - element.find("div", {"class": "RTaUke"}), - element.find("div", {"class": "KJ7Tg"}), - ] - return any(conditions) + log.debug(f"Extracting Components {'-'*50}") + self.rhs_handler.extract() + self.header_handler.extract() + self.main_handler.extract() + self.footer_handler.extract() + self.rhs_handler.append() + log.debug(f"total components: {self.components.cmpt_rank_counter:,}") diff --git a/WebSearcher/extractors/extractor_footer.py b/WebSearcher/extractors/extractor_footer.py new file mode 100644 index 0000000..ccf3397 --- /dev/null +++ b/WebSearcher/extractors/extractor_footer.py @@ -0,0 +1,54 @@ +import bs4 +from .. import webutils +from .. import logger + +log = logger.Logger().start(__name__) + +class ExtractorFooter: + def __init__(self, soup: bs4.BeautifulSoup, components): + self.soup = soup + self.components = components + + def extract(self): + """Extract the footer section of the SERP""" + + footer_div = self.soup.find('div', {'id':'botstuff'}) + footer_component_list = [] + + if footer_div: + footer_component_divs = webutils.find_all_divs( + self.soup, 'div', {'id': ['bres', 'brs']} + ) + if footer_component_divs: + log.debug(f"footer_components: {len(footer_component_divs):,}") + for footer_component_div in footer_component_divs: + expanded_divs = webutils.find_all_divs( + footer_component_div, "div", {"class": "MjjYud"} + ) + if expanded_divs and len(expanded_divs) > 1: + footer_component_list.extend(expanded_divs) + else: + footer_component_list.append(footer_component_div) + + omitted_notice = self.soup.find('div', {'class':'ClPXac'}) + if omitted_notice: + footer_component_list.append(omitted_notice) + + footer_component_list = [ + e for e in footer_component_list + if not ExtractorFooter.is_hidden_footer(e) + ] + log.debug(f'footer_components: {len(footer_component_list)}') + + for footer_component in footer_component_list: + self.components.add_component(footer_component, section='footer') + + @staticmethod + def is_hidden_footer(element): + """Filter out hidden footer components (no visual presence).""" + conditions = [ + element.find("span", {"class":"oUAcPd"}), + element.find("div", {"class": "RTaUke"}), + element.find("div", {"class": "KJ7Tg"}), + ] + return any(conditions) \ No newline at end of file diff --git a/WebSearcher/extractors/extractor_header.py b/WebSearcher/extractors/extractor_header.py new file mode 100644 index 0000000..7955d04 --- /dev/null +++ b/WebSearcher/extractors/extractor_header.py @@ -0,0 +1,33 @@ +import bs4 +from .. import webutils +from .. import logger + +log = logger.Logger().start(__name__) + +class ExtractorHeader: + def __init__(self, soup: bs4.BeautifulSoup, components): + self.soup = soup + self.components = components + self.exists = False + + def extract(self): + """Extract the header section: appbar and notices.""" + self.extract_appbar() + self.extract_notices() + + def extract_appbar(self): + """Extract the top bar section, often a carousel of images or other suggestions.""" + appbar = self.soup.find('div', {'id':'appbar'}) + if appbar: + has_img = appbar.find(lambda tag: tag.has_attr('src') and not tag.has_attr('data-src')) + if appbar.find('g-scrolling-carousel') and has_img: + self.components.add_component(appbar, section='header', type='top_image_carousel') + self.exists = True + + def extract_notices(self): + """Append notices to the components list at the end.""" + notices = webutils.find_all_divs(self.soup, "div", {"id": "oFNiHe"}, filter_empty=True) + if notices: + self.exists = True + for notice in notices: + self.components.add_component(notice, section="header", type="notice") \ No newline at end of file diff --git a/WebSearcher/extractors/extractor_main.py b/WebSearcher/extractors/extractor_main.py new file mode 100644 index 0000000..9f772df --- /dev/null +++ b/WebSearcher/extractors/extractor_main.py @@ -0,0 +1,227 @@ +import bs4 +from .. import webutils +from ..logger import Logger + +log = Logger().start(__name__) + +class ExtractorMain: + def __init__(self, soup: bs4.BeautifulSoup, components): + self.soup = soup + self.components = components + + # copied from Extractor.__init__ + self.layout_divs = { + "rso": None, + "top-bars": None, + "left-bar": None, + } + self.layouts = { + "top-bars": False, + "left-bar": False, + "standard": False, + "no-rso": False, + } + self.layout_label = None + self.layout_extractors = { + "standard": self.extract_from_standard, + "top-bars": self.extract_from_top_bar, + "left-bar": self.extract_from_left_bar, + "no-rso": self.extract_from_no_rso + } + + def extract(self): + self.get_layout() + self._ads_top() + self._main_column() + self._ads_bottom() + log.debug(f"main_components: {self.components.cmpt_rank_counter:,}") + + def get_layout(self): + """Divide and label the page layout""" + + # Layout soup subsets + layout_divs = {} + layout_divs['rso'] = self.soup.find('div', {'id':'rso'}) + layout_divs['left-bar'] = self.soup.find('div', {'class': 'OeVqAd'}) + # layout_divs['top-bars'] = self.soup.find_all('div', {'class': ['XqFnDf', 'M8OgIe']}) + + rcnt = self.soup.find('div', {'id':'rcnt'}) + layout_divs['top-bars'] = rcnt.find_all('div', {'class': ['XqFnDf', 'M8OgIe']}, recursive=False) + + # Layout classifications + layouts = {} + layouts['top-bars'] = bool(layout_divs['top-bars']) + layouts['left-bar'] = bool(layout_divs['left-bar']) + layouts['standard'] = ( + bool(layout_divs['rso']) & + (not layouts['top-bars']) & + (not layouts['left-bar']) + ) + layouts['no-rso'] = not bool(layout_divs['rso']) + + if layouts['top-bars'] and bool(layout_divs['rso']) and not layouts['left-bar']: + layout_label = 'standard' + else: + # Get layout label + label_matches = [k for k,v in layouts.items() if v] + layout_label = label_matches[0] if label_matches else None + + # Set layout details + log.debug(f"main_layout: {layout_label}") + self.layout_label = layout_label + self.layouts.update(layouts) + self.layout_divs.update(layout_divs) + + def _ads_top(self): + ads = self.soup.find('div', {'id':'tads'}) + if ads and webutils.get_text(ads): + ads.extract() + self.components.add_component(ads, section='main', type='ad') + + def _main_column(self, drop_tags: set = {'script', 'style', None}): + try: + extractor = self.layout_extractors[self.layout_label] + except KeyError: + raise ValueError(f"no extractor for layout_label: {self.layout_label}") + + column = extractor(drop_tags) + column = webutils.filter_empty_divs(column) + for c in column: + if ExtractorMain.is_valid(c): + self.components.add_component(c, section='main') + + def _ads_bottom(self): + ads = self.soup.find('div', {'id':'tadsb'}) + if ads and webutils.get_text(ads): + ads.extract() + self.components.add_component(ads, section='main', type='ad') + + def extract_from_standard(self, drop_tags:set={}) -> list: + + rso_div = self.layout_divs['rso'] + standard_layouts = { + "standard-0": rso_div.find('div', {'id':'kp-wp-tab-overview'}), + "standard-1": rso_div.find('div', {'id':'kp-wp-tab-Songs'}), + } + for layout_name, layout_div in standard_layouts.items(): + if layout_div: + if layout_div.find_all("div"): + return self._extract_from_standard(layout_name) + + # self.layout_label = layout_name + # return self._extract_from_standard(layout_name) + + col = ExtractorMain.extract_children(rso_div, drop_tags) + col = [c for c in col if ExtractorMain.is_valid(c)] + if not col: + self.layout_label = 'standard-2' + log.debug(f"main_layout: {self.layout_label} (update)") + divs = rso_div.find_all('div', {'id':'kp-wp-tab-overview'}) + col = sum([d.find_all('div', {'class':'TzHB6b'}) for d in divs], []) + return col + + def _extract_from_standard(self, sub_type:str = "") -> list: + + self.layout_label = sub_type + rso_div = self.layout_divs['rso'] + log.debug(f"main_layout: {self.layout_label} (update)") + + if self.layout_label == "standard-0": + column = [] + top_divs = ExtractorMain.extract_top_divs(self.layout_divs['top-bars']) or [] + main_divs = rso_div.find_all('div', {'class':'TzHB6b'}) or [] + column.extend(top_divs) + column.extend(main_divs) + log.debug(f"main_components: {len(column):,}") + return column + + if self.layout_label == "standard-1": + column = [] + top_divs = ExtractorMain.extract_top_divs(self.layout_divs['top-bars']) or [] + main_divs = rso_div.find('div', {'id':'kp-wp-tab-Songs'}).children or [] + column.extend(top_divs) + column.extend(main_divs) + column = [div for div in column if div.name not in {'script', 'style'}] + column = webutils.filter_empty_divs(column) + return column + + + def extract_from_top_bar(self, drop_tags:set={}) -> list: + out = [] + tops = ExtractorMain.extract_top_divs(self.layout_divs['top-bars']) + out.extend(tops) + + div_classes = [ + 'cUnQKe', # people also ask + 'g', # general + 'Lv2Cle', # images-medium + 'oIk2Cb', # searches_related + 'Ww4FFb', # discussions_and_forums + 'vtSz8d', # videos + 'uVMCKf', # videos + ] + + rso_divs = self.layout_divs['rso'].find_all('div', attrs={'class':div_classes}) + if rso_divs: + self.layout_label = 'top-bars-divs' + col = [div for div in rso_divs if div.name not in drop_tags] + else: + self.layout_label = 'top-bars-children' + col = ExtractorMain.extract_children(self.layout_divs['rso'], drop_tags) + log.debug(f"main_layout: {self.layout_label} (update)") + out.extend(col) + return out + + @staticmethod + def extract_top_divs(soup, drop_tags:set={}) -> list: + out = [] + for tb in soup: + if webutils.check_dict_value(tb.attrs, "class", ["M8OgIe"]): + kd = webutils.find_all_divs(tb, "div", {"jscontroller":["qTdDb","OWrb3e"]}) + out.extend(kd) + else: + out.append(tb) + return out + + def extract_from_left_bar(self, drop_tags:set={}) -> list: + return self.soup.find_all('div', {'class':'TzHB6b'}) + + def extract_from_no_rso(self, drop_tags:set={}) -> list: + out=[]; sec1=self.soup.find_all('div', {'class':'UDZeY OTFaAf'}) + for div in sec1: + if div.find('h2') and div.find('h2').text=="Twitter Results": + out.append(div.find('div').parent) + elif div.find('g-section-with-header'): + out.append(div.find('g-section-with-header').parent) + elif div.find('g-more-link'): + out.append(div) + elif div.find('div',{'class':'oIk2Cb'}): + out.append(div) + else: + out.extend(div.find_all('div',{'class':'g'})) + sec2=self.soup.find('div',{'class':'WvKfwe a3spGf'}) + if sec2: + out.extend(sec2.children) + return [c for c in out if c.name not in drop_tags] + + @staticmethod + def extract_children(soup, drop_tags:set={}) -> list: + cts=[] + for ch in soup.children: + if ch.name in drop_tags: continue + if not ch.attrs: cts.extend(ch.contents) + else: cts.append(ch) + return cts + + @staticmethod + def is_valid(c) -> bool: + if not c: return False + bad = {"Main results","Twitter Results",""} + if c.text in bad: return False + # hidden survey + cond = [ + c.find('promo-throttler'), + webutils.check_dict_value(c.attrs,"class",["ULSxyf"]) if 'attrs' in c else False, + ] + if all(cond): return False + return True \ No newline at end of file diff --git a/WebSearcher/extractors/extractor_rhs.py b/WebSearcher/extractors/extractor_rhs.py new file mode 100644 index 0000000..4fc013d --- /dev/null +++ b/WebSearcher/extractors/extractor_rhs.py @@ -0,0 +1,43 @@ +import bs4 +from .. import webutils +from .. import logger + +log = logger.Logger().start(__name__) + +class ExtractorRightHandSide: + def __init__(self, soup: bs4.BeautifulSoup, components): + self.soup = soup + self.components = components + self.rhs = {} + + def extract(self): + """Extract the RHS Knowledge Panel, if present.""" + rhs_div = self.soup.find('div', {'id': 'rhs'}) + if not rhs_div: + return + rhs_div.extract() + layout, div = self._get_layout(rhs_div) + if layout: + log.debug(f"rhs_layout: {layout}") + self.rhs = { + "elem": div, + "section": "rhs", + "type": "knowledge_rhs" + } + else: + log.debug("no rhs_layout") + + def append(self): + """Append the RHS panel as a component at the end.""" + if self.rhs: + log.debug("appending rhs") + self.components.add_component(**self.rhs) + self.rhs = {} + + def _get_layout(self, rhs_div): + rhs_layouts = { + 'rhs_complementary': rhs_div if webutils.check_dict_value(rhs_div.attrs, "role", "complementary") else None, + 'rhs_knowledge': rhs_div.find('div', {'class': ['kp-wholepage', 'knowledge-panel', 'TzHB6b']}) + } + found = next((name for name, node in rhs_layouts.items() if node), None) + return (found, rhs_div) if found else (None, rhs_div) \ No newline at end of file From a3b7c006233415143c7fbc783e4c702e6c6d3073 Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 8 May 2025 11:12:53 -0700 Subject: [PATCH 079/101] version: 0.6.5.dev4 --- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index 9c3b696..d0c3d64 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.6.5.dev3" +__version__ = "0.6.5.dev4" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index 2f43c50..6dfbf8f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.6.5.dev3" +version = "0.6.5.dev4" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From 1a32ee051950a9a27487f4179ae79fdfb1d0c5e4 Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 8 May 2025 17:38:15 -0700 Subject: [PATCH 080/101] add: recent_posts variant of top_stories --- WebSearcher/classifiers/header_text.py | 1 + WebSearcher/component_parsers/__init__.py | 2 ++ WebSearcher/component_parsers/recent_posts.py | 14 ++++++++++++++ 3 files changed, 17 insertions(+) create mode 100644 WebSearcher/component_parsers/recent_posts.py diff --git a/WebSearcher/classifiers/header_text.py b/WebSearcher/classifiers/header_text.py index 8c0c3c7..713f5e5 100644 --- a/WebSearcher/classifiers/header_text.py +++ b/WebSearcher/classifiers/header_text.py @@ -116,6 +116,7 @@ def _get_header_level_mapping(level) -> dict: "News", "Noticias", "Market news"], + "recent_posts": ["Recent posts"], "twitter": ["Twitter Results"], "videos": ["Videos"] } diff --git a/WebSearcher/component_parsers/__init__.py b/WebSearcher/component_parsers/__init__.py index aaff223..afb67a2 100644 --- a/WebSearcher/component_parsers/__init__.py +++ b/WebSearcher/component_parsers/__init__.py @@ -15,6 +15,7 @@ from .latest_from import parse_latest_from from .local_news import parse_local_news from .perspectives import parse_perspectives +from .recent_posts import parse_recent_posts from .local_results import parse_local_results from .map_results import parse_map_results @@ -57,6 +58,7 @@ ('news_quotes', parse_news_quotes, 'News Quotes'), ('people_also_ask', parse_people_also_ask, 'People Also Ask'), ('perspectives', parse_perspectives, 'Perspectives & Opinions'), + ('recent_posts', parse_recent_posts, 'Recent Posts'), ('scholarly_articles', parse_scholarly_articles, 'Scholar Articles'), ('searches_related', parse_searches_related, 'Related Searches'), ('shopping_ads', parse_shopping_ads, 'Shopping Ad'), diff --git a/WebSearcher/component_parsers/recent_posts.py b/WebSearcher/component_parsers/recent_posts.py new file mode 100644 index 0000000..ee0a24d --- /dev/null +++ b/WebSearcher/component_parsers/recent_posts.py @@ -0,0 +1,14 @@ +from .top_stories import parse_top_stories + +def parse_recent_posts(cmpt): + """Parse a "Recent posts" component + + These components have a similar carousel as Top Stories and Perspectives. + + Args: + cmpt (bs4 object): A html component + + Returns: + dict : parsed result + """ + return parse_top_stories(cmpt, ctype='recent_posts') From f775eac6a268bc6a407a0f21c2e4ac2c7e93a02f Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 8 May 2025 17:38:33 -0700 Subject: [PATCH 081/101] update: remove duplicate log --- WebSearcher/extractors/extractor_footer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/WebSearcher/extractors/extractor_footer.py b/WebSearcher/extractors/extractor_footer.py index ccf3397..abe2530 100644 --- a/WebSearcher/extractors/extractor_footer.py +++ b/WebSearcher/extractors/extractor_footer.py @@ -20,7 +20,6 @@ def extract(self): self.soup, 'div', {'id': ['bres', 'brs']} ) if footer_component_divs: - log.debug(f"footer_components: {len(footer_component_divs):,}") for footer_component_div in footer_component_divs: expanded_divs = webutils.find_all_divs( footer_component_div, "div", {"class": "MjjYud"} From e0edd4e6b013cda8ede766d44f5efff6ffdf7c9b Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 8 May 2025 17:38:46 -0700 Subject: [PATCH 082/101] version: 0.6.5.dev5 --- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index d0c3d64..61e16df 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.6.5.dev4" +__version__ = "0.6.5.dev5" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index 6dfbf8f..34de6e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.6.5.dev4" +version = "0.6.5.dev5" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From bd02fa8983464f2bb6e13df06874848a64265e27 Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 8 May 2025 20:56:19 -0700 Subject: [PATCH 083/101] fix: missing comma --- WebSearcher/classifiers/header_text.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/WebSearcher/classifiers/header_text.py b/WebSearcher/classifiers/header_text.py index 713f5e5..b247af8 100644 --- a/WebSearcher/classifiers/header_text.py +++ b/WebSearcher/classifiers/header_text.py @@ -90,7 +90,8 @@ def _get_header_level_mapping(level) -> dict: "local_results": [ "Local Results", "Locations", - "Places", "Sitios" + "Places", + "Sitios", "Businesses", "locations", ], From 71e1a552bbb0683dfa741d1861fd528d24b2e216 Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 8 May 2025 20:56:45 -0700 Subject: [PATCH 084/101] update: main column extractors --- WebSearcher/extractors/extractor_main.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/WebSearcher/extractors/extractor_main.py b/WebSearcher/extractors/extractor_main.py index 9f772df..af7b399 100644 --- a/WebSearcher/extractors/extractor_main.py +++ b/WebSearcher/extractors/extractor_main.py @@ -102,30 +102,30 @@ def extract_from_standard(self, drop_tags:set={}) -> list: standard_layouts = { "standard-0": rso_div.find('div', {'id':'kp-wp-tab-overview'}), "standard-1": rso_div.find('div', {'id':'kp-wp-tab-Songs'}), + "standard-2": rso_div.find('div', {'id':'kp-wp-tab-SportsStandings'}), } for layout_name, layout_div in standard_layouts.items(): if layout_div: if layout_div.find_all("div"): - return self._extract_from_standard(layout_name) + return self._extract_from_standard_sub_type(layout_name) - # self.layout_label = layout_name - # return self._extract_from_standard(layout_name) - + top_divs = ExtractorMain.extract_top_divs(self.layout_divs['top-bars']) or [] col = ExtractorMain.extract_children(rso_div, drop_tags) + col = top_divs + col col = [c for c in col if ExtractorMain.is_valid(c)] if not col: - self.layout_label = 'standard-2' + self.layout_label = 'standard-3' log.debug(f"main_layout: {self.layout_label} (update)") divs = rso_div.find_all('div', {'id':'kp-wp-tab-overview'}) col = sum([d.find_all('div', {'class':'TzHB6b'}) for d in divs], []) return col - def _extract_from_standard(self, sub_type:str = "") -> list: + def _extract_from_standard_sub_type(self, sub_type:str = "") -> list: self.layout_label = sub_type rso_div = self.layout_divs['rso'] log.debug(f"main_layout: {self.layout_label} (update)") - + if self.layout_label == "standard-0": column = [] top_divs = ExtractorMain.extract_top_divs(self.layout_divs['top-bars']) or [] @@ -144,6 +144,16 @@ def _extract_from_standard(self, sub_type:str = "") -> list: column = [div for div in column if div.name not in {'script', 'style'}] column = webutils.filter_empty_divs(column) return column + + if self.layout_label == "standard-2": + column = [] + top_divs = ExtractorMain.extract_top_divs(self.layout_divs['top-bars']) or [] + main_divs = rso_div.find('div', {'id':'kp-wp-tab-SportsStandings'}).children or [] + column.extend(top_divs) + column.extend(main_divs) + column = [div for div in column if div.name not in {'script', 'style'}] + column = webutils.filter_empty_divs(column) + return column def extract_from_top_bar(self, drop_tags:set={}) -> list: From 9955b7c591331277a62a34bf8c3477aec6bbcc70 Mon Sep 17 00:00:00 2001 From: gitronald Date: Fri, 9 May 2025 09:10:32 -0700 Subject: [PATCH 085/101] update: bump h11 per dependabot --- poetry.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 34ccb6a..79638ee 100644 --- a/poetry.lock +++ b/poetry.lock @@ -558,14 +558,14 @@ typing = ["typing-extensions (>=4.12.2) ; python_version < \"3.11\""] [[package]] name = "h11" -version = "0.14.0" +version = "0.16.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" groups = ["main"] files = [ - {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, - {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, + {file = "h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86"}, + {file = "h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1"}, ] [[package]] From 951a5dac0c35d9f13b9733290e49fd7b69340cbd Mon Sep 17 00:00:00 2001 From: gitronald Date: Wed, 14 May 2025 18:59:17 -0700 Subject: [PATCH 086/101] fix: handle serps with no rcnt div --- WebSearcher/extractors/extractor_main.py | 3 +-- WebSearcher/webutils.py | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/WebSearcher/extractors/extractor_main.py b/WebSearcher/extractors/extractor_main.py index af7b399..70c9550 100644 --- a/WebSearcher/extractors/extractor_main.py +++ b/WebSearcher/extractors/extractor_main.py @@ -43,10 +43,9 @@ def get_layout(self): layout_divs = {} layout_divs['rso'] = self.soup.find('div', {'id':'rso'}) layout_divs['left-bar'] = self.soup.find('div', {'class': 'OeVqAd'}) - # layout_divs['top-bars'] = self.soup.find_all('div', {'class': ['XqFnDf', 'M8OgIe']}) rcnt = self.soup.find('div', {'id':'rcnt'}) - layout_divs['top-bars'] = rcnt.find_all('div', {'class': ['XqFnDf', 'M8OgIe']}, recursive=False) + layout_divs['top-bars'] = webutils.find_all_divs(rcnt, 'div', {'class': ['XqFnDf', 'M8OgIe']}) # Layout classifications layouts = {} diff --git a/WebSearcher/webutils.py b/WebSearcher/webutils.py index a36c2b4..f4db20f 100644 --- a/WebSearcher/webutils.py +++ b/WebSearcher/webutils.py @@ -122,6 +122,8 @@ def get_link_list(soup: BeautifulSoup, attrs: dict = {}, key: str = 'href', filt return [link.attrs.get(key, None) for link in links] if links else None def find_all_divs(soup: BeautifulSoup, name: str, attrs: dict = {}, filter_empty: bool = True) -> list: + if not soup: + return [] divs = soup.find_all(name, attrs) if attrs else soup.find_all(name) divs = filter_empty_divs(divs) if filter_empty else divs return divs From 22f639b2fcebbda71e6bf83161b31a486bb6ed69 Mon Sep 17 00:00:00 2001 From: gitronald Date: Thu, 15 May 2025 14:40:04 -0700 Subject: [PATCH 087/101] update: stricter news_quotes classification, more knowledge classifier signals --- WebSearcher/classifiers/main.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/WebSearcher/classifiers/main.py b/WebSearcher/classifiers/main.py index 4130fd3..616a539 100644 --- a/WebSearcher/classifiers/main.py +++ b/WebSearcher/classifiers/main.py @@ -1,9 +1,9 @@ +import bs4 from .. import logger log = logger.Logger().start(__name__) from .header_text import ClassifyHeaderText from .. import webutils -import bs4 class ClassifyMain: """Classify a component from the main section based on its bs4.element.Tag """ @@ -151,7 +151,8 @@ def knowledge_panel(cmpt: bs4.element.Tag) -> str: cmpt.find("div", {"class": ["knowledge-panel", "knavi", "kp-blk", "kp-wholepage-osrp"]}), cmpt.find("div", {"aria-label": "Featured results", "role": "complementary"}), cmpt.find("div", {"jscontroller": "qTdDb"}), - webutils.check_dict_value(cmpt.attrs, "jscontroller", "qTdDb") + webutils.check_dict_value(cmpt.attrs, "jscontroller", "qTdDb"), + cmpt.find('div', {'class':'obcontainer'}) ] return 'knowledge' if any(conditions) else "unknown" @@ -187,10 +188,9 @@ def top_stories(cmpt: bs4.element.Tag) -> str: @staticmethod def news_quotes(cmpt: bs4.element.Tag) -> str: """Classify top stories components""" - conditions = [ - cmpt.find("g-tray-header", role="heading"), - ] - return 'news_quotes' if all(conditions) else "unknown" + header_div = cmpt.find("g-tray-header", role="heading") + condition = webutils.get_text(header_div, strip=True) == "News quotes" + return 'news_quotes' if condition else "unknown" @staticmethod def twitter(cmpt: bs4.element.Tag) -> str: From ebcca84085ff55348699e46551e330c884969bfb Mon Sep 17 00:00:00 2001 From: gitronald Date: Sat, 17 May 2025 15:17:25 -0700 Subject: [PATCH 088/101] fix: stricter parsing for songs id div --- WebSearcher/extractors/extractor_main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WebSearcher/extractors/extractor_main.py b/WebSearcher/extractors/extractor_main.py index 70c9550..7be0c19 100644 --- a/WebSearcher/extractors/extractor_main.py +++ b/WebSearcher/extractors/extractor_main.py @@ -100,7 +100,7 @@ def extract_from_standard(self, drop_tags:set={}) -> list: rso_div = self.layout_divs['rso'] standard_layouts = { "standard-0": rso_div.find('div', {'id':'kp-wp-tab-overview'}), - "standard-1": rso_div.find('div', {'id':'kp-wp-tab-Songs'}), + "standard-1": rso_div.find('div', {'id':'kp-wp-tab-cont-Songs', 'role':'tabpanel'}), "standard-2": rso_div.find('div', {'id':'kp-wp-tab-SportsStandings'}), } for layout_name, layout_div in standard_layouts.items(): From cc9a93884d6f6238b8965096643d148767c0d2d2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 23 May 2025 19:35:41 +0000 Subject: [PATCH 089/101] build(deps-dev): bump tornado from 6.4.2 to 6.5.1 Bumps [tornado](https://github.com/tornadoweb/tornado) from 6.4.2 to 6.5.1. - [Changelog](https://github.com/tornadoweb/tornado/blob/master/docs/releases.rst) - [Commits](https://github.com/tornadoweb/tornado/compare/v6.4.2...v6.5.1) --- updated-dependencies: - dependency-name: tornado dependency-version: 6.5.1 dependency-type: indirect ... Signed-off-by: dependabot[bot] --- poetry.lock | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/poetry.lock b/poetry.lock index 34ccb6a..8d5c7a8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1979,23 +1979,24 @@ files = [ [[package]] name = "tornado" -version = "6.4.2" +version = "6.5.1" description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed." optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "tornado-6.4.2-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:e828cce1123e9e44ae2a50a9de3055497ab1d0aeb440c5ac23064d9e44880da1"}, - {file = "tornado-6.4.2-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:072ce12ada169c5b00b7d92a99ba089447ccc993ea2143c9ede887e0937aa803"}, - {file = "tornado-6.4.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a017d239bd1bb0919f72af256a970624241f070496635784d9bf0db640d3fec"}, - {file = "tornado-6.4.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c36e62ce8f63409301537222faffcef7dfc5284f27eec227389f2ad11b09d946"}, - {file = "tornado-6.4.2-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca9eb02196e789c9cb5c3c7c0f04fb447dc2adffd95265b2c7223a8a615ccbf"}, - {file = "tornado-6.4.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:304463bd0772442ff4d0f5149c6f1c2135a1fae045adf070821c6cdc76980634"}, - {file = "tornado-6.4.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:c82c46813ba483a385ab2a99caeaedf92585a1f90defb5693351fa7e4ea0bf73"}, - {file = "tornado-6.4.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:932d195ca9015956fa502c6b56af9eb06106140d844a335590c1ec7f5277d10c"}, - {file = "tornado-6.4.2-cp38-abi3-win32.whl", hash = "sha256:2876cef82e6c5978fde1e0d5b1f919d756968d5b4282418f3146b79b58556482"}, - {file = "tornado-6.4.2-cp38-abi3-win_amd64.whl", hash = "sha256:908b71bf3ff37d81073356a5fadcc660eb10c1476ee6e2725588626ce7e5ca38"}, - {file = "tornado-6.4.2.tar.gz", hash = "sha256:92bad5b4746e9879fd7bf1eb21dce4e3fc5128d71601f80005afa39237ad620b"}, + {file = "tornado-6.5.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:d50065ba7fd11d3bd41bcad0825227cc9a95154bad83239357094c36708001f7"}, + {file = "tornado-6.5.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9e9ca370f717997cb85606d074b0e5b247282cf5e2e1611568b8821afe0342d6"}, + {file = "tornado-6.5.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b77e9dfa7ed69754a54c89d82ef746398be82f749df69c4d3abe75c4d1ff4888"}, + {file = "tornado-6.5.1-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:253b76040ee3bab8bcf7ba9feb136436a3787208717a1fb9f2c16b744fba7331"}, + {file = "tornado-6.5.1-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:308473f4cc5a76227157cdf904de33ac268af770b2c5f05ca6c1161d82fdd95e"}, + {file = "tornado-6.5.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:caec6314ce8a81cf69bd89909f4b633b9f523834dc1a352021775d45e51d9401"}, + {file = "tornado-6.5.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:13ce6e3396c24e2808774741331638ee6c2f50b114b97a55c5b442df65fd9692"}, + {file = "tornado-6.5.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5cae6145f4cdf5ab24744526cc0f55a17d76f02c98f4cff9daa08ae9a217448a"}, + {file = "tornado-6.5.1-cp39-abi3-win32.whl", hash = "sha256:e0a36e1bc684dca10b1aa75a31df8bdfed656831489bc1e6a6ebed05dc1ec365"}, + {file = "tornado-6.5.1-cp39-abi3-win_amd64.whl", hash = "sha256:908e7d64567cecd4c2b458075589a775063453aeb1d2a1853eedb806922f568b"}, + {file = "tornado-6.5.1-cp39-abi3-win_arm64.whl", hash = "sha256:02420a0eb7bf617257b9935e2b754d1b63897525d8a289c9d65690d580b4dcf7"}, + {file = "tornado-6.5.1.tar.gz", hash = "sha256:84ceece391e8eb9b2b95578db65e920d2a61070260594819589609ba9bc6308c"}, ] [[package]] From 8daaec7a943840f10b95d04b6f98f8c2d5fdb6dc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 10 Jun 2025 10:04:13 +0000 Subject: [PATCH 090/101] build(deps): bump requests from 2.32.3 to 2.32.4 Bumps [requests](https://github.com/psf/requests) from 2.32.3 to 2.32.4. - [Release notes](https://github.com/psf/requests/releases) - [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md) - [Commits](https://github.com/psf/requests/compare/v2.32.3...v2.32.4) --- updated-dependencies: - dependency-name: requests dependency-version: 2.32.4 dependency-type: indirect ... Signed-off-by: dependabot[bot] --- poetry.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 34ccb6a..941d541 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1742,19 +1742,19 @@ cffi = {version = "*", markers = "implementation_name == \"pypy\""} [[package]] name = "requests" -version = "2.32.3" +version = "2.32.4" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, - {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, + {file = "requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c"}, + {file = "requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422"}, ] [package.dependencies] certifi = ">=2017.4.17" -charset-normalizer = ">=2,<4" +charset_normalizer = ">=2,<4" idna = ">=2.5,<4" urllib3 = ">=1.21.1,<3" From a4a4a3239e9bef32676aa4d9950ef8c287ab7513 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 17 Jun 2025 03:45:08 +0000 Subject: [PATCH 091/101] build(deps): bump protobuf from 6.30.0 to 6.31.1 Bumps [protobuf](https://github.com/protocolbuffers/protobuf) from 6.30.0 to 6.31.1. - [Release notes](https://github.com/protocolbuffers/protobuf/releases) - [Changelog](https://github.com/protocolbuffers/protobuf/blob/main/protobuf_release.bzl) - [Commits](https://github.com/protocolbuffers/protobuf/compare/v6.30.0...v6.31.1) --- updated-dependencies: - dependency-name: protobuf dependency-version: 6.31.1 dependency-type: indirect ... Signed-off-by: dependabot[bot] --- poetry.lock | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/poetry.lock b/poetry.lock index 34ccb6a..55752a3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1288,21 +1288,21 @@ wcwidth = "*" [[package]] name = "protobuf" -version = "6.30.0" +version = "6.31.1" description = "" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "protobuf-6.30.0-cp310-abi3-win32.whl", hash = "sha256:7337d76d8efe65ee09ee566b47b5914c517190196f414e5418fa236dfd1aed3e"}, - {file = "protobuf-6.30.0-cp310-abi3-win_amd64.whl", hash = "sha256:9b33d51cc95a7ec4f407004c8b744330b6911a37a782e2629c67e1e8ac41318f"}, - {file = "protobuf-6.30.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:52d4bb6fe76005860e1d0b8bfa126f5c97c19cc82704961f60718f50be16942d"}, - {file = "protobuf-6.30.0-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:7940ab4dfd60d514b2e1d3161549ea7aed5be37d53bafde16001ac470a3e202b"}, - {file = "protobuf-6.30.0-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:d79bf6a202a536b192b7e8d295d7eece0c86fbd9b583d147faf8cfeff46bf598"}, - {file = "protobuf-6.30.0-cp39-cp39-win32.whl", hash = "sha256:bb35ad251d222f03d6c4652c072dfee156be0ef9578373929c1a7ead2bd5492c"}, - {file = "protobuf-6.30.0-cp39-cp39-win_amd64.whl", hash = "sha256:501810e0eba1d327e783fde47cc767a563b0f1c292f1a3546d4f2b8c3612d4d0"}, - {file = "protobuf-6.30.0-py3-none-any.whl", hash = "sha256:e5ef216ea061b262b8994cb6b7d6637a4fb27b3fb4d8e216a6040c0b93bd10d7"}, - {file = "protobuf-6.30.0.tar.gz", hash = "sha256:852b675d276a7d028f660da075af1841c768618f76b90af771a8e2c29e6f5965"}, + {file = "protobuf-6.31.1-cp310-abi3-win32.whl", hash = "sha256:7fa17d5a29c2e04b7d90e5e32388b8bfd0e7107cd8e616feef7ed3fa6bdab5c9"}, + {file = "protobuf-6.31.1-cp310-abi3-win_amd64.whl", hash = "sha256:426f59d2964864a1a366254fa703b8632dcec0790d8862d30034d8245e1cd447"}, + {file = "protobuf-6.31.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:6f1227473dc43d44ed644425268eb7c2e488ae245d51c6866d19fe158e207402"}, + {file = "protobuf-6.31.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:a40fc12b84c154884d7d4c4ebd675d5b3b5283e155f324049ae396b95ddebc39"}, + {file = "protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:4ee898bf66f7a8b0bd21bce523814e6fbd8c6add948045ce958b73af7e8878c6"}, + {file = "protobuf-6.31.1-cp39-cp39-win32.whl", hash = "sha256:0414e3aa5a5f3ff423828e1e6a6e907d6c65c1d5b7e6e975793d5590bdeecc16"}, + {file = "protobuf-6.31.1-cp39-cp39-win_amd64.whl", hash = "sha256:8764cf4587791e7564051b35524b72844f845ad0bb011704c3736cce762d8fe9"}, + {file = "protobuf-6.31.1-py3-none-any.whl", hash = "sha256:720a6c7e6b77288b85063569baae8536671b39f15cc22037ec7045658d80489e"}, + {file = "protobuf-6.31.1.tar.gz", hash = "sha256:d8cac4c982f0b957a4dc73a80e2ea24fab08e679c0de9deb835f4a12d69aca9a"}, ] [[package]] From a05270efcd5e3601394fb4be8d04c18f74b3ea52 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 19 Jun 2025 04:51:01 +0000 Subject: [PATCH 092/101] build(deps): bump urllib3 from 2.3.0 to 2.5.0 Bumps [urllib3](https://github.com/urllib3/urllib3) from 2.3.0 to 2.5.0. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/2.3.0...2.5.0) --- updated-dependencies: - dependency-name: urllib3 dependency-version: 2.5.0 dependency-type: indirect ... Signed-off-by: dependabot[bot] --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 34ccb6a..7a88f9a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2113,14 +2113,14 @@ websockets = "*" [[package]] name = "urllib3" -version = "2.3.0" +version = "2.5.0" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df"}, - {file = "urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d"}, + {file = "urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc"}, + {file = "urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760"}, ] [package.dependencies] From 0209fdba55fd70c6004f974ea7618ac55b420af9 Mon Sep 17 00:00:00 2001 From: gitronald Date: Tue, 14 Oct 2025 08:16:39 -0700 Subject: [PATCH 093/101] version: 0.6.5a0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 34de6e7..019cd14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.6.5.dev5" +version = "0.6.5a0" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] From b7cc70011c858c75e2e14468624c937c8dc1ecd1 Mon Sep 17 00:00:00 2001 From: gitronald Date: Tue, 14 Oct 2025 08:17:20 -0700 Subject: [PATCH 094/101] refactor: convert Footer methods to staticmethod --- WebSearcher/component_parsers/footer.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/WebSearcher/component_parsers/footer.py b/WebSearcher/component_parsers/footer.py index b60ff86..e45a044 100644 --- a/WebSearcher/component_parsers/footer.py +++ b/WebSearcher/component_parsers/footer.py @@ -2,13 +2,13 @@ class Footer: - @classmethod - def parse_image_cards(self, elem) -> list: + @staticmethod + def parse_image_cards(elem) -> list: subs = webutils.find_all_divs(elem, 'div', {'class':'g'}) - return [self.parse_image_card(sub, sub_rank) for sub_rank, sub in enumerate(subs)] + return [Footer.parse_image_card(sub, sub_rank) for sub_rank, sub in enumerate(subs)] - @classmethod - def parse_image_card(self, sub, sub_rank=0) -> dict: + @staticmethod + def parse_image_card(sub, sub_rank=0) -> dict: parsed = {'type':'img_cards', 'sub_rank':sub_rank} parsed['title'] = webutils.get_text(sub, "div", {'aria-level':"3", "role":"heading"}) images = sub.find_all('img') @@ -16,8 +16,8 @@ def parse_image_card(self, sub, sub_rank=0) -> dict: parsed['details'] = [{'text':i['alt'], 'url':i['src']} for i in images] return parsed - @classmethod - def parse_discover_more(self, elem) -> list: + @staticmethod + def parse_discover_more(elem) -> list: carousel = elem.find('g-scrolling-carousel') return [{ 'type':'discover_more', @@ -25,8 +25,8 @@ def parse_discover_more(self, elem) -> list: 'text': '|'.join(c.text for c in carousel.find_all('g-inner-card')) }] - @classmethod - def parse_omitted_notice(self, elem) -> list: + @staticmethod + def parse_omitted_notice(elem) -> list: return [{ 'type':'omitted_notice', 'sub_rank':0, From e66515c8ec4e9ecc4fb9c4e27727d99f6a14a996 Mon Sep 17 00:00:00 2001 From: gitronald Date: Tue, 14 Oct 2025 08:29:30 -0700 Subject: [PATCH 095/101] fix: update demo-search entry point to use typer app --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 019cd14..a814d57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ homepage = "http://github.com/gitronald/WebSearcher" repository = "http://github.com/gitronald/WebSearcher" [project.scripts] -demo-search = 'scripts.demo_search:main' +demo-search = 'scripts.demo_search:app' [tool.poetry] packages = [{include = "WebSearcher"}] From bb938d5719d0aeb5db591795a5290ba1ea84bfa7 Mon Sep 17 00:00:00 2001 From: gitronald Date: Tue, 14 Oct 2025 08:29:41 -0700 Subject: [PATCH 096/101] update: version in __init__.py to match pyproject.toml --- WebSearcher/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index 61e16df..f8bdeb9 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.6.5.dev5" +__version__ = "0.6.5a0" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor From cc33dea9b326a29b648b6ce073f335de6b921afe Mon Sep 17 00:00:00 2001 From: gitronald Date: Tue, 14 Oct 2025 08:47:42 -0700 Subject: [PATCH 097/101] update: default Chrome version to 141 --- README.md | 4 ++-- WebSearcher/models/configs.py | 2 +- WebSearcher/search_methods/selenium_searcher.py | 2 -- scripts/demo_search.py | 2 +- scripts/demo_searches.py | 2 +- 5 files changed, 5 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 677c408..ba79109 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,7 @@ drwxr-xr-x 2 user user 4.0K 2024-11-11 10:55 html/ ### Step by Step -Example search and parse pipeline: +Example search and parse pipeline (via requests): ```python import WebSearcher as ws @@ -143,7 +143,7 @@ se = ws.SearchEngine( "headless": False, "use_subprocess": False, "driver_executable_path": "", - "version_main": 133, + "version_main": 141, } ) ``` diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py index 81e011d..5d6ea80 100644 --- a/WebSearcher/models/configs.py +++ b/WebSearcher/models/configs.py @@ -25,7 +25,7 @@ class LogConfig(BaseConfig): class SeleniumConfig(BaseConfig): headless: bool = False - version_main: int = 133 + version_main: int = 141 use_subprocess: bool = False driver_executable_path: str = "" diff --git a/WebSearcher/search_methods/selenium_searcher.py b/WebSearcher/search_methods/selenium_searcher.py index 1e67025..e4b1e16 100644 --- a/WebSearcher/search_methods/selenium_searcher.py +++ b/WebSearcher/search_methods/selenium_searcher.py @@ -136,14 +136,12 @@ def cleanup(self) -> bool: try: self.delete_cookies() self.close_all_windows() - # Finally quit the driver self.driver.quit() self.driver = None self.log.debug(f'Browser successfully closed') return True except Exception as e: self.log.warning(f'Failed to close browser: {e}') - # Force driver to be None so we create a fresh instance next time self.driver = None return False return True diff --git a/scripts/demo_search.py b/scripts/demo_search.py index 3debcaf..ebfdd1c 100644 --- a/scripts/demo_search.py +++ b/scripts/demo_search.py @@ -22,7 +22,7 @@ def main( data_dir: str = typer.Option(DEFAULT_DATA_DIR, help="Prefix for output files"), headless: bool = typer.Option(False, help="Run browser in headless mode"), use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"), - version_main: int = typer.Option(133, help="Main version of Chrome to use"), + version_main: int = typer.Option(141, help="Main version of Chrome to use"), ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"), driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"), ) -> None: diff --git a/scripts/demo_searches.py b/scripts/demo_searches.py index 82eee67..f63f454 100644 --- a/scripts/demo_searches.py +++ b/scripts/demo_searches.py @@ -22,7 +22,7 @@ def main( data_dir: str = typer.Option(DEFAULT_DATA_DIR, help="Prefix for output files"), headless: bool = typer.Option(False, help="Run browser in headless mode"), use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"), - version_main: int = typer.Option(133, help="Main version of Chrome to use"), + version_main: int = typer.Option(141, help="Main version of Chrome to use"), ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"), driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"), ) -> None: From 1cb9ae5c274493d445b8c8069b6e1927103c9798 Mon Sep 17 00:00:00 2001 From: gitronald Date: Tue, 14 Oct 2025 08:54:55 -0700 Subject: [PATCH 098/101] update: bump requests to 2.32.4 and protobuf to 6.31.1 --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a814d57..16f8df9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ license = "GPL-3.0" readme = "README.md" requires-python = ">=3.10" dependencies = [ - "requests>=2.32.3", + "requests>=2.32.4", "lxml>=5.3.0", "beautifulsoup4>=4.12.3", "tldextract>=5.1.2", @@ -17,7 +17,7 @@ dependencies = [ "pandas>=2.2.3", "undetected-chromedriver>=3.5.5", "selenium>=4.9.0", - "protobuf (>=6.30.0,<7.0.0)", + "protobuf (>=6.31.1,<7.0.0)", "orjson (>=3.10.16,<4.0.0)", ] From 3cb10932d00a107d57c7534d8efbcd5f35ba6b37 Mon Sep 17 00:00:00 2001 From: gitronald Date: Tue, 14 Oct 2025 08:57:02 -0700 Subject: [PATCH 099/101] update: regenerate poetry.lock --- poetry.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 22c0970..369c990 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. [[package]] name = "annotated-types" @@ -515,7 +515,7 @@ description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" groups = ["main", "dev"] -markers = "python_version < \"3.11\"" +markers = "python_version == \"3.10\"" files = [ {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, @@ -1941,7 +1941,7 @@ description = "A lil' TOML parser" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version < \"3.11\"" +markers = "python_version == \"3.10\"" files = [ {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, {file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"}, @@ -2259,4 +2259,4 @@ h11 = ">=0.9.0,<1" [metadata] lock-version = "2.1" python-versions = ">=3.10" -content-hash = "684e3794b5ea4541fde5a46b9bf83f67cbeedcecf4cd969dce683ffc3210b382" +content-hash = "c571829b60451314f3df0749f1f8f8b553bdfe22d4e8a183c096335cfae000ae" From eb3cec9442463487e8d36cfea45b92f039a5ebf8 Mon Sep 17 00:00:00 2001 From: gitronald Date: Fri, 5 Dec 2025 13:19:56 -0800 Subject: [PATCH 100/101] update: github actions readme section --- README.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/README.md b/README.md index ba79109..fa43bd2 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ Below are some details about recent updates. For a longer list, see the [Update - [Repair or Enhance a Parser](#repair-or-enhance-a-parser) - [Add a Parser](#add-a-parser) - [Testing](#testing) + - [GitHub Actions](#github-actions) - [Update Log](#update-log) - [Similar Packages](#similar-packages) - [License](#license) @@ -253,6 +254,22 @@ With the `-k` flag you can run a test for a specific html file: pytest -k "1684837514.html" ``` +--- +## GitHub Actions + +This repository uses GitHub Actions for automated publishing: + +**Release Workflow** (`.github/workflows/publish.yml`) +Automatically publishes to PyPI when a pull request is merged into `master`. The workflow: +- Triggers on merged PRs to `master` +- Builds the package using Poetry +- Publishes to PyPI using trusted publishing (no API tokens required) + +To release a new version: +1. Update the version in `pyproject.toml` +2. Create a PR to `master` +3. Once merged, the package is automatically published to PyPI + --- ## Update Log From a864e09ccf4318d0b05c154c3be24397ba18f52a Mon Sep 17 00:00:00 2001 From: gitronald Date: Fri, 5 Dec 2025 13:23:51 -0800 Subject: [PATCH 101/101] version: 0.6.5 --- WebSearcher/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index f8bdeb9..6380b2f 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.6.5a0" +__version__ = "0.6.5" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/pyproject.toml b/pyproject.toml index 16f8df9..a63593b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.6.5a0" +version = "0.6.5" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"]