From 63c4cc993cff3a767c68156184791aff158bae26 Mon Sep 17 00:00:00 2001
From: EvanUp <evanwilliams@Evans-MacBook-Pro-6.local>
Date: Mon, 10 Feb 2025 15:24:54 -0500
Subject: [PATCH 001/101] switching from requests to selenium

---
 .python-version          |   1 +
 README.md                |  21 +++++---
 WebSearcher/searchers.py | 103 +++++++++++++++++++++++++++++----------
 pyproject.toml           |   2 +
 scripts/demo_search.py   |   1 +
 tests/selenium_test.py   |   7 +++
 6 files changed, 102 insertions(+), 33 deletions(-)
 create mode 100644 .python-version
 create mode 100644 tests/selenium_test.py
diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000..c84ccce
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.10.5
diff --git a/README.md b/README.md
index 31f1e97..a611527 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,10 @@
 ## Tools for conducting and parsing web searches  
 [![PyPI version](https://badge.fury.io/py/WebSearcher.svg)](https://badge.fury.io/py/WebSearcher)
 
+NOTE: In 0.5.*, we moved scraping to selenium
+
 This package provides tools for conducting algorithm audits of web search and 
-includes a scraper built on `requests` with tools for geolocating, conducting, 
+includes a scraper built on `selenium` with tools for geolocating, conducting, 
 and saving searches. It also includes a modular parser built on `BeautifulSoup` 
 for decomposing a SERP into list of components with categorical classifications 
 and position-based specifications.
@@ -104,13 +106,14 @@ drwxr-xr-x 2 user user 4.0K 2024-11-11 10:55 html/
 -rw-r--r-- 1 user user 990K 2024-11-11 10:55 serps.json
 ```
 
-### Step by Step
+### Step by Step 
 
 Example search and parse pipeline:
 
 ```python
 import WebSearcher as ws
 se = ws.SearchEngine()                     # 1. Initialize collector
+se.launch_chromedriver(headless=False)     # 2. Launch undetected chromedriver window
 se.search('immigration news')              # 2. Conduct a search
 se.parse_results()                         # 3. Parse search results
 se.save_serp(append_to='serps.json')       # 4. Save HTML and metadata
@@ -153,14 +156,20 @@ vars(se)
  'log': <Logger WebSearcher.searchers (DEBUG)>}
 ```
 
-#### 2. Conduct a Search
+#### 2. Launch undetected chromedriver window
+We've switched to using [undetected chrome](https://github.com/ultrafunkamsterdam/undetected-chromedriver) to scrape search results. You'll need to ensure that your chromedriver is up-to-date. All cookies are deleted following each search.launch_chromedriver accepts 3 optional arguments. The defaults are:
+
+se.launch_chromedriver(headless = False, use_subprocess = False, chromedriver_path = '')
+
+
+#### 3. Conduct a Search
 
 ```python
 se.search('immigration news')
 # 2024-08-19 14:09:18.502 | INFO | WebSearcher.searchers | 200 | immigration news
 ```
 
-#### 3. Parse Search Results
+#### 4. Parse Search Results
 
 The example below is primarily for parsing search results as you collect HTML.  
 See `ws.parse_serp(html)` for parsing existing HTML data.
@@ -185,7 +194,7 @@ se.results[0]
 ```
 
 
-#### 4. Save HTML and Metadata
+#### 5. Save HTML and Metadata
 
 Recommended: Append html and meta data as lines to a json file for larger or 
 ongoing collections.
@@ -200,7 +209,7 @@ Alternative: Save individual html files in a directory, named by a provided or (
 se.save_serp(save_dir='./serps')
 ```
 
-#### 5. Save Parsed Results
+#### 6. Save Parsed Results
 
 Save to a json lines file.
 
diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 13e2444..4f45bd1 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -4,12 +4,19 @@
 from . import utils
 from . import logger
 from .models import BaseSERP
+# selenium updates
+import undetected_chromedriver as uc
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
 
 import os
 import time
 import brotli
 import requests
 import subprocess
+
 from datetime import datetime, timezone
 from typing import Any, Dict, Optional
 
@@ -17,20 +24,30 @@
 WS_VERSION = metadata.version('WebSearcher')
 
 # Default headers to send with requests (i.e. device fingerprint)
-DEFAULT_HEADERS = {
-    'Host': 'www.google.com',
-    'Referer': 'https://www.google.com/',
-    'Accept': '*/*',
-    'Accept-Encoding': 'gzip,deflate,br',
-    'Accept-Language': 'en-US,en;q=0.5',
-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0',
-}
+
+#DEFAULT_HEADERS = {
+#    'Host': 'www.google.com',
+#    'Referer': 'https://www.google.com/',
+#    'Accept': '*/*',
+#    'Accept-Encoding': 'gzip,deflate,br',
+#    'Accept-Language': 'en-US,en;q=0.5',
+#    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0',
+#}
+
+#chromedriver_path = "/opt/homebrew/Caskroom/chromedriver/133.0.6943.53"
+#driver = uc.Chrome(chromedriver_path = chromedriver_path)
+#driver.get('https://www.google.com')
+#search_box = driver.find_element(By.ID, "APjFqb")
+#search_box.send_keys("how climate change works")
+#search_box.send_keys(Keys.RETURN)
+#html_content = driver.page_source
+
 
 
 class SearchEngine:
     """Collect Search Engine Results Pages (SERPs)"""
     def __init__(self, 
-            headers: Dict[str, str] = DEFAULT_HEADERS,
+            headers: Dict[str, str] = None,
             sesh: Optional[requests.Session] = None, 
             ssh_tunnel: Optional[subprocess.Popen] = None, 
             unzip: bool = True,
@@ -54,8 +71,9 @@ def __init__(self,
         # Initialize data storage
         self.version: str = WS_VERSION
         self.base_url: str = 'https://www.google.com/search'
-        self.headers: Dict[str, str] = headers
-        self.sesh: requests.Session = sesh if sesh else wu.start_sesh(headers=self.headers)
+        self.headers: Dict[str, str] = None
+        #self.sesh: requests.Session = sesh if sesh else wu.start_sesh(headers=self.headers)
+        self.sesh = None
         self.ssh_tunnel: subprocess.Popen = ssh_tunnel
         self.unzip: bool = unzip
         self.params: Dict[str, Any] = {}
@@ -83,6 +101,11 @@ def __init__(self,
             file_level=log_level,
         ).start(__name__)
 
+    def launch_chromedriver(self, headless = False, use_subprocess = False, chromedriver_path = ''):
+        self.headless = headless
+        self.use_subprocess = use_subprocess
+        self.chromedriver_path = chromedriver_path
+        self._init_chromedriver()
 
     def search(self, qry: str, location: str = None, num_results: int = None, serp_id: str = '', crawl_id: str = ''):
         """Conduct a search and save HTML
@@ -95,9 +118,8 @@ def search(self, qry: str, location: str = None, num_results: int = None, serp_i
             crawl_id (str, optional): An identifier for this crawl
         """
         self._prepare_search(qry=qry, location=location, num_results=num_results)
-        self._conduct_search(serp_id=serp_id, crawl_id=crawl_id)
-        self._handle_response()
-
+        self._conduct_chromedriver_search(serp_id=serp_id, crawl_id=crawl_id)
+        #self._handle_response()
 
     def _prepare_search(self, qry: str, location: str = None, num_results: int = None):
         """Prepare a search URL and metadata for the given query and location"""
@@ -111,23 +133,41 @@ def _prepare_search(self, qry: str, location: str = None, num_results: int = Non
         if self.loc and self.loc != 'None':
             self.params['uule'] = locations.get_location_id(canonical_name=self.loc)
 
+    def _init_chromedriver(self):
+        print('launching...')
+        if self.chromedriver_path == '':
+            #optionally: headless=True, use_subprocess=True
+            self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess)
+        else:
+            self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess, chromedriver_path = self.chromedriver_path)
+            #chromedriver_path = "/opt/homebrew/Caskroom/chromedriver/133.0.6943.53"
+        time.sleep(2)
+        self.driver.get('https://www.google.com')
+        time.sleep(2)
 
-    def _conduct_search(self, serp_id: str = '', crawl_id: str = ''):
+    def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = ''):
         """Send a search request and handle errors"""
-
         self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
         self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp)
         self.crawl_id = crawl_id
         try:
-            self._send_request()
-        except requests.exceptions.ConnectionError:
-            self.log.exception(f'SERP | Connection error | {self.serp_id}')
-            self._reset_ssh_tunnel()
-        except requests.exceptions.Timeout:
-            self.log.exception(f'SERP | Timeout error | {self.serp_id}')
-        except Exception:
+            self._send_chromedriver_request()
+        except:
             self.log.exception(f'SERP | Unknown error | {self.serp_id}')
+        self.driver.delete_all_cookies()
 
+    def _send_chromedriver_request(self):
+        search_box = self.driver.find_element(By.ID, "APjFqb")
+        search_box.send_keys(self.qry)
+        search_box.send_keys(Keys.RETURN)
+        
+        # wait for the page to load
+        WebDriverWait(self.driver, 10).until(
+            EC.presence_of_element_located((By.ID, "search")) 
+        )
+        time.sleep(2) #including a sleep to allow the page to fully load
+        self.html = self.driver.page_source
+        self.url = self.driver.current_url
 
     def _send_request(self):
         self.url = f"{self.base_url}?{wu.join_url_quote(self.params)}"
@@ -144,7 +184,6 @@ def _reset_ssh_tunnel(self):
             self.log.info(f'SERP | Restarted SSH tunnel | {self.serp_id}')
             time.sleep(10) # Allow time to establish connection
 
-
     def _handle_response(self):
         try:
             if self.unzip:  
@@ -155,7 +194,6 @@ def _handle_response(self):
         except Exception:
             self.log.exception(f'Response handling error')
 
-
     def _unzip_html(self):
         """Unzip brotli zipped html 
 
@@ -199,8 +237,8 @@ def prepare_serp_save(self):
             loc=self.loc, 
             url=self.url, 
             html=self.html,
-            response_code=self.response.status_code,
-            user_agent=self.headers['User-Agent'],
+            response_code= 0,#self.response.status_code,
+            user_agent='',#self.headers['User-Agent'],
             timestamp=self.timestamp,
             serp_id=self.serp_id,
             crawl_id=self.crawl_id,
@@ -264,3 +302,14 @@ def save_results(self, save_dir: str = "", append_to: str = ""):
                 utils.write_lines(self.results, fp)
         else:
             self.log.info(f'No parsed results for serp_id: {self.serp_id}')
+
+
+
+
+#chromedriver_path = "/opt/homebrew/Caskroom/chromedriver/133.0.6943.53"
+#ws = SearchEngine(chromedriver_path=chromedriver_path)
+#ws.launch_chromedriver()
+#qry = 'how climate change works'
+#ws.search(qry)
+#ws.parse_results()
+#ws.results
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 57b0870..8a710ee 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,8 @@ dependencies = [
     "brotli>=1.1.0",
     "pydantic>=2.9.2",
     "pandas>=2.2.3",
+    "undetected-chromedriver>=3.5.5",
+    "selenium>=4.9.0",
 ]
 
 [project.urls]
diff --git a/scripts/demo_search.py b/scripts/demo_search.py
index 9de4dde..94bfb68 100644
--- a/scripts/demo_search.py
+++ b/scripts/demo_search.py
@@ -29,6 +29,7 @@ def main():
 
     # Search, parse, and save
     se = ws.SearchEngine()                  # Initialize searcher
+    se.launch_chromedriver(headless =False) # Launch browser
     se.search(args.query)                   # Conduct Search
     se.parse_results()                      # Parse Results
     se.save_serp(append_to=fp_serps)        # Save SERP to json (html + metadata)
diff --git a/tests/selenium_test.py b/tests/selenium_test.py
new file mode 100644
index 0000000..f5fe929
--- /dev/null
+++ b/tests/selenium_test.py
@@ -0,0 +1,7 @@
+import WebSearcher as ws
+se = ws.SearchEngine()                     # 1. Initialize collector
+se.launch_chromedriver(headless = False)   # 2. Launch undetected chromedriver window
+se.search('immigration news')              # 2. Conduct a search
+se.parse_results()                         # 3. Parse search results
+se.save_serp(append_to='serps.json')       # 4. Save HTML and metadata
+se.save_results(append_to='results.json')  # 5. Save parsed results

From 1d5e2554f002423a1e67882f6a9248bb640ea181 Mon Sep 17 00:00:00 2001
From: EvanUp
 <evanwilliams@dynamic-oit-ip4-wifirestricted04-10-51-168-90.princeton.edu>
Date: Mon, 17 Feb 2025 15:35:39 -0500
Subject: [PATCH 002/101] added code to expand ai overview text and urls

---
 WebSearcher/searchers.py | 43 +++++++++++++++++++++++++++++++++++++---
 tests/selenium_test.py   |  4 ++++
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 4f45bd1..9f23f42 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -10,6 +10,7 @@
 from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import NoSuchElementException
 
 import os
 import time
@@ -107,7 +108,7 @@ def launch_chromedriver(self, headless = False, use_subprocess = False, chromedr
         self.chromedriver_path = chromedriver_path
         self._init_chromedriver()
 
-    def search(self, qry: str, location: str = None, num_results: int = None, serp_id: str = '', crawl_id: str = ''):
+    def search(self, qry: str, ai_expand = False, location: str = None, num_results: int = None, serp_id: str = '', crawl_id: str = ''):
         """Conduct a search and save HTML
         
         Args:
@@ -118,9 +119,10 @@ def search(self, qry: str, location: str = None, num_results: int = None, serp_i
             crawl_id (str, optional): An identifier for this crawl
         """
         self._prepare_search(qry=qry, location=location, num_results=num_results)
-        self._conduct_chromedriver_search(serp_id=serp_id, crawl_id=crawl_id)
+        self._conduct_chromedriver_search(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand)
         #self._handle_response()
 
+
     def _prepare_search(self, qry: str, location: str = None, num_results: int = None):
         """Prepare a search URL and metadata for the given query and location"""
         self.qry = str(qry)
@@ -145,7 +147,14 @@ def _init_chromedriver(self):
         self.driver.get('https://www.google.com')
         time.sleep(2)
 
-    def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = ''):
+    def _check_ai_expand(self):
+        try:
+            self.driver.find_element(By.XPATH, "//div[@jsname='rPRdsc' and @role='button']")
+            return True
+        except NoSuchElementException:
+            return False
+
+    def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = '', ai_expand = False):
         """Send a search request and handle errors"""
         self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
         self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp)
@@ -154,10 +163,38 @@ def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = ''):
             self._send_chromedriver_request()
         except:
             self.log.exception(f'SERP | Unknown error | {self.serp_id}')
+        
+        ## Look for AI overview box and click on it
+        if ai_expand:
+            ai_button = self._check_ai_expand()
+            if ai_button:
+                try:
+                    show_more_button = WebDriverWait(self.driver, 1).until(
+                        EC.element_to_be_clickable((By.XPATH, "//div[@jsname='rPRdsc' and @role='button']"))
+                    )
+                    show_more_button.click()
+                    if show_more_button is not None:
+                        try:
+                            # Wait for additional content to load
+                            time.sleep(2)
+
+                            show_all_button = WebDriverWait(self.driver, 1).until(
+                                EC.element_to_be_clickable((By.XPATH, '//div[contains(@class, "trEk7e") and @role="button"]'))
+                            )
+                            show_all_button.click()
+                        except:
+                            pass
+                except:
+                    pass
+                self.html = self.driver.page_source
+            else:
+                pass
+        
         self.driver.delete_all_cookies()
 
     def _send_chromedriver_request(self):
         search_box = self.driver.find_element(By.ID, "APjFqb")
+        search_box.clear()
         search_box.send_keys(self.qry)
         search_box.send_keys(Keys.RETURN)
         
diff --git a/tests/selenium_test.py b/tests/selenium_test.py
index f5fe929..8aa06c1 100644
--- a/tests/selenium_test.py
+++ b/tests/selenium_test.py
@@ -5,3 +5,7 @@
 se.parse_results()                         # 3. Parse search results
 se.save_serp(append_to='serps.json')       # 4. Save HTML and metadata
 se.save_results(append_to='results.json')  # 5. Save parsed results
+
+
+#import pandas as pd
+#df = pd.DataFrame(se.results)                   # 6. Display results in a pandas dataframe
\ No newline at end of file

From 002386bd315decae3134a0a8f3b144a7209e9c33 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Wed, 26 Feb 2025 13:11:04 -0800
Subject: [PATCH 003/101] update: add lang arg to search using hl url param

---
 WebSearcher/searchers.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 13e2444..b8e1737 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -63,6 +63,7 @@ def __init__(self,
         # Initialize search details
         self.qry: str = None
         self.loc: str = None
+        self.lang: str = None
         self.num_results = None
         self.url: str = None
         self.timestamp: str = None
@@ -84,7 +85,14 @@ def __init__(self,
         ).start(__name__)
 
 
-    def search(self, qry: str, location: str = None, num_results: int = None, serp_id: str = '', crawl_id: str = ''):
+    def search(self, 
+            qry: str, 
+            location: str = None, 
+            lang: str = None, 
+            num_results: int = None, 
+            serp_id: str = '', 
+            crawl_id: str = ''
+        ):
         """Conduct a search and save HTML
         
         Args:
@@ -94,20 +102,23 @@ def search(self, qry: str, location: str = None, num_results: int = None, serp_i
             serp_id (str, optional): A unique identifier for this SERP
             crawl_id (str, optional): An identifier for this crawl
         """
-        self._prepare_search(qry=qry, location=location, num_results=num_results)
+        self._prepare_search(qry=qry, location=location, lang=lang, num_results=num_results)
         self._conduct_search(serp_id=serp_id, crawl_id=crawl_id)
         self._handle_response()
 
 
-    def _prepare_search(self, qry: str, location: str = None, num_results: int = None):
+    def _prepare_search(self, qry: str, location: str = None, lang: str = None, num_results: int = None):
         """Prepare a search URL and metadata for the given query and location"""
         self.qry = str(qry)
         self.loc = str(location) if location else ''
+        self.lang = lang
         self.num_results = num_results
         self.params = {}
         self.params['q'] = wu.encode_param_value(self.qry)
         if self.num_results:
             self.params['num'] = self.num_results
+        if self.lang:
+            self.params['hl'] = self.lang
         if self.loc and self.loc != 'None':
             self.params['uule'] = locations.get_location_id(canonical_name=self.loc)
 

From 4bf035ba6e8f916f9c4dc087d1f5c3bbaa88d190 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Wed, 26 Feb 2025 13:11:19 -0800
Subject: [PATCH 004/101] version: 0.5.1.dev0

---
 WebSearcher/__init__.py | 2 +-
 pyproject.toml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index 61bd55b..dd7c0c6 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.5.0"
+__version__ = "0.5.1.dev0"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index 57b0870..c121063 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.5.0"
+version = "0.5.1.dev0"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From 834769bd1bdd95cec2601bc7e7da8d2b7dd2fde1 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Wed, 26 Feb 2025 14:51:18 -0800
Subject: [PATCH 005/101] update: add lang to output

---
 WebSearcher/searchers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index b8e1737..779e64a 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -208,6 +208,7 @@ def prepare_serp_save(self):
         self.serp = BaseSERP(
             qry=self.qry, 
             loc=self.loc, 
+            lang=self.lang,
             url=self.url, 
             html=self.html,
             response_code=self.response.status_code,

From e09030b00cdc3b8a5e7f6373c57aa753b601f17c Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Wed, 26 Feb 2025 14:57:30 -0800
Subject: [PATCH 006/101] update: add language to serp model

---
 WebSearcher/models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/WebSearcher/models.py b/WebSearcher/models.py
index c3f617a..a85d7c1 100644
--- a/WebSearcher/models.py
+++ b/WebSearcher/models.py
@@ -17,6 +17,7 @@ class BaseResult(BaseModel):
 class BaseSERP(BaseModel):
     qry: str                   # Search query 
     loc: Optional[str] = None  # Location if set, "Canonical Name"
+    lang: Optional[str] = None # Language if set
     url: str                   # URL of SERP   
     html: str                  # Raw HTML of SERP
     timestamp: str             # Timestamp of crawl

From 9377d753326f4324cfc83e19f20b1ba279eec85f Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Wed, 26 Feb 2025 14:58:02 -0800
Subject: [PATCH 007/101] version: 0.5.1.dev1

---
 WebSearcher/__init__.py | 2 +-
 pyproject.toml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index dd7c0c6..c9ad3af 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.5.1.dev0"
+__version__ = "0.5.1.dev1"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index c121063..ee93f21 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.5.1.dev0"
+version = "0.5.1.dev1"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From 19a94f916f4c390dd74e7710394e8de850e59f34 Mon Sep 17 00:00:00 2001
From: mariaelissat <166256195+mariaelissat@users.noreply.github.com>
Date: Sat, 1 Mar 2025 15:16:12 -0500
Subject: [PATCH 008/101] =?UTF-8?q?Update=20header=5Ftext=20en=20espa?=
 =?UTF-8?q?=C3=B1ol=20v2.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 WebSearcher/classifiers/header_text.py | 57 +++++++++++++++-----------
 1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/WebSearcher/classifiers/header_text.py b/WebSearcher/classifiers/header_text.py
index dd0062f..5bc2cb9 100644
--- a/WebSearcher/classifiers/header_text.py
+++ b/WebSearcher/classifiers/header_text.py
@@ -51,51 +51,62 @@ def _get_header_level_mapping(level) -> dict:
 
     # WS type -> header level 2 text (e.g., <h2>title</h2>)
     TYPE_TO_H2_MAPPING = {
-        "directions": ["Directions"],
+        "directions": ["Directions", "Ubicaciones"],
         "discussions_and_forums": ["Discussions and forums"],
         "general": ["Complementary Results", 
-                    "Resultados de la Web", 
                     "Web Result with Site Links", 
-                    "Web results"],
-        "images": ["Images"],
-        "jobs": ["Jobs"],
+                    "Web results", "Resultados de la Web", 
+                    "AI-powered overview", "Visión general creada por IA", 
+                    "Things to know", "Cosas que debes saber"],
+        "images": ["Images", "Imágenes"],
+        "jobs": ["Jobs", "Empleos"],
         "knowledge": ["Calculator Result", 
-                    "Featured snippet from the web", 
-                    "Finance Results", 
+                    "Featured snippet from the web", "Fragmento destacado",
+                    "Finance Results", "Resumen de Mercado",
                     "From sources across the web", 
                     "Knowledge Result", 
                     "Resultado de traducci\u00f3n", 
                     "Sports Results", 
+                    "Table", "Posiciones",
+                    "Stat Leaders", "Líderes de estadísticas",
+                    "Teams", "Equipos",
+                    "Players", "Jugadores",
                     "Translation Result", 
                     "Unit Converter", 
-                    "Weather Result"],
-        "local_news": ["Local news"],
+                    "Weather Result", "Clima"
+                    "Artworks", "Obras de arte",
+                    "Songs", "Canciones"
+                    "Albums", "Álbumes",
+                    "What people are saying",
+                    "About", "Información",
+                    "Profiles", "Perfiles"],
+        "local_news": ["Local news", "Noticias Locales"],
         "local_results": [
             "Local Results",
             "Locations",
-            "Places",
+            "Places", "Sitios"
             "Businesses",
             "locations",
         ],
         "map_results": ["Map Results",
-                        "Choice Hotels"],
+                        "Choice Hotels", "Hoteles", "Hotel"],
         "omitted_notice": ["Notices about Filtered Results"],
-        "people_also_ask": ["People also ask"],
+        "people_also_ask": ["People also ask", "Más preguntas"],
         "perspectives": ["Perspectives & opinions", 
                         "Perspectives"],
         "searches_related": ["Additional searches", 
-                            "More searches", 
+                            "More searches", "Ver más",
                             "Other searches", 
-                            "People also search for", 
+                            "People also search for", "También se buscó",
                             "Related", 
                             "Related searches", 
                             "Related to this search",
                             "Searches related to"],
-        "top_stories": ["Top stories", 
-                        "News",
+        "top_stories": ["Top stories", "Noticias Destacadas", "Noticias Principales",
+                        "News", "Noticias",
                         "Market news"],
         "twitter": ["Twitter Results"],
-        "videos": ["Videos"]
+        "videos": ["Videos", "Videos"]
     }
 
     # WS type -> header level 2 text (e.g., <h3>title</h3>)
@@ -104,13 +115,13 @@ def _get_header_level_mapping(level) -> dict:
         "latest_from": ["Latest from"],
         "products": ["Popular products"],
         "news_quotes": ["Quotes in the news"],
-        "recipes": ["Recipes"],
+        "recipes": ["Recipes", "Recetas"],
         "searches_related": ["Related searches"],
-        "scholarly_articles": ["Scholarly articles for"],
-        "top_stories": ["Top stories"],
-        "videos": ["Videos"],
-        "view_more_news": ["View more news"],
-        "view_more_videos": ["View more videos"]
+        "scholarly_articles": ["Scholarly articles for", "Artículos académicos para"],
+        "top_stories": ["Top stories", "Noticias destacadas", "Noticias Principales"],
+        "videos": ["Videos", "Videos"],
+        "view_more_news": ["View more news", "Más noticias", "Ver más"],
+        "view_more_videos": ["View more videos", "Más videos", "Ver más"]
     }
 
     # Invert from {label: [text, ...]} to [{text: label}, ...]

From 3010788f8baf584dd8ca57cbd49d5c0890f7435c Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 6 Mar 2025 01:42:14 -0800
Subject: [PATCH 009/101] update: null arg handling

---
 WebSearcher/searchers.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 779e64a..3c4e3d6 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -10,6 +10,7 @@
 import brotli
 import requests
 import subprocess
+import pandas as pd
 from datetime import datetime, timezone
 from typing import Any, Dict, Optional
 
@@ -110,16 +111,16 @@ def search(self,
     def _prepare_search(self, qry: str, location: str = None, lang: str = None, num_results: int = None):
         """Prepare a search URL and metadata for the given query and location"""
         self.qry = str(qry)
-        self.loc = str(location) if location else ''
-        self.lang = lang
+        self.loc = str(location) if not pd.isnull(location) else ''
+        self.lang = str(lang) if not pd.isnull(lang) else ''
         self.num_results = num_results
         self.params = {}
         self.params['q'] = wu.encode_param_value(self.qry)
         if self.num_results:
             self.params['num'] = self.num_results
-        if self.lang:
+        if self.lang and self.lang not in {'None', 'nan'}:
             self.params['hl'] = self.lang
-        if self.loc and self.loc != 'None':
+        if self.loc and self.loc not in {'None', 'nan'}:
             self.params['uule'] = locations.get_location_id(canonical_name=self.loc)
 
 

From e547b888a8f344345c970c6fadb84171892c3c86 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 6 Mar 2025 01:42:27 -0800
Subject: [PATCH 010/101] version: 0.5.1.dev4

---
 WebSearcher/__init__.py | 2 +-
 pyproject.toml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index c9ad3af..9e83a10 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.5.1.dev1"
+__version__ = "0.5.1.dev4"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index ee93f21..ac2f6d2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.5.1.dev1"
+version = "0.5.1.dev4"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From 1ae28138382dfa3bca360e9230d65d8b5aa802e2 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Fri, 7 Mar 2025 09:23:43 -0800
Subject: [PATCH 011/101] fix: canonical name to uule converter with protobuf

---
 WebSearcher/locations.py | 83 ++++++++++++++++++++++++++++++----------
 WebSearcher/searchers.py |  2 +-
 poetry.lock              | 22 ++++++++++-
 pyproject.toml           |  1 +
 4 files changed, 85 insertions(+), 23 deletions(-)

diff --git a/WebSearcher/locations.py b/WebSearcher/locations.py
index 16b3ca2..c9fcbe1 100644
--- a/WebSearcher/locations.py
+++ b/WebSearcher/locations.py
@@ -2,39 +2,80 @@
 import io
 import csv
 import base64
-import string
 import zipfile
 import requests
-from bs4 import BeautifulSoup
+from google.protobuf.internal import decoder, encoder  # poetry add protobuf
+from typing import Dict, Union, Any
 
 from . import logger
 from . import webutils as wu
 log = logger.Logger().start(__name__)
 
 
-def get_location_id(canonical_name: str) -> str:
-    """Get location ID for URL parameter 'uule'
-    
-    Returns the url parameter for a given location's Canonical Name.
-    See download_locations to obtain a csv of locations and their canonical names. 
+def convert_canonical_name_to_uule(canon_name: str) -> str:
+    """
+    Get UULE parameter based on a location's canonical name.
+    Args: canon_name: Canonical name of the location
+    Returns: UULE parameter for Google search
+    """
+    fields = {1: 2, 2: 32, 4: canon_name}
+    encoded_string = encode_protobuf_string(fields)
+    return f'w+{encoded_string}'
 
-    Credit for figuring this out goes to the author of the PHP version: 
-    https://github.com/512banque/uule-grabber/blob/master/uule.php
 
-    Args:
-        canonical_name (str): The "Canoncial Name" for a location. Use 
-        download_locations to obtain file containing all options. Column name 
-        is usually something like "Canonical Name" or "Canonical.Name". 
-    
-    Returns:
-        str: The uule parameter key for a given location's Canonical Name.
+def encode_protobuf_string(fields: Dict[int, Union[str, int]]) -> str:
+    """
+    Encode a dictionary of field numbers and values into a base64-encoded protobuf string.
+    Args: fields: A dictionary where keys are protobuf field numbers and values are the data to encode
+    Returns: A base64-encoded protobuf message string
+    """
+    encoded = bytearray()  # Buffer to store encoded bytes
+
+    for field_number, value in fields.items():
+        wire_type = 2 if isinstance(value, str) else 0  # Determine wire type based on value type
+        tag = field_number << 3 | wire_type             # Combine field number and wire type into tag
+        encoded.extend(encoder._VarintBytes(tag))       # Encode the tag into bytes
+        
+        # Encode the value based on wire type
+        if wire_type == 0:
+            encoded.extend(encoder._VarintBytes(value))       # Encode the integer as varint
+        if wire_type == 2:
+            value = value.encode('utf-8')                     # Convert string to bytes
+            encoded.extend(encoder._VarintBytes(len(value)))  # Add length prefix
+            encoded.extend(value)                             # Add the actual bytes
     
+    return base64.b64encode(bytes(encoded)).decode('utf-8')   # Convert to base64 and decode to string
+
+
+def decode_protobuf_string(encoded_string: str) -> Dict[int, Any]:
+    """
+    Decode a base64-encoded protobuf string into a dictionary of field numbers and values.
+    Args: encoded_string: A base64-encoded protobuf message
+    Returns: dictionary where keys are protobuf field numbers and values are the decoded values
     """
-    uule_key = string.ascii_uppercase+string.ascii_lowercase+string.digits
-    uule_key = uule_key + '-_' + uule_key + '-_' # Double length, repeating
-    key = uule_key[len(canonical_name)]
-    b64 = base64.b64encode(canonical_name.encode('utf-8')).decode('utf-8')
-    return f'w+CAIQICI{key}{b64}'
+
+    pos = 0       # Position tracker for decoding
+    fields = {}   # Dictionary to store decoded field numbers and values
+
+    protobuf_bytes = base64.b64decode(encoded_string) # Convert to protobuf bytes
+    while pos < len(protobuf_bytes):
+
+        # Get field number and wire type
+        tag, pos_new = decoder._DecodeVarint(protobuf_bytes, pos) # Each protobuf field starts with a varint tag
+        field_number, wire_type = tag >> 3, tag & 7               # Extract field number and wire type from tag
+        
+        # Decode value based on wire type (0: varint, 2: length-delimited; others not supported)
+        if wire_type == 0:
+            value, pos_new = decoder._DecodeVarint(protobuf_bytes, pos_new)    # Get the varint value and new position
+        elif wire_type == 2:
+            length, pos_start = decoder._DecodeVarint(protobuf_bytes, pos_new) # Get length and starting position
+            value = protobuf_bytes[pos_start:pos_start + length]               # Extract data based on the length
+            pos_new = pos_start + length                                       # Update the new position
+            value = value.decode('utf-8')                                      # Assume UTF-8 encoding for strings
+        
+        fields[field_number] = value    # Store the field number and value in the dictionary
+        pos = pos_new                   # Move to the next field using the updated position
+    return fields
 
 
 def download_locations(
diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 3c4e3d6..01ca73f 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -121,7 +121,7 @@ def _prepare_search(self, qry: str, location: str = None, lang: str = None, num_
         if self.lang and self.lang not in {'None', 'nan'}:
             self.params['hl'] = self.lang
         if self.loc and self.loc not in {'None', 'nan'}:
-            self.params['uule'] = locations.get_location_id(canonical_name=self.loc)
+            self.params['uule'] = locations.convert_canonical_name_to_uule(self.loc)
 
 
     def _conduct_search(self, serp_id: str = '', crawl_id: str = ''):
diff --git a/poetry.lock b/poetry.lock
index 57ccb86..d657798 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1220,6 +1220,26 @@ files = [
 [package.dependencies]
 wcwidth = "*"
 
+[[package]]
+name = "protobuf"
+version = "6.30.0"
+description = ""
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "protobuf-6.30.0-cp310-abi3-win32.whl", hash = "sha256:7337d76d8efe65ee09ee566b47b5914c517190196f414e5418fa236dfd1aed3e"},
+    {file = "protobuf-6.30.0-cp310-abi3-win_amd64.whl", hash = "sha256:9b33d51cc95a7ec4f407004c8b744330b6911a37a782e2629c67e1e8ac41318f"},
+    {file = "protobuf-6.30.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:52d4bb6fe76005860e1d0b8bfa126f5c97c19cc82704961f60718f50be16942d"},
+    {file = "protobuf-6.30.0-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:7940ab4dfd60d514b2e1d3161549ea7aed5be37d53bafde16001ac470a3e202b"},
+    {file = "protobuf-6.30.0-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:d79bf6a202a536b192b7e8d295d7eece0c86fbd9b583d147faf8cfeff46bf598"},
+    {file = "protobuf-6.30.0-cp39-cp39-win32.whl", hash = "sha256:bb35ad251d222f03d6c4652c072dfee156be0ef9578373929c1a7ead2bd5492c"},
+    {file = "protobuf-6.30.0-cp39-cp39-win_amd64.whl", hash = "sha256:501810e0eba1d327e783fde47cc767a563b0f1c292f1a3546d4f2b8c3612d4d0"},
+    {file = "protobuf-6.30.0-py3-none-any.whl", hash = "sha256:e5ef216ea061b262b8994cb6b7d6637a4fb27b3fb4d8e216a6040c0b93bd10d7"},
+    {file = "protobuf-6.30.0.tar.gz", hash = "sha256:852b675d276a7d028f660da075af1841c768618f76b90af771a8e2c29e6f5965"},
+]
+
 [[package]]
 name = "psutil"
 version = "6.1.1"
@@ -1940,4 +1960,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.9"
-content-hash = "9928a0553f056ecc96916fb2d6c4adeca729ec9f5c69ef72322077610def4d88"
+content-hash = "aae03414bd510dcc398d4b52bd96660021224dfbf78564b91a1235d3e851a582"
diff --git a/pyproject.toml b/pyproject.toml
index ac2f6d2..aedbc1c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ dependencies = [
     "brotli>=1.1.0",
     "pydantic>=2.9.2",
     "pandas>=2.2.3",
+    "protobuf (>=6.30.0,<7.0.0)",
 ]
 
 [project.urls]

From bddd4a1d927a58de88299a67bf073a41d720612c Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Fri, 7 Mar 2025 09:26:13 -0800
Subject: [PATCH 012/101] update: more specific dir name for geotargets csv
 download

---
 scripts/demo_locations.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/demo_locations.py b/scripts/demo_locations.py
index e75870e..655f142 100644
--- a/scripts/demo_locations.py
+++ b/scripts/demo_locations.py
@@ -5,7 +5,7 @@
 import WebSearcher as ws
 
 # Retrieve and save latest location data 
-data_dir = 'data/locations'
+data_dir = 'data/google_locations'
 os.makedirs(data_dir, exist_ok=True)
 ws.download_locations(data_dir)
 
@@ -116,4 +116,5 @@
 
 dir_html = os.path.join("data", 'html')
 os.makedirs(dir_html, exist_ok=True)
+se.save_search(append_to=os.path.join(dir_html, "searches.json"))
 se.save_serp(save_dir=dir_html)

From 74a1487169b48dce73ed750625a2d2817bff861a Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Fri, 7 Mar 2025 09:29:54 -0800
Subject: [PATCH 013/101] version: 0.5.1.dev5

---
 WebSearcher/__init__.py | 2 +-
 pyproject.toml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index 9e83a10..69bccf6 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.5.1.dev4"
+__version__ = "0.5.1.dev5"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index aedbc1c..dd08db5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.5.1.dev4"
+version = "0.5.1.dev5"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From e18eed58db021be98defb34e10f11a445a3bce1e Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Fri, 7 Mar 2025 10:19:44 -0800
Subject: [PATCH 014/101] version: 0.5.1

---
 WebSearcher/__init__.py | 2 +-
 pyproject.toml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index 69bccf6..10b1c51 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.5.1.dev5"
+__version__ = "0.5.1"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index dd08db5..b66d25d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.5.1.dev5"
+version = "0.5.1"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From 8ffbf34a171245888740f2b7d1d0982049818ec7 Mon Sep 17 00:00:00 2001
From: "Ronald E. Robertson" <gitronald@users.noreply.github.com>
Date: Fri, 7 Mar 2025 10:34:28 -0800
Subject: [PATCH 015/101] Update WebSearcher/classifiers/header_text.py

fix missing commas

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 WebSearcher/classifiers/header_text.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/classifiers/header_text.py b/WebSearcher/classifiers/header_text.py
index 5bc2cb9..8a573a2 100644
--- a/WebSearcher/classifiers/header_text.py
+++ b/WebSearcher/classifiers/header_text.py
@@ -73,9 +73,9 @@ def _get_header_level_mapping(level) -> dict:
                     "Players", "Jugadores",
                     "Translation Result", 
                     "Unit Converter", 
-                    "Weather Result", "Clima"
+                    "Weather Result", "Clima",
                     "Artworks", "Obras de arte",
-                    "Songs", "Canciones"
+                    "Songs", "Canciones",
                     "Albums", "Álbumes",
                     "What people are saying",
                     "About", "Información",

From a136095af2a9da7efb59d647486e97bb2d0013ac Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Fri, 7 Mar 2025 10:48:34 -0800
Subject: [PATCH 016/101] update: formatting, drop repeated Video labels

---
 WebSearcher/classifiers/header_text.py | 87 +++++++++++++++-----------
 1 file changed, 49 insertions(+), 38 deletions(-)

diff --git a/WebSearcher/classifiers/header_text.py b/WebSearcher/classifiers/header_text.py
index 8a573a2..8c0c3c7 100644
--- a/WebSearcher/classifiers/header_text.py
+++ b/WebSearcher/classifiers/header_text.py
@@ -51,35 +51,41 @@ def _get_header_level_mapping(level) -> dict:
 
     # WS type -> header level 2 text (e.g., <h2>title</h2>)
     TYPE_TO_H2_MAPPING = {
-        "directions": ["Directions", "Ubicaciones"],
+        "directions": ["Directions", 
+                       "Ubicaciones"],
         "discussions_and_forums": ["Discussions and forums"],
         "general": ["Complementary Results", 
                     "Web Result with Site Links", 
-                    "Web results", "Resultados de la Web", 
-                    "AI-powered overview", "Visión general creada por IA", 
-                    "Things to know", "Cosas que debes saber"],
-        "images": ["Images", "Imágenes"],
-        "jobs": ["Jobs", "Empleos"],
+                    "Web results", 
+                    "Resultados de la Web", 
+                    "AI-powered overview", 
+                    "Visión general creada por IA", 
+                    "Things to know", 
+                    "Cosas que debes saber"],
+        "images": ["Images", 
+                   "Imágenes"],
+        "jobs": ["Jobs", 
+                 "Empleos"],
         "knowledge": ["Calculator Result", 
-                    "Featured snippet from the web", "Fragmento destacado",
-                    "Finance Results", "Resumen de Mercado",
-                    "From sources across the web", 
-                    "Knowledge Result", 
-                    "Resultado de traducci\u00f3n", 
-                    "Sports Results", 
-                    "Table", "Posiciones",
-                    "Stat Leaders", "Líderes de estadísticas",
-                    "Teams", "Equipos",
-                    "Players", "Jugadores",
-                    "Translation Result", 
-                    "Unit Converter", 
-                    "Weather Result", "Clima",
-                    "Artworks", "Obras de arte",
-                    "Songs", "Canciones",
-                    "Albums", "Álbumes",
-                    "What people are saying",
-                    "About", "Información",
-                    "Profiles", "Perfiles"],
+                      "Featured snippet from the web", "Fragmento destacado",
+                      "Finance Results", "Resumen de Mercado",
+                      "From sources across the web", 
+                      "Knowledge Result", 
+                      "Resultado de traducci\u00f3n", 
+                      "Sports Results", 
+                      "Table", "Posiciones",
+                      "Stat Leaders", "Líderes de estadísticas",
+                      "Teams", "Equipos",
+                      "Players", "Jugadores",
+                      "Translation Result", 
+                      "Unit Converter", 
+                      "Weather Result", "Clima",
+                      "Artworks", "Obras de arte",
+                      "Songs", "Canciones",
+                      "Albums", "Álbumes",
+                      "What people are saying",
+                      "About", "Información",
+                      "Profiles", "Perfiles"],
         "local_news": ["Local news", "Noticias Locales"],
         "local_results": [
             "Local Results",
@@ -89,24 +95,29 @@ def _get_header_level_mapping(level) -> dict:
             "locations",
         ],
         "map_results": ["Map Results",
-                        "Choice Hotels", "Hoteles", "Hotel"],
+                        "Choice Hotels", 
+                        "Hoteles", 
+                        "Hotel"],
         "omitted_notice": ["Notices about Filtered Results"],
         "people_also_ask": ["People also ask", "Más preguntas"],
         "perspectives": ["Perspectives & opinions", 
-                        "Perspectives"],
+                         "Perspectives"],
         "searches_related": ["Additional searches", 
-                            "More searches", "Ver más",
-                            "Other searches", 
-                            "People also search for", "También se buscó",
-                            "Related", 
-                            "Related searches", 
-                            "Related to this search",
-                            "Searches related to"],
-        "top_stories": ["Top stories", "Noticias Destacadas", "Noticias Principales",
-                        "News", "Noticias",
+                             "More searches", "Ver más",
+                             "Other searches", 
+                             "People also search for", "También se buscó",
+                             "Related", 
+                             "Related searches", 
+                             "Related to this search",
+                             "Searches related to"],
+        "top_stories": ["Top stories", 
+                        "Noticias Destacadas", 
+                        "Noticias Principales",
+                        "News", 
+                        "Noticias",
                         "Market news"],
         "twitter": ["Twitter Results"],
-        "videos": ["Videos", "Videos"]
+        "videos": ["Videos"]
     }
 
     # WS type -> header level 2 text (e.g., <h3>title</h3>)
@@ -119,7 +130,7 @@ def _get_header_level_mapping(level) -> dict:
         "searches_related": ["Related searches"],
         "scholarly_articles": ["Scholarly articles for", "Artículos académicos para"],
         "top_stories": ["Top stories", "Noticias destacadas", "Noticias Principales"],
-        "videos": ["Videos", "Videos"],
+        "videos": ["Videos"],
         "view_more_news": ["View more news", "Más noticias", "Ver más"],
         "view_more_videos": ["View more videos", "Más videos", "Ver más"]
     }

From f701a9eb5dd7f6bd65c2eea0d3b8d9e8416e61db Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Fri, 7 Mar 2025 10:51:11 -0800
Subject: [PATCH 017/101] version: 0.5.2.dev0

---
 WebSearcher/__init__.py | 2 +-
 pyproject.toml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index 10b1c51..fa705da 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.5.1"
+__version__ = "0.5.2.dev0"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index b66d25d..2bca1b4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.5.1"
+version = "0.5.2.dev0"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From 0730e6bbe37f91caa6dda6261d9c90e0b978dec0 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Sun, 9 Mar 2025 11:11:05 -0700
Subject: [PATCH 018/101] version: 0.5.2

---
 README.md               | 19 +++++++++++++++----
 WebSearcher/__init__.py |  2 +-
 pyproject.toml          |  2 +-
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 31f1e97..12d23f0 100644
--- a/README.md
+++ b/README.md
@@ -8,17 +8,26 @@ and saving searches. It also includes a modular parser built on `BeautifulSoup`
 for decomposing a SERP into list of components with categorical classifications 
 and position-based specifications.
 
-## Recent Update
+## Recent Updates
 
-`0.5.0` - poetry v2
+Below are some details about recent updates. For a longer list, see the [Update Log](#update-log).
 
-For a longer list of updates, see the [Update Log](#update-log).
+
+`0.5.2`
+- Added support for Spanish component headers by text
+- Pull request [#74](https://github.com/gitronald/WebSearcher/pull/74)
+
+`0.5.1`
+- Fixed canonical name -> UULE converter using `protobuf`, see [this gist](https://gist.github.com/gitronald/66cac42194ea2d489ff3a1e32651e736) for details
+- Added lang arg to specify language in se.search, uses hl URL param and does not change Accept-Language request header (which defaults to en-US), but works in tests.
+- Fixed null location/language arg input handling (again)
+- Pull Request [#76](https://github.com/gitronald/WebSearcher/pull/76)
 
 ## Table of Contents
 
 - [WebSearcher](#websearcher)
   - [Tools for conducting and parsing web searches](#tools-for-conducting-and-parsing-web-searches)
-  - [Recent Update](#recent-update)
+  - [Recent Updates](#recent-updates)
   - [Table of Contents](#table-of-contents)
   - [Getting Started](#getting-started)
   - [Usage](#usage)
@@ -261,6 +270,8 @@ pytest -k "1684837514.html"
 ---
 ## Update Log
 
+`0.5.0`
+- configuration now using poetry v2
 
 `0.4.9` - last version with poetry v1, future versions (`>=0.5.0`) will use [poetry v2](https://python-poetry.org/blog/announcing-poetry-2.0.1/) configs.
 
diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index fa705da..8cec324 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.5.2.dev0"
+__version__ = "0.5.2"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index 2bca1b4..eb43408 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.5.2.dev0"
+version = "0.5.2"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From 7b4b95c818b36cc7f5d55fce2c9cbdfa7d49bfe6 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Sun, 9 Mar 2025 11:20:39 -0700
Subject: [PATCH 019/101] version: 0.6.0.dev0

---
 README.md               | 15 +++++++++------
 WebSearcher/__init__.py |  2 +-
 pyproject.toml          |  2 +-
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index b4f01cb..214e1bc 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,6 @@
 ## Tools for conducting and parsing web searches  
 [![PyPI version](https://badge.fury.io/py/WebSearcher.svg)](https://badge.fury.io/py/WebSearcher)
 
-NOTE: In 0.5.*, we moved scraping to selenium
-
 This package provides tools for conducting algorithm audits of web search and 
 includes a scraper built on `selenium` with tools for geolocating, conducting, 
 and saving searches. It also includes a modular parser built on `BeautifulSoup` 
@@ -15,6 +13,10 @@ and position-based specifications.
 Below are some details about recent updates. For a longer list, see the [Update Log](#update-log).
 
 
+`0.6.0` 
+- method for collecting data with selenium; requests no longer works without a redirect
+- Pull request [#72](https://github.com/gitronald/WebSearcher/pull/72)
+
 `0.5.2`
 - Added support for Spanish component headers by text
 - Pull request [#74](https://github.com/gitronald/WebSearcher/pull/74)
@@ -36,10 +38,11 @@ Below are some details about recent updates. For a longer list, see the [Update
     - [Example Search Script](#example-search-script)
     - [Step by Step](#step-by-step)
       - [1. Initialize Collector](#1-initialize-collector)
-      - [2. Conduct a Search](#2-conduct-a-search)
-      - [3. Parse Search Results](#3-parse-search-results)
-      - [4. Save HTML and Metadata](#4-save-html-and-metadata)
-      - [5. Save Parsed Results](#5-save-parsed-results)
+      - [2. Launch undetected chromedriver window](#2-launch-undetected-chromedriver-window)
+      - [3. Conduct a Search](#3-conduct-a-search)
+      - [4. Parse Search Results](#4-parse-search-results)
+      - [5. Save HTML and Metadata](#5-save-html-and-metadata)
+      - [6. Save Parsed Results](#6-save-parsed-results)
   - [Localization](#localization)
   - [Contributing](#contributing)
     - [Repair or Enhance a Parser](#repair-or-enhance-a-parser)
diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index 8cec324..e03cd22 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.5.2"
+__version__ = "0.6.0.dev0"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index 68001ee..e521987 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.5.2"
+version = "0.6.0.dev0"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From acea0222933ae3987a6a80c79d02e0505f04050f Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Sun, 9 Mar 2025 13:52:25 -0700
Subject: [PATCH 020/101] update: dedupe args, add version_main for
 chromedriver launch

---
 WebSearcher/searchers.py | 18 ++++++++++++------
 tests/selenium_test.py   |  3 +--
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 0d46686..b29f8f3 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -62,9 +62,8 @@ def __init__(self,
         # Initialize data storage
         self.version: str = WS_VERSION
         self.base_url: str = 'https://www.google.com/search'
-        self.headers: Dict[str, str] = None
-        self.sesh: requests.Session = sesh if sesh else wu.start_sesh(headers=self.headers)
-        self.sesh = None
+        self.headers: Dict[str, str] = headers or DEFAULT_HEADERS
+        self.sesh: requests.Session = sesh or wu.start_sesh(headers=self.headers)
         self.ssh_tunnel: subprocess.Popen = ssh_tunnel
         self.unzip: bool = unzip
         self.params: Dict[str, Any] = {}
@@ -93,10 +92,17 @@ def __init__(self,
             file_level=log_level,
         ).start(__name__)
 
-    def launch_chromedriver(self, headless = False, use_subprocess = False, chromedriver_path = ''):
+    def launch_chromedriver(
+            self, 
+            headless: bool = False, 
+            version_main: int = 133,
+            use_subprocess: bool = False, 
+            chromedriver_path: str = ''
+        ) -> None:
         self.headless = headless
         self.use_subprocess = use_subprocess
         self.chromedriver_path = chromedriver_path
+        self.version_main = version_main
         self._init_chromedriver()
 
     def search(self, 
@@ -147,9 +153,9 @@ def _init_chromedriver(self):
         print('launching...')
         if self.chromedriver_path == '':
             #optionally: headless=True, use_subprocess=True
-            self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess)
+            self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess, version_main = self.version_main)
         else:
-            self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess, chromedriver_path = self.chromedriver_path)
+            self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess, chromedriver_path = self.chromedriver_path, version_main = self.version_main)
             #chromedriver_path = "/opt/homebrew/Caskroom/chromedriver/133.0.6943.53"
         time.sleep(2)
         self.driver.get('https://www.google.com')
diff --git a/tests/selenium_test.py b/tests/selenium_test.py
index 8aa06c1..855f573 100644
--- a/tests/selenium_test.py
+++ b/tests/selenium_test.py
@@ -1,11 +1,10 @@
 import WebSearcher as ws
 se = ws.SearchEngine()                     # 1. Initialize collector
-se.launch_chromedriver(headless = False)   # 2. Launch undetected chromedriver window
+se.launch_chromedriver(headless=False, version_main=133)   # 2. Launch undetected chromedriver window
 se.search('immigration news')              # 2. Conduct a search
 se.parse_results()                         # 3. Parse search results
 se.save_serp(append_to='serps.json')       # 4. Save HTML and metadata
 se.save_results(append_to='results.json')  # 5. Save parsed results
 
-
 #import pandas as pd
 #df = pd.DataFrame(se.results)                   # 6. Display results in a pandas dataframe
\ No newline at end of file

From 80403f2f137dd6532cdf0f272e4a3f7cc080ce73 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Sun, 9 Mar 2025 14:20:01 -0700
Subject: [PATCH 021/101] update: poetry lock file

---
 poetry.lock | 298 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 292 insertions(+), 6 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index d657798..225ef6d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -43,6 +43,27 @@ files = [
 astroid = ["astroid (>=2,<4)"]
 test = ["astroid (>=2,<4)", "pytest", "pytest-cov", "pytest-xdist"]
 
+[[package]]
+name = "attrs"
+version = "25.1.0"
+description = "Classes Without Boilerplate"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a"},
+    {file = "attrs-25.1.0.tar.gz", hash = "sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e"},
+]
+
+[package.extras]
+benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
+tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
+
 [[package]]
 name = "beautifulsoup4"
 version = "4.13.1"
@@ -222,8 +243,7 @@ version = "1.17.1"
 description = "Foreign Function Interface for Python calling C code."
 optional = false
 python-versions = ">=3.8"
-groups = ["dev"]
-markers = "python_version <= \"3.11\" and implementation_name == \"pypy\" or python_version >= \"3.12\" and implementation_name == \"pypy\""
+groups = ["main", "dev"]
 files = [
     {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"},
     {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"},
@@ -293,6 +313,7 @@ files = [
     {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"},
     {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"},
 ]
+markers = {main = "python_version <= \"3.11\" and os_name == \"nt\" and implementation_name != \"pypy\" or python_version >= \"3.12\" and os_name == \"nt\" and implementation_name != \"pypy\"", dev = "python_version <= \"3.11\" and implementation_name == \"pypy\" or python_version >= \"3.12\" and implementation_name == \"pypy\""}
 
 [package.dependencies]
 pycparser = "*"
@@ -488,7 +509,7 @@ version = "1.2.2"
 description = "Backport of PEP 654 (exception groups)"
 optional = false
 python-versions = ">=3.7"
-groups = ["dev"]
+groups = ["main", "dev"]
 markers = "python_version < \"3.11\""
 files = [
     {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
@@ -532,6 +553,19 @@ docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)
 testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"]
 typing = ["typing-extensions (>=4.12.2)"]
 
+[[package]]
+name = "h11"
+version = "0.14.0"
+description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
+optional = false
+python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
+    {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
+]
+
 [[package]]
 name = "idna"
 version = "3.10"
@@ -1035,6 +1069,22 @@ files = [
     {file = "numpy-2.2.2.tar.gz", hash = "sha256:ed6906f61834d687738d25988ae117683705636936cc605be0bb208b23df4d8f"},
 ]
 
+[[package]]
+name = "outcome"
+version = "1.3.0.post0"
+description = "Capture the outcome of Python function calls."
+optional = false
+python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b"},
+    {file = "outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8"},
+]
+
+[package.dependencies]
+attrs = ">=19.2.0"
+
 [[package]]
 name = "packaging"
 version = "24.2"
@@ -1307,12 +1357,12 @@ version = "2.22"
 description = "C parser in Python"
 optional = false
 python-versions = ">=3.8"
-groups = ["dev"]
-markers = "python_version <= \"3.11\" and implementation_name == \"pypy\" or python_version >= \"3.12\" and implementation_name == \"pypy\""
+groups = ["main", "dev"]
 files = [
     {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"},
     {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"},
 ]
+markers = {main = "python_version <= \"3.11\" and os_name == \"nt\" and implementation_name != \"pypy\" or python_version >= \"3.12\" and os_name == \"nt\" and implementation_name != \"pypy\"", dev = "python_version <= \"3.11\" and implementation_name == \"pypy\" or python_version >= \"3.12\" and implementation_name == \"pypy\""}
 
 [[package]]
 name = "pydantic"
@@ -1466,6 +1516,20 @@ files = [
 [package.extras]
 windows-terminal = ["colorama (>=0.4.6)"]
 
+[[package]]
+name = "pysocks"
+version = "1.7.1"
+description = "A Python SOCKS client module. See https://github.com/Anorov/PySocks for more information."
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+groups = ["main"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "PySocks-1.7.1-py27-none-any.whl", hash = "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299"},
+    {file = "PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5"},
+    {file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"},
+]
+
 [[package]]
 name = "pytest"
 version = "8.3.4"
@@ -1710,6 +1774,27 @@ files = [
 [package.dependencies]
 requests = ">=1.0.0"
 
+[[package]]
+name = "selenium"
+version = "4.29.0"
+description = "Official Python bindings for Selenium WebDriver"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "selenium-4.29.0-py3-none-any.whl", hash = "sha256:ce5d26f1ddc1111641113653af33694c13947dd36c2df09cdd33f554351d372e"},
+    {file = "selenium-4.29.0.tar.gz", hash = "sha256:3a62f7ec33e669364a6c0562a701deb69745b569c50d55f1a912bf8eb33358ba"},
+]
+
+[package.dependencies]
+certifi = ">=2021.10.8"
+trio = ">=0.17,<1.0"
+trio-websocket = ">=0.9,<1.0"
+typing_extensions = ">=4.9,<5.0"
+urllib3 = {version = ">=1.26,<3", extras = ["socks"]}
+websocket-client = ">=1.8,<2.0"
+
 [[package]]
 name = "six"
 version = "1.17.0"
@@ -1723,6 +1808,32 @@ files = [
     {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"},
 ]
 
+[[package]]
+name = "sniffio"
+version = "1.3.1"
+description = "Sniff out which async library your code is running under"
+optional = false
+python-versions = ">=3.7"
+groups = ["main"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"},
+    {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
+]
+
+[[package]]
+name = "sortedcontainers"
+version = "2.4.0"
+description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set"
+optional = false
+python-versions = "*"
+groups = ["main"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"},
+    {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
+]
+
 [[package]]
 name = "soupsieve"
 version = "2.6"
@@ -1878,6 +1989,47 @@ files = [
 docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"]
 test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<8.2)", "pytest-mock", "pytest-mypy-testing"]
 
+[[package]]
+name = "trio"
+version = "0.29.0"
+description = "A friendly Python library for async concurrency and I/O"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "trio-0.29.0-py3-none-any.whl", hash = "sha256:d8c463f1a9cc776ff63e331aba44c125f423a5a13c684307e828d930e625ba66"},
+    {file = "trio-0.29.0.tar.gz", hash = "sha256:ea0d3967159fc130acb6939a0be0e558e364fee26b5deeecc893a6b08c361bdf"},
+]
+
+[package.dependencies]
+attrs = ">=23.2.0"
+cffi = {version = ">=1.14", markers = "os_name == \"nt\" and implementation_name != \"pypy\""}
+exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
+idna = "*"
+outcome = "*"
+sniffio = ">=1.3.0"
+sortedcontainers = "*"
+
+[[package]]
+name = "trio-websocket"
+version = "0.12.2"
+description = "WebSocket library for Trio"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "trio_websocket-0.12.2-py3-none-any.whl", hash = "sha256:df605665f1db533f4a386c94525870851096a223adcb97f72a07e8b4beba45b6"},
+    {file = "trio_websocket-0.12.2.tar.gz", hash = "sha256:22c72c436f3d1e264d0910a3951934798dcc5b00ae56fc4ee079d46c7cf20fae"},
+]
+
+[package.dependencies]
+exceptiongroup = {version = "*", markers = "python_version < \"3.11\""}
+outcome = ">=1.2.0"
+trio = ">=0.11"
+wsproto = ">=0.14"
+
 [[package]]
 name = "typing-extensions"
 version = "4.12.2"
@@ -1904,6 +2056,23 @@ files = [
     {file = "tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694"},
 ]
 
+[[package]]
+name = "undetected-chromedriver"
+version = "3.5.5"
+description = "('Selenium.webdriver.Chrome replacement with compatiblity for Brave, and other Chromium based browsers.', 'Not triggered by CloudFlare/Imperva/hCaptcha and such.', 'NOTE: results may vary due to many factors. No guarantees are given, except for ongoing efforts in understanding detection algorithms.')"
+optional = false
+python-versions = "*"
+groups = ["main"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "undetected-chromedriver-3.5.5.tar.gz", hash = "sha256:9f945e1435005247abe17de316bcfda85b284a4177fd5f25167c78ced33b65ec"},
+]
+
+[package.dependencies]
+requests = "*"
+selenium = ">=4.9.0"
+websockets = "*"
+
 [[package]]
 name = "urllib3"
 version = "2.3.0"
@@ -1917,6 +2086,9 @@ files = [
     {file = "urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d"},
 ]
 
+[package.dependencies]
+pysocks = {version = ">=1.5.6,<1.5.7 || >1.5.7,<2.0", optional = true, markers = "extra == \"socks\""}
+
 [package.extras]
 brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
 h2 = ["h2 (>=4,<5)"]
@@ -1936,6 +2108,120 @@ files = [
     {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"},
 ]
 
+[[package]]
+name = "websocket-client"
+version = "1.8.0"
+description = "WebSocket client for Python with low level API options"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "websocket_client-1.8.0-py3-none-any.whl", hash = "sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526"},
+    {file = "websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da"},
+]
+
+[package.extras]
+docs = ["Sphinx (>=6.0)", "myst-parser (>=2.0.0)", "sphinx-rtd-theme (>=1.1.0)"]
+optional = ["python-socks", "wsaccel"]
+test = ["websockets"]
+
+[[package]]
+name = "websockets"
+version = "15.0.1"
+description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "websockets-15.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d63efaa0cd96cf0c5fe4d581521d9fa87744540d4bc999ae6e08595a1014b45b"},
+    {file = "websockets-15.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ac60e3b188ec7574cb761b08d50fcedf9d77f1530352db4eef1707fe9dee7205"},
+    {file = "websockets-15.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5756779642579d902eed757b21b0164cd6fe338506a8083eb58af5c372e39d9a"},
+    {file = "websockets-15.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fdfe3e2a29e4db3659dbd5bbf04560cea53dd9610273917799f1cde46aa725e"},
+    {file = "websockets-15.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c2529b320eb9e35af0fa3016c187dffb84a3ecc572bcee7c3ce302bfeba52bf"},
+    {file = "websockets-15.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac1e5c9054fe23226fb11e05a6e630837f074174c4c2f0fe442996112a6de4fb"},
+    {file = "websockets-15.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5df592cd503496351d6dc14f7cdad49f268d8e618f80dce0cd5a36b93c3fc08d"},
+    {file = "websockets-15.0.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0a34631031a8f05657e8e90903e656959234f3a04552259458aac0b0f9ae6fd9"},
+    {file = "websockets-15.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3d00075aa65772e7ce9e990cab3ff1de702aa09be3940d1dc88d5abf1ab8a09c"},
+    {file = "websockets-15.0.1-cp310-cp310-win32.whl", hash = "sha256:1234d4ef35db82f5446dca8e35a7da7964d02c127b095e172e54397fb6a6c256"},
+    {file = "websockets-15.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:39c1fec2c11dc8d89bba6b2bf1556af381611a173ac2b511cf7231622058af41"},
+    {file = "websockets-15.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:823c248b690b2fd9303ba00c4f66cd5e2d8c3ba4aa968b2779be9532a4dad431"},
+    {file = "websockets-15.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678999709e68425ae2593acf2e3ebcbcf2e69885a5ee78f9eb80e6e371f1bf57"},
+    {file = "websockets-15.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d50fd1ee42388dcfb2b3676132c78116490976f1300da28eb629272d5d93e905"},
+    {file = "websockets-15.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d99e5546bf73dbad5bf3547174cd6cb8ba7273062a23808ffea025ecb1cf8562"},
+    {file = "websockets-15.0.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66dd88c918e3287efc22409d426c8f729688d89a0c587c88971a0faa2c2f3792"},
+    {file = "websockets-15.0.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8dd8327c795b3e3f219760fa603dcae1dcc148172290a8ab15158cf85a953413"},
+    {file = "websockets-15.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8fdc51055e6ff4adeb88d58a11042ec9a5eae317a0a53d12c062c8a8865909e8"},
+    {file = "websockets-15.0.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:693f0192126df6c2327cce3baa7c06f2a117575e32ab2308f7f8216c29d9e2e3"},
+    {file = "websockets-15.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:54479983bd5fb469c38f2f5c7e3a24f9a4e70594cd68cd1fa6b9340dadaff7cf"},
+    {file = "websockets-15.0.1-cp311-cp311-win32.whl", hash = "sha256:16b6c1b3e57799b9d38427dda63edcbe4926352c47cf88588c0be4ace18dac85"},
+    {file = "websockets-15.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:27ccee0071a0e75d22cb35849b1db43f2ecd3e161041ac1ee9d2352ddf72f065"},
+    {file = "websockets-15.0.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:3e90baa811a5d73f3ca0bcbf32064d663ed81318ab225ee4f427ad4e26e5aff3"},
+    {file = "websockets-15.0.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:592f1a9fe869c778694f0aa806ba0374e97648ab57936f092fd9d87f8bc03665"},
+    {file = "websockets-15.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0701bc3cfcb9164d04a14b149fd74be7347a530ad3bbf15ab2c678a2cd3dd9a2"},
+    {file = "websockets-15.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8b56bdcdb4505c8078cb6c7157d9811a85790f2f2b3632c7d1462ab5783d215"},
+    {file = "websockets-15.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0af68c55afbd5f07986df82831c7bff04846928ea8d1fd7f30052638788bc9b5"},
+    {file = "websockets-15.0.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dee438fed052b52e4f98f76c5790513235efaa1ef7f3f2192c392cd7c91b65"},
+    {file = "websockets-15.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d5f6b181bb38171a8ad1d6aa58a67a6aa9d4b38d0f8c5f496b9e42561dfc62fe"},
+    {file = "websockets-15.0.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5d54b09eba2bada6011aea5375542a157637b91029687eb4fdb2dab11059c1b4"},
+    {file = "websockets-15.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3be571a8b5afed347da347bfcf27ba12b069d9d7f42cb8c7028b5e98bbb12597"},
+    {file = "websockets-15.0.1-cp312-cp312-win32.whl", hash = "sha256:c338ffa0520bdb12fbc527265235639fb76e7bc7faafbb93f6ba80d9c06578a9"},
+    {file = "websockets-15.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:fcd5cf9e305d7b8338754470cf69cf81f420459dbae8a3b40cee57417f4614a7"},
+    {file = "websockets-15.0.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ee443ef070bb3b6ed74514f5efaa37a252af57c90eb33b956d35c8e9c10a1931"},
+    {file = "websockets-15.0.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5a939de6b7b4e18ca683218320fc67ea886038265fd1ed30173f5ce3f8e85675"},
+    {file = "websockets-15.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:746ee8dba912cd6fc889a8147168991d50ed70447bf18bcda7039f7d2e3d9151"},
+    {file = "websockets-15.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:595b6c3969023ecf9041b2936ac3827e4623bfa3ccf007575f04c5a6aa318c22"},
+    {file = "websockets-15.0.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c714d2fc58b5ca3e285461a4cc0c9a66bd0e24c5da9911e30158286c9b5be7f"},
+    {file = "websockets-15.0.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f3c1e2ab208db911594ae5b4f79addeb3501604a165019dd221c0bdcabe4db8"},
+    {file = "websockets-15.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:229cf1d3ca6c1804400b0a9790dc66528e08a6a1feec0d5040e8b9eb14422375"},
+    {file = "websockets-15.0.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:756c56e867a90fb00177d530dca4b097dd753cde348448a1012ed6c5131f8b7d"},
+    {file = "websockets-15.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:558d023b3df0bffe50a04e710bc87742de35060580a293c2a984299ed83bc4e4"},
+    {file = "websockets-15.0.1-cp313-cp313-win32.whl", hash = "sha256:ba9e56e8ceeeedb2e080147ba85ffcd5cd0711b89576b83784d8605a7df455fa"},
+    {file = "websockets-15.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:e09473f095a819042ecb2ab9465aee615bd9c2028e4ef7d933600a8401c79561"},
+    {file = "websockets-15.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5f4c04ead5aed67c8a1a20491d54cdfba5884507a48dd798ecaf13c74c4489f5"},
+    {file = "websockets-15.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:abdc0c6c8c648b4805c5eacd131910d2a7f6455dfd3becab248ef108e89ab16a"},
+    {file = "websockets-15.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a625e06551975f4b7ea7102bc43895b90742746797e2e14b70ed61c43a90f09b"},
+    {file = "websockets-15.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d591f8de75824cbb7acad4e05d2d710484f15f29d4a915092675ad3456f11770"},
+    {file = "websockets-15.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:47819cea040f31d670cc8d324bb6435c6f133b8c7a19ec3d61634e62f8d8f9eb"},
+    {file = "websockets-15.0.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac017dd64572e5c3bd01939121e4d16cf30e5d7e110a119399cf3133b63ad054"},
+    {file = "websockets-15.0.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4a9fac8e469d04ce6c25bb2610dc535235bd4aa14996b4e6dbebf5e007eba5ee"},
+    {file = "websockets-15.0.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:363c6f671b761efcb30608d24925a382497c12c506b51661883c3e22337265ed"},
+    {file = "websockets-15.0.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2034693ad3097d5355bfdacfffcbd3ef5694f9718ab7f29c29689a9eae841880"},
+    {file = "websockets-15.0.1-cp39-cp39-win32.whl", hash = "sha256:3b1ac0d3e594bf121308112697cf4b32be538fb1444468fb0a6ae4feebc83411"},
+    {file = "websockets-15.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:b7643a03db5c95c799b89b31c036d5f27eeb4d259c798e878d6937d71832b1e4"},
+    {file = "websockets-15.0.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0c9e74d766f2818bb95f84c25be4dea09841ac0f734d1966f415e4edfc4ef1c3"},
+    {file = "websockets-15.0.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:1009ee0c7739c08a0cd59de430d6de452a55e42d6b522de7aa15e6f67db0b8e1"},
+    {file = "websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76d1f20b1c7a2fa82367e04982e708723ba0e7b8d43aa643d3dcd404d74f1475"},
+    {file = "websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f29d80eb9a9263b8d109135351caf568cc3f80b9928bccde535c235de55c22d9"},
+    {file = "websockets-15.0.1-pp310-pypy310_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b359ed09954d7c18bbc1680f380c7301f92c60bf924171629c5db97febb12f04"},
+    {file = "websockets-15.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:cad21560da69f4ce7658ca2cb83138fb4cf695a2ba3e475e0559e05991aa8122"},
+    {file = "websockets-15.0.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7f493881579c90fc262d9cdbaa05a6b54b3811c2f300766748db79f098db9940"},
+    {file = "websockets-15.0.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:47b099e1f4fbc95b701b6e85768e1fcdaf1630f3cbe4765fa216596f12310e2e"},
+    {file = "websockets-15.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67f2b6de947f8c757db2db9c71527933ad0019737ec374a8a6be9a956786aaf9"},
+    {file = "websockets-15.0.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d08eb4c2b7d6c41da6ca0600c077e93f5adcfd979cd777d747e9ee624556da4b"},
+    {file = "websockets-15.0.1-pp39-pypy39_pp73-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b826973a4a2ae47ba357e4e82fa44a463b8f168e1ca775ac64521442b19e87f"},
+    {file = "websockets-15.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:21c1fa28a6a7e3cbdc171c694398b6df4744613ce9b36b1a498e816787e28123"},
+    {file = "websockets-15.0.1-py3-none-any.whl", hash = "sha256:f7a866fbc1e97b5c617ee4116daaa09b722101d4a3c170c787450ba409f9736f"},
+    {file = "websockets-15.0.1.tar.gz", hash = "sha256:82544de02076bafba038ce055ee6412d68da13ab47f0c60cab827346de828dee"},
+]
+
+[[package]]
+name = "wsproto"
+version = "1.2.0"
+description = "WebSockets state-machine based protocol implementation"
+optional = false
+python-versions = ">=3.7.0"
+groups = ["main"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "wsproto-1.2.0-py3-none-any.whl", hash = "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736"},
+    {file = "wsproto-1.2.0.tar.gz", hash = "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065"},
+]
+
+[package.dependencies]
+h11 = ">=0.9.0,<1"
+
 [[package]]
 name = "zipp"
 version = "3.21.0"
@@ -1960,4 +2246,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.9"
-content-hash = "aae03414bd510dcc398d4b52bd96660021224dfbf78564b91a1235d3e851a582"
+content-hash = "d71e1b8f0d0886b2f716c19310371fb54f9216a14c38d50327a4f42283c08523"

From 465c5538f8adc61147f8e5f25d8508befd940df1 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Sun, 9 Mar 2025 14:20:14 -0700
Subject: [PATCH 022/101] update: reorg selenium code

---
 WebSearcher/searchers.py | 123 +++++++++++++++++++--------------------
 1 file changed, 61 insertions(+), 62 deletions(-)

diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index b29f8f3..7c50dc2 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -92,19 +92,6 @@ def __init__(self,
             file_level=log_level,
         ).start(__name__)
 
-    def launch_chromedriver(
-            self, 
-            headless: bool = False, 
-            version_main: int = 133,
-            use_subprocess: bool = False, 
-            chromedriver_path: str = ''
-        ) -> None:
-        self.headless = headless
-        self.use_subprocess = use_subprocess
-        self.chromedriver_path = chromedriver_path
-        self.version_main = version_main
-        self._init_chromedriver()
-
     def search(self, 
             qry: str, 
             location: str = None, 
@@ -127,9 +114,9 @@ def search(self,
         """
 
         self._prepare_search(qry=qry, location=location, lang=lang, num_results=num_results)
+
         if method == 'selenium':
             self._conduct_chromedriver_search(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand)
-        
         elif method == 'requests':
             self._conduct_search(serp_id=serp_id, crawl_id=crawl_id)
             self._handle_response()
@@ -149,63 +136,29 @@ def _prepare_search(self, qry: str, location: str = None, lang: str = None, num_
         if self.loc and self.loc not in {'None', 'nan'}:
             self.params['uule'] = locations.convert_canonical_name_to_uule(self.loc)
 
+    def launch_chromedriver(
+            self, 
+            headless: bool = False, 
+            version_main: int = 133,
+            use_subprocess: bool = False, 
+            chromedriver_path: str = ''
+        ) -> None:
+        self.headless = headless
+        self.use_subprocess = use_subprocess
+        self.chromedriver_path = chromedriver_path
+        self.version_main = version_main
+        self._init_chromedriver()
+
     def _init_chromedriver(self):
-        print('launching...')
+        self.log.info(f'SERP | Launching ChromeDriver | headless: {self.headless} | subprocess: {self.use_subprocess} | version: {self.version_main}')
         if self.chromedriver_path == '':
-            #optionally: headless=True, use_subprocess=True
             self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess, version_main = self.version_main)
         else:
             self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess, chromedriver_path = self.chromedriver_path, version_main = self.version_main)
-            #chromedriver_path = "/opt/homebrew/Caskroom/chromedriver/133.0.6943.53"
         time.sleep(2)
         self.driver.get('https://www.google.com')
         time.sleep(2)
 
-    def _check_ai_expand(self):
-        try:
-            self.driver.find_element(By.XPATH, "//div[@jsname='rPRdsc' and @role='button']")
-            return True
-        except NoSuchElementException:
-            return False
-
-    def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = '', ai_expand = False):
-        """Send a search request and handle errors"""
-        self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
-        self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp)
-        self.crawl_id = crawl_id
-        try:
-            self._send_chromedriver_request()
-        except:
-            self.log.exception(f'SERP | Unknown error | {self.serp_id}')
-        
-        ## Look for AI overview box and click on it
-        if ai_expand:
-            ai_button = self._check_ai_expand()
-            if ai_button:
-                try:
-                    show_more_button = WebDriverWait(self.driver, 1).until(
-                        EC.element_to_be_clickable((By.XPATH, "//div[@jsname='rPRdsc' and @role='button']"))
-                    )
-                    show_more_button.click()
-                    if show_more_button is not None:
-                        try:
-                            # Wait for additional content to load
-                            time.sleep(2)
-
-                            show_all_button = WebDriverWait(self.driver, 1).until(
-                                EC.element_to_be_clickable((By.XPATH, '//div[contains(@class, "trEk7e") and @role="button"]'))
-                            )
-                            show_all_button.click()
-                        except:
-                            pass
-                except:
-                    pass
-                self.html = self.driver.page_source
-            else:
-                pass
-        
-        self.driver.delete_all_cookies()
-
     def _send_chromedriver_request(self):
         search_box = self.driver.find_element(By.ID, "APjFqb")
         search_box.clear()
@@ -227,6 +180,52 @@ def _send_request(self):
         log_msg = f"{log_msg} | {self.loc}" if self.loc else log_msg
         self.log.info(log_msg)
 
+    def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = '', ai_expand = False):
+        """Send a search request and handle errors"""
+        self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
+        self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp)
+        self.crawl_id = crawl_id
+        try:
+            self._send_chromedriver_request()
+            self.html = self.driver.page_source
+        except:
+            self.log.exception(f'SERP | Chromedriver error | {self.serp_id}')
+
+        if ai_expand:
+            self._expand_ai_overview()           # Expand AI overview box by clicking it
+        self.driver.delete_all_cookies()
+
+    def _expand_ai_overview(self):
+        show_more_button_xpath = "//div[@jsname='rPRdsc' and @role='button']"
+        show_all_button_xpath = '//div[contains(@class, "trEk7e") and @role="button"]'
+
+        try:
+            self.driver.find_element(By.XPATH, show_more_button_xpath)
+            show_more_button_exists = True
+        except NoSuchElementException:
+            show_more_button_exists = False
+        
+        if show_more_button_exists:
+            try:
+                show_more_button = WebDriverWait(self.driver, 1).until(
+                    EC.element_to_be_clickable((By.XPATH, show_more_button_xpath))
+                )
+                if show_more_button is not None:
+                    show_more_button.click()
+                    try:
+                        time.sleep(2) # Wait for additional content to load
+                        show_all_button = WebDriverWait(self.driver, 1).until(
+                            EC.element_to_be_clickable((By.XPATH, show_all_button_xpath))
+                        )
+                        show_all_button.click()
+                    except Exception:
+                        pass
+                    
+                     # Overwrite html with expanded content
+                    self.html = self.driver.page_source
+
+            except Exception:
+                pass
 
     def _reset_ssh_tunnel(self):
         if self.ssh_tunnel:

From ea6eebc9850319447cff8cfd516f90e6887e0b54 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Sun, 9 Mar 2025 14:21:27 -0700
Subject: [PATCH 023/101] update: specify args, headless not working locally

---
 tests/selenium_test.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/selenium_test.py b/tests/selenium_test.py
index 855f573..d5e644e 100644
--- a/tests/selenium_test.py
+++ b/tests/selenium_test.py
@@ -1,6 +1,11 @@
 import WebSearcher as ws
+
+#chromedriver_path = "/opt/homebrew/Caskroom/chromedriver/133.0.6943.53"
+
 se = ws.SearchEngine()                     # 1. Initialize collector
-se.launch_chromedriver(headless=False, version_main=133)   # 2. Launch undetected chromedriver window
+se.launch_chromedriver(headless=False,     # 2. Launch undetected_chromedriver window 
+                       use_subprocess=False,
+                       version_main=133)   
 se.search('immigration news')              # 2. Conduct a search
 se.parse_results()                         # 3. Parse search results
 se.save_serp(append_to='serps.json')       # 4. Save HTML and metadata

From 82bf5078235bc613a1d947fbe2ed8598c654df48 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Mon, 10 Mar 2025 17:23:25 -0700
Subject: [PATCH 024/101] update: collection code and selenium test

---
 README.md                |  64 +++-------
 WebSearcher/logger.py    |   4 +-
 WebSearcher/searchers.py | 264 ++++++++++++++++++++++++++++-----------
 poetry.lock              | 114 ++++++++++++++++-
 pyproject.toml           |   1 +
 tests/selenium_test.py   |  52 ++++++--
 6 files changed, 368 insertions(+), 131 deletions(-)

diff --git a/README.md b/README.md
index 214e1bc..677c408 100644
--- a/README.md
+++ b/README.md
@@ -38,11 +38,10 @@ Below are some details about recent updates. For a longer list, see the [Update
     - [Example Search Script](#example-search-script)
     - [Step by Step](#step-by-step)
       - [1. Initialize Collector](#1-initialize-collector)
-      - [2. Launch undetected chromedriver window](#2-launch-undetected-chromedriver-window)
-      - [3. Conduct a Search](#3-conduct-a-search)
-      - [4. Parse Search Results](#4-parse-search-results)
-      - [5. Save HTML and Metadata](#5-save-html-and-metadata)
-      - [6. Save Parsed Results](#6-save-parsed-results)
+      - [2. Conduct a Search](#2-conduct-a-search)
+      - [3. Parse Search Results](#3-parse-search-results)
+      - [4. Save HTML and Metadata](#4-save-html-and-metadata)
+      - [5. Save Parsed Results](#5-save-parsed-results)
   - [Localization](#localization)
   - [Contributing](#contributing)
     - [Repair or Enhance a Parser](#repair-or-enhance-a-parser)
@@ -125,7 +124,6 @@ Example search and parse pipeline:
 ```python
 import WebSearcher as ws
 se = ws.SearchEngine()                     # 1. Initialize collector
-se.launch_chromedriver(headless=False)     # 2. Launch undetected chromedriver window
 se.search('immigration news')              # 2. Conduct a search
 se.parse_results()                         # 3. Parse search results
 se.save_serp(append_to='serps.json')       # 4. Save HTML and metadata
@@ -138,50 +136,26 @@ se.save_results(append_to='results.json')  # 5. Save parsed results
 ```python
 import WebSearcher as ws
 
-# Initialize collector with optional defaults (headers, logs, ssh tunnels)
-se = ws.SearchEngine()
-
-# Show collector settings
-vars(se)
-{'version': '0.4.1',
- 'base_url': 'https://www.google.com/search',
- 'headers': {'Host': 'www.google.com',
-  'Referer': 'https://www.google.com/',
-  'Accept': '*/*',
-  'Accept-Encoding': 'gzip,deflate,br',
-  'Accept-Language': 'en-US,en;q=0.5',
-  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0'},
- 'sesh': <requests.sessions.Session at 0x7f9ac018ece0>,
- 'ssh_tunnel': None,
- 'unzip': True,
- 'params': {},
- 'qry': None,
- 'loc': None,
- 'num_results': None,
- 'url': None,
- 'timestamp': None,
- 'serp_id': None,
- 'crawl_id': None,
- 'response': None,
- 'html': None,
- 'results': [],
- 'log': <Logger WebSearcher.searchers (DEBUG)>}
-```
-
-#### 2. Launch undetected chromedriver window
-We've switched to using [undetected chrome](https://github.com/ultrafunkamsterdam/undetected-chromedriver) to scrape search results. You'll need to ensure that your chromedriver is up-to-date. All cookies are deleted following each search.launch_chromedriver accepts 3 optional arguments. The defaults are:
-
-se.launch_chromedriver(headless = False, use_subprocess = False, chromedriver_path = '')
-
+# Initialize collector with method and other settings
+se = ws.SearchEngine(
+    method="selenium", 
+    selenium_config = {
+        "headless": False,
+        "use_subprocess": False,
+        "driver_executable_path": "",
+        "version_main": 133,
+    }
+)
+```   
 
-#### 3. Conduct a Search
+#### 2. Conduct a Search
 
 ```python
 se.search('immigration news')
 # 2024-08-19 14:09:18.502 | INFO | WebSearcher.searchers | 200 | immigration news
 ```
 
-#### 4. Parse Search Results
+#### 3. Parse Search Results
 
 The example below is primarily for parsing search results as you collect HTML.  
 See `ws.parse_serp(html)` for parsing existing HTML data.
@@ -206,7 +180,7 @@ se.results[0]
 ```
 
 
-#### 5. Save HTML and Metadata
+#### 4. Save HTML and Metadata
 
 Recommended: Append html and meta data as lines to a json file for larger or 
 ongoing collections.
@@ -221,7 +195,7 @@ Alternative: Save individual html files in a directory, named by a provided or (
 se.save_serp(save_dir='./serps')
 ```
 
-#### 6. Save Parsed Results
+#### 5. Save Parsed Results
 
 Save to a json lines file.
 
diff --git a/WebSearcher/logger.py b/WebSearcher/logger.py
index 48ff44f..147000d 100644
--- a/WebSearcher/logger.py
+++ b/WebSearcher/logger.py
@@ -85,7 +85,9 @@ def __init__(self,
             'urllib3': {'level': 'WARNING'},
             'asyncio': {'level': 'INFO'},
             'chardet.charsetprober': {'level': 'INFO'},
-            'parso': {'level': 'INFO'} # Fix for ipython autocomplete bug
+            'parso': {'level': 'INFO'}, # Fix for ipython autocomplete bug
+            'undetected_chromedriver': {'level': 'WARNING'},
+            'uc': {'level': 'WARNING'},
         }
 
         self.log_config = { 
diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 7c50dc2..e2330fb 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -7,12 +7,15 @@
 
 import os
 import time
+import json
 import brotli
 import requests
 import subprocess
 import pandas as pd
+from enum import Enum
+from typing import Any, Dict, Optional, Union
 from datetime import datetime, timezone
-from typing import Any, Dict, Optional
+from dataclasses import dataclass, field
 
 # selenium updates
 import undetected_chromedriver as uc
@@ -35,39 +38,118 @@
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0',
 }
 
+class SearchMethod(Enum):
+    REQUESTS = "requests"
+    SELENIUM = "selenium"
+
+@dataclass
+class BaseConfig:
+    """Common search configuration
+    
+    Attributes:
+        log_fp (str, optional): A file to log function process output to
+        log_mode (str, optional): Write over the log file or append to it
+        log_level (str, optional): The file logging level
+    
+    """
+    log_fp: str = ''
+    log_mode: str = 'a+'
+    log_level: str = 'INFO'
+
+@dataclass
+class SeleniumConfig:
+    """Selenium-specific configuration
+
+    Attributes:
+        headless (bool): Whether to run the browser in headless mode
+        version_main (int): The main version of the ChromeDriver to use
+        use_subprocess (bool): Whether to use subprocess for ChromeDriver
+        driver_executable_path (str): Path to the ChromeDriver executable
+    
+    """
+    headless: bool = False
+    version_main: int = 133
+    use_subprocess: bool = False
+    driver_executable_path: str = ''
+
+@dataclass 
+class RequestsConfig:
+    """Requests-specific configuration
+    
+    Attributes:
+        headers (Dict[str, str]): Headers to send with requests
+        sesh (Optional[requests.Session]): A `requests.Session` object
+        ssh_tunnel (Optional[subprocess.Popen]): An SSH tunnel subprocess from `webutils`
+        unzip (bool): Unzip brotli zipped html responses
+    
+    """
+    headers: Dict[str, str] = field(default_factory=lambda: DEFAULT_HEADERS)
+    sesh: Optional[requests.Session] = None
+    ssh_tunnel: Optional[subprocess.Popen] = None
+    unzip: bool = True
+
+@dataclass
+class SearchConfig:
+    """Combined search engine configuration
+
+    Attributes:
+        method (Union[str, SearchMethod]): The method to use for searching, either 'requests' or 'selenium'
+        base (BaseConfig): Common search configuration
+        selenium (SeleniumConfig): Selenium-specific configuration
+        requests (RequestsConfig): Requests-specific configuration
+    
+    """
+    method: Union[str, SearchMethod] = SearchMethod.SELENIUM
+    base: BaseConfig = field(default_factory=BaseConfig)
+    selenium: SeleniumConfig = field(default_factory=SeleniumConfig)
+    requests: RequestsConfig = field(default_factory=RequestsConfig)
+
+
 class SearchEngine:
     """Collect Search Engine Results Pages (SERPs)"""
     def __init__(self, 
-            headers: Dict[str, str] = None,
-            sesh: Optional[requests.Session] = None, 
-            ssh_tunnel: Optional[subprocess.Popen] = None, 
-            unzip: bool = True,
-            log_fp: str = '', 
-            log_mode: str = 'a+',
-            log_level: str ='INFO',
+            method: Union[str, SearchMethod] = SearchMethod.SELENIUM,
+            base_config: Union[dict, BaseConfig] = None,
+            selenium_config: Union[dict, SeleniumConfig] = None,
+            requests_config: Union[dict, RequestsConfig] = None
         ) -> None:
-        """Initialize a `requests.Session` to conduct searches through or
-        pass an existing one with an optional SSH tunnel.
-        
-        Args:
-            headers (dict, optional): Headers to send with requests.
-            unzip (bool, optional): Unzip brotli zipped html responses.
-            sesh (None, optional): A `requests.Session` object.
-            ssh_tunnel (None, optional): An SSH tunnel subprocess from `webutils`.
-            log_fp (str, optional): A file to log function process output to.
-            log_mode (str, optional): Write over the log file or append to it.
-            log_level (str, optional): The file logging level.
+        """Initialize the search engine
+
+        Args: 
+            method (Union[str, SearchMethod], optional): The method to use for searching, either 'requests' or 'selenium'. Defaults to SearchMethod.SELENIUM.
+            base_config (Union[dict, BaseConfig], optional): Common search configuration. Defaults to None.
+            selenium_config (Union[dict, SeleniumConfig], optional): Selenium-specific configuration. Defaults to None.
+            requests_config (Union[dict, RequestsConfig], optional): Requests-specific configuration. Defaults to None.
         """
 
-        # Initialize data storage
+        # Convert string method to enum if needed
+        if isinstance(method, str):
+            method = SearchMethod(method.lower())
+
+        # Handle config objects/dicts
+        def isdict(config): 
+            return isinstance(config, dict)
+        base = BaseConfig(**base_config) if isdict(base_config) else base_config or BaseConfig()
+        selenium = SeleniumConfig(**selenium_config) if isdict(selenium_config) else selenium_config or SeleniumConfig()
+        requests = RequestsConfig(**requests_config) if isdict(requests_config) else requests_config or RequestsConfig()
+        self.config = SearchConfig(
+            method=method,
+            base=base,
+            selenium=selenium,
+            requests=requests
+        )
+
+        # Initialize common attributes
         self.version: str = WS_VERSION
         self.base_url: str = 'https://www.google.com/search'
-        self.headers: Dict[str, str] = headers or DEFAULT_HEADERS
-        self.sesh: requests.Session = sesh or wu.start_sesh(headers=self.headers)
-        self.ssh_tunnel: subprocess.Popen = ssh_tunnel
-        self.unzip: bool = unzip
         self.params: Dict[str, Any] = {}
 
+        # Initialize method-specific attributes
+        if self.config.method == SearchMethod.SELENIUM:
+            self.driver = None
+        else:
+            self.config.requests.sesh = self.config.requests.sesh or wu.start_sesh(headers=self.config.requests.headers)
+
         # Initialize search details
         self.qry: str = None
         self.loc: str = None
@@ -77,6 +159,8 @@ def __init__(self,
         self.timestamp: str = None
         self.serp_id: str = None
         self.crawl_id: str = None
+
+        # Initialize search outputs
         self.response: requests.Response = None
         self.html: str = None
         self.results: list = []
@@ -85,11 +169,11 @@ def __init__(self,
 
         # Set a log file, prints to console by default
         self.log = logger.Logger(
-            console=True if not log_fp else False,
-            console_level=log_level,
-            file_name=log_fp, 
-            file_mode=log_mode,
-            file_level=log_level,
+            console=True if not self.config.base.log_fp else False,
+            console_level=self.config.base.log_level,
+            file_name=self.config.base.log_fp, 
+            file_mode=self.config.base.log_mode,
+            file_level=self.config.base.log_level,
         ).start(__name__)
 
     def search(self, 
@@ -97,7 +181,6 @@ def search(self,
             location: str = None, 
             lang: str = None, 
             num_results: int = None, 
-            method: str = 'selenium',
             ai_expand: bool = False,
             serp_id: str = '', 
             crawl_id: str = ''
@@ -114,10 +197,10 @@ def search(self,
         """
 
         self._prepare_search(qry=qry, location=location, lang=lang, num_results=num_results)
-
-        if method == 'selenium':
+        if self.config.method == SearchMethod.SELENIUM:
+            self._init_chromedriver()
             self._conduct_chromedriver_search(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand)
-        elif method == 'requests':
+        elif self.config.method == SearchMethod.REQUESTS:
             self._conduct_search(serp_id=serp_id, crawl_id=crawl_id)
             self._handle_response()
 
@@ -135,48 +218,55 @@ def _prepare_search(self, qry: str, location: str = None, lang: str = None, num_
             self.params['hl'] = self.lang
         if self.loc and self.loc not in {'None', 'nan'}:
             self.params['uule'] = locations.convert_canonical_name_to_uule(self.loc)
+        self.url = f"{self.base_url}?{wu.join_url_quote(self.params)}"
 
-    def launch_chromedriver(
-            self, 
-            headless: bool = False, 
-            version_main: int = 133,
-            use_subprocess: bool = False, 
-            chromedriver_path: str = ''
-        ) -> None:
-        self.headless = headless
-        self.use_subprocess = use_subprocess
-        self.chromedriver_path = chromedriver_path
-        self.version_main = version_main
-        self._init_chromedriver()
-
-    def _init_chromedriver(self):
-        self.log.info(f'SERP | Launching ChromeDriver | headless: {self.headless} | subprocess: {self.use_subprocess} | version: {self.version_main}')
-        if self.chromedriver_path == '':
-            self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess, version_main = self.version_main)
-        else:
-            self.driver = uc.Chrome(headless = self.headless, subprocess = self.use_subprocess, chromedriver_path = self.chromedriver_path, version_main = self.version_main)
+    # ==========================================================================
+    # Selenium method
+
+    def _init_chromedriver(self) -> None:
+        """Initialize Chrome driver with selenium-specific config"""
+        self.log.debug(f'SERP | init uc chromedriver | kwargs: {self.config.selenium.__dict__}')
+        self.driver = uc.Chrome(**self.config.selenium.__dict__)
+        self.user_agent = self.driver.execute_script('return navigator.userAgent')
+        self.response_code = None
+        
+        # Log version information
+        self.browser_info = {
+            'browser_id': "",
+            'browser_name': self.driver.capabilities['browserName'],
+            'browser_version': self.driver.capabilities['browserVersion'],
+            'driver_version': self.driver.capabilities['chrome']['chromedriverVersion'].split(' ')[0],
+        }
+        self.browser_info['browser_id'] = utils.hash_id(json.dumps(self.browser_info))
+        self.log.debug(json.dumps(self.browser_info, indent=4))
+
+    def _send_chromedriver_typed_query(self):
+        """Send a typed query to the search box"""
         time.sleep(2)
         self.driver.get('https://www.google.com')
         time.sleep(2)
-
-    def _send_chromedriver_request(self):
         search_box = self.driver.find_element(By.ID, "APjFqb")
         search_box.clear()
         search_box.send_keys(self.qry)
         search_box.send_keys(Keys.RETURN)
+
+    def _send_chromedriver_request(self):
+        """Use a prepared URL to conduct a search"""
+
+        time.sleep(2)
+        self.driver.get(self.url)
+        time.sleep(2)
         
         # wait for the page to load
         WebDriverWait(self.driver, 10).until(
             EC.presence_of_element_located((By.ID, "search")) 
         )
         time.sleep(2) #including a sleep to allow the page to fully load
+
         self.html = self.driver.page_source
         self.url = self.driver.current_url
-
-    def _send_request(self):
-        self.url = f"{self.base_url}?{wu.join_url_quote(self.params)}"
-        self.response = self.sesh.get(self.url, timeout=10)
-        log_msg = f"{self.response.status_code} | {self.qry}"
+        self.response_code = 0
+        log_msg = f"{self.response_code} | {self.qry}"
         log_msg = f"{log_msg} | {self.loc}" if self.loc else log_msg
         self.log.info(log_msg)
 
@@ -222,21 +312,51 @@ def _expand_ai_overview(self):
                         pass
                     
                      # Overwrite html with expanded content
-                    self.html = self.driver.page_source
+                    new_html = self.driver.page_source
+                    self.log.debug(f'SERP | overwriting expanded content | len diff: {len(new_html) - len(self.html)}')
+                    self.html = new_html
 
             except Exception:
                 pass
 
+    # ==========================================================================
+    # Requests method
+
+    def _conduct_search(self, serp_id: str = '', crawl_id: str = ''):
+        """Send a search request and handle errors"""
+
+        self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
+        self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp)
+        self.crawl_id = crawl_id
+        self.user_agent = self.config.requests.headers['User-Agent']
+
+        try:
+            self._send_request()
+        except requests.exceptions.ConnectionError:
+            self.log.exception(f'SERP | Connection error | {self.serp_id}')
+            self._reset_ssh_tunnel()
+        except requests.exceptions.Timeout:
+            self.log.exception(f'SERP | Timeout error | {self.serp_id}')
+        except Exception:
+            self.log.exception(f'SERP | Unknown error | {self.serp_id}')
+
+    def _send_request(self):
+        self.response = self.config.requests.sesh.get(self.url, timeout=10)
+        self.response_code = self.response.status_code
+        log_msg = f"{self.response_code} | {self.qry}"
+        log_msg = f"{log_msg} | {self.loc}" if self.loc else log_msg
+        self.log.info(log_msg)
+
     def _reset_ssh_tunnel(self):
-        if self.ssh_tunnel:
-            self.ssh_tunnel.tunnel.kill()
-            self.ssh_tunnel.open_tunnel()
+        if self.config.requests.ssh_tunnel:
+            self.config.requests.ssh_tunnel.tunnel.kill()
+            self.config.requests.ssh_tunnel.open_tunnel()
             self.log.info(f'SERP | Restarted SSH tunnel | {self.serp_id}')
             time.sleep(10) # Allow time to establish connection
 
     def _handle_response(self):
         try:
-            if self.unzip:  
+            if self.config.requests.unzip:  
                 self._unzip_html()
             else:
                 self.html = self.response.content
@@ -244,14 +364,11 @@ def _handle_response(self):
         except Exception:
             self.log.exception(f'Response handling error')
 
-    def _unzip_html(self):
+    def _unzip_html(self) -> None:
         """Unzip brotli zipped html 
 
         Can allow zipped responses by setting the header `"Accept-Encoding"`.
         Zipped reponses are the default because it is more efficient.
-        
-        Returns:
-            str: Decompressed html
         """
 
         rcontent = self.response.content
@@ -263,6 +380,9 @@ def _unzip_html(self):
             self.log.exception(f'unzip error | serp_id : {self.serp_id}')
             self.html = rcontent
 
+    # ==========================================================================
+    # Parsing
+
     def parse_results(self):
         """Parse a SERP - see parsers.py"""
 
@@ -281,6 +401,9 @@ def parse_serp_features(self):
         except Exception:
             self.log.exception(f'Feature extraction error | serp_id : {self.serp_id}')
 
+    # ==========================================================================
+    # Saving
+
     def prepare_serp_save(self):
         self.serp = BaseSERP(
             qry=self.qry, 
@@ -288,8 +411,8 @@ def prepare_serp_save(self):
             lang=self.lang,
             url=self.url, 
             html=self.html,
-            response_code=0 if not self.response else self.response.status_code,
-            user_agent='' if not self.response else self.headers['User-Agent'],
+            response_code=self.response_code,
+            user_agent=self.user_agent,
             timestamp=self.timestamp,
             serp_id=self.serp_id,
             crawl_id=self.crawl_id,
@@ -354,4 +477,3 @@ def save_results(self, save_dir: str = "", append_to: str = ""):
         else:
             self.log.info(f'No parsed results for serp_id: {self.serp_id}')
 
-
diff --git a/poetry.lock b/poetry.lock
index 225ef6d..9ca06df 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -421,6 +421,22 @@ files = [
     {file = "charset_normalizer-3.4.1.tar.gz", hash = "sha256:44251f18cd68a75b56585dd00dae26183e102cd5e0f9f1466e6df5da2ed64ea3"},
 ]
 
+[[package]]
+name = "click"
+version = "8.1.8"
+description = "Composable command line interface toolkit"
+optional = false
+python-versions = ">=3.7"
+groups = ["dev"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"},
+    {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
@@ -428,7 +444,7 @@ description = "Cross-platform colored terminal text."
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" and sys_platform == \"win32\" or python_version >= \"3.12\" and sys_platform == \"win32\""
+markers = "python_version <= \"3.11\" and sys_platform == \"win32\" or python_version <= \"3.11\" and platform_system == \"Windows\" or python_version >= \"3.12\" and sys_platform == \"win32\" or python_version >= \"3.12\" and platform_system == \"Windows\""
 files = [
     {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
     {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
@@ -918,6 +934,32 @@ html5 = ["html5lib"]
 htmlsoup = ["BeautifulSoup4"]
 source = ["Cython (>=3.0.11)"]
 
+[[package]]
+name = "markdown-it-py"
+version = "3.0.0"
+description = "Python port of markdown-it. Markdown parsing, done right!"
+optional = false
+python-versions = ">=3.8"
+groups = ["dev"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
+    {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
+]
+
+[package.dependencies]
+mdurl = ">=0.1,<1.0"
+
+[package.extras]
+benchmarking = ["psutil", "pytest", "pytest-benchmark"]
+code-style = ["pre-commit (>=3.0,<4.0)"]
+compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"]
+linkify = ["linkify-it-py (>=1,<3)"]
+plugins = ["mdit-py-plugins"]
+profiling = ["gprof2dot"]
+rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"]
+testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
+
 [[package]]
 name = "matplotlib-inline"
 version = "0.1.7"
@@ -934,6 +976,19 @@ files = [
 [package.dependencies]
 traitlets = "*"
 
+[[package]]
+name = "mdurl"
+version = "0.1.2"
+description = "Markdown URL utilities"
+optional = false
+python-versions = ">=3.7"
+groups = ["dev"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
+    {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
+]
+
 [[package]]
 name = "nest-asyncio"
 version = "1.6.0"
@@ -1774,6 +1829,27 @@ files = [
 [package.dependencies]
 requests = ">=1.0.0"
 
+[[package]]
+name = "rich"
+version = "13.9.4"
+description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
+optional = false
+python-versions = ">=3.8.0"
+groups = ["dev"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"},
+    {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"},
+]
+
+[package.dependencies]
+markdown-it-py = ">=2.2.0"
+pygments = ">=2.13.0,<3.0.0"
+typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.11\""}
+
+[package.extras]
+jupyter = ["ipywidgets (>=7.5.1,<9)"]
+
 [[package]]
 name = "selenium"
 version = "4.29.0"
@@ -1795,6 +1871,19 @@ typing_extensions = ">=4.9,<5.0"
 urllib3 = {version = ">=1.26,<3", extras = ["socks"]}
 websocket-client = ">=1.8,<2.0"
 
+[[package]]
+name = "shellingham"
+version = "1.5.4"
+description = "Tool to Detect Surrounding Shell"
+optional = false
+python-versions = ">=3.7"
+groups = ["dev"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686"},
+    {file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"},
+]
+
 [[package]]
 name = "six"
 version = "1.17.0"
@@ -2030,6 +2119,25 @@ outcome = ">=1.2.0"
 trio = ">=0.11"
 wsproto = ">=0.14"
 
+[[package]]
+name = "typer"
+version = "0.15.2"
+description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
+optional = false
+python-versions = ">=3.7"
+groups = ["dev"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
+files = [
+    {file = "typer-0.15.2-py3-none-any.whl", hash = "sha256:46a499c6107d645a9c13f7ee46c5d5096cae6f5fc57dd11eccbbb9ae3e44ddfc"},
+    {file = "typer-0.15.2.tar.gz", hash = "sha256:ab2fab47533a813c49fe1f16b1a370fd5819099c00b119e0633df65f22144ba5"},
+]
+
+[package.dependencies]
+click = ">=8.0.0"
+rich = ">=10.11.0"
+shellingham = ">=1.3.0"
+typing-extensions = ">=3.7.4.3"
+
 [[package]]
 name = "typing-extensions"
 version = "4.12.2"
@@ -2037,11 +2145,11 @@ description = "Backported and Experimental Type Hints for Python 3.8+"
 optional = false
 python-versions = ">=3.8"
 groups = ["main", "dev"]
+markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
     {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
 ]
-markers = {main = "python_version <= \"3.11\" or python_version >= \"3.12\"", dev = "python_version < \"3.10\""}
 
 [[package]]
 name = "tzdata"
@@ -2246,4 +2354,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.9"
-content-hash = "d71e1b8f0d0886b2f716c19310371fb54f9216a14c38d50327a4f42283c08523"
+content-hash = "1afa3bf7c3d9ce06c3cf91b77da72e8f7bf4d543351120cdfe00bedb1286df6b"
diff --git a/pyproject.toml b/pyproject.toml
index e521987..ab2c164 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -37,6 +37,7 @@ optional = true
 pytest = "^8.3.4"
 syrupy = "^4.8.1"
 ipykernel = "^6.29.5"
+typer = "^0.15.2"
 
 [build-system]
 requires = ["poetry-core>=2.0.0,<3.0.0"]
diff --git a/tests/selenium_test.py b/tests/selenium_test.py
index d5e644e..f6924b7 100644
--- a/tests/selenium_test.py
+++ b/tests/selenium_test.py
@@ -1,15 +1,45 @@
+import typer
 import WebSearcher as ws
 
-#chromedriver_path = "/opt/homebrew/Caskroom/chromedriver/133.0.6943.53"
+# driver_executable_path locations:
+# /opt/homebrew/Caskroom/chromedriver/133.0.6943.53 # Mac
+# /opt/google/chrome/google-chrome  # Google Chrome 134.0.6998.88 | permissions error
+# ~/.local/share/undetected_chromedriver/undetected_chromedriver # ChromeDriver 133.0.6943.141
 
-se = ws.SearchEngine()                     # 1. Initialize collector
-se.launch_chromedriver(headless=False,     # 2. Launch undetected_chromedriver window 
-                       use_subprocess=False,
-                       version_main=133)   
-se.search('immigration news')              # 2. Conduct a search
-se.parse_results()                         # 3. Parse search results
-se.save_serp(append_to='serps.json')       # 4. Save HTML and metadata
-se.save_results(append_to='results.json')  # 5. Save parsed results
+app = typer.Typer()
 
-#import pandas as pd
-#df = pd.DataFrame(se.results)                   # 6. Display results in a pandas dataframe
\ No newline at end of file
+@app.command()
+def main(
+    query: str = typer.Argument(..., help="Search query to use"),
+    method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"),
+    headless: bool = typer.Option(False, help="Run browser in headless mode"),
+    use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"),
+    version_main: int = typer.Option(133, help="Main version of Chrome to use"),
+    ai_expand: bool = typer.Option(False, help="Expand AI overviews if present"),
+    driver_executable_path: str = typer.Option(None, help="Path to ChromeDriver executable"),
+    output_prefix: str = typer.Option("output", help="Prefix for output files")
+) -> None:
+
+    typer.echo(f"query: {query}\nmethod: {method}")
+    selenium_config = {
+        "headless": headless,
+        "use_subprocess": use_subprocess,
+        "driver_executable_path": driver_executable_path,
+        "version_main": version_main,
+    }
+    
+    se = ws.SearchEngine(
+        method=method,
+        selenium_config=selenium_config
+    )
+    
+    se.search(qry=query, ai_expand=ai_expand)
+    se.parse_results()
+    
+    # Save results with the specified prefix
+    se.save_serp(append_to=f'{output_prefix}_serps.json')
+    se.save_search(append_to=f'{output_prefix}_searches.json')
+    se.save_results(append_to=f'{output_prefix}_results.json')
+
+if __name__ == "__main__":
+    app()
\ No newline at end of file

From 5ca2da7a774b07291197047c06e6f1cf8f03b5ab Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Mon, 10 Mar 2025 23:16:39 -0700
Subject: [PATCH 025/101] update: save method variable along with metadata

---
 WebSearcher/models.py    | 1 +
 WebSearcher/searchers.py | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/WebSearcher/models.py b/WebSearcher/models.py
index a85d7c1..2a564e6 100644
--- a/WebSearcher/models.py
+++ b/WebSearcher/models.py
@@ -26,4 +26,5 @@ class BaseSERP(BaseModel):
     serp_id: str               # Search Engine Results Page (SERP) ID
     crawl_id: str              # Crawl ID for grouping SERPs
     version: str               # WebSearcher version
+    method: str                # Search method used
 
diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index e2330fb..2ea604c 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -161,7 +161,7 @@ def isdict(config):
         self.crawl_id: str = None
 
         # Initialize search outputs
-        self.response: requests.Response = None
+        self.response = None  # type: Optional[requests.Response]
         self.html: str = None
         self.results: list = []
         self.serp_features: dict = {}
@@ -417,6 +417,7 @@ def prepare_serp_save(self):
             serp_id=self.serp_id,
             crawl_id=self.crawl_id,
             version=self.version,
+            method=self.config.method.value
         ).model_dump()
 
     def save_serp(self, save_dir: str = "", append_to: str = ""):

From b950989631d138f936299f32c75837529cab1593 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Tue, 11 Mar 2025 09:30:36 -0700
Subject: [PATCH 026/101] update: handle null links in tw result

---
 WebSearcher/component_parsers/twitter_result.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/WebSearcher/component_parsers/twitter_result.py b/WebSearcher/component_parsers/twitter_result.py
index f9742a4..e2814c7 100644
--- a/WebSearcher/component_parsers/twitter_result.py
+++ b/WebSearcher/component_parsers/twitter_result.py
@@ -1,3 +1,5 @@
+from ..webutils import get_text, get_link
+
 def parse_twitter_result(cmpt, sub_rank=0) -> list:
     """Parse a Twitter single result component
 
@@ -28,7 +30,7 @@ def parse_twitter_result(cmpt, sub_rank=0) -> list:
     
     # Get snippet text, timestamp, and tweet url
     body, timestamp_url = cmpt.find('div', {'class':'tw-res'}).children
-    parsed['text'] = body.text
-    parsed['timestamp'] = timestamp_url.find('span').text
-    parsed['details'] = timestamp_url.find('a')['href']
+    parsed['text'] = get_text(body)
+    parsed['timestamp'] = get_text(timestamp_url, 'span')
+    parsed['details'] = get_link(timestamp_url)
     return [parsed]
\ No newline at end of file

From dc00990cdc8505e760a499d58684f095431de73a Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Tue, 11 Mar 2025 09:37:40 -0700
Subject: [PATCH 027/101] version: 0.6.0.dev1

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index ab2c164..2e7b240 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.6.0.dev0"
+version = "0.6.0.dev1"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From a7cfd5adb8459267117fff6372e0db02ac8ccb62 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Tue, 11 Mar 2025 10:37:41 -0700
Subject: [PATCH 028/101] update: move driver init to search, add driver
 cleanup

---
 WebSearcher/searchers.py | 24 ++++++++++++++++++++++--
 tests/selenium_test.py   |  1 +
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 2ea604c..4cd86d5 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -198,7 +198,6 @@ def search(self,
 
         self._prepare_search(qry=qry, location=location, lang=lang, num_results=num_results)
         if self.config.method == SearchMethod.SELENIUM:
-            self._init_chromedriver()
             self._conduct_chromedriver_search(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand)
         elif self.config.method == SearchMethod.REQUESTS:
             self._conduct_search(serp_id=serp_id, crawl_id=crawl_id)
@@ -272,6 +271,9 @@ def _send_chromedriver_request(self):
 
     def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = '', ai_expand = False):
         """Send a search request and handle errors"""
+        if not self.driver:
+            self._init_chromedriver()
+
         self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
         self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp)
         self.crawl_id = crawl_id
@@ -282,10 +284,11 @@ def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = '', ai
             self.log.exception(f'SERP | Chromedriver error | {self.serp_id}')
 
         if ai_expand:
-            self._expand_ai_overview()           # Expand AI overview box by clicking it
+            self._expand_ai_overview()
         self.driver.delete_all_cookies()
 
     def _expand_ai_overview(self):
+        """Expand AI overview box by clicking it"""
         show_more_button_xpath = "//div[@jsname='rPRdsc' and @role='button']"
         show_all_button_xpath = '//div[contains(@class, "trEk7e") and @role="button"]'
 
@@ -319,6 +322,23 @@ def _expand_ai_overview(self):
             except Exception:
                 pass
 
+    def cleanup(self):
+        """Clean up resources, particularly Selenium's browser instance
+        
+        Returns:
+            bool: True if cleanup was successful or not needed, False if cleanup failed
+        """
+        if self.config.method == SearchMethod.SELENIUM and hasattr(self, 'driver') and self.driver:
+            try:
+                self.driver.quit()
+                self.driver = None
+                self.log.debug(f'Browser successfully closed')
+                return True
+            except Exception as e:
+                self.log.warning(f'Failed to close browser: {e}')
+                return False
+        return True
+
     # ==========================================================================
     # Requests method
 
diff --git a/tests/selenium_test.py b/tests/selenium_test.py
index f6924b7..1476a32 100644
--- a/tests/selenium_test.py
+++ b/tests/selenium_test.py
@@ -40,6 +40,7 @@ def main(
     se.save_serp(append_to=f'{output_prefix}_serps.json')
     se.save_search(append_to=f'{output_prefix}_searches.json')
     se.save_results(append_to=f'{output_prefix}_results.json')
+    se.cleanup()
 
 if __name__ == "__main__":
     app()
\ No newline at end of file

From 2a86a5abb2550ee705baafed0027478ee1a2ea98 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Tue, 11 Mar 2025 10:41:32 -0700
Subject: [PATCH 029/101] version: 0.6.0.dev2

---
 WebSearcher/__init__.py | 2 +-
 pyproject.toml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index e03cd22..e2af3af 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.0.dev0"
+__version__ = "0.6.0.dev2"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index 2e7b240..b6c9c65 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.6.0.dev1"
+version = "0.6.0.dev2"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From 76fa069afb7646977f03aa780df0507fc146653e Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Wed, 19 Mar 2025 16:04:32 -0700
Subject: [PATCH 030/101] update: add parse both features and results options

---
 WebSearcher/parsers.py   | 71 ++++++++++++++++++++++++++++------------
 WebSearcher/searchers.py | 11 +++++--
 2 files changed, 59 insertions(+), 23 deletions(-)

diff --git a/WebSearcher/parsers.py b/WebSearcher/parsers.py
index 1e4f283..692171b 100644
--- a/WebSearcher/parsers.py
+++ b/WebSearcher/parsers.py
@@ -5,12 +5,23 @@
 
 import re
 from bs4 import BeautifulSoup
-from typing import Union, List, Dict
+from typing import Union, List, Dict, Tuple
 
 
-def parse_serp(serp: Union[str, BeautifulSoup]) -> List[Dict]:
-    """Parse a Search Engine Result Page (SERP)"""
-
+def parse_serp(
+        serp: Union[str, BeautifulSoup], 
+        extract_features: bool = False
+    ) -> Union[List[Dict], Tuple[List[Dict], Dict]]:
+    """Parse a Search Engine Result Page (SERP)
+    
+    Args:
+        serp (Union[str, BeautifulSoup]): The HTML content of the SERP or a BeautifulSoup object
+        extract_features (bool, optional): Whether to also extract SERP features. Defaults to False.
+        
+    Returns:
+        Union[List[Dict], Tuple[List[Dict], Dict]]: If extract_features is False, returns a list of result components.
+            If extract_features is True, returns a tuple of (results, features).
+    """
     # Extract components
     soup = webutils.make_soup(serp)
     extractor = Extractor(soup)
@@ -22,18 +33,38 @@ def parse_serp(serp: Union[str, BeautifulSoup]) -> List[Dict]:
         cmpt.classify_component()
         cmpt.parse_component()
     
-    return component_list.export_component_results()
+    results = component_list.export_component_results()
+    
+    if extract_features:
+        # Extract features from the same soup object to avoid parsing twice
+        features = FeatureExtractor.extract_features(soup)
+        return results, features
+        
+    return results
 
 
 class FeatureExtractor:
     @staticmethod
-    def extract_features(html: str) -> dict:
-        rx_estimate = re.compile(r'<div id="result-stats">.*?</div>')
-        rx_language = re.compile(r'<html[^>]*\slang="([^"]+)"')
-        rx_no_results = re.compile(r"Your search - .*? - did not match any documents\.")
+    def extract_features(html_or_soup: Union[str, BeautifulSoup]) -> dict:
+        """Extract SERP features from HTML or a BeautifulSoup object
+        
+        Args:
+            html_or_soup (Union[str, BeautifulSoup]): The HTML content or a BeautifulSoup object
+            
+        Returns:
+            dict: The extracted features
+        """
+        
         output = {}
+        if isinstance(html_or_soup, BeautifulSoup):
+            soup = html_or_soup
+            html = str(soup)
+        else:
+            html = html_or_soup
+            soup = webutils.make_soup(html)
 
         # Extract result estimate count and time
+        rx_estimate = re.compile(r'<div id="result-stats">.*?</div>')
         match = rx_estimate.search(html)
         result_estimate_div = match.group(0) if match else None
         if result_estimate_div is None:
@@ -46,23 +77,21 @@ def extract_features(html: str) -> dict:
             output["result_estimate_time"] = float(time_match.group(1)) if time_match else None
 
         # Extract language
+        rx_language = re.compile(r'<html[^>]*\slang="([^"]+)"')
         match = rx_language.search(html)
         output['language'] = match.group(1) if match else None
 
         # No results notice
+        rx_no_results = re.compile(r"Your search - .*? - did not match any documents\.")
         match = rx_no_results.search(html)
         output['notice_no_results'] = bool(match)
 
-        # Shortened query notice
-        pattern = "(and any subsequent words) was ignored because we limit queries to 32 words."
-        output['notice_shortened_query'] = (pattern in html)
-
-        # Server error notice
-        pattern = "We're sorry but it appears that there has been an internal server error while processing your request."
-        output['notice_server_error'] = (pattern in html)
-
-        # Infinity scroll button
-        pattern = '<span class="RVQdVd">More results</span>'
-        output['infinity_scroll'] = (pattern in html)
-
+        string_match_dict = {
+            'notice_shortened_query': "(and any subsequent words) was ignored because we limit queries to 32 words.",
+            'notice_server_error': "We're sorry but it appears that there has been an internal server error while processing your request.",
+            'infinity_scroll': '<span class="RVQdVd">More results</span>'
+        }
+        for key, pattern in string_match_dict.items():
+            output[key] = (pattern in html)
+        
         return output
diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 4cd86d5..525c968 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -403,9 +403,17 @@ def _unzip_html(self) -> None:
     # ==========================================================================
     # Parsing
 
+    def parse_all(self):
+        """Parse results and extract SERP features in a single pass"""
+        assert self.html, "No HTML found"
+        try:
+            # Use the enhanced parse_serp function to get both results and features in one pass
+            self.results, self.serp_features = parsers.parse_serp(self.html, extract_features=True)
+        except Exception:
+            self.log.exception(f'Combined parsing error | serp_id : {self.serp_id}')
+
     def parse_results(self):
         """Parse a SERP - see parsers.py"""
-
         assert self.html, "No HTML found"
         try:
             self.results = parsers.parse_serp(self.html)
@@ -414,7 +422,6 @@ def parse_results(self):
 
     def parse_serp_features(self):
         """Extract SERP features - see parsers.py"""
-
         assert self.html, "No HTML found"
         try:
             self.serp_features = parsers.FeatureExtractor.extract_features(self.html)

From 6270f5d8054731f024d7d9821170f2f0176daaba Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Wed, 19 Mar 2025 16:05:10 -0700
Subject: [PATCH 031/101] version: 0.6.0.dev3

---
 WebSearcher/__init__.py | 2 +-
 pyproject.toml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index e2af3af..e3e41b6 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.0.dev2"
+__version__ = "0.6.0.dev3"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index b6c9c65..e8525f5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.6.0.dev2"
+version = "0.6.0.dev3"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From 6752498ad4077b4286ef41dcb2cb265c23b8fc72 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Wed, 26 Mar 2025 08:57:39 -0700
Subject: [PATCH 032/101] version: 0.6.0.dev4

---
 WebSearcher/__init__.py | 2 +-
 pyproject.toml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index e3e41b6..d33ee46 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.0.dev3"
+__version__ = "0.6.0.dev4"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index e8525f5..2b85ba6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.6.0.dev3"
+version = "0.6.0.dev4"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From 165f9e36a4770b081787d71e10c571fd8c73b16b Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Wed, 26 Mar 2025 10:25:47 -0700
Subject: [PATCH 033/101] update: condense args, use currently reliable default
 query

---
 tests/selenium_test.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/tests/selenium_test.py b/tests/selenium_test.py
index 1476a32..46afb15 100644
--- a/tests/selenium_test.py
+++ b/tests/selenium_test.py
@@ -10,29 +10,26 @@
 
 @app.command()
 def main(
-    query: str = typer.Argument(..., help="Search query to use"),
+    query: str = typer.Argument("why is the sky blue?", help="Search query to use"),
     method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"),
     headless: bool = typer.Option(False, help="Run browser in headless mode"),
     use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"),
     version_main: int = typer.Option(133, help="Main version of Chrome to use"),
-    ai_expand: bool = typer.Option(False, help="Expand AI overviews if present"),
-    driver_executable_path: str = typer.Option(None, help="Path to ChromeDriver executable"),
+    ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"),
+    driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"),
     output_prefix: str = typer.Option("output", help="Prefix for output files")
 ) -> None:
 
     typer.echo(f"query: {query}\nmethod: {method}")
-    selenium_config = {
-        "headless": headless,
-        "use_subprocess": use_subprocess,
-        "driver_executable_path": driver_executable_path,
-        "version_main": version_main,
-    }
-    
     se = ws.SearchEngine(
         method=method,
-        selenium_config=selenium_config
+        selenium_config={
+            "headless": headless,
+            "use_subprocess": use_subprocess,
+            "driver_executable_path": driver_executable_path,
+            "version_main": version_main,
+        }
     )
-    
     se.search(qry=query, ai_expand=ai_expand)
     se.parse_results()
     

From b77c413bb9abfbc5426436a7032fa6a23e3e1cd3 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Wed, 26 Mar 2025 10:28:13 -0700
Subject: [PATCH 034/101] update: use pydantic models for configs and defaults

---
 WebSearcher/models.py    |  45 +++++++++++-
 WebSearcher/searchers.py | 144 ++++++++++++++-------------------------
 2 files changed, 93 insertions(+), 96 deletions(-)

diff --git a/WebSearcher/models.py b/WebSearcher/models.py
index 2a564e6..efaadbd 100644
--- a/WebSearcher/models.py
+++ b/WebSearcher/models.py
@@ -1,5 +1,8 @@
-from pydantic import BaseModel
-from typing import Any, Optional
+from pydantic import BaseModel, Field
+from typing import Any, Optional, Dict, Union
+import subprocess
+import requests
+from enum import Enum
 
 
 class BaseResult(BaseModel):
@@ -28,3 +31,41 @@ class BaseSERP(BaseModel):
     version: str               # WebSearcher version
     method: str                # Search method used
 
+
+class LogConfig(BaseModel):
+    log_fp: str = ''
+    log_mode: str = 'a+'
+    log_level: str = 'INFO'
+
+
+class SeleniumConfig(BaseModel):
+    headless: bool = False
+    version_main: int = 133
+    use_subprocess: bool = False
+    driver_executable_path: str = ""
+
+
+class RequestsConfig(BaseModel):
+    model_config = {"arbitrary_types_allowed": True}
+    headers: Dict[str, str] = Field(default_factory=lambda: {
+        'Host': 'www.google.com',
+        'Referer': 'https://www.google.com/',
+        'Accept': '*/*',
+        'Accept-Encoding': 'gzip,deflate,br',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0',
+    })
+    sesh: Optional[requests.Session] = None
+    ssh_tunnel: Optional[subprocess.Popen] = None
+    unzip: bool = True
+
+
+class SearchMethod(Enum):
+    REQUESTS = "requests"
+    SELENIUM = "selenium"
+
+class SearchConfig(BaseModel):
+    method: Union[str, SearchMethod] = SearchMethod.SELENIUM
+    base: LogConfig = Field(default_factory=LogConfig)
+    selenium: SeleniumConfig = Field(default_factory=SeleniumConfig)
+    requests: RequestsConfig = Field(default_factory=RequestsConfig)
diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 525c968..f693b62 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -3,19 +3,16 @@
 from . import webutils as wu
 from . import utils
 from . import logger
-from .models import BaseSERP
+from .models import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod, BaseSERP
 
 import os
 import time
 import json
 import brotli
 import requests
-import subprocess
 import pandas as pd
-from enum import Enum
 from typing import Any, Dict, Optional, Union
 from datetime import datetime, timezone
-from dataclasses import dataclass, field
 
 # selenium updates
 import undetected_chromedriver as uc
@@ -28,88 +25,11 @@
 from importlib import metadata
 WS_VERSION = metadata.version('WebSearcher')
 
-# Default headers to send with requests (i.e. device fingerprint)
-DEFAULT_HEADERS = {
-   'Host': 'www.google.com',
-   'Referer': 'https://www.google.com/',
-   'Accept': '*/*',
-   'Accept-Encoding': 'gzip,deflate,br',
-   'Accept-Language': 'en-US,en;q=0.5',
-   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0',
-}
-
-class SearchMethod(Enum):
-    REQUESTS = "requests"
-    SELENIUM = "selenium"
-
-@dataclass
-class BaseConfig:
-    """Common search configuration
-    
-    Attributes:
-        log_fp (str, optional): A file to log function process output to
-        log_mode (str, optional): Write over the log file or append to it
-        log_level (str, optional): The file logging level
-    
-    """
-    log_fp: str = ''
-    log_mode: str = 'a+'
-    log_level: str = 'INFO'
-
-@dataclass
-class SeleniumConfig:
-    """Selenium-specific configuration
-
-    Attributes:
-        headless (bool): Whether to run the browser in headless mode
-        version_main (int): The main version of the ChromeDriver to use
-        use_subprocess (bool): Whether to use subprocess for ChromeDriver
-        driver_executable_path (str): Path to the ChromeDriver executable
-    
-    """
-    headless: bool = False
-    version_main: int = 133
-    use_subprocess: bool = False
-    driver_executable_path: str = ''
-
-@dataclass 
-class RequestsConfig:
-    """Requests-specific configuration
-    
-    Attributes:
-        headers (Dict[str, str]): Headers to send with requests
-        sesh (Optional[requests.Session]): A `requests.Session` object
-        ssh_tunnel (Optional[subprocess.Popen]): An SSH tunnel subprocess from `webutils`
-        unzip (bool): Unzip brotli zipped html responses
-    
-    """
-    headers: Dict[str, str] = field(default_factory=lambda: DEFAULT_HEADERS)
-    sesh: Optional[requests.Session] = None
-    ssh_tunnel: Optional[subprocess.Popen] = None
-    unzip: bool = True
-
-@dataclass
-class SearchConfig:
-    """Combined search engine configuration
-
-    Attributes:
-        method (Union[str, SearchMethod]): The method to use for searching, either 'requests' or 'selenium'
-        base (BaseConfig): Common search configuration
-        selenium (SeleniumConfig): Selenium-specific configuration
-        requests (RequestsConfig): Requests-specific configuration
-    
-    """
-    method: Union[str, SearchMethod] = SearchMethod.SELENIUM
-    base: BaseConfig = field(default_factory=BaseConfig)
-    selenium: SeleniumConfig = field(default_factory=SeleniumConfig)
-    requests: RequestsConfig = field(default_factory=RequestsConfig)
-
-
 class SearchEngine:
     """Collect Search Engine Results Pages (SERPs)"""
     def __init__(self, 
             method: Union[str, SearchMethod] = SearchMethod.SELENIUM,
-            base_config: Union[dict, BaseConfig] = None,
+            base_config: Union[dict, LogConfig] = None,
             selenium_config: Union[dict, SeleniumConfig] = None,
             requests_config: Union[dict, RequestsConfig] = None
         ) -> None:
@@ -117,7 +37,7 @@ def __init__(self,
 
         Args: 
             method (Union[str, SearchMethod], optional): The method to use for searching, either 'requests' or 'selenium'. Defaults to SearchMethod.SELENIUM.
-            base_config (Union[dict, BaseConfig], optional): Common search configuration. Defaults to None.
+            base_config (Union[dict, LogConfig], optional): Common search configuration. Defaults to None.
             selenium_config (Union[dict, SeleniumConfig], optional): Selenium-specific configuration. Defaults to None.
             requests_config (Union[dict, RequestsConfig], optional): Requests-specific configuration. Defaults to None.
         """
@@ -129,7 +49,7 @@ def __init__(self,
         # Handle config objects/dicts
         def isdict(config): 
             return isinstance(config, dict)
-        base = BaseConfig(**base_config) if isdict(base_config) else base_config or BaseConfig()
+        base = LogConfig(**base_config) if isdict(base_config) else base_config or LogConfig()
         selenium = SeleniumConfig(**selenium_config) if isdict(selenium_config) else selenium_config or SeleniumConfig()
         requests = RequestsConfig(**requests_config) if isdict(requests_config) else requests_config or RequestsConfig()
         self.config = SearchConfig(
@@ -197,11 +117,7 @@ def search(self,
         """
 
         self._prepare_search(qry=qry, location=location, lang=lang, num_results=num_results)
-        if self.config.method == SearchMethod.SELENIUM:
-            self._conduct_chromedriver_search(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand)
-        elif self.config.method == SearchMethod.REQUESTS:
-            self._conduct_search(serp_id=serp_id, crawl_id=crawl_id)
-            self._handle_response()
+        self._conduct_search(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand)
 
     def _prepare_search(self, qry: str, location: str = None, lang: str = None, num_results: int = None):
         """Prepare a search URL and metadata for the given query and location"""
@@ -219,6 +135,12 @@ def _prepare_search(self, qry: str, location: str = None, lang: str = None, num_
             self.params['uule'] = locations.convert_canonical_name_to_uule(self.loc)
         self.url = f"{self.base_url}?{wu.join_url_quote(self.params)}"
 
+    def _conduct_search(self, serp_id:str = '', crawl_id:str = '', ai_expand:bool = False):
+        if self.config.method == SearchMethod.SELENIUM:
+            self._conduct_search_chromedriver(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand)
+        elif self.config.method == SearchMethod.REQUESTS:
+            self._conduct_search_requests(serp_id=serp_id, crawl_id=crawl_id)
+
     # ==========================================================================
     # Selenium method
 
@@ -269,7 +191,7 @@ def _send_chromedriver_request(self):
         log_msg = f"{log_msg} | {self.loc}" if self.loc else log_msg
         self.log.info(log_msg)
 
-    def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = '', ai_expand = False):
+    def _conduct_search_chromedriver(self, serp_id: str = '', crawl_id: str = '', ai_expand = False):
         """Send a search request and handle errors"""
         if not self.driver:
             self._init_chromedriver()
@@ -280,12 +202,19 @@ def _conduct_chromedriver_search(self, serp_id: str = '', crawl_id: str = '', ai
         try:
             self._send_chromedriver_request()
             self.html = self.driver.page_source
-        except:
-            self.log.exception(f'SERP | Chromedriver error | {self.serp_id}')
+        except Exception as e:
+            self.log.exception(f'SERP | Chromedriver error | {self.serp_id}: {str(e)}')
 
         if ai_expand:
             self._expand_ai_overview()
-        self.driver.delete_all_cookies()
+        
+        # Only delete cookies, don't close the driver here
+        # The driver will be closed when cleanup() is called
+        if self.driver:
+            try:
+                self.driver.delete_all_cookies()
+            except Exception as e:
+                self.log.warning(f"Failed to delete cookies: {str(e)}")
 
     def _expand_ai_overview(self):
         """Expand AI overview box by clicking it"""
@@ -330,19 +259,44 @@ def cleanup(self):
         """
         if self.config.method == SearchMethod.SELENIUM and hasattr(self, 'driver') and self.driver:
             try:
+                # Try a more thorough cleanup
+                try:
+                    self.driver.delete_all_cookies()
+                except Exception:
+                    pass
+                
+                try:
+                    # Close all tabs/windows
+                    original_handle = self.driver.current_window_handle
+                    for handle in self.driver.window_handles:
+                        self.driver.switch_to.window(handle)
+                        self.driver.close()
+                except Exception:
+                    pass
+                
+                # Finally quit the driver
                 self.driver.quit()
                 self.driver = None
                 self.log.debug(f'Browser successfully closed')
                 return True
             except Exception as e:
                 self.log.warning(f'Failed to close browser: {e}')
+                # Force driver to be None so we create a fresh instance next time
+                self.driver = None
                 return False
         return True
+    
+    def __del__(self):
+        """Destructor to ensure browser is closed when object is garbage collected"""
+        try:
+            self.cleanup()
+        except Exception:
+            pass
 
     # ==========================================================================
     # Requests method
 
-    def _conduct_search(self, serp_id: str = '', crawl_id: str = ''):
+    def _conduct_search_requests(self, serp_id: str = '', crawl_id: str = ''):
         """Send a search request and handle errors"""
 
         self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
@@ -359,6 +313,8 @@ def _conduct_search(self, serp_id: str = '', crawl_id: str = ''):
             self.log.exception(f'SERP | Timeout error | {self.serp_id}')
         except Exception:
             self.log.exception(f'SERP | Unknown error | {self.serp_id}')
+        finally:
+            self._handle_response()
 
     def _send_request(self):
         self.response = self.config.requests.sesh.get(self.url, timeout=10)

From 9b925ae2356c64762bbcd8f8fe27ba87d361f6f1 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Wed, 26 Mar 2025 11:10:07 -0700
Subject: [PATCH 035/101] update: model directory with multiple files, new
 BaseConfig model

---
 WebSearcher/components.py                    |  2 +-
 WebSearcher/models/__init__.py               |  0
 WebSearcher/{models.py => models/configs.py} | 46 ++++++--------------
 WebSearcher/models/data.py                   | 28 ++++++++++++
 WebSearcher/searchers.py                     | 18 +++-----
 tests/selenium_test.py                       | 19 ++++----
 6 files changed, 58 insertions(+), 55 deletions(-)
 create mode 100644 WebSearcher/models/__init__.py
 rename WebSearcher/{models.py => models/configs.py} (51%)
 create mode 100644 WebSearcher/models/data.py

diff --git a/WebSearcher/components.py b/WebSearcher/components.py
index 13449c9..d757a65 100644
--- a/WebSearcher/components.py
+++ b/WebSearcher/components.py
@@ -1,4 +1,4 @@
-from .models import BaseResult
+from .models.data import BaseResult
 from .classifiers import ClassifyMain, ClassifyFooter, ClassifyHeaderComponent
 from .component_parsers import main_parser_dict, footer_parser_dict, header_parser_dict
 from .component_parsers import parse_unknown, parse_not_implemented
diff --git a/WebSearcher/models/__init__.py b/WebSearcher/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/WebSearcher/models.py b/WebSearcher/models/configs.py
similarity index 51%
rename from WebSearcher/models.py
rename to WebSearcher/models/configs.py
index efaadbd..becf6c3 100644
--- a/WebSearcher/models.py
+++ b/WebSearcher/models/configs.py
@@ -1,51 +1,33 @@
 from pydantic import BaseModel, Field
-from typing import Any, Optional, Dict, Union
+from typing import Dict, Optional, Union
 import subprocess
 import requests
 from enum import Enum
 
-
-class BaseResult(BaseModel):
-    sub_rank: int = 0
-    type: str = 'unclassified'
-    sub_type: Optional[str] = None
-    title: Optional[str] = None
-    url: Optional[str] = None
-    text: Optional[str] = None
-    cite: Optional[str] = None
-    details: Optional[Any] = None
-    error: Optional[str] = None
-
-
-class BaseSERP(BaseModel):
-    qry: str                   # Search query 
-    loc: Optional[str] = None  # Location if set, "Canonical Name"
-    lang: Optional[str] = None # Language if set
-    url: str                   # URL of SERP   
-    html: str                  # Raw HTML of SERP
-    timestamp: str             # Timestamp of crawl
-    response_code: int         # HTTP response code
-    user_agent: str            # User agent used for the crawl
-    serp_id: str               # Search Engine Results Page (SERP) ID
-    crawl_id: str              # Crawl ID for grouping SERPs
-    version: str               # WebSearcher version
-    method: str                # Search method used
-
-
-class LogConfig(BaseModel):
+class BaseConfig(BaseModel):
+    """Base class for all configuration classes"""
+    
+    @classmethod
+    def create(cls, config=None):
+        """Create a config instance from a dictionary or existing instance"""
+        if isinstance(config, dict):
+            return cls(**config)
+        return config or cls()
+
+class LogConfig(BaseConfig):
     log_fp: str = ''
     log_mode: str = 'a+'
     log_level: str = 'INFO'
 
 
-class SeleniumConfig(BaseModel):
+class SeleniumConfig(BaseConfig):
     headless: bool = False
     version_main: int = 133
     use_subprocess: bool = False
     driver_executable_path: str = ""
 
 
-class RequestsConfig(BaseModel):
+class RequestsConfig(BaseConfig):
     model_config = {"arbitrary_types_allowed": True}
     headers: Dict[str, str] = Field(default_factory=lambda: {
         'Host': 'www.google.com',
diff --git a/WebSearcher/models/data.py b/WebSearcher/models/data.py
new file mode 100644
index 0000000..8c7571b
--- /dev/null
+++ b/WebSearcher/models/data.py
@@ -0,0 +1,28 @@
+from pydantic import BaseModel
+from typing import Any, Optional
+
+class BaseResult(BaseModel):
+    sub_rank: int = 0
+    type: str = 'unclassified'
+    sub_type: Optional[str] = None
+    title: Optional[str] = None
+    url: Optional[str] = None
+    text: Optional[str] = None
+    cite: Optional[str] = None
+    details: Optional[Any] = None
+    error: Optional[str] = None
+
+
+class BaseSERP(BaseModel):
+    qry: str                   # Search query 
+    loc: Optional[str] = None  # Location if set, "Canonical Name"
+    lang: Optional[str] = None # Language if set
+    url: str                   # URL of SERP   
+    html: str                  # Raw HTML of SERP
+    timestamp: str             # Timestamp of crawl
+    response_code: int         # HTTP response code
+    user_agent: str            # User agent used for the crawl
+    serp_id: str               # Search Engine Results Page (SERP) ID
+    crawl_id: str              # Crawl ID for grouping SERPs
+    version: str               # WebSearcher version
+    method: str                # Search method used
diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index f693b62..06a3f69 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -3,7 +3,8 @@
 from . import webutils as wu
 from . import utils
 from . import logger
-from .models import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod, BaseSERP
+from .models.configs import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod
+from .models.data import BaseSERP
 
 import os
 import time
@@ -46,17 +47,10 @@ def __init__(self,
         if isinstance(method, str):
             method = SearchMethod(method.lower())
 
-        # Handle config objects/dicts
-        def isdict(config): 
-            return isinstance(config, dict)
-        base = LogConfig(**base_config) if isdict(base_config) else base_config or LogConfig()
-        selenium = SeleniumConfig(**selenium_config) if isdict(selenium_config) else selenium_config or SeleniumConfig()
-        requests = RequestsConfig(**requests_config) if isdict(requests_config) else requests_config or RequestsConfig()
-        self.config = SearchConfig(
-            method=method,
-            base=base,
-            selenium=selenium,
-            requests=requests
+        self.config = SearchConfig(method=method, 
+            base=LogConfig(base_config), 
+            selenium=SeleniumConfig(selenium_config),
+            requests=RequestsConfig(requests_config)
         )
 
         # Initialize common attributes
diff --git a/tests/selenium_test.py b/tests/selenium_test.py
index 46afb15..0318fd2 100644
--- a/tests/selenium_test.py
+++ b/tests/selenium_test.py
@@ -10,16 +10,15 @@
 
 @app.command()
 def main(
-    query: str = typer.Argument("why is the sky blue?", help="Search query to use"),
-    method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"),
-    headless: bool = typer.Option(False, help="Run browser in headless mode"),
-    use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"),
-    version_main: int = typer.Option(133, help="Main version of Chrome to use"),
-    ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"),
-    driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"),
-    output_prefix: str = typer.Option("output", help="Prefix for output files")
-) -> None:
-
+        query: str = typer.Argument("why is the sky blue?", help="Search query to use"),
+        method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"),
+        headless: bool = typer.Option(False, help="Run browser in headless mode"),
+        use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"),
+        version_main: int = typer.Option(133, help="Main version of Chrome to use"),
+        ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"),
+        driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"),
+        output_prefix: str = typer.Option("output", help="Prefix for output files")
+    ) -> None:
     typer.echo(f"query: {query}\nmethod: {method}")
     se = ws.SearchEngine(
         method=method,

From 4e87302bf403572c1cc48430dc430d89a73d4dae Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Wed, 26 Mar 2025 11:40:48 -0700
Subject: [PATCH 036/101] update: use baseconfig in searchconfig

---
 WebSearcher/models/configs.py | 20 +++++++++++++++++++-
 WebSearcher/models/data.py    |  1 +
 WebSearcher/searchers.py      | 23 +++++++++++------------
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py
index becf6c3..d202881 100644
--- a/WebSearcher/models/configs.py
+++ b/WebSearcher/models/configs.py
@@ -4,6 +4,7 @@
 import requests
 from enum import Enum
 
+
 class BaseConfig(BaseModel):
     """Base class for all configuration classes"""
     
@@ -14,6 +15,7 @@ def create(cls, config=None):
             return cls(**config)
         return config or cls()
 
+
 class LogConfig(BaseConfig):
     log_fp: str = ''
     log_mode: str = 'a+'
@@ -46,7 +48,23 @@ class SearchMethod(Enum):
     REQUESTS = "requests"
     SELENIUM = "selenium"
 
-class SearchConfig(BaseModel):
+    @classmethod
+    def create(cls, method=None):
+        """Convert string to SearchMethod enum or return existing enum instance"""
+        if method is None:
+            return cls.SELENIUM
+        if isinstance(method, cls):
+            return method
+        if isinstance(method, str):
+            try:
+                return cls(method.lower())
+            except ValueError:
+                valid_values = [e.value for e in cls]
+                raise ValueError(f"Invalid search method: {method}. Valid values are: {valid_values}")
+        raise TypeError(f"Expected string or SearchMethod, got {type(method)}")
+
+
+class SearchConfig(BaseConfig):
     method: Union[str, SearchMethod] = SearchMethod.SELENIUM
     base: LogConfig = Field(default_factory=LogConfig)
     selenium: SeleniumConfig = Field(default_factory=SeleniumConfig)
diff --git a/WebSearcher/models/data.py b/WebSearcher/models/data.py
index 8c7571b..4143f8d 100644
--- a/WebSearcher/models/data.py
+++ b/WebSearcher/models/data.py
@@ -1,6 +1,7 @@
 from pydantic import BaseModel
 from typing import Any, Optional
 
+
 class BaseResult(BaseModel):
     sub_rank: int = 0
     type: str = 'unclassified'
diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 06a3f69..3fd4c04 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -30,9 +30,9 @@ class SearchEngine:
     """Collect Search Engine Results Pages (SERPs)"""
     def __init__(self, 
             method: Union[str, SearchMethod] = SearchMethod.SELENIUM,
-            base_config: Union[dict, LogConfig] = None,
-            selenium_config: Union[dict, SeleniumConfig] = None,
-            requests_config: Union[dict, RequestsConfig] = None
+            base_config: Union[dict, LogConfig] = {},
+            selenium_config: Union[dict, SeleniumConfig] = {},
+            requests_config: Union[dict, RequestsConfig] = {}
         ) -> None:
         """Initialize the search engine
 
@@ -43,15 +43,14 @@ def __init__(self,
             requests_config (Union[dict, RequestsConfig], optional): Requests-specific configuration. Defaults to None.
         """
 
-        # Convert string method to enum if needed
-        if isinstance(method, str):
-            method = SearchMethod(method.lower())
-
-        self.config = SearchConfig(method=method, 
-            base=LogConfig(base_config), 
-            selenium=SeleniumConfig(selenium_config),
-            requests=RequestsConfig(requests_config)
-        )
+        # Initialize configuration
+        self.version = WS_VERSION
+        self.config = SearchConfig.create({
+            "method": SearchMethod.create(method),
+            "base": LogConfig.create(base_config),
+            "selenium": SeleniumConfig.create(selenium_config),
+            "requests": RequestsConfig.create(requests_config),
+        })
 
         # Initialize common attributes
         self.version: str = WS_VERSION

From d0a7aa9f93d02f18d43407e9826d78299a1ce202 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Wed, 26 Mar 2025 12:11:27 -0700
Subject: [PATCH 037/101] update: clean log config, header as arg

---
 WebSearcher/models/configs.py |  8 ++++----
 WebSearcher/searchers.py      | 33 +++++++++++++++++----------------
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py
index d202881..9cacc50 100644
--- a/WebSearcher/models/configs.py
+++ b/WebSearcher/models/configs.py
@@ -17,9 +17,9 @@ def create(cls, config=None):
 
 
 class LogConfig(BaseConfig):
-    log_fp: str = ''
-    log_mode: str = 'a+'
-    log_level: str = 'INFO'
+    fp: str = ''
+    mode: str = 'a'
+    level: str = 'INFO'
 
 
 class SeleniumConfig(BaseConfig):
@@ -66,6 +66,6 @@ def create(cls, method=None):
 
 class SearchConfig(BaseConfig):
     method: Union[str, SearchMethod] = SearchMethod.SELENIUM
-    base: LogConfig = Field(default_factory=LogConfig)
+    log: LogConfig = Field(default_factory=LogConfig)
     selenium: SeleniumConfig = Field(default_factory=SeleniumConfig)
     requests: RequestsConfig = Field(default_factory=RequestsConfig)
diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 3fd4c04..71b2b8c 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -30,15 +30,16 @@ class SearchEngine:
     """Collect Search Engine Results Pages (SERPs)"""
     def __init__(self, 
             method: Union[str, SearchMethod] = SearchMethod.SELENIUM,
-            base_config: Union[dict, LogConfig] = {},
+            log_config: Union[dict, LogConfig] = {},
             selenium_config: Union[dict, SeleniumConfig] = {},
-            requests_config: Union[dict, RequestsConfig] = {}
+            requests_config: Union[dict, RequestsConfig] = {},
+            headers: Dict[str, str] = None
         ) -> None:
         """Initialize the search engine
 
         Args: 
             method (Union[str, SearchMethod], optional): The method to use for searching, either 'requests' or 'selenium'. Defaults to SearchMethod.SELENIUM.
-            base_config (Union[dict, LogConfig], optional): Common search configuration. Defaults to None.
+            log_config (Union[dict, LogConfig], optional): Common search configuration. Defaults to None.
             selenium_config (Union[dict, SeleniumConfig], optional): Selenium-specific configuration. Defaults to None.
             requests_config (Union[dict, RequestsConfig], optional): Requests-specific configuration. Defaults to None.
         """
@@ -47,18 +48,18 @@ def __init__(self,
         self.version = WS_VERSION
         self.config = SearchConfig.create({
             "method": SearchMethod.create(method),
-            "base": LogConfig.create(base_config),
+            "log": LogConfig.create(log_config),
             "selenium": SeleniumConfig.create(selenium_config),
             "requests": RequestsConfig.create(requests_config),
         })
 
-        # Initialize common attributes
-        self.version: str = WS_VERSION
+        # Initialize searcher
         self.base_url: str = 'https://www.google.com/search'
         self.params: Dict[str, Any] = {}
-
-        # Initialize method-specific attributes
-        if self.config.method == SearchMethod.SELENIUM:
+        if self.config.method == SearchMethod.REQUESTS:
+            self.headers = headers or self.config.requests.headers
+            self.sesh = self.config.requests.sesh or wu.start_sesh(headers=self.headers)
+        elif self.config.method == SearchMethod.SELENIUM:
             self.driver = None
         else:
             self.config.requests.sesh = self.config.requests.sesh or wu.start_sesh(headers=self.config.requests.headers)
@@ -82,11 +83,11 @@ def __init__(self,
 
         # Set a log file, prints to console by default
         self.log = logger.Logger(
-            console=True if not self.config.base.log_fp else False,
-            console_level=self.config.base.log_level,
-            file_name=self.config.base.log_fp, 
-            file_mode=self.config.base.log_mode,
-            file_level=self.config.base.log_level,
+            console=True if not self.config.log.fp else False,
+            console_level=self.config.log.level,
+            file_name=self.config.log.fp, 
+            file_mode=self.config.log.mode,
+            file_level=self.config.log.level,
         ).start(__name__)
 
     def search(self, 
@@ -295,7 +296,7 @@ def _conduct_search_requests(self, serp_id: str = '', crawl_id: str = ''):
         self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
         self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp)
         self.crawl_id = crawl_id
-        self.user_agent = self.config.requests.headers['User-Agent']
+        self.user_agent = self.headers['User-Agent']
 
         try:
             self._send_request()
@@ -310,7 +311,7 @@ def _conduct_search_requests(self, serp_id: str = '', crawl_id: str = ''):
             self._handle_response()
 
     def _send_request(self):
-        self.response = self.config.requests.sesh.get(self.url, timeout=10)
+        self.response = self.sesh.get(self.url, timeout=10)
         self.response_code = self.response.status_code
         log_msg = f"{self.response_code} | {self.qry}"
         log_msg = f"{log_msg} | {self.loc}" if self.loc else log_msg

From 2fc420dfa769e118921704081a94bcff41a0e6d0 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Wed, 26 Mar 2025 15:05:08 -0700
Subject: [PATCH 038/101] update: use search params pydantic model

---
 WebSearcher/models/configs.py | 30 ++++++++++++++-
 WebSearcher/searchers.py      | 70 ++++++++++++++---------------------
 2 files changed, 56 insertions(+), 44 deletions(-)

diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py
index 9cacc50..c784141 100644
--- a/WebSearcher/models/configs.py
+++ b/WebSearcher/models/configs.py
@@ -1,9 +1,12 @@
-from pydantic import BaseModel, Field
-from typing import Dict, Optional, Union
+from pydantic import BaseModel, Field, computed_field
+from typing import Dict, Optional, Union, Any
 import subprocess
 import requests
 from enum import Enum
 
+from .. import webutils as wu
+from .. import locations
+
 
 class BaseConfig(BaseModel):
     """Base class for all configuration classes"""
@@ -15,6 +18,29 @@ def create(cls, config=None):
             return cls(**config)
         return config or cls()
 
+class SearchParams(BaseConfig):
+    qry: str = ''
+    num_results: Optional[int] = None
+    lang: Optional[str] = None
+    loc: Optional[str] = None
+    base_url: str = "https://www.google.com/search"
+    
+    @computed_field
+    def url_params(self) -> Dict[str, Any]:
+        params = {'q': wu.encode_param_value(self.qry)}
+        opt_params = {
+            'num': self.num_results,
+            'hl': self.lang,
+            'uule': locations.convert_canonical_name_to_uule(self.loc) if self.loc else None,
+        }
+        opt_params = {k: v for k, v in opt_params.items() if v and v not in {'None', 'nan'}}
+        params.update(opt_params)
+        return params
+    
+    @computed_field
+    def url(self) -> str:
+        """Return the fully formed URL with parameters."""
+        return f"{self.base_url}?{wu.join_url_quote(self.url_params)}"
 
 class LogConfig(BaseConfig):
     fp: str = ''
diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 71b2b8c..f16331f 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -1,9 +1,8 @@
 from . import parsers
-from . import locations
 from . import webutils as wu
 from . import utils
 from . import logger
-from .models.configs import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod
+from .models.configs import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod, SearchParams
 from .models.data import BaseSERP
 
 import os
@@ -12,7 +11,7 @@
 import brotli
 import requests
 import pandas as pd
-from typing import Any, Dict, Optional, Union
+from typing import Dict, Optional, Union
 from datetime import datetime, timezone
 
 # selenium updates
@@ -54,22 +53,16 @@ def __init__(self,
         })
 
         # Initialize searcher
-        self.base_url: str = 'https://www.google.com/search'
-        self.params: Dict[str, Any] = {}
         if self.config.method == SearchMethod.REQUESTS:
             self.headers = headers or self.config.requests.headers
             self.sesh = self.config.requests.sesh or wu.start_sesh(headers=self.headers)
         elif self.config.method == SearchMethod.SELENIUM:
             self.driver = None
-        else:
-            self.config.requests.sesh = self.config.requests.sesh or wu.start_sesh(headers=self.config.requests.headers)
+
+        self.search_params = SearchParams.create()
+
 
         # Initialize search details
-        self.qry: str = None
-        self.loc: str = None
-        self.lang: str = None
-        self.num_results = None
-        self.url: str = None
         self.timestamp: str = None
         self.serp_id: str = None
         self.crawl_id: str = None
@@ -113,21 +106,13 @@ def search(self,
         self._prepare_search(qry=qry, location=location, lang=lang, num_results=num_results)
         self._conduct_search(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand)
 
-    def _prepare_search(self, qry: str, location: str = None, lang: str = None, num_results: int = None):
-        """Prepare a search URL and metadata for the given query and location"""
-        self.qry = str(qry)
-        self.loc = str(location) if not pd.isnull(location) else ''
-        self.lang = str(lang) if not pd.isnull(lang) else ''
-        self.num_results = num_results
-        self.params = {}
-        self.params['q'] = wu.encode_param_value(self.qry)
-        if self.num_results:
-            self.params['num'] = self.num_results
-        if self.lang and self.lang not in {'None', 'nan'}:
-            self.params['hl'] = self.lang
-        if self.loc and self.loc not in {'None', 'nan'}:
-            self.params['uule'] = locations.convert_canonical_name_to_uule(self.loc)
-        self.url = f"{self.base_url}?{wu.join_url_quote(self.params)}"
+    def _prepare_search(self, qry: str, location: str, lang: str, num_results: int):
+        self.search_params = SearchParams.create({
+            'qry': str(qry),
+            'loc': str(location) if not pd.isnull(location) else '',
+            'lang': str(lang) if not pd.isnull(lang) else '',
+            'num_results': num_results,
+        })
 
     def _conduct_search(self, serp_id:str = '', crawl_id:str = '', ai_expand:bool = False):
         if self.config.method == SearchMethod.SELENIUM:
@@ -162,14 +147,14 @@ def _send_chromedriver_typed_query(self):
         time.sleep(2)
         search_box = self.driver.find_element(By.ID, "APjFqb")
         search_box.clear()
-        search_box.send_keys(self.qry)
+        search_box.send_keys(self.search_params.qry)
         search_box.send_keys(Keys.RETURN)
 
     def _send_chromedriver_request(self):
         """Use a prepared URL to conduct a search"""
 
         time.sleep(2)
-        self.driver.get(self.url)
+        self.driver.get(self.search_params.url)
         time.sleep(2)
         
         # wait for the page to load
@@ -179,19 +164,19 @@ def _send_chromedriver_request(self):
         time.sleep(2) #including a sleep to allow the page to fully load
 
         self.html = self.driver.page_source
-        self.url = self.driver.current_url
+        self.selenium_url = self.driver.current_url
         self.response_code = 0
-        log_msg = f"{self.response_code} | {self.qry}"
-        log_msg = f"{log_msg} | {self.loc}" if self.loc else log_msg
+        log_msg = f"{self.response_code} | {self.search_params.qry}"
+        log_msg = f"{log_msg} | {self.search_params.loc}" if self.search_params.loc else log_msg
         self.log.info(log_msg)
 
     def _conduct_search_chromedriver(self, serp_id: str = '', crawl_id: str = '', ai_expand = False):
         """Send a search request and handle errors"""
         if not self.driver:
             self._init_chromedriver()
-
         self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
-        self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp)
+        str_to_hash = self.search_params.qry + self.search_params.loc + self.timestamp
+        self.serp_id = serp_id if serp_id else utils.hash_id(str_to_hash)
         self.crawl_id = crawl_id
         try:
             self._send_chromedriver_request()
@@ -294,7 +279,8 @@ def _conduct_search_requests(self, serp_id: str = '', crawl_id: str = ''):
         """Send a search request and handle errors"""
 
         self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
-        self.serp_id = serp_id if serp_id else utils.hash_id(self.qry + self.loc + self.timestamp)
+        str_to_hash = self.search_params.qry + self.search_params.loc + self.timestamp
+        self.serp_id = serp_id if serp_id else utils.hash_id(str_to_hash)
         self.crawl_id = crawl_id
         self.user_agent = self.headers['User-Agent']
 
@@ -311,10 +297,10 @@ def _conduct_search_requests(self, serp_id: str = '', crawl_id: str = ''):
             self._handle_response()
 
     def _send_request(self):
-        self.response = self.sesh.get(self.url, timeout=10)
+        self.response = self.sesh.get(self.search_params.url, timeout=10)
         self.response_code = self.response.status_code
-        log_msg = f"{self.response_code} | {self.qry}"
-        log_msg = f"{log_msg} | {self.loc}" if self.loc else log_msg
+        log_msg = f"{self.response_code} | {self.search_params.qry}"
+        log_msg = f"{log_msg} | {self.search_params.loc}" if self.search_params.loc else log_msg
         self.log.info(log_msg)
 
     def _reset_ssh_tunnel(self):
@@ -383,10 +369,10 @@ def parse_serp_features(self):
 
     def prepare_serp_save(self):
         self.serp = BaseSERP(
-            qry=self.qry, 
-            loc=self.loc, 
-            lang=self.lang,
-            url=self.url, 
+            qry=self.search_params.qry,
+            loc=self.search_params.loc,
+            lang=self.search_params.lang,
+            url=self.search_params.url, 
             html=self.html,
             response_code=self.response_code,
             user_agent=self.user_agent,

From 083282fb71c386fd1942be19c1f8e6d571cc43b4 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 27 Mar 2025 08:49:26 -0700
Subject: [PATCH 039/101] update: move selenium to new searchers dir

---
 WebSearcher/searchers.py                   | 224 +++++++--------------
 WebSearcher/searchers/__init__.py          |   0
 WebSearcher/searchers/selenium_searcher.py | 176 ++++++++++++++++
 3 files changed, 247 insertions(+), 153 deletions(-)
 create mode 100644 WebSearcher/searchers/__init__.py
 create mode 100644 WebSearcher/searchers/selenium_searcher.py

diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index f16331f..70e954f 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -2,6 +2,7 @@
 from . import webutils as wu
 from . import utils
 from . import logger
+from .searchers.selenium_searcher import SeleniumDriver
 from .models.configs import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod, SearchParams
 from .models.data import BaseSERP
 
@@ -14,13 +15,7 @@
 from typing import Dict, Optional, Union
 from datetime import datetime, timezone
 
-# selenium updates
-import undetected_chromedriver as uc
-from selenium.webdriver.common.by import By
-from selenium.webdriver.common.keys import Keys
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import NoSuchElementException
+# selenium imports no longer needed here as they're in selenium_utils.py
 
 from importlib import metadata
 WS_VERSION = metadata.version('WebSearcher')
@@ -52,15 +47,43 @@ def __init__(self,
             "requests": RequestsConfig.create(requests_config),
         })
 
+        # Set a log file, prints to console by default
+        self.log = logger.Logger(
+            console=True if not self.config.log.fp else False,
+            console_level=self.config.log.level,
+            file_name=self.config.log.fp, 
+            file_mode=self.config.log.mode,
+            file_level=self.config.log.level,
+        ).start(__name__)
+
         # Initialize searcher
         if self.config.method == SearchMethod.REQUESTS:
             self.headers = headers or self.config.requests.headers
             self.sesh = self.config.requests.sesh or wu.start_sesh(headers=self.headers)
         elif self.config.method == SearchMethod.SELENIUM:
-            self.driver = None
+            self.selenium_driver = SeleniumDriver(config=self.config.selenium, logger=self.log)
+            self.selenium_driver.driver = None
 
         self.search_params = SearchParams.create()
 
+        # Initialize search details
+        self.serp = {
+            'version': self.version,
+            'method': self.config.method.value,
+            'crawl_id': None,
+            'serp_id': None,
+            'qry': None,
+            'loc': None,
+            'lang': None,
+            'url': None,
+            'response_code': None,
+            'user_agent': None,
+            'timestamp': None,
+            'serp_id': None,
+            'html': None,
+            'results': [],
+            'features': {},
+        }
 
         # Initialize search details
         self.timestamp: str = None
@@ -74,15 +97,6 @@ def __init__(self,
         self.serp_features: dict = {}
         self.serp: dict = {}
 
-        # Set a log file, prints to console by default
-        self.log = logger.Logger(
-            console=True if not self.config.log.fp else False,
-            console_level=self.config.log.level,
-            file_name=self.config.log.fp, 
-            file_mode=self.config.log.mode,
-            file_level=self.config.log.level,
-        ).start(__name__)
-
     def search(self, 
             qry: str, 
             location: str = None, 
@@ -123,154 +137,38 @@ def _conduct_search(self, serp_id:str = '', crawl_id:str = '', ai_expand:bool =
     # ==========================================================================
     # Selenium method
 
-    def _init_chromedriver(self) -> None:
-        """Initialize Chrome driver with selenium-specific config"""
-        self.log.debug(f'SERP | init uc chromedriver | kwargs: {self.config.selenium.__dict__}')
-        self.driver = uc.Chrome(**self.config.selenium.__dict__)
-        self.user_agent = self.driver.execute_script('return navigator.userAgent')
-        self.response_code = None
-        
-        # Log version information
-        self.browser_info = {
-            'browser_id': "",
-            'browser_name': self.driver.capabilities['browserName'],
-            'browser_version': self.driver.capabilities['browserVersion'],
-            'driver_version': self.driver.capabilities['chrome']['chromedriverVersion'].split(' ')[0],
-        }
-        self.browser_info['browser_id'] = utils.hash_id(json.dumps(self.browser_info))
-        self.log.debug(json.dumps(self.browser_info, indent=4))
-
-    def _send_chromedriver_typed_query(self):
-        """Send a typed query to the search box"""
-        time.sleep(2)
-        self.driver.get('https://www.google.com')
-        time.sleep(2)
-        search_box = self.driver.find_element(By.ID, "APjFqb")
-        search_box.clear()
-        search_box.send_keys(self.search_params.qry)
-        search_box.send_keys(Keys.RETURN)
-
-    def _send_chromedriver_request(self):
-        """Use a prepared URL to conduct a search"""
-
-        time.sleep(2)
-        self.driver.get(self.search_params.url)
-        time.sleep(2)
-        
-        # wait for the page to load
-        WebDriverWait(self.driver, 10).until(
-            EC.presence_of_element_located((By.ID, "search")) 
-        )
-        time.sleep(2) #including a sleep to allow the page to fully load
-
-        self.html = self.driver.page_source
-        self.selenium_url = self.driver.current_url
-        self.response_code = 0
-        log_msg = f"{self.response_code} | {self.search_params.qry}"
-        log_msg = f"{log_msg} | {self.search_params.loc}" if self.search_params.loc else log_msg
-        self.log.info(log_msg)
-
     def _conduct_search_chromedriver(self, serp_id: str = '', crawl_id: str = '', ai_expand = False):
         """Send a search request and handle errors"""
-        if not self.driver:
-            self._init_chromedriver()
+        if not self.selenium_driver.driver:
+            self.selenium_driver.init_driver()
+        
+        self.crawl_id = crawl_id
         self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
         str_to_hash = self.search_params.qry + self.search_params.loc + self.timestamp
         self.serp_id = serp_id if serp_id else utils.hash_id(str_to_hash)
-        self.crawl_id = crawl_id
+        
         try:
-            self._send_chromedriver_request()
-            self.html = self.driver.page_source
+            response_data = self.selenium_driver.send_request(self.search_params.url)
+            self.html = response_data['html']
+            self.selenium_url = response_data['url']
+            self.response_code = response_data['response_code']
+            self.user_agent = self.selenium_driver.user_agent
+            
+            log_msg = f"{self.response_code} | {self.search_params.qry}"
+            log_msg = f"{log_msg} | {self.search_params.loc}" if self.search_params.loc else log_msg
+            self.log.info(log_msg)
+            
         except Exception as e:
             self.log.exception(f'SERP | Chromedriver error | {self.serp_id}: {str(e)}')
 
         if ai_expand:
-            self._expand_ai_overview()
+            expanded_html = self.selenium_driver.expand_ai_overview()
+            if expanded_html:
+                self.log.debug(f'SERP | overwriting expanded content | len diff: {len(expanded_html) - len(self.html)}')
+                self.html = expanded_html
         
         # Only delete cookies, don't close the driver here
-        # The driver will be closed when cleanup() is called
-        if self.driver:
-            try:
-                self.driver.delete_all_cookies()
-            except Exception as e:
-                self.log.warning(f"Failed to delete cookies: {str(e)}")
-
-    def _expand_ai_overview(self):
-        """Expand AI overview box by clicking it"""
-        show_more_button_xpath = "//div[@jsname='rPRdsc' and @role='button']"
-        show_all_button_xpath = '//div[contains(@class, "trEk7e") and @role="button"]'
-
-        try:
-            self.driver.find_element(By.XPATH, show_more_button_xpath)
-            show_more_button_exists = True
-        except NoSuchElementException:
-            show_more_button_exists = False
-        
-        if show_more_button_exists:
-            try:
-                show_more_button = WebDriverWait(self.driver, 1).until(
-                    EC.element_to_be_clickable((By.XPATH, show_more_button_xpath))
-                )
-                if show_more_button is not None:
-                    show_more_button.click()
-                    try:
-                        time.sleep(2) # Wait for additional content to load
-                        show_all_button = WebDriverWait(self.driver, 1).until(
-                            EC.element_to_be_clickable((By.XPATH, show_all_button_xpath))
-                        )
-                        show_all_button.click()
-                    except Exception:
-                        pass
-                    
-                     # Overwrite html with expanded content
-                    new_html = self.driver.page_source
-                    self.log.debug(f'SERP | overwriting expanded content | len diff: {len(new_html) - len(self.html)}')
-                    self.html = new_html
-
-            except Exception:
-                pass
-
-    def cleanup(self):
-        """Clean up resources, particularly Selenium's browser instance
-        
-        Returns:
-            bool: True if cleanup was successful or not needed, False if cleanup failed
-        """
-        if self.config.method == SearchMethod.SELENIUM and hasattr(self, 'driver') and self.driver:
-            try:
-                # Try a more thorough cleanup
-                try:
-                    self.driver.delete_all_cookies()
-                except Exception:
-                    pass
-                
-                try:
-                    # Close all tabs/windows
-                    original_handle = self.driver.current_window_handle
-                    for handle in self.driver.window_handles:
-                        self.driver.switch_to.window(handle)
-                        self.driver.close()
-                except Exception:
-                    pass
-                
-                # Finally quit the driver
-                self.driver.quit()
-                self.driver = None
-                self.log.debug(f'Browser successfully closed')
-                return True
-            except Exception as e:
-                self.log.warning(f'Failed to close browser: {e}')
-                # Force driver to be None so we create a fresh instance next time
-                self.driver = None
-                return False
-        return True
-    
-    def __del__(self):
-        """Destructor to ensure browser is closed when object is garbage collected"""
-        try:
-            self.cleanup()
-        except Exception:
-            pass
+        self.selenium_driver.delete_cookies()
 
     # ==========================================================================
     # Requests method
@@ -441,3 +339,23 @@ def save_results(self, save_dir: str = "", append_to: str = ""):
         else:
             self.log.info(f'No parsed results for serp_id: {self.serp_id}')
 
+    def cleanup(self):
+        """Clean up resources, particularly Selenium's browser instance
+        
+        Returns:
+            bool: True if cleanup was successful or not needed, False if cleanup failed
+        """
+        if self.config.method == SearchMethod.SELENIUM and hasattr(self, 'selenium_driver'):
+            result = self.selenium_driver.cleanup()
+            if result:
+                self.selenium_driver.driver = None  # Update the reference
+            return result
+        return True
+    
+    def __del__(self):
+        """Destructor to ensure browser is closed when object is garbage collected"""
+        try:
+            self.cleanup()
+        except Exception:
+            pass
+
diff --git a/WebSearcher/searchers/__init__.py b/WebSearcher/searchers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/WebSearcher/searchers/selenium_searcher.py b/WebSearcher/searchers/selenium_searcher.py
new file mode 100644
index 0000000..3d5f5d6
--- /dev/null
+++ b/WebSearcher/searchers/selenium_searcher.py
@@ -0,0 +1,176 @@
+import time
+import json
+from typing import Dict, Optional, Any
+
+import undetected_chromedriver as uc
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.common.exceptions import NoSuchElementException
+
+from .. import utils
+from ..models.configs import SeleniumConfig
+
+
+class SeleniumDriver:
+    """Handle Selenium-based web interactions for search engines"""
+    
+    def __init__(self, config: SeleniumConfig, logger):
+        """Initialize a Selenium driver with the given configuration
+        
+        Args:
+            config (SeleniumConfig): Configuration for Selenium
+            logger: Logger instance
+        """
+        self.config = config
+        self.log = logger
+        self.driver = None
+        self.user_agent = None
+        self.response_code = None
+        self.browser_info = {}
+        
+    def init_driver(self) -> None:
+        """Initialize Chrome driver with selenium-specific config"""
+        self.log.debug(f'SERP | init uc chromedriver | kwargs: {self.config.__dict__}')
+        self.driver = uc.Chrome(**self.config.__dict__)
+        self.user_agent = self.driver.execute_script('return navigator.userAgent')
+        self.response_code = None
+        
+        # Log version information
+        self.browser_info = {
+            'browser_id': "",
+            'browser_name': self.driver.capabilities['browserName'],
+            'browser_version': self.driver.capabilities['browserVersion'],
+            'driver_version': self.driver.capabilities['chrome']['chromedriverVersion'].split(' ')[0],
+        }
+        self.browser_info['browser_id'] = utils.hash_id(json.dumps(self.browser_info))
+        self.log.debug(json.dumps(self.browser_info, indent=4))
+        
+    def send_typed_query(self, query: str):
+        """Send a typed query to the search box"""
+        time.sleep(2)
+        self.driver.get('https://www.google.com')
+        time.sleep(2)
+        search_box = self.driver.find_element(By.ID, "APjFqb")
+        search_box.clear()
+        search_box.send_keys(query)
+        search_box.send_keys(Keys.RETURN)
+        
+    def send_request(self, url: str) -> Dict[str, Any]:
+        """Use a prepared URL to conduct a search
+        
+        Args:
+            url (str): The URL to request
+            
+        Returns:
+            Dict[str, Any]: Dictionary containing response data
+        """
+        time.sleep(2)
+        self.driver.get(url)
+        time.sleep(2)
+        
+        # wait for the page to load
+        WebDriverWait(self.driver, 10).until(
+            EC.presence_of_element_located((By.ID, "search")) 
+        )
+        time.sleep(2) #including a sleep to allow the page to fully load
+
+        html = self.driver.page_source
+        selenium_url = self.driver.current_url
+        self.response_code = 0
+        
+        return {
+            'html': html,
+            'url': selenium_url,
+            'response_code': self.response_code,
+        }
+        
+    def expand_ai_overview(self):
+        """Expand AI overview box by clicking it
+        
+        Returns:
+            str: Updated HTML if expansion occurred, None otherwise
+        """
+        show_more_button_xpath = "//div[@jsname='rPRdsc' and @role='button']"
+        show_all_button_xpath = '//div[contains(@class, "trEk7e") and @role="button"]'
+
+        try:
+            self.driver.find_element(By.XPATH, show_more_button_xpath)
+            show_more_button_exists = True
+        except NoSuchElementException:
+            show_more_button_exists = False
+        
+        if show_more_button_exists:
+            try:
+                show_more_button = WebDriverWait(self.driver, 1).until(
+                    EC.element_to_be_clickable((By.XPATH, show_more_button_xpath))
+                )
+                if show_more_button is not None:
+                    show_more_button.click()
+                    try:
+                        time.sleep(2) # Wait for additional content to load
+                        show_all_button = WebDriverWait(self.driver, 1).until(
+                            EC.element_to_be_clickable((By.XPATH, show_all_button_xpath))
+                        )
+                        show_all_button.click()
+                    except Exception:
+                        pass
+                    
+                    # Return expanded content
+                    return self.driver.page_source
+
+            except Exception:
+                pass
+        
+        return None
+        
+    def cleanup(self) -> bool:
+        """Clean up resources, particularly Selenium's browser instance
+        
+        Returns:
+            bool: True if cleanup was successful or not needed, False if cleanup failed
+        """
+        if self.driver:
+            try:
+                # Try a more thorough cleanup
+                try:
+                    self.driver.delete_all_cookies()
+                except Exception:
+                    pass
+                
+                try:
+                    # Close all tabs/windows
+                    original_handle = self.driver.current_window_handle
+                    for handle in self.driver.window_handles:
+                        self.driver.switch_to.window(handle)
+                        self.driver.close()
+                except Exception:
+                    pass
+                
+                # Finally quit the driver
+                self.driver.quit()
+                self.driver = None
+                self.log.debug(f'Browser successfully closed')
+                return True
+            except Exception as e:
+                self.log.warning(f'Failed to close browser: {e}')
+                # Force driver to be None so we create a fresh instance next time
+                self.driver = None
+                return False
+        return True
+    
+    def delete_cookies(self):
+        """Delete all cookies from the browser"""
+        if self.driver:
+            try:
+                self.driver.delete_all_cookies()
+            except Exception as e:
+                self.log.warning(f"Failed to delete cookies: {str(e)}")
+                
+    def __del__(self):
+        """Destructor to ensure browser is closed when object is garbage collected"""
+        try:
+            self.cleanup()
+        except Exception:
+            pass

From 7ae02c6bfe71e54a9c1ab6d75c96a6e9d4b04665 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 27 Mar 2025 08:49:33 -0700
Subject: [PATCH 040/101] update: model docs

---
 WebSearcher/models/configs.py | 29 ++++++++++++++----
 WebSearcher/models/data.py    | 56 +++++++++++++++++++++--------------
 2 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py
index c784141..0f48176 100644
--- a/WebSearcher/models/configs.py
+++ b/WebSearcher/models/configs.py
@@ -19,12 +19,24 @@ def create(cls, config=None):
         return config or cls()
 
 class SearchParams(BaseConfig):
-    qry: str = ''
-    num_results: Optional[int] = None
-    lang: Optional[str] = None
-    loc: Optional[str] = None
-    base_url: str = "https://www.google.com/search"
+    """
+    Contains parameters for a search request and utility methods for URL generation.
     
+    This class stores search query parameters and provides methods to convert
+    them into properly formatted URL parameters and complete search URLs.
+    """
+    qry: str = Field('', description="The search query text")
+    num_results: Optional[int] = Field(None, description="Number of results to return")
+    lang: Optional[str] = Field(None, description="Language code (e.g., 'en')")
+    loc: Optional[str] = Field(None, description="Location in Canonical Name format")
+    base_url: str = Field("https://www.google.com/search", description="Base search engine URL")
+    
+    """
+    Generates a dictionary of URL parameters based on the search parameters.
+    
+    Converts the search parameters to a dictionary format suitable for URL encoding,
+    removing any None values and handling special parameters like location.
+    """
     @computed_field
     def url_params(self) -> Dict[str, Any]:
         params = {'q': wu.encode_param_value(self.qry)}
@@ -37,9 +49,14 @@ def url_params(self) -> Dict[str, Any]:
         params.update(opt_params)
         return params
     
+    """
+    Returns the fully formed search URL with all parameters.
+    
+    Combines the base URL with the encoded parameters to create a complete,
+    properly escaped search URL.
+    """
     @computed_field
     def url(self) -> str:
-        """Return the fully formed URL with parameters."""
         return f"{self.base_url}?{wu.join_url_quote(self.url_params)}"
 
 class LogConfig(BaseConfig):
diff --git a/WebSearcher/models/data.py b/WebSearcher/models/data.py
index 4143f8d..b141668 100644
--- a/WebSearcher/models/data.py
+++ b/WebSearcher/models/data.py
@@ -1,29 +1,41 @@
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 from typing import Any, Optional
 
 
 class BaseResult(BaseModel):
-    sub_rank: int = 0
-    type: str = 'unclassified'
-    sub_type: Optional[str] = None
-    title: Optional[str] = None
-    url: Optional[str] = None
-    text: Optional[str] = None
-    cite: Optional[str] = None
-    details: Optional[Any] = None
-    error: Optional[str] = None
+    """
+    Represents a single search result item extracted from a SERP.
+    
+    Contains the structured data of one search result including its rank,
+    type, title, URL, and other metadata.
+    """
+    sub_rank: int = Field(0, description="Position within a results component")
+    type: str = Field('unclassified', description="Result type (general, ad, etc.)")
+    sub_type: Optional[str] = Field(None, description="Result sub-type (e.g., header, item)")
+    title: Optional[str] = Field(None, description="Title of the search result")
+    url: Optional[str] = Field(None, description="URL of the search result")
+    text: Optional[str] = Field(None, description="Snippet text from the search result") 
+    cite: Optional[str] = Field(None, description="Citation or source information")
+    details: Optional[Any] = Field(None, description="Additional structured details specific to result type")
+    error: Optional[str] = Field(None, description="Error message if result parsing failed")
 
 
 class BaseSERP(BaseModel):
-    qry: str                   # Search query 
-    loc: Optional[str] = None  # Location if set, "Canonical Name"
-    lang: Optional[str] = None # Language if set
-    url: str                   # URL of SERP   
-    html: str                  # Raw HTML of SERP
-    timestamp: str             # Timestamp of crawl
-    response_code: int         # HTTP response code
-    user_agent: str            # User agent used for the crawl
-    serp_id: str               # Search Engine Results Page (SERP) ID
-    crawl_id: str              # Crawl ID for grouping SERPs
-    version: str               # WebSearcher version
-    method: str                # Search method used
+    """
+    Represents a complete Search Engine Results Page (SERP).
+    
+    Contains all data related to a single search query including the query itself,
+    raw HTML response, metadata about the request, and identifiers for tracking.
+    """
+    qry: str = Field(..., description="Search query")
+    loc: Optional[str] = Field(None, description="Location if set, in Canonical Name format")
+    lang: Optional[str] = Field(None, description="Language code if set")
+    url: str = Field(..., description="URL of the SERP")
+    html: str = Field(..., description="Raw HTML of the SERP")
+    timestamp: str = Field(..., description="ISO format timestamp of the crawl")
+    response_code: int = Field(..., description="HTTP response code")
+    user_agent: str = Field(..., description="User agent used for the request")
+    serp_id: str = Field(..., description="Unique identifier for this SERP")
+    crawl_id: str = Field(..., description="Identifier for grouping related SERPs")
+    version: str = Field(..., description="WebSearcher version used")
+    method: str = Field(..., description="Search method used (selenium/requests)")

From b725ace767dde113b8dae32cc52346c23f56b6ee Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 27 Mar 2025 10:27:04 -0700
Subject: [PATCH 041/101] add: searches directory for diff methods

---
 WebSearcher/models/configs.py                 |  55 +-------
 WebSearcher/models/searches.py                |  76 +++++++++++
 .../{searchers => search_methods}/__init__.py |   0
 .../selenium_searcher.py                      |  64 ++++-----
 WebSearcher/searchers.py                      | 127 +++++++-----------
 5 files changed, 153 insertions(+), 169 deletions(-)
 create mode 100644 WebSearcher/models/searches.py
 rename WebSearcher/{searchers => search_methods}/__init__.py (100%)
 rename WebSearcher/{searchers => search_methods}/selenium_searcher.py (82%)

diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py
index 0f48176..c64ee30 100644
--- a/WebSearcher/models/configs.py
+++ b/WebSearcher/models/configs.py
@@ -1,12 +1,8 @@
-from pydantic import BaseModel, Field, computed_field
-from typing import Dict, Optional, Union, Any
-import subprocess
 import requests
+import subprocess
 from enum import Enum
-
-from .. import webutils as wu
-from .. import locations
-
+from typing import Dict, Optional, Union
+from pydantic import BaseModel, Field
 
 class BaseConfig(BaseModel):
     """Base class for all configuration classes"""
@@ -18,60 +14,17 @@ def create(cls, config=None):
             return cls(**config)
         return config or cls()
 
-class SearchParams(BaseConfig):
-    """
-    Contains parameters for a search request and utility methods for URL generation.
-    
-    This class stores search query parameters and provides methods to convert
-    them into properly formatted URL parameters and complete search URLs.
-    """
-    qry: str = Field('', description="The search query text")
-    num_results: Optional[int] = Field(None, description="Number of results to return")
-    lang: Optional[str] = Field(None, description="Language code (e.g., 'en')")
-    loc: Optional[str] = Field(None, description="Location in Canonical Name format")
-    base_url: str = Field("https://www.google.com/search", description="Base search engine URL")
-    
-    """
-    Generates a dictionary of URL parameters based on the search parameters.
-    
-    Converts the search parameters to a dictionary format suitable for URL encoding,
-    removing any None values and handling special parameters like location.
-    """
-    @computed_field
-    def url_params(self) -> Dict[str, Any]:
-        params = {'q': wu.encode_param_value(self.qry)}
-        opt_params = {
-            'num': self.num_results,
-            'hl': self.lang,
-            'uule': locations.convert_canonical_name_to_uule(self.loc) if self.loc else None,
-        }
-        opt_params = {k: v for k, v in opt_params.items() if v and v not in {'None', 'nan'}}
-        params.update(opt_params)
-        return params
-    
-    """
-    Returns the fully formed search URL with all parameters.
-    
-    Combines the base URL with the encoded parameters to create a complete,
-    properly escaped search URL.
-    """
-    @computed_field
-    def url(self) -> str:
-        return f"{self.base_url}?{wu.join_url_quote(self.url_params)}"
-
 class LogConfig(BaseConfig):
     fp: str = ''
     mode: str = 'a'
     level: str = 'INFO'
 
-
 class SeleniumConfig(BaseConfig):
     headless: bool = False
     version_main: int = 133
     use_subprocess: bool = False
     driver_executable_path: str = ""
 
-
 class RequestsConfig(BaseConfig):
     model_config = {"arbitrary_types_allowed": True}
     headers: Dict[str, str] = Field(default_factory=lambda: {
@@ -86,7 +39,6 @@ class RequestsConfig(BaseConfig):
     ssh_tunnel: Optional[subprocess.Popen] = None
     unzip: bool = True
 
-
 class SearchMethod(Enum):
     REQUESTS = "requests"
     SELENIUM = "selenium"
@@ -106,7 +58,6 @@ def create(cls, method=None):
                 raise ValueError(f"Invalid search method: {method}. Valid values are: {valid_values}")
         raise TypeError(f"Expected string or SearchMethod, got {type(method)}")
 
-
 class SearchConfig(BaseConfig):
     method: Union[str, SearchMethod] = SearchMethod.SELENIUM
     log: LogConfig = Field(default_factory=LogConfig)
diff --git a/WebSearcher/models/searches.py b/WebSearcher/models/searches.py
new file mode 100644
index 0000000..3570823
--- /dev/null
+++ b/WebSearcher/models/searches.py
@@ -0,0 +1,76 @@
+from pydantic import Field, computed_field
+from typing import Dict, Optional, Any, List
+from datetime import datetime, timezone
+
+from ..utils import hash_id
+from ..import webutils as wu
+from ..import locations
+from .configs import BaseConfig
+
+
+class SearchParams(BaseConfig):
+    """Contains parameters for a search request and utility methods for URL generation"""
+    qry: str = Field('', description="The search query text")
+    num_results: Optional[int] = Field(None, description="Number of results to return")
+    lang: Optional[str] = Field(None, description="Language code (e.g., 'en')")
+    loc: Optional[str] = Field(None, description="Location in Canonical Name format")
+    base_url: str = Field("https://www.google.com/search", description="Base search engine URL")
+    
+    @computed_field
+    def url_params(self) -> Dict[str, Any]:
+        """Generates a dictionary of URL parameters based on the search parameters"""
+        params = {'q': wu.encode_param_value(self.qry)}
+        opt_params = {
+            'num': self.num_results,
+            'hl': self.lang,
+            'uule': locations.convert_canonical_name_to_uule(self.loc) if self.loc else None,
+        }
+        opt_params = {k: v for k, v in opt_params.items() if v and v not in {'None', 'nan'}}
+        params.update(opt_params)
+        return params
+    
+    @computed_field
+    def url(self) -> str:
+        """Returns the fully formed search URL with all parameters"""
+        return f"{self.base_url}?{wu.join_url_quote(self.url_params)}"
+    
+    @computed_field
+    def serp_id(self) -> str:
+        """Computes a unique SERP ID based on query, location, and timestamp"""
+        timestamp = datetime.now().isoformat()
+        return hash_id(f"{self.qry}{self.loc}{timestamp}")
+    
+    def to_dict_output(self) -> Dict[str, Any]:
+        """Outputs the variables needed for SERPDetails as a dictionary"""
+        timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
+        return {
+            "qry": self.qry,
+            "loc": self.loc,
+            "lang": self.lang,
+            "url": self.url,
+            "serp_id": hash_id(f"{self.qry}{self.loc}{timestamp}"),
+            "timestamp": timestamp,
+        }
+
+
+class SERPDetails(BaseConfig):
+    """
+    Contains details about a Search Engine Results Page (SERP).
+    
+    This class stores all the information related to a SERP, including
+    search parameters, response data, parsed results and features.
+    """
+    version: str = Field(None, description="WebSearcher version")
+    method: str = Field(None, description="Search method used (requests or selenium)")
+    crawl_id: Optional[str] = Field(None, description="ID for the crawl session")
+    serp_id: Optional[str] = Field(None, description="Unique ID for this SERP")
+    qry: Optional[str] = Field(None, description="Search query")
+    loc: Optional[str] = Field(None, description="Location used for search")
+    lang: Optional[str] = Field(None, description="Language used for search")
+    url: Optional[str] = Field(None, description="Full search URL")
+    response_code: Optional[int] = Field(None, description="HTTP response code")
+    user_agent: Optional[str] = Field(None, description="User agent used for request")
+    timestamp: Optional[str] = Field(None, description="ISO timestamp of search")
+    html: Optional[str] = Field(None, description="Raw HTML response")
+    results: List[Dict[str, Any]] = Field(default_factory=list, description="Parsed search results")
+    features: Dict[str, Any] = Field(default_factory=dict, description="Extracted SERP features")
diff --git a/WebSearcher/searchers/__init__.py b/WebSearcher/search_methods/__init__.py
similarity index 100%
rename from WebSearcher/searchers/__init__.py
rename to WebSearcher/search_methods/__init__.py
diff --git a/WebSearcher/searchers/selenium_searcher.py b/WebSearcher/search_methods/selenium_searcher.py
similarity index 82%
rename from WebSearcher/searchers/selenium_searcher.py
rename to WebSearcher/search_methods/selenium_searcher.py
index 3d5f5d6..3529c6f 100644
--- a/WebSearcher/searchers/selenium_searcher.py
+++ b/WebSearcher/search_methods/selenium_searcher.py
@@ -12,7 +12,6 @@
 from .. import utils
 from ..models.configs import SeleniumConfig
 
-
 class SeleniumDriver:
     """Handle Selenium-based web interactions for search engines"""
     
@@ -26,16 +25,12 @@ def __init__(self, config: SeleniumConfig, logger):
         self.config = config
         self.log = logger
         self.driver = None
-        self.user_agent = None
-        self.response_code = None
         self.browser_info = {}
         
     def init_driver(self) -> None:
         """Initialize Chrome driver with selenium-specific config"""
         self.log.debug(f'SERP | init uc chromedriver | kwargs: {self.config.__dict__}')
         self.driver = uc.Chrome(**self.config.__dict__)
-        self.user_agent = self.driver.execute_script('return navigator.userAgent')
-        self.response_code = None
         
         # Log version information
         self.browser_info = {
@@ -43,6 +38,7 @@ def init_driver(self) -> None:
             'browser_name': self.driver.capabilities['browserName'],
             'browser_version': self.driver.capabilities['browserVersion'],
             'driver_version': self.driver.capabilities['chrome']['chromedriverVersion'].split(' ')[0],
+            'user_agent': self.driver.execute_script('return navigator.userAgent'),
         }
         self.browser_info['browser_id'] = utils.hash_id(json.dumps(self.browser_info))
         self.log.debug(json.dumps(self.browser_info, indent=4))
@@ -58,40 +54,34 @@ def send_typed_query(self, query: str):
         search_box.send_keys(Keys.RETURN)
         
     def send_request(self, url: str) -> Dict[str, Any]:
-        """Use a prepared URL to conduct a search
-        
-        Args:
-            url (str): The URL to request
-            
-        Returns:
-            Dict[str, Any]: Dictionary containing response data
-        """
-        time.sleep(2)
-        self.driver.get(url)
-        time.sleep(2)
-        
-        # wait for the page to load
-        WebDriverWait(self.driver, 10).until(
-            EC.presence_of_element_located((By.ID, "search")) 
-        )
-        time.sleep(2) #including a sleep to allow the page to fully load
+        """Visit a URL with selenium and save HTML response"""
+
+        try:
+            self.driver.get(url)
+            time.sleep(2)
+            WebDriverWait(self.driver, 10).until(
+                EC.presence_of_element_located((By.ID, "search")) 
+            )
+            time.sleep(2)
+            response_output = {
+                'html': self.driver.page_source,
+                'url': self.driver.current_url,
+                'user_agent': self.browser_info['user_agent'],
+                'response_code': 200,
+            }
+        except Exception as e:
+            self.log.exception(f'SERP | Chromedriver error | {str(e)}')
+            response_output = {
+                'html': '',
+                'url': '',
+                'user_agent': self.browser_info['user_agent'],
+                'response_code': 0,
+            }
+        finally:
+            return response_output
 
-        html = self.driver.page_source
-        selenium_url = self.driver.current_url
-        self.response_code = 0
-        
-        return {
-            'html': html,
-            'url': selenium_url,
-            'response_code': self.response_code,
-        }
-        
     def expand_ai_overview(self):
-        """Expand AI overview box by clicking it
-        
-        Returns:
-            str: Updated HTML if expansion occurred, None otherwise
-        """
+        """Expand AI overview box by clicking it"""
         show_more_button_xpath = "//div[@jsname='rPRdsc' and @role='button']"
         show_all_button_xpath = '//div[contains(@class, "trEk7e") and @role="button"]'
 
diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 70e954f..bce7a45 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -2,21 +2,19 @@
 from . import webutils as wu
 from . import utils
 from . import logger
-from .searchers.selenium_searcher import SeleniumDriver
-from .models.configs import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod, SearchParams
+from .search_methods.selenium_searcher import SeleniumDriver
+from .models.configs import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod
+from .models.searches import SearchParams, SERPDetails
 from .models.data import BaseSERP
 
 import os
 import time
-import json
 import brotli
 import requests
 import pandas as pd
 from typing import Dict, Optional, Union
 from datetime import datetime, timezone
 
-# selenium imports no longer needed here as they're in selenium_utils.py
-
 from importlib import metadata
 WS_VERSION = metadata.version('WebSearcher')
 
@@ -40,6 +38,7 @@ def __init__(self,
 
         # Initialize configuration
         self.version = WS_VERSION
+        self.method = method.value if isinstance(method, SearchMethod) else method
         self.config = SearchConfig.create({
             "method": SearchMethod.create(method),
             "log": LogConfig.create(log_config),
@@ -64,38 +63,18 @@ def __init__(self,
             self.selenium_driver = SeleniumDriver(config=self.config.selenium, logger=self.log)
             self.selenium_driver.driver = None
 
+        # Initialize search params and output
         self.search_params = SearchParams.create()
-
-        # Initialize search details
-        self.serp = {
-            'version': self.version,
-            'method': self.config.method.value,
-            'crawl_id': None,
-            'serp_id': None,
-            'qry': None,
-            'loc': None,
-            'lang': None,
-            'url': None,
-            'response_code': None,
-            'user_agent': None,
-            'timestamp': None,
-            'serp_id': None,
-            'html': None,
-            'results': [],
-            'features': {},
-        }
-
-        # Initialize search details
-        self.timestamp: str = None
-        self.serp_id: str = None
-        self.crawl_id: str = None
+        self.serp_template = SERPDetails.create({'version': self.version, 'method': self.config.method.value})
 
         # Initialize search outputs
-        self.response = None  # type: Optional[requests.Response]
-        self.html: str = None
+        self._response = {
+            "url": None,
+            "response_code": None,
+            "html": None,
+        }
         self.results: list = []
         self.serp_features: dict = {}
-        self.serp: dict = {}
 
     def search(self, 
             qry: str, 
@@ -103,7 +82,6 @@ def search(self,
             lang: str = None, 
             num_results: int = None, 
             ai_expand: bool = False,
-            serp_id: str = '', 
             crawl_id: str = ''
         ):
         """Conduct a search and save HTML
@@ -118,7 +96,7 @@ def search(self,
         """
 
         self._prepare_search(qry=qry, location=location, lang=lang, num_results=num_results)
-        self._conduct_search(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand)
+        self._conduct_search(crawl_id=crawl_id, ai_expand=ai_expand)
 
     def _prepare_search(self, qry: str, location: str, lang: str, num_results: int):
         self.search_params = SearchParams.create({
@@ -128,44 +106,29 @@ def _prepare_search(self, qry: str, location: str, lang: str, num_results: int):
             'num_results': num_results,
         })
 
-    def _conduct_search(self, serp_id:str = '', crawl_id:str = '', ai_expand:bool = False):
+    def _conduct_search(self, crawl_id: str = '', ai_expand: bool = False):
         if self.config.method == SearchMethod.SELENIUM:
-            self._conduct_search_chromedriver(serp_id=serp_id, crawl_id=crawl_id, ai_expand=ai_expand)
+            self._conduct_search_chromedriver(crawl_id=crawl_id, ai_expand=ai_expand)
         elif self.config.method == SearchMethod.REQUESTS:
-            self._conduct_search_requests(serp_id=serp_id, crawl_id=crawl_id)
+            self._conduct_search_requests(crawl_id=crawl_id)
 
     # ==========================================================================
     # Selenium method
 
-    def _conduct_search_chromedriver(self, serp_id: str = '', crawl_id: str = '', ai_expand = False):
+    def _conduct_search_chromedriver(self, crawl_id: str = '', ai_expand = False):
         """Send a search request and handle errors"""
         if not self.selenium_driver.driver:
             self.selenium_driver.init_driver()
-        
-        self.crawl_id = crawl_id
-        self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
-        str_to_hash = self.search_params.qry + self.search_params.loc + self.timestamp
-        self.serp_id = serp_id if serp_id else utils.hash_id(str_to_hash)
-        
-        try:
-            response_data = self.selenium_driver.send_request(self.search_params.url)
-            self.html = response_data['html']
-            self.selenium_url = response_data['url']
-            self.response_code = response_data['response_code']
-            self.user_agent = self.selenium_driver.user_agent
-            
-            log_msg = f"{self.response_code} | {self.search_params.qry}"
-            log_msg = f"{log_msg} | {self.search_params.loc}" if self.search_params.loc else log_msg
-            self.log.info(log_msg)
-            
-        except Exception as e:
-            self.log.exception(f'SERP | Chromedriver error | {self.serp_id}: {str(e)}')
+        response_output = self.selenium_driver.send_request(self.search_params.url)
+        serp = self.search_params.to_dict_output() | response_output
+        self.serp = BaseSERP(version=self.version, method=self.method, crawl_id=crawl_id, **serp).model_dump()
+        self.log.info(" | ".join([f"{k}: {self.serp[k]}" for k in {'response_code','qry','loc'} if self.serp[k]]))
 
         if ai_expand:
             expanded_html = self.selenium_driver.expand_ai_overview()
             if expanded_html:
-                self.log.debug(f'SERP | overwriting expanded content | len diff: {len(expanded_html) - len(self.html)}')
-                self.html = expanded_html
+                self.log.debug(f"SERP | expanded html | len diff: {len(expanded_html) - len(self.serp['html'])}")
+                self.serp['html'] = expanded_html
         
         # Only delete cookies, don't close the driver here
         self.selenium_driver.delete_cookies()
@@ -175,7 +138,7 @@ def _conduct_search_chromedriver(self, serp_id: str = '', crawl_id: str = '', ai
 
     def _conduct_search_requests(self, serp_id: str = '', crawl_id: str = ''):
         """Send a search request and handle errors"""
-
+        
         self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
         str_to_hash = self.search_params.qry + self.search_params.loc + self.timestamp
         self.serp_id = serp_id if serp_id else utils.hash_id(str_to_hash)
@@ -239,26 +202,26 @@ def _unzip_html(self) -> None:
 
     def parse_all(self):
         """Parse results and extract SERP features in a single pass"""
-        assert self.html, "No HTML found"
+        assert self.serp['html'], "No HTML found"
         try:
             # Use the enhanced parse_serp function to get both results and features in one pass
-            self.results, self.serp_features = parsers.parse_serp(self.html, extract_features=True)
+            self.results, self.serp_features = parsers.parse_serp(self.serp['html'], extract_features=True)
         except Exception:
             self.log.exception(f'Combined parsing error | serp_id : {self.serp_id}')
 
     def parse_results(self):
         """Parse a SERP - see parsers.py"""
-        assert self.html, "No HTML found"
+        assert self.serp['html'], "No HTML found"
         try:
-            self.results = parsers.parse_serp(self.html)
+            self.results = parsers.parse_serp(self.serp['html'])
         except Exception:
             self.log.exception(f'Parsing error | serp_id : {self.serp_id}')
 
     def parse_serp_features(self):
         """Extract SERP features - see parsers.py"""
-        assert self.html, "No HTML found"
+        assert self.serp['html'], "No HTML found"
         try:
-            self.serp_features = parsers.FeatureExtractor.extract_features(self.html)
+            self.serp_features = parsers.FeatureExtractor.extract_features(self.serp['html'])
         except Exception:
             self.log.exception(f'Feature extraction error | serp_id : {self.serp_id}')
 
@@ -267,16 +230,16 @@ def parse_serp_features(self):
 
     def prepare_serp_save(self):
         self.serp = BaseSERP(
-            qry=self.search_params.qry,
-            loc=self.search_params.loc,
-            lang=self.search_params.lang,
-            url=self.search_params.url, 
-            html=self.html,
-            response_code=self.response_code,
-            user_agent=self.user_agent,
-            timestamp=self.timestamp,
-            serp_id=self.serp_id,
-            crawl_id=self.crawl_id,
+            qry=self.serp['qry'],
+            loc=self.serp['loc'],
+            lang=self.serp['lang'],
+            url=self.serp['url'], 
+            html=self.serp['html'],
+            response_code=self.serp['response_code'],
+            user_agent=self.serp['user_agent'],
+            timestamp=self.serp['timestamp'],
+            serp_id=self.serp['serp_id'],
+            crawl_id=self.serp['crawl_id'],
             version=self.version,
             method=self.config.method.value
         ).model_dump()
@@ -288,7 +251,7 @@ def save_serp(self, save_dir: str = "", append_to: str = ""):
             save_dir (str, optional): Save results as `save_dir/{serp_id}.html`
             append_to (str, optional): Append results to this file path
         """
-        assert self.html, "No HTML found"
+        assert self.serp['html'], "No HTML found"
         assert save_dir or append_to, "Must provide a save_dir or append_to file path"
 
         if append_to:
@@ -298,7 +261,7 @@ def save_serp(self, save_dir: str = "", append_to: str = ""):
         else:
             fp = os.path.join(save_dir, f'{self.serp_id}.html')
             with open(fp, 'w') as outfile:
-                outfile.write(self.html)
+                outfile.write(self.serp['html'])
 
     def save_search(self, append_to: str = ""):
         """Save search metadata (excludes HTML) to file
@@ -306,7 +269,7 @@ def save_search(self, append_to: str = ""):
         Args:
             append_to (str, optional): Append results to this file path
         """
-        assert self.html, "No HTML found"
+        assert self.serp['html'], "No HTML found"
         assert append_to, "Must provide an append_to file path"
 
         if not self.serp:
@@ -330,7 +293,11 @@ def save_results(self, save_dir: str = "", append_to: str = ""):
 
         if self.results:
             if append_to:
-                result_metadata = {'crawl_id': self.crawl_id, 'serp_id': self.serp_id, 'version': self.version}
+                result_metadata = {
+                    'crawl_id': self.serp["crawl_id"], 
+                    'serp_id': self.serp["serp_id"], 
+                    'version': self.version
+                }
                 results_output = [{**result, **result_metadata} for result in self.results]
                 utils.write_lines(results_output, append_to)
             else:

From 6d4642f187d6519334527814faf45895310248f4 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 27 Mar 2025 14:16:31 -0700
Subject: [PATCH 042/101] add: file for requests code, update outputs

---
 WebSearcher/models/data.py                    |   2 +-
 WebSearcher/models/searches.py                |  25 +-
 WebSearcher/parsers.py                        |  10 +-
 .../search_methods/requests_searcher.py       |  91 +++++++
 WebSearcher/searchers.py                      | 230 ++++++------------
 tests/selenium_test.py                        |  23 +-
 6 files changed, 182 insertions(+), 199 deletions(-)
 create mode 100644 WebSearcher/search_methods/requests_searcher.py

diff --git a/WebSearcher/models/data.py b/WebSearcher/models/data.py
index b141668..45c4bef 100644
--- a/WebSearcher/models/data.py
+++ b/WebSearcher/models/data.py
@@ -1,5 +1,5 @@
 from pydantic import BaseModel, Field
-from typing import Any, Optional
+from typing import Any, Optional, List, Dict
 
 
 class BaseResult(BaseModel):
diff --git a/WebSearcher/models/searches.py b/WebSearcher/models/searches.py
index 3570823..6884ec2 100644
--- a/WebSearcher/models/searches.py
+++ b/WebSearcher/models/searches.py
@@ -40,7 +40,7 @@ def serp_id(self) -> str:
         timestamp = datetime.now().isoformat()
         return hash_id(f"{self.qry}{self.loc}{timestamp}")
     
-    def to_dict_output(self) -> Dict[str, Any]:
+    def to_serp_output(self) -> Dict[str, Any]:
         """Outputs the variables needed for SERPDetails as a dictionary"""
         timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
         return {
@@ -51,26 +51,3 @@ def to_dict_output(self) -> Dict[str, Any]:
             "serp_id": hash_id(f"{self.qry}{self.loc}{timestamp}"),
             "timestamp": timestamp,
         }
-
-
-class SERPDetails(BaseConfig):
-    """
-    Contains details about a Search Engine Results Page (SERP).
-    
-    This class stores all the information related to a SERP, including
-    search parameters, response data, parsed results and features.
-    """
-    version: str = Field(None, description="WebSearcher version")
-    method: str = Field(None, description="Search method used (requests or selenium)")
-    crawl_id: Optional[str] = Field(None, description="ID for the crawl session")
-    serp_id: Optional[str] = Field(None, description="Unique ID for this SERP")
-    qry: Optional[str] = Field(None, description="Search query")
-    loc: Optional[str] = Field(None, description="Location used for search")
-    lang: Optional[str] = Field(None, description="Language used for search")
-    url: Optional[str] = Field(None, description="Full search URL")
-    response_code: Optional[int] = Field(None, description="HTTP response code")
-    user_agent: Optional[str] = Field(None, description="User agent used for request")
-    timestamp: Optional[str] = Field(None, description="ISO timestamp of search")
-    html: Optional[str] = Field(None, description="Raw HTML response")
-    results: List[Dict[str, Any]] = Field(default_factory=list, description="Parsed search results")
-    features: Dict[str, Any] = Field(default_factory=dict, description="Extracted SERP features")
diff --git a/WebSearcher/parsers.py b/WebSearcher/parsers.py
index 692171b..7f5fb0f 100644
--- a/WebSearcher/parsers.py
+++ b/WebSearcher/parsers.py
@@ -32,14 +32,14 @@ def parse_serp(
     for cmpt in component_list:
         cmpt.classify_component()
         cmpt.parse_component()
-    
     results = component_list.export_component_results()
     
     if extract_features:
-        # Extract features from the same soup object to avoid parsing twice
-        features = FeatureExtractor.extract_features(soup)
-        return results, features
-        
+        return {
+            "features": FeatureExtractor.extract_features(soup),
+            "results": results
+        }
+    
     return results
 
 
diff --git a/WebSearcher/search_methods/requests_searcher.py b/WebSearcher/search_methods/requests_searcher.py
new file mode 100644
index 0000000..7b0ad62
--- /dev/null
+++ b/WebSearcher/search_methods/requests_searcher.py
@@ -0,0 +1,91 @@
+import time
+import brotli
+import requests
+from typing import Dict, Optional, Any
+from datetime import datetime, timezone
+
+from .. import utils
+
+class RequestsSearcher:
+    """Handle Requests-based web interactions for search engines"""
+    
+    def __init__(self, config, headers, logger):
+        """Initialize a Requests searcher with the given configuration
+        
+        Args:
+            config: RequestsConfig instance
+            headers: Dictionary of HTTP headers
+            logger: Logger instance
+        """
+        self.config = config
+        self.headers = headers
+        self.log = logger
+        self.sesh = self.config.sesh or self._start_session()
+        
+    def _start_session(self):
+        """Start a new requests session with the configured headers"""
+        session = requests.Session()
+        session.headers.update(self.headers)
+        return session
+        
+    def send_request(self, search_params) -> Dict[str, Any]:
+        """Send a request and handle the response
+        
+        Args:
+            search_params: SearchParams instance
+            serp_id: Optional SERP ID
+            crawl_id: Optional crawl ID
+            
+        Returns:
+            Dictionary with response data
+        """
+        
+        response_data = {
+            'html': '',
+            'url': search_params.url,
+            'user_agent': self.headers.get('User-Agent'),
+            'response_code': 0,
+        }
+        
+        try:
+            response = self.sesh.get(search_params.url, timeout=10)
+            response_data['html'] = self._handle_response_content(response)
+            response_data['response_code'] = response.status_code
+        except requests.exceptions.ConnectionError:
+            self.log.exception(f'Requests | Connection error')
+            self._reset_ssh_tunnel()
+        except requests.exceptions.Timeout:
+            self.log.exception(f'Requests | Timeout error')
+        except Exception:
+            self.log.exception(f'Requests | Unknown error')
+        finally:
+            return response_data
+
+    def _handle_response_content(self, response):
+        try:
+            if self.config.unzip:  
+                html = self._unzip_html(response.content)
+            else:
+                html = response.content
+            return html.decode('utf-8', 'ignore')
+        except Exception:
+            self.log.exception(f'Response handling error')
+            return response.content
+
+    def _unzip_html(self, content) -> bytes:
+        """Unzip brotli zipped html"""
+        try:
+            return brotli.decompress(content)
+        except brotli.error:
+            return content
+        except Exception:
+            self.log.exception(f'unzip error')
+            return content
+
+    def _reset_ssh_tunnel(self):
+        """Reset the SSH tunnel if configured"""
+        if self.config.ssh_tunnel:
+            self.config.ssh_tunnel.tunnel.kill()
+            self.config.ssh_tunnel.open_tunnel()
+            self.log.info(f'SERP | Restarted SSH tunnel')
+            time.sleep(10)  # Allow time to establish connection
diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index bce7a45..30ac74d 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -1,19 +1,15 @@
 from . import parsers
-from . import webutils as wu
 from . import utils
 from . import logger
 from .search_methods.selenium_searcher import SeleniumDriver
+from .search_methods.requests_searcher import RequestsSearcher
 from .models.configs import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod
-from .models.searches import SearchParams, SERPDetails
+from .models.searches import SearchParams
 from .models.data import BaseSERP
 
 import os
-import time
-import brotli
-import requests
 import pandas as pd
-from typing import Dict, Optional, Union
-from datetime import datetime, timezone
+from typing import Dict, Union
 
 from importlib import metadata
 WS_VERSION = metadata.version('WebSearcher')
@@ -58,23 +54,15 @@ def __init__(self,
         # Initialize searcher
         if self.config.method == SearchMethod.REQUESTS:
             self.headers = headers or self.config.requests.headers
-            self.sesh = self.config.requests.sesh or wu.start_sesh(headers=self.headers)
+            self.requests_searcher = RequestsSearcher(config=self.config.requests, headers=self.headers, logger=self.log)
         elif self.config.method == SearchMethod.SELENIUM:
             self.selenium_driver = SeleniumDriver(config=self.config.selenium, logger=self.log)
             self.selenium_driver.driver = None
 
         # Initialize search params and output
         self.search_params = SearchParams.create()
-        self.serp_template = SERPDetails.create({'version': self.version, 'method': self.config.method.value})
+        self.parsed = {'results': [], 'features': {}}
 
-        # Initialize search outputs
-        self._response = {
-            "url": None,
-            "response_code": None,
-            "html": None,
-        }
-        self.results: list = []
-        self.serp_features: dict = {}
 
     def search(self, 
             qry: str, 
@@ -91,7 +79,6 @@ def search(self,
             location (str, optional): A location's Canonical Name
             num_results (int, optional): The number of results to return
             ai_expand: (bool, optional): Whether to use selenium to expand AI overviews
-            serp_id (str, optional): A unique identifier for this SERP
             crawl_id (str, optional): An identifier for this crawl
         """
 
@@ -119,131 +106,66 @@ def _conduct_search_chromedriver(self, crawl_id: str = '', ai_expand = False):
         """Send a search request and handle errors"""
         if not self.selenium_driver.driver:
             self.selenium_driver.init_driver()
+
+        # Conduct search
+        serp_output = self.search_params.to_serp_output()
         response_output = self.selenium_driver.send_request(self.search_params.url)
-        serp = self.search_params.to_dict_output() | response_output
-        self.serp = BaseSERP(version=self.version, method=self.method, crawl_id=crawl_id, **serp).model_dump()
+        serp_output.update(response_output)
+
+        # Store output
+        self.serp = BaseSERP(
+            version=self.version, 
+            method=self.method, 
+            crawl_id=crawl_id, 
+            **serp_output
+        ).model_dump()
         self.log.info(" | ".join([f"{k}: {self.serp[k]}" for k in {'response_code','qry','loc'} if self.serp[k]]))
 
+        # Expand AI overview
         if ai_expand:
             expanded_html = self.selenium_driver.expand_ai_overview()
             if expanded_html:
                 self.log.debug(f"SERP | expanded html | len diff: {len(expanded_html) - len(self.serp['html'])}")
                 self.serp['html'] = expanded_html
-        
-        # Only delete cookies, don't close the driver here
+
+        # Delete cookies
         self.selenium_driver.delete_cookies()
 
     # ==========================================================================
     # Requests method
 
-    def _conduct_search_requests(self, serp_id: str = '', crawl_id: str = ''):
-        """Send a search request and handle errors"""
-        
-        self.timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
-        str_to_hash = self.search_params.qry + self.search_params.loc + self.timestamp
-        self.serp_id = serp_id if serp_id else utils.hash_id(str_to_hash)
-        self.crawl_id = crawl_id
-        self.user_agent = self.headers['User-Agent']
-
-        try:
-            self._send_request()
-        except requests.exceptions.ConnectionError:
-            self.log.exception(f'SERP | Connection error | {self.serp_id}')
-            self._reset_ssh_tunnel()
-        except requests.exceptions.Timeout:
-            self.log.exception(f'SERP | Timeout error | {self.serp_id}')
-        except Exception:
-            self.log.exception(f'SERP | Unknown error | {self.serp_id}')
-        finally:
-            self._handle_response()
-
-    def _send_request(self):
-        self.response = self.sesh.get(self.search_params.url, timeout=10)
-        self.response_code = self.response.status_code
-        log_msg = f"{self.response_code} | {self.search_params.qry}"
-        log_msg = f"{log_msg} | {self.search_params.loc}" if self.search_params.loc else log_msg
-        self.log.info(log_msg)
-
-    def _reset_ssh_tunnel(self):
-        if self.config.requests.ssh_tunnel:
-            self.config.requests.ssh_tunnel.tunnel.kill()
-            self.config.requests.ssh_tunnel.open_tunnel()
-            self.log.info(f'SERP | Restarted SSH tunnel | {self.serp_id}')
-            time.sleep(10) # Allow time to establish connection
-
-    def _handle_response(self):
-        try:
-            if self.config.requests.unzip:  
-                self._unzip_html()
-            else:
-                self.html = self.response.content
-            self.html = self.html.decode('utf-8', 'ignore')
-        except Exception:
-            self.log.exception(f'Response handling error')
-
-    def _unzip_html(self) -> None:
-        """Unzip brotli zipped html 
-
-        Can allow zipped responses by setting the header `"Accept-Encoding"`.
-        Zipped reponses are the default because it is more efficient.
-        """
-
-        rcontent = self.response.content
-        try:
-            self.html = brotli.decompress(rcontent)
-        except brotli.error:
-            self.html = rcontent
-        except Exception:
-            self.log.exception(f'unzip error | serp_id : {self.serp_id}')
-            self.html = rcontent
+    def _conduct_search_requests(self, crawl_id: str = ''):
+        """Send a search request using the requests library"""
+
+        # Conduct search
+        serp_output = self.search_params.to_serp_output()
+        serp_output['version'] = self.version
+        serp_output['method'] = self.method
+        serp_output['crawl_id'] = crawl_id
+        response_output = self.requests_searcher.send_request(self.search_params)
+        serp_output.update(response_output)
+        self.serp = BaseSERP(**serp_output).model_dump()
+        self.log.info(" | ".join([f"{k}: {self.serp[k]}" for k in {'response_code','qry','loc'} if self.serp[k]]))
 
     # ==========================================================================
     # Parsing
 
-    def parse_all(self):
-        """Parse results and extract SERP features in a single pass"""
-        assert self.serp['html'], "No HTML found"
+    def parse_serp(self, extract_features=True):
         try:
-            # Use the enhanced parse_serp function to get both results and features in one pass
-            self.results, self.serp_features = parsers.parse_serp(self.serp['html'], extract_features=True)
+            metadata = {k:v for k,v in self.serp.items() if k not in ['html']}
+            parsed = parsers.parse_serp(self.serp['html'], extract_features=extract_features)
+            self.parsed = metadata | parsed
         except Exception:
-            self.log.exception(f'Combined parsing error | serp_id : {self.serp_id}')
+            self.log.exception(f'Parsing error | serp_id : {self.serp["serp_id"]}')
 
     def parse_results(self):
-        """Parse a SERP - see parsers.py"""
-        assert self.serp['html'], "No HTML found"
-        try:
-            self.results = parsers.parse_serp(self.serp['html'])
-        except Exception:
-            self.log.exception(f'Parsing error | serp_id : {self.serp_id}')
-
-    def parse_serp_features(self):
-        """Extract SERP features - see parsers.py"""
-        assert self.serp['html'], "No HTML found"
-        try:
-            self.serp_features = parsers.FeatureExtractor.extract_features(self.serp['html'])
-        except Exception:
-            self.log.exception(f'Feature extraction error | serp_id : {self.serp_id}')
+        """Backwards compatibility for parsing results"""
+        self.parse_serp()
+        self.results = self.parsed['results']
 
     # ==========================================================================
     # Saving
 
-    def prepare_serp_save(self):
-        self.serp = BaseSERP(
-            qry=self.serp['qry'],
-            loc=self.serp['loc'],
-            lang=self.serp['lang'],
-            url=self.serp['url'], 
-            html=self.serp['html'],
-            response_code=self.serp['response_code'],
-            user_agent=self.serp['user_agent'],
-            timestamp=self.serp['timestamp'],
-            serp_id=self.serp['serp_id'],
-            crawl_id=self.serp['crawl_id'],
-            version=self.version,
-            method=self.config.method.value
-        ).model_dump()
-
     def save_serp(self, save_dir: str = "", append_to: str = ""):
         """Save SERP to file
 
@@ -251,35 +173,35 @@ def save_serp(self, save_dir: str = "", append_to: str = ""):
             save_dir (str, optional): Save results as `save_dir/{serp_id}.html`
             append_to (str, optional): Append results to this file path
         """
-        assert self.serp['html'], "No HTML found"
-        assert save_dir or append_to, "Must provide a save_dir or append_to file path"
-
-        if append_to:
-            self.prepare_serp_save()
+        if not save_dir and not append_to:
+            self.log.warning("Must provide a save_dir or append_to file path to save a SERP")
+            return
+        elif append_to:
             utils.write_lines([self.serp], append_to)
-
-        else:
-            fp = os.path.join(save_dir, f'{self.serp_id}.html')
+        elif save_dir:
+            fp = os.path.join(save_dir, f'{self.serp["serp_id"]}.html')
             with open(fp, 'w') as outfile:
                 outfile.write(self.serp['html'])
 
-    def save_search(self, append_to: str = ""):
-        """Save search metadata (excludes HTML) to file
-
-        Args:
-            append_to (str, optional): Append results to this file path
-        """
-        assert self.serp['html'], "No HTML found"
-        assert append_to, "Must provide an append_to file path"
-
-        if not self.serp:
-            self.prepare_serp_save()
+    def save_parsed(self, save_dir: str = "", append_to: str = ""):
+        """Save parsed SERP to file"""
+        if not save_dir and not append_to:
+            self.log.warning("Must provide a save_dir or append_to file path to save parsed SERP")
+            return
+        if not self.parsed:
+            self.log.warning("No parsed SERP available to save")
+            return
         
-        if not self.serp_features:
-            self.parse_serp_features()
+        fp = append_to if append_to else os.path.join(save_dir, 'parsed.json')
+        utils.write_lines([self.parsed], fp)
+
+    def save_search(self, append_to: str = ""):
+        """Save SERP metadata (excludes HTML) to file"""
+        if not append_to:
+            self.log.warning("Must provide an append_to file path to save SERP metadata")
+            return
         
         self.serp_metadata = {k: v for k, v in self.serp.items() if k != 'html'}
-        self.serp_metadata.update(self.serp_features)
         utils.write_lines([self.serp_metadata], append_to)
 
     def save_results(self, save_dir: str = "", append_to: str = ""):
@@ -289,22 +211,18 @@ def save_results(self, save_dir: str = "", append_to: str = ""):
             save_dir (str, optional): Save results as `save_dir/results/{serp_id}.json`
             append_to (bool, optional): Append results to this file path
         """
-        assert save_dir or append_to, "Must provide a save_dir or append_to file path"
-
-        if self.results:
-            if append_to:
-                result_metadata = {
-                    'crawl_id': self.serp["crawl_id"], 
-                    'serp_id': self.serp["serp_id"], 
-                    'version': self.version
-                }
-                results_output = [{**result, **result_metadata} for result in self.results]
-                utils.write_lines(results_output, append_to)
-            else:
-                fp = os.path.join(save_dir, 'results', f'{self.serp_id}.json')
-                utils.write_lines(self.results, fp)
-        else:
-            self.log.info(f'No parsed results for serp_id: {self.serp_id}')
+        if not save_dir and not append_to:
+            self.log.warning("Must provide a save_dir or append_to file path to save results")
+            return
+        if not self.parsed["results"]:
+            self.log.warning(f'No parsed results to save')
+            return
+
+        # Add metadta to results
+        result_metadata = {k: self.serp[k] for k in ['crawl_id', 'serp_id', 'version']}
+        results_output = [{**result, **result_metadata} for result in self.parsed["results"]]
+        fp = append_to if append_to else os.path.join(save_dir, 'results.json')        
+        utils.write_lines(results_output, fp)
 
     def cleanup(self):
         """Clean up resources, particularly Selenium's browser instance
diff --git a/tests/selenium_test.py b/tests/selenium_test.py
index 0318fd2..7540fb2 100644
--- a/tests/selenium_test.py
+++ b/tests/selenium_test.py
@@ -10,15 +10,15 @@
 
 @app.command()
 def main(
-        query: str = typer.Argument("why is the sky blue?", help="Search query to use"),
-        method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"),
-        headless: bool = typer.Option(False, help="Run browser in headless mode"),
-        use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"),
-        version_main: int = typer.Option(133, help="Main version of Chrome to use"),
-        ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"),
-        driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"),
-        output_prefix: str = typer.Option("output", help="Prefix for output files")
-    ) -> None:
+    query: str = typer.Argument("why is the sky blue?", help="Search query to use"),
+    method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"),
+    headless: bool = typer.Option(False, help="Run browser in headless mode"),
+    use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"),
+    version_main: int = typer.Option(133, help="Main version of Chrome to use"),
+    ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"),
+    driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"),
+    output_prefix: str = typer.Option("output", help="Prefix for output files")
+) -> None:
     typer.echo(f"query: {query}\nmethod: {method}")
     se = ws.SearchEngine(
         method=method,
@@ -31,11 +31,8 @@ def main(
     )
     se.search(qry=query, ai_expand=ai_expand)
     se.parse_results()
-    
-    # Save results with the specified prefix
     se.save_serp(append_to=f'{output_prefix}_serps.json')
-    se.save_search(append_to=f'{output_prefix}_searches.json')
-    se.save_results(append_to=f'{output_prefix}_results.json')
+    se.save_parsed(append_to=f'{output_prefix}_parsed.json')
     se.cleanup()
 
 if __name__ == "__main__":

From bd2b76eca850ab302d84c2f7049d20c20fee775c Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Fri, 28 Mar 2025 09:21:34 -0700
Subject: [PATCH 043/101] version: 0.6.0.dev5

---
 WebSearcher/__init__.py | 2 +-
 pyproject.toml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index d33ee46..0aad1e5 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.0.dev4"
+__version__ = "0.6.0.dev5"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index 2b85ba6..9309f37 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.6.0.dev4"
+version = "0.6.0.dev5"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From 0c959b945254c358834b5e5ec0c95e4de49b95e0 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Fri, 28 Mar 2025 09:22:33 -0700
Subject: [PATCH 044/101] update: cleaner selenium cleanup

---
 .../search_methods/selenium_searcher.py       | 29 +++++++++----------
 WebSearcher/searchers.py                      | 23 +--------------
 tests/selenium_test.py                        |  2 +-
 3 files changed, 16 insertions(+), 38 deletions(-)

diff --git a/WebSearcher/search_methods/selenium_searcher.py b/WebSearcher/search_methods/selenium_searcher.py
index 3529c6f..3b5c8dc 100644
--- a/WebSearcher/search_methods/selenium_searcher.py
+++ b/WebSearcher/search_methods/selenium_searcher.py
@@ -123,21 +123,8 @@ def cleanup(self) -> bool:
         """
         if self.driver:
             try:
-                # Try a more thorough cleanup
-                try:
-                    self.driver.delete_all_cookies()
-                except Exception:
-                    pass
-                
-                try:
-                    # Close all tabs/windows
-                    original_handle = self.driver.current_window_handle
-                    for handle in self.driver.window_handles:
-                        self.driver.switch_to.window(handle)
-                        self.driver.close()
-                except Exception:
-                    pass
-                
+                self.delete_cookies()      
+                self.close_all_windows()          
                 # Finally quit the driver
                 self.driver.quit()
                 self.driver = None
@@ -150,6 +137,18 @@ def cleanup(self) -> bool:
                 return False
         return True
     
+    def close_all_windows(self):
+        try:
+            # Close all tabs/windows
+            original_handle = self.driver.current_window_handle
+            for handle in self.driver.window_handles:
+                self.driver.switch_to.window(handle)
+                self.driver.close()
+            self.driver.switch_to.window(original_handle)
+            self.driver.close()
+        except Exception:
+            pass
+    
     def delete_cookies(self):
         """Delete all cookies from the browser"""
         if self.driver:
diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 30ac74d..3604426 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -218,29 +218,8 @@ def save_results(self, save_dir: str = "", append_to: str = ""):
             self.log.warning(f'No parsed results to save')
             return
 
-        # Add metadta to results
+        # Add metadata to results
         result_metadata = {k: self.serp[k] for k in ['crawl_id', 'serp_id', 'version']}
         results_output = [{**result, **result_metadata} for result in self.parsed["results"]]
         fp = append_to if append_to else os.path.join(save_dir, 'results.json')        
         utils.write_lines(results_output, fp)
-
-    def cleanup(self):
-        """Clean up resources, particularly Selenium's browser instance
-        
-        Returns:
-            bool: True if cleanup was successful or not needed, False if cleanup failed
-        """
-        if self.config.method == SearchMethod.SELENIUM and hasattr(self, 'selenium_driver'):
-            result = self.selenium_driver.cleanup()
-            if result:
-                self.selenium_driver.driver = None  # Update the reference
-            return result
-        return True
-    
-    def __del__(self):
-        """Destructor to ensure browser is closed when object is garbage collected"""
-        try:
-            self.cleanup()
-        except Exception:
-            pass
-
diff --git a/tests/selenium_test.py b/tests/selenium_test.py
index 7540fb2..7711c6c 100644
--- a/tests/selenium_test.py
+++ b/tests/selenium_test.py
@@ -32,8 +32,8 @@ def main(
     se.search(qry=query, ai_expand=ai_expand)
     se.parse_results()
     se.save_serp(append_to=f'{output_prefix}_serps.json')
+    se.save_search(append_to=f'{output_prefix}_searches.json')
     se.save_parsed(append_to=f'{output_prefix}_parsed.json')
-    se.cleanup()
 
 if __name__ == "__main__":
     app()
\ No newline at end of file

From 82ef0dbe11324aac72badece2a3e9d9847fbff65 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Fri, 28 Mar 2025 09:23:40 -0700
Subject: [PATCH 045/101] update: consistent logging and serp handling

---
 WebSearcher/searchers.py | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 3604426..36367ac 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -82,10 +82,6 @@ def search(self,
             crawl_id (str, optional): An identifier for this crawl
         """
 
-        self._prepare_search(qry=qry, location=location, lang=lang, num_results=num_results)
-        self._conduct_search(crawl_id=crawl_id, ai_expand=ai_expand)
-
-    def _prepare_search(self, qry: str, location: str, lang: str, num_results: int):
         self.search_params = SearchParams.create({
             'qry': str(qry),
             'loc': str(location) if not pd.isnull(location) else '',
@@ -93,7 +89,6 @@ def _prepare_search(self, qry: str, location: str, lang: str, num_results: int):
             'num_results': num_results,
         })
 
-    def _conduct_search(self, crawl_id: str = '', ai_expand: bool = False):
         if self.config.method == SearchMethod.SELENIUM:
             self._conduct_search_chromedriver(crawl_id=crawl_id, ai_expand=ai_expand)
         elif self.config.method == SearchMethod.REQUESTS:
@@ -109,17 +104,13 @@ def _conduct_search_chromedriver(self, crawl_id: str = '', ai_expand = False):
 
         # Conduct search
         serp_output = self.search_params.to_serp_output()
+        serp_output['version'] = self.version
+        serp_output['method'] = self.method
+        serp_output['crawl_id'] = crawl_id
         response_output = self.selenium_driver.send_request(self.search_params.url)
         serp_output.update(response_output)
-
-        # Store output
-        self.serp = BaseSERP(
-            version=self.version, 
-            method=self.method, 
-            crawl_id=crawl_id, 
-            **serp_output
-        ).model_dump()
-        self.log.info(" | ".join([f"{k}: {self.serp[k]}" for k in {'response_code','qry','loc'} if self.serp[k]]))
+        self.serp = BaseSERP(**serp_output).model_dump()
+        self.log.info(" | ".join([f"{self.serp[k]}" for k in {'response_code','qry','loc'} if self.serp[k]]))
 
         # Expand AI overview
         if ai_expand:
@@ -145,16 +136,16 @@ def _conduct_search_requests(self, crawl_id: str = ''):
         response_output = self.requests_searcher.send_request(self.search_params)
         serp_output.update(response_output)
         self.serp = BaseSERP(**serp_output).model_dump()
-        self.log.info(" | ".join([f"{k}: {self.serp[k]}" for k in {'response_code','qry','loc'} if self.serp[k]]))
+        self.log.info(" | ".join([f"{self.serp[k]}" for k in {'qry','response_code','loc'} if self.serp[k]]))
 
     # ==========================================================================
     # Parsing
 
     def parse_serp(self, extract_features=True):
         try:
-            metadata = {k:v for k,v in self.serp.items() if k not in ['html']}
+            parsed_metadata = {k:v for k,v in self.serp.items() if k in ['crawl_id', 'serp_id', 'version', 'method']}
             parsed = parsers.parse_serp(self.serp['html'], extract_features=extract_features)
-            self.parsed = metadata | parsed
+            self.parsed = parsed_metadata | parsed
         except Exception:
             self.log.exception(f'Parsing error | serp_id : {self.serp["serp_id"]}')
 

From 3fd8e19cc0914d34d3fa00bdc765e733b8f4088c Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Fri, 28 Mar 2025 10:05:26 -0700
Subject: [PATCH 046/101] update: simplify search logic, use SearchParams, ai
 expand logic in selenium file

---
 WebSearcher/models/configs.py                 |  6 ++
 .../search_methods/requests_searcher.py       | 23 +++---
 .../search_methods/selenium_searcher.py       | 38 ++++++----
 WebSearcher/searchers.py                      | 73 +++++--------------
 4 files changed, 58 insertions(+), 82 deletions(-)

diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py
index c64ee30..429d872 100644
--- a/WebSearcher/models/configs.py
+++ b/WebSearcher/models/configs.py
@@ -39,6 +39,12 @@ class RequestsConfig(BaseConfig):
     ssh_tunnel: Optional[subprocess.Popen] = None
     unzip: bool = True
 
+    def update_headers(self, new_headers: Dict[str, str]) -> None:
+        """Update the headers dictionary with new values."""
+        self.headers.update(new_headers)
+
+
+
 class SearchMethod(Enum):
     REQUESTS = "requests"
     SELENIUM = "selenium"
diff --git a/WebSearcher/search_methods/requests_searcher.py b/WebSearcher/search_methods/requests_searcher.py
index 7b0ad62..1b12450 100644
--- a/WebSearcher/search_methods/requests_searcher.py
+++ b/WebSearcher/search_methods/requests_searcher.py
@@ -1,15 +1,15 @@
 import time
 import brotli
 import requests
-from typing import Dict, Optional, Any
-from datetime import datetime, timezone
+from typing import Dict, Any
 
-from .. import utils
+from ..models.configs import RequestsConfig
+from ..models.searches import SearchParams
 
 class RequestsSearcher:
     """Handle Requests-based web interactions for search engines"""
     
-    def __init__(self, config, headers, logger):
+    def __init__(self, config: RequestsConfig, logger):
         """Initialize a Requests searcher with the given configuration
         
         Args:
@@ -18,17 +18,16 @@ def __init__(self, config, headers, logger):
             logger: Logger instance
         """
         self.config = config
-        self.headers = headers
         self.log = logger
         self.sesh = self.config.sesh or self._start_session()
         
     def _start_session(self):
         """Start a new requests session with the configured headers"""
         session = requests.Session()
-        session.headers.update(self.headers)
+        session.headers.update(self.config.headers)
         return session
         
-    def send_request(self, search_params) -> Dict[str, Any]:
+    def send_request(self, search_params: SearchParams) -> Dict[str, Any]:
         """Send a request and handle the response
         
         Args:
@@ -40,17 +39,17 @@ def send_request(self, search_params) -> Dict[str, Any]:
             Dictionary with response data
         """
         
-        response_data = {
+        response_output = {
             'html': '',
             'url': search_params.url,
-            'user_agent': self.headers.get('User-Agent'),
+            'user_agent': self.config.headers.get('User-Agent'),
             'response_code': 0,
         }
         
         try:
             response = self.sesh.get(search_params.url, timeout=10)
-            response_data['html'] = self._handle_response_content(response)
-            response_data['response_code'] = response.status_code
+            response_output['html'] = self._handle_response_content(response)
+            response_output['response_code'] = response.status_code
         except requests.exceptions.ConnectionError:
             self.log.exception(f'Requests | Connection error')
             self._reset_ssh_tunnel()
@@ -59,7 +58,7 @@ def send_request(self, search_params) -> Dict[str, Any]:
         except Exception:
             self.log.exception(f'Requests | Unknown error')
         finally:
-            return response_data
+            return response_output
 
     def _handle_response_content(self, response):
         try:
diff --git a/WebSearcher/search_methods/selenium_searcher.py b/WebSearcher/search_methods/selenium_searcher.py
index 3b5c8dc..d78fe44 100644
--- a/WebSearcher/search_methods/selenium_searcher.py
+++ b/WebSearcher/search_methods/selenium_searcher.py
@@ -1,6 +1,6 @@
 import time
 import json
-from typing import Dict, Optional, Any
+from typing import Dict, Any
 
 import undetected_chromedriver as uc
 from selenium.webdriver.common.by import By
@@ -11,6 +11,7 @@
 
 from .. import utils
 from ..models.configs import SeleniumConfig
+from ..models.searches import SearchParams
 
 class SeleniumDriver:
     """Handle Selenium-based web interactions for search engines"""
@@ -53,31 +54,38 @@ def send_typed_query(self, query: str):
         search_box.send_keys(query)
         search_box.send_keys(Keys.RETURN)
         
-    def send_request(self, url: str) -> Dict[str, Any]:
+    def send_request(self, search_params: SearchParams, ai_expand: bool = False) -> Dict[str, Any]:
         """Visit a URL with selenium and save HTML response"""
 
+        response_output = {
+            'html': '',
+            'url': search_params.url,
+            'user_agent': self.browser_info['user_agent'],
+            'response_code': 0,
+        }
+
         try:
-            self.driver.get(url)
+            self.driver.get(search_params.url)
             time.sleep(2)
             WebDriverWait(self.driver, 10).until(
                 EC.presence_of_element_located((By.ID, "search")) 
             )
             time.sleep(2)
-            response_output = {
-                'html': self.driver.page_source,
-                'url': self.driver.current_url,
-                'user_agent': self.browser_info['user_agent'],
-                'response_code': 200,
-            }
+            response_output['html'] = self.driver.page_source
+            response_output['url'] = self.driver.current_url
+            response_output['response_code'] = 200
+
+            # Expand AI overview if requested
+            if ai_expand:
+                expanded_html = self.expand_ai_overview()
+                if expanded_html:
+                    self.log.debug(f"SERP | expanded html | len diff: {len(expanded_html) - len(self.serp['html'])}")
+                    response_output['html'] = expanded_html
+
         except Exception as e:
             self.log.exception(f'SERP | Chromedriver error | {str(e)}')
-            response_output = {
-                'html': '',
-                'url': '',
-                'user_agent': self.browser_info['user_agent'],
-                'response_code': 0,
-            }
         finally:
+            self.delete_cookies()
             return response_output
 
     def expand_ai_overview(self):
diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 36367ac..8566d17 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -21,7 +21,7 @@ def __init__(self,
             log_config: Union[dict, LogConfig] = {},
             selenium_config: Union[dict, SeleniumConfig] = {},
             requests_config: Union[dict, RequestsConfig] = {},
-            headers: Dict[str, str] = None
+            crawl_id: str = '',
         ) -> None:
         """Initialize the search engine
 
@@ -33,7 +33,6 @@ def __init__(self,
         """
 
         # Initialize configuration
-        self.version = WS_VERSION
         self.method = method.value if isinstance(method, SearchMethod) else method
         self.config = SearchConfig.create({
             "method": SearchMethod.create(method),
@@ -41,7 +40,12 @@ def __init__(self,
             "selenium": SeleniumConfig.create(selenium_config),
             "requests": RequestsConfig.create(requests_config),
         })
-
+        self.session_data = {
+            "method": self.config.method.value,
+            "version": WS_VERSION,
+            "crawl_id": crawl_id,
+        }
+        
         # Set a log file, prints to console by default
         self.log = logger.Logger(
             console=True if not self.config.log.fp else False,
@@ -51,14 +55,6 @@ def __init__(self,
             file_level=self.config.log.level,
         ).start(__name__)
 
-        # Initialize searcher
-        if self.config.method == SearchMethod.REQUESTS:
-            self.headers = headers or self.config.requests.headers
-            self.requests_searcher = RequestsSearcher(config=self.config.requests, headers=self.headers, logger=self.log)
-        elif self.config.method == SearchMethod.SELENIUM:
-            self.selenium_driver = SeleniumDriver(config=self.config.selenium, logger=self.log)
-            self.selenium_driver.driver = None
-
         # Initialize search params and output
         self.search_params = SearchParams.create()
         self.parsed = {'results': [], 'features': {}}
@@ -70,7 +66,7 @@ def search(self,
             lang: str = None, 
             num_results: int = None, 
             ai_expand: bool = False,
-            crawl_id: str = ''
+            headers: Dict[str, str] = {},
         ):
         """Conduct a search and save HTML
         
@@ -90,58 +86,25 @@ def search(self,
         })
 
         if self.config.method == SearchMethod.SELENIUM:
-            self._conduct_search_chromedriver(crawl_id=crawl_id, ai_expand=ai_expand)
-        elif self.config.method == SearchMethod.REQUESTS:
-            self._conduct_search_requests(crawl_id=crawl_id)
-
-    # ==========================================================================
-    # Selenium method
-
-    def _conduct_search_chromedriver(self, crawl_id: str = '', ai_expand = False):
-        """Send a search request and handle errors"""
-        if not self.selenium_driver.driver:
+            self.selenium_driver = SeleniumDriver(config=self.config.selenium, logger=self.log)
             self.selenium_driver.init_driver()
+            self.response_output = self.selenium_driver.send_request(self.search_params, ai_expand=ai_expand)
+        
+        elif self.config.method == SearchMethod.REQUESTS:
+            self.config.requests.update_headers(headers)
+            self.requests_searcher = RequestsSearcher(config=self.config.requests, logger=self.log)
+            self.response_output = self.requests_searcher.send_request(self.search_params)
 
-        # Conduct search
         serp_output = self.search_params.to_serp_output()
-        serp_output['version'] = self.version
-        serp_output['method'] = self.method
-        serp_output['crawl_id'] = crawl_id
-        response_output = self.selenium_driver.send_request(self.search_params.url)
-        serp_output.update(response_output)
+        serp_output.update(self.session_data)
+        serp_output.update(self.response_output)
         self.serp = BaseSERP(**serp_output).model_dump()
         self.log.info(" | ".join([f"{self.serp[k]}" for k in {'response_code','qry','loc'} if self.serp[k]]))
 
-        # Expand AI overview
-        if ai_expand:
-            expanded_html = self.selenium_driver.expand_ai_overview()
-            if expanded_html:
-                self.log.debug(f"SERP | expanded html | len diff: {len(expanded_html) - len(self.serp['html'])}")
-                self.serp['html'] = expanded_html
-
-        # Delete cookies
-        self.selenium_driver.delete_cookies()
-
-    # ==========================================================================
-    # Requests method
-
-    def _conduct_search_requests(self, crawl_id: str = ''):
-        """Send a search request using the requests library"""
-
-        # Conduct search
-        serp_output = self.search_params.to_serp_output()
-        serp_output['version'] = self.version
-        serp_output['method'] = self.method
-        serp_output['crawl_id'] = crawl_id
-        response_output = self.requests_searcher.send_request(self.search_params)
-        serp_output.update(response_output)
-        self.serp = BaseSERP(**serp_output).model_dump()
-        self.log.info(" | ".join([f"{self.serp[k]}" for k in {'qry','response_code','loc'} if self.serp[k]]))
-
     # ==========================================================================
     # Parsing
 
-    def parse_serp(self, extract_features=True):
+    def parse_serp(self, extract_features: bool = True):
         try:
             parsed_metadata = {k:v for k,v in self.serp.items() if k in ['crawl_id', 'serp_id', 'version', 'method']}
             parsed = parsers.parse_serp(self.serp['html'], extract_features=extract_features)

From bdb5975a0f3460940d8681338cb1d81d7114dd79 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Fri, 28 Mar 2025 10:13:08 -0700
Subject: [PATCH 047/101] update: drop python version file, use python>=3.10 in
 pyproject

---
 .python-version | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 .python-version

diff --git a/.python-version b/.python-version
deleted file mode 100644
index c84ccce..0000000
--- a/.python-version
+++ /dev/null
@@ -1 +0,0 @@
-3.10.5

From d5c753925b4c5e1a88f0debb16757366a2426fa1 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Fri, 28 Mar 2025 10:24:44 -0700
Subject: [PATCH 048/101] fix: selenium output reference

---
 WebSearcher/search_methods/selenium_searcher.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/WebSearcher/search_methods/selenium_searcher.py b/WebSearcher/search_methods/selenium_searcher.py
index d78fe44..d8315ca 100644
--- a/WebSearcher/search_methods/selenium_searcher.py
+++ b/WebSearcher/search_methods/selenium_searcher.py
@@ -79,7 +79,8 @@ def send_request(self, search_params: SearchParams, ai_expand: bool = False) ->
             if ai_expand:
                 expanded_html = self.expand_ai_overview()
                 if expanded_html:
-                    self.log.debug(f"SERP | expanded html | len diff: {len(expanded_html) - len(self.serp['html'])}")
+                    len_diff = len(expanded_html) - len(response_output['html'])
+                    self.log.debug(f"SERP | expanded html | len diff: {len_diff}")
                     response_output['html'] = expanded_html
 
         except Exception as e:

From 64ee056fb3843345de1c755efa4dc224edd64f04 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Fri, 28 Mar 2025 11:35:23 -0700
Subject: [PATCH 049/101] update: demo scripts

---
 scripts/demo_search.py   | 71 +++++++++++++++++++--------------
 scripts/demo_searches.py | 85 ++++++++++++++++++++++++++--------------
 tests/selenium_test.py   | 39 ------------------
 3 files changed, 97 insertions(+), 98 deletions(-)
 delete mode 100644 tests/selenium_test.py

diff --git a/scripts/demo_search.py b/scripts/demo_search.py
index 94bfb68..3debcaf 100644
--- a/scripts/demo_search.py
+++ b/scripts/demo_search.py
@@ -2,45 +2,58 @@
 """
 
 import os
-import argparse
+import typer
 import pandas as pd
 import WebSearcher as ws
 
-pd.set_option('display.width', 120, 
+pd.set_option('display.width', 160, 
               'display.max_rows', None, 
               'display.max_columns', None,
               'display.max_colwidth', 40)
 
-def main():
-    # Settings
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-q", "--query", type=str, help="A search query", required=True)
-    parser.add_argument("-d", "--data_dir", type=str, help="Directory to save data", 
-                        default=os.path.join("data", f"demo-ws-v{ws.__version__}"))
-    args = parser.parse_args()
-    print(f'WebSearcher v{ws.__version__}\nSearch Query: {args.query}\nOutput Dir: {args.data_dir}\n')
+DEFAULT_DATA_DIR = os.path.join("data", f"demo-ws-v{ws.__version__}")
 
+app = typer.Typer()
+
+@app.command()
+def main(
+    query: str = typer.Argument("why is the sky blue?", help="Search query to use"),
+    method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"),
+    data_dir: str = typer.Option(DEFAULT_DATA_DIR, help="Prefix for output files"),
+    headless: bool = typer.Option(False, help="Run browser in headless mode"),
+    use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"),
+    version_main: int = typer.Option(133, help="Main version of Chrome to use"),
+    ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"),
+    driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"),
+) -> None:
+    
     # Filepaths
-    fp_serps = os.path.join(args.data_dir, 'serps.json')
-    fp_results = os.path.join(args.data_dir, 'results.json')
-    fp_searches = os.path.join(args.data_dir, 'searches.json')
-    dir_html = os.path.join(args.data_dir, 'html')
-    os.makedirs(dir_html, exist_ok=True)
-
-    # Search, parse, and save
-    se = ws.SearchEngine()                  # Initialize searcher
-    se.launch_chromedriver(headless =False) # Launch browser
-    se.search(args.query)                   # Conduct Search
-    se.parse_results()                      # Parse Results
-    se.save_serp(append_to=fp_serps)        # Save SERP to json (html + metadata)
-    se.save_results(append_to=fp_results)   # Save results to json
-    se.save_serp(save_dir=dir_html)         # Save SERP html to dir (no metadata)
-    se.save_search(append_to=fp_searches)   # Save search metadata + extracted features
+    fps = {k: os.path.join(data_dir, f"{k}.json") for k in ["serps", "parsed", "searches"]}
+    os.makedirs(data_dir, exist_ok=True)
+    print(f'WebSearcher v{ws.__version__}\nSearch Query: {query}\nOutput Dir: {data_dir}\n')
+
+    # Setup search engine
+    se = ws.SearchEngine(
+        method=method, 
+        selenium_config={
+            "headless": headless,
+            "use_subprocess": use_subprocess,
+            "driver_executable_path": driver_executable_path,
+            "version_main": version_main,
+        }
+    )
+
+    # Search and parse
+    se.search(query, ai_expand=ai_expand)      # Conduct Search
+    se.parse_results()                         # Parse Results
+    se.save_serp(append_to=fps['serps'])       # Save SERP to json (html + metadata)
+    se.save_search(append_to=fps['searches'])  # Save search metadata to json
+    se.save_parsed(append_to=fps['parsed'])    # Save results/features to json
 
     # Convert results to dataframe and print select columns
-    if se.results:
-        results = pd.DataFrame(se.results)
-        print(results[['type', 'title', 'url']])
+    if se.parsed["results"]:
+        results = pd.DataFrame(se.parsed["results"])
+        print(results[['type', 'sub_type', 'title', 'url']])
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    app()
\ No newline at end of file
diff --git a/scripts/demo_searches.py b/scripts/demo_searches.py
index 6c96341..82eee67 100644
--- a/scripts/demo_searches.py
+++ b/scripts/demo_searches.py
@@ -3,37 +3,62 @@
 
 import os
 import time
+import typer
 import pandas as pd
 import WebSearcher as ws
 
-pd.set_option('display.width', 120, 
-              'display.max_colwidth', 40,
+pd.set_option('display.width', 160, 
               'display.max_rows', None, 
-              'display.max_columns', None)
-
-# Filepaths
-data_dir = os.path.join("data", f"demo-ws-v{ws.__version__}")
-fp_serps = os.path.join(data_dir, 'serps.json')
-fp_results = os.path.join(data_dir, 'results.json')
-dir_html = os.path.join(data_dir, 'html')
-os.makedirs(dir_html, exist_ok=True)
-
-# Load query list from file, from: https://ahrefs.com/blog/top-google-searches/
-fp_queries = 'data/tests/top_searches_google_2020-04.tsv'
-top_list = pd.read_csv(fp_queries, sep='\t')
-queries = top_list['keyword']
-
-# Search, parse, and save
-for qry in queries:
-    se = ws.SearchEngine()                  # Initialize searcher
-    se.search(qry)                          # Conduct Search
-    se.parse_results()                      # Parse Results
-    se.save_serp(append_to=fp_serps)        # Save SERP to json (html + metadata)
-    se.save_results(append_to=fp_results)   # Save results to json
-    se.save_serp(save_dir=dir_html)         # Save SERP html to dir (no metadata)
-
-    # Convert results to dataframe and print select columns
-    if se.results:
-        results = pd.DataFrame(se.results)
-        print(results[['type', 'title', 'url']])
-    time.sleep(30)
+              'display.max_columns', None,
+              'display.max_colwidth', 40)
+
+DEFAULT_DATA_DIR = os.path.join("data", f"demo-ws-v{ws.__version__}")
+
+app = typer.Typer()
+
+@app.command()
+def main(
+    method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"),
+    data_dir: str = typer.Option(DEFAULT_DATA_DIR, help="Prefix for output files"),
+    headless: bool = typer.Option(False, help="Run browser in headless mode"),
+    use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"),
+    version_main: int = typer.Option(133, help="Main version of Chrome to use"),
+    ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"),
+    driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"),
+) -> None:
+
+    # Filepaths
+    fps = {k: os.path.join(data_dir, f"{k}.json") for k in ["serps", "parsed", "searches"]}
+    os.makedirs(data_dir, exist_ok=True)
+
+    # Load query list from file, from: https://ahrefs.com/blog/top-google-searches/
+    fp_queries = 'data/tests/top_searches_google_2020-04.tsv'
+    top_list = pd.read_csv(fp_queries, sep='\t')
+    queries = top_list['keyword']
+
+    for qry in queries:
+
+        # Setup search engine
+        se = ws.SearchEngine(
+            method=method, 
+            selenium_config={
+                "headless": headless,
+                "use_subprocess": use_subprocess,
+                "driver_executable_path": driver_executable_path,
+                "version_main": version_main,
+            }
+        )
+
+        # Search, parse, and save
+        se.search(qry, ai_expand=ai_expand)       # Conduct Search
+        se.parse_results()                        # Parse Results
+        se.save_serp(append_to=fps['serps'])      # Save SERP to json (html + metadata)
+        se.save_search(append_to=fps['searches']) # Save search to json (metadata only)
+        se.save_parsed(append_to=fps['parsed'])   # Save parsed results and SERP features to json
+
+        # Convert results to dataframe and print select columns
+        if se.parsed["results"]:
+            results = pd.DataFrame(se.parsed["results"])
+            print(results[['type', 'sub_type', 'title', 'url']])
+
+        time.sleep(30)
diff --git a/tests/selenium_test.py b/tests/selenium_test.py
deleted file mode 100644
index 7711c6c..0000000
--- a/tests/selenium_test.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import typer
-import WebSearcher as ws
-
-# driver_executable_path locations:
-# /opt/homebrew/Caskroom/chromedriver/133.0.6943.53 # Mac
-# /opt/google/chrome/google-chrome  # Google Chrome 134.0.6998.88 | permissions error
-# ~/.local/share/undetected_chromedriver/undetected_chromedriver # ChromeDriver 133.0.6943.141
-
-app = typer.Typer()
-
-@app.command()
-def main(
-    query: str = typer.Argument("why is the sky blue?", help="Search query to use"),
-    method: str = typer.Argument("selenium", help="Search method to use: 'selenium' or 'requests'"),
-    headless: bool = typer.Option(False, help="Run browser in headless mode"),
-    use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"),
-    version_main: int = typer.Option(133, help="Main version of Chrome to use"),
-    ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"),
-    driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"),
-    output_prefix: str = typer.Option("output", help="Prefix for output files")
-) -> None:
-    typer.echo(f"query: {query}\nmethod: {method}")
-    se = ws.SearchEngine(
-        method=method,
-        selenium_config={
-            "headless": headless,
-            "use_subprocess": use_subprocess,
-            "driver_executable_path": driver_executable_path,
-            "version_main": version_main,
-        }
-    )
-    se.search(qry=query, ai_expand=ai_expand)
-    se.parse_results()
-    se.save_serp(append_to=f'{output_prefix}_serps.json')
-    se.save_search(append_to=f'{output_prefix}_searches.json')
-    se.save_parsed(append_to=f'{output_prefix}_parsed.json')
-
-if __name__ == "__main__":
-    app()
\ No newline at end of file

From 16ba005e9a861accb14ce27aed39c4b98b0d073c Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Tue, 1 Apr 2025 07:48:47 -0700
Subject: [PATCH 050/101] update: timestamp before request, ai expand as search
 param, load searcher method on se init

---
 WebSearcher/models/configs.py                 | 13 ++++----
 WebSearcher/models/searches.py                | 13 +++-----
 .../search_methods/requests_searcher.py       |  5 +++
 .../search_methods/selenium_searcher.py       |  6 ++--
 WebSearcher/searchers.py                      | 32 +++++++++++--------
 5 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py
index 429d872..99021cf 100644
--- a/WebSearcher/models/configs.py
+++ b/WebSearcher/models/configs.py
@@ -2,7 +2,7 @@
 import subprocess
 from enum import Enum
 from typing import Dict, Optional, Union
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, computed_field
 
 class BaseConfig(BaseModel):
     """Base class for all configuration classes"""
@@ -35,14 +35,15 @@ class RequestsConfig(BaseConfig):
         'Accept-Language': 'en-US,en;q=0.5',
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0',
     })
-    sesh: Optional[requests.Session] = None
     ssh_tunnel: Optional[subprocess.Popen] = None
     unzip: bool = True
 
-    def update_headers(self, new_headers: Dict[str, str]) -> None:
-        """Update the headers dictionary with new values."""
-        self.headers.update(new_headers)
-
+    @computed_field
+    def sesh(self) -> requests.Session:
+        """Create and configure a requests session with the current headers."""
+        sesh = requests.Session()
+        sesh.headers.update(self.headers)
+        return sesh
 
 
 class SearchMethod(Enum):
diff --git a/WebSearcher/models/searches.py b/WebSearcher/models/searches.py
index 6884ec2..b213e3d 100644
--- a/WebSearcher/models/searches.py
+++ b/WebSearcher/models/searches.py
@@ -1,6 +1,6 @@
 from pydantic import Field, computed_field
 from typing import Dict, Optional, Any, List
-from datetime import datetime, timezone
+from datetime import datetime
 
 from ..utils import hash_id
 from ..import webutils as wu
@@ -15,6 +15,8 @@ class SearchParams(BaseConfig):
     lang: Optional[str] = Field(None, description="Language code (e.g., 'en')")
     loc: Optional[str] = Field(None, description="Location in Canonical Name format")
     base_url: str = Field("https://www.google.com/search", description="Base search engine URL")
+    ai_expand: bool = Field(False, description="Expand AI overviews if present")
+    headers: Dict[str, str] = Field(default_factory=dict, description="Custom headers")
     
     @computed_field
     def url_params(self) -> Dict[str, Any]:
@@ -36,18 +38,13 @@ def url(self) -> str:
     
     @computed_field
     def serp_id(self) -> str:
-        """Computes a unique SERP ID based on query, location, and timestamp"""
-        timestamp = datetime.now().isoformat()
-        return hash_id(f"{self.qry}{self.loc}{timestamp}")
+        return hash_id(f"{self.qry}{self.loc}{datetime.now().isoformat()}")
     
     def to_serp_output(self) -> Dict[str, Any]:
-        """Outputs the variables needed for SERPDetails as a dictionary"""
-        timestamp = datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
         return {
             "qry": self.qry,
             "loc": self.loc,
             "lang": self.lang,
             "url": self.url,
-            "serp_id": hash_id(f"{self.qry}{self.loc}{timestamp}"),
-            "timestamp": timestamp,
+            "serp_id": self.serp_id,
         }
diff --git a/WebSearcher/search_methods/requests_searcher.py b/WebSearcher/search_methods/requests_searcher.py
index 1b12450..666afcb 100644
--- a/WebSearcher/search_methods/requests_searcher.py
+++ b/WebSearcher/search_methods/requests_searcher.py
@@ -1,6 +1,7 @@
 import time
 import brotli
 import requests
+from datetime import datetime, timezone
 from typing import Dict, Any
 
 from ..models.configs import RequestsConfig
@@ -38,12 +39,16 @@ def send_request(self, search_params: SearchParams) -> Dict[str, Any]:
         Returns:
             Dictionary with response data
         """
+
+        if search_params.headers:
+            self.sesh.headers.update(search_params.headers)
         
         response_output = {
             'html': '',
             'url': search_params.url,
             'user_agent': self.config.headers.get('User-Agent'),
             'response_code': 0,
+            'timestamp': datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
         }
         
         try:
diff --git a/WebSearcher/search_methods/selenium_searcher.py b/WebSearcher/search_methods/selenium_searcher.py
index d8315ca..00eb829 100644
--- a/WebSearcher/search_methods/selenium_searcher.py
+++ b/WebSearcher/search_methods/selenium_searcher.py
@@ -1,5 +1,6 @@
 import time
 import json
+from datetime import datetime, timezone
 from typing import Dict, Any
 
 import undetected_chromedriver as uc
@@ -54,7 +55,7 @@ def send_typed_query(self, query: str):
         search_box.send_keys(query)
         search_box.send_keys(Keys.RETURN)
         
-    def send_request(self, search_params: SearchParams, ai_expand: bool = False) -> Dict[str, Any]:
+    def send_request(self, search_params: SearchParams) -> Dict[str, Any]:
         """Visit a URL with selenium and save HTML response"""
 
         response_output = {
@@ -62,6 +63,7 @@ def send_request(self, search_params: SearchParams, ai_expand: bool = False) ->
             'url': search_params.url,
             'user_agent': self.browser_info['user_agent'],
             'response_code': 0,
+            'timestamp': datetime.now(timezone.utc).replace(tzinfo=None).isoformat()
         }
 
         try:
@@ -76,7 +78,7 @@ def send_request(self, search_params: SearchParams, ai_expand: bool = False) ->
             response_output['response_code'] = 200
 
             # Expand AI overview if requested
-            if ai_expand:
+            if search_params.ai_expand:
                 expanded_html = self.expand_ai_overview()
                 if expanded_html:
                     len_diff = len(expanded_html) - len(response_output['html'])
diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 8566d17..72c791f 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -1,8 +1,10 @@
 from . import parsers
 from . import utils
 from . import logger
+
 from .search_methods.selenium_searcher import SeleniumDriver
 from .search_methods.requests_searcher import RequestsSearcher
+
 from .models.configs import LogConfig, SeleniumConfig, RequestsConfig, SearchConfig, SearchMethod
 from .models.searches import SearchParams
 from .models.data import BaseSERP
@@ -30,8 +32,9 @@ def __init__(self,
             log_config (Union[dict, LogConfig], optional): Common search configuration. Defaults to None.
             selenium_config (Union[dict, SeleniumConfig], optional): Selenium-specific configuration. Defaults to None.
             requests_config (Union[dict, RequestsConfig], optional): Requests-specific configuration. Defaults to None.
+            crawl_id (str, optional): A unique identifier for the crawl. Defaults to ''.
         """
-
+  
         # Initialize configuration
         self.method = method.value if isinstance(method, SearchMethod) else method
         self.config = SearchConfig.create({
@@ -40,12 +43,14 @@ def __init__(self,
             "selenium": SeleniumConfig.create(selenium_config),
             "requests": RequestsConfig.create(requests_config),
         })
+
+        # Initialize session data
         self.session_data = {
             "method": self.config.method.value,
             "version": WS_VERSION,
             "crawl_id": crawl_id,
         }
-        
+
         # Set a log file, prints to console by default
         self.log = logger.Logger(
             console=True if not self.config.log.fp else False,
@@ -55,6 +60,12 @@ def __init__(self,
             file_level=self.config.log.level,
         ).start(__name__)
 
+        if self.config.method == SearchMethod.SELENIUM:
+            self.searcher = SeleniumDriver(config=self.config.selenium, logger=self.log)
+            self.searcher.init_driver()
+        elif self.config.method == SearchMethod.REQUESTS:
+            self.searcher = RequestsSearcher(config=self.config.requests, logger=self.log)
+
         # Initialize search params and output
         self.search_params = SearchParams.create()
         self.parsed = {'results': [], 'features': {}}
@@ -73,28 +84,23 @@ def search(self,
         Args:
             qry (str): The search query
             location (str, optional): A location's Canonical Name
+            lang (str, optional): A language code (e.g., 'en')
             num_results (int, optional): The number of results to return
             ai_expand: (bool, optional): Whether to use selenium to expand AI overviews
-            crawl_id (str, optional): An identifier for this crawl
+            headers (Dict[str, str], optional): Custom headers to include in the request
         """
 
+        self.log.warning('starting search config')
         self.search_params = SearchParams.create({
             'qry': str(qry),
             'loc': str(location) if not pd.isnull(location) else '',
             'lang': str(lang) if not pd.isnull(lang) else '',
             'num_results': num_results,
+            'ai_expand': ai_expand,
+            'headers': headers,
         })
 
-        if self.config.method == SearchMethod.SELENIUM:
-            self.selenium_driver = SeleniumDriver(config=self.config.selenium, logger=self.log)
-            self.selenium_driver.init_driver()
-            self.response_output = self.selenium_driver.send_request(self.search_params, ai_expand=ai_expand)
-        
-        elif self.config.method == SearchMethod.REQUESTS:
-            self.config.requests.update_headers(headers)
-            self.requests_searcher = RequestsSearcher(config=self.config.requests, logger=self.log)
-            self.response_output = self.requests_searcher.send_request(self.search_params)
-
+        self.response_output = self.searcher.send_request(self.search_params)
         serp_output = self.search_params.to_serp_output()
         serp_output.update(self.session_data)
         serp_output.update(self.response_output)

From 51ee3b2e7809c2275d1df5e587b302d0968c8e6f Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Tue, 1 Apr 2025 07:49:56 -0700
Subject: [PATCH 051/101] update: poetry lock

---
 poetry.lock | 214 +++++-----------------------------------------------
 1 file changed, 20 insertions(+), 194 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 9ca06df..db4733a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
 
 [[package]]
 name = "annotated-types"
@@ -7,7 +7,6 @@ description = "Reusable constraint types to use with typing.Annotated"
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53"},
     {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"},
@@ -20,7 +19,7 @@ description = "Disable App Nap on macOS >= 10.9"
 optional = false
 python-versions = ">=3.6"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" and platform_system == \"Darwin\" or python_version >= \"3.12\" and platform_system == \"Darwin\""
+markers = "platform_system == \"Darwin\""
 files = [
     {file = "appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c"},
     {file = "appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee"},
@@ -33,7 +32,6 @@ description = "Annotate AST trees with source code positions"
 optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2"},
     {file = "asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7"},
@@ -50,19 +48,18 @@ description = "Classes Without Boilerplate"
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a"},
     {file = "attrs-25.1.0.tar.gz", hash = "sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e"},
 ]
 
 [package.extras]
-benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+benchmark = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"]
+cov = ["cloudpickle ; platform_python_implementation == \"CPython\"", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"]
+dev = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pre-commit-uv", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"]
 docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
-tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
-tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"]
+tests = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-xdist[psutil]"]
+tests-mypy = ["mypy (>=1.11.1) ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\"", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version >= \"3.10\""]
 
 [[package]]
 name = "beautifulsoup4"
@@ -71,7 +68,6 @@ description = "Screen-scraping library"
 optional = false
 python-versions = ">=3.7.0"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "beautifulsoup4-4.13.1-py3-none-any.whl", hash = "sha256:72465267014897bb10ca749bb632bde6c2d20f3254afd5458544bd74e6c2e6d8"},
     {file = "beautifulsoup4-4.13.1.tar.gz", hash = "sha256:741c8b6903a1e4ae8ba32b9c9ae7510dab7a197fdbadcf9fcdeb0891ef5ec66a"},
@@ -95,7 +91,6 @@ description = "Python bindings for the Brotli compression library"
 optional = false
 python-versions = "*"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "Brotli-1.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e1140c64812cb9b06c922e77f1c26a75ec5e3f0fb2bf92cc8c58720dec276752"},
     {file = "Brotli-1.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c8fd5270e906eef71d4a8d19b7c6a43760c6abcfcc10c9101d14eb2357418de9"},
@@ -231,7 +226,6 @@ description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "certifi-2025.1.31-py3-none-any.whl", hash = "sha256:ca78db4565a652026a4db2bcdf68f2fb589ea80d0be70e03929ed730746b84fe"},
     {file = "certifi-2025.1.31.tar.gz", hash = "sha256:3d5da6925056f6f18f119200434a4780a94263f10d1c21d032a6f6b2baa20651"},
@@ -313,7 +307,7 @@ files = [
     {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"},
     {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"},
 ]
-markers = {main = "python_version <= \"3.11\" and os_name == \"nt\" and implementation_name != \"pypy\" or python_version >= \"3.12\" and os_name == \"nt\" and implementation_name != \"pypy\"", dev = "python_version <= \"3.11\" and implementation_name == \"pypy\" or python_version >= \"3.12\" and implementation_name == \"pypy\""}
+markers = {main = "os_name == \"nt\" and implementation_name != \"pypy\"", dev = "implementation_name == \"pypy\""}
 
 [package.dependencies]
 pycparser = "*"
@@ -325,7 +319,6 @@ description = "The Real First Universal Charset Detector. Open, modern and activ
 optional = false
 python-versions = ">=3.7"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "charset_normalizer-3.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:91b36a978b5ae0ee86c394f5a54d6ef44db1de0815eb43de826d41d21e4af3de"},
     {file = "charset_normalizer-3.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7461baadb4dc00fd9e0acbe254e3d7d2112e7f92ced2adc96e54ef6501c5f176"},
@@ -428,7 +421,6 @@ description = "Composable command line interface toolkit"
 optional = false
 python-versions = ">=3.7"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"},
     {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"},
@@ -444,7 +436,7 @@ description = "Cross-platform colored terminal text."
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" and sys_platform == \"win32\" or python_version <= \"3.11\" and platform_system == \"Windows\" or python_version >= \"3.12\" and sys_platform == \"win32\" or python_version >= \"3.12\" and platform_system == \"Windows\""
+markers = "sys_platform == \"win32\" or platform_system == \"Windows\""
 files = [
     {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
     {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
@@ -457,7 +449,6 @@ description = "Jupyter Python Comm implementation, for usage in ipykernel, xeus-
 optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "comm-0.2.2-py3-none-any.whl", hash = "sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3"},
     {file = "comm-0.2.2.tar.gz", hash = "sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e"},
@@ -476,7 +467,6 @@ description = "An implementation of the Debug Adapter Protocol for Python"
 optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "debugpy-1.8.12-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:a2ba7ffe58efeae5b8fad1165357edfe01464f9aef25e814e891ec690e7dd82a"},
     {file = "debugpy-1.8.12-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cbbd4149c4fc5e7d508ece083e78c17442ee13b0e69bfa6bd63003e486770f45"},
@@ -513,7 +503,6 @@ description = "Decorators for Humans"
 optional = false
 python-versions = ">=3.5"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"},
     {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"},
@@ -542,14 +531,13 @@ description = "Get the currently executing AST node of a frame, and other inform
 optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa"},
     {file = "executing-2.2.0.tar.gz", hash = "sha256:5d108c028108fe2551d1a7b2e8b713341e2cb4fc0aa7dcf966fa4327a5226755"},
 ]
 
 [package.extras]
-tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"]
+tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich ; python_version >= \"3.11\""]
 
 [[package]]
 name = "filelock"
@@ -558,7 +546,6 @@ description = "A platform independent file lock."
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "filelock-3.17.0-py3-none-any.whl", hash = "sha256:533dc2f7ba78dc2f0f531fc6c4940addf7b70a481e269a5a3b93be94ffbe8338"},
     {file = "filelock-3.17.0.tar.gz", hash = "sha256:ee4e77401ef576ebb38cd7f13b9b28893194acc20a8e68e18730ba9c0e54660e"},
@@ -567,7 +554,7 @@ files = [
 [package.extras]
 docs = ["furo (>=2024.8.6)", "sphinx (>=8.1.3)", "sphinx-autodoc-typehints (>=3)"]
 testing = ["covdefaults (>=2.3)", "coverage (>=7.6.10)", "diff-cover (>=9.2.1)", "pytest (>=8.3.4)", "pytest-asyncio (>=0.25.2)", "pytest-cov (>=6)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.28.1)"]
-typing = ["typing-extensions (>=4.12.2)"]
+typing = ["typing-extensions (>=4.12.2) ; python_version < \"3.11\""]
 
 [[package]]
 name = "h11"
@@ -576,7 +563,6 @@ description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
 optional = false
 python-versions = ">=3.7"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
     {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
@@ -589,7 +575,6 @@ description = "Internationalized Domain Names in Applications (IDNA)"
 optional = false
 python-versions = ">=3.6"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"},
     {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"},
@@ -598,31 +583,6 @@ files = [
 [package.extras]
 all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"]
 
-[[package]]
-name = "importlib-metadata"
-version = "8.6.1"
-description = "Read metadata from Python packages"
-optional = false
-python-versions = ">=3.9"
-groups = ["dev"]
-markers = "python_version < \"3.10\""
-files = [
-    {file = "importlib_metadata-8.6.1-py3-none-any.whl", hash = "sha256:02a89390c1e15fdfdc0d7c6b25cb3e62650d0494005c97d6f148bf5b9787525e"},
-    {file = "importlib_metadata-8.6.1.tar.gz", hash = "sha256:310b41d755445d74569f993ccfc22838295d9fe005425094fad953d7f15c8580"},
-]
-
-[package.dependencies]
-zipp = ">=3.20"
-
-[package.extras]
-check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
-cover = ["pytest-cov"]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-enabler = ["pytest-enabler (>=2.2)"]
-perf = ["ipython"]
-test = ["flufl.flake8", "importlib_resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"]
-type = ["pytest-mypy"]
-
 [[package]]
 name = "iniconfig"
 version = "2.0.0"
@@ -630,7 +590,6 @@ description = "brain-dead simple config-ini parsing"
 optional = false
 python-versions = ">=3.7"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
     {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
@@ -643,7 +602,6 @@ description = "IPython Kernel for Jupyter"
 optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "ipykernel-6.29.5-py3-none-any.whl", hash = "sha256:afdb66ba5aa354b09b91379bac28ae4afebbb30e8b39510c9690afb7a10421b5"},
     {file = "ipykernel-6.29.5.tar.gz", hash = "sha256:f093a22c4a40f8828f8e330a9c297cb93dcab13bd9678ded6de8e5cf81c56215"},
@@ -678,7 +636,6 @@ description = "IPython: Productive Interactive Computing"
 optional = false
 python-versions = ">=3.9"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "ipython-8.18.1-py3-none-any.whl", hash = "sha256:e8267419d72d81955ec1177f8a29aaa90ac80ad647499201119e2f05e99aa397"},
     {file = "ipython-8.18.1.tar.gz", hash = "sha256:ca6f079bb33457c66e233e4580ebfc4128855b4cf6370dddd73842a9563e8a27"},
@@ -695,7 +652,6 @@ prompt-toolkit = ">=3.0.41,<3.1.0"
 pygments = ">=2.4.0"
 stack-data = "*"
 traitlets = ">=5"
-typing-extensions = {version = "*", markers = "python_version < \"3.10\""}
 
 [package.extras]
 all = ["black", "curio", "docrepr", "exceptiongroup", "ipykernel", "ipyparallel", "ipywidgets", "matplotlib", "matplotlib (!=3.2.0)", "nbconvert", "nbformat", "notebook", "numpy (>=1.22)", "pandas", "pickleshare", "pytest (<7)", "pytest (<7.1)", "pytest-asyncio (<0.22)", "qtconsole", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "stack-data", "testpath", "trio", "typing-extensions"]
@@ -717,7 +673,6 @@ description = "An autocompletion tool for Python that can be used for text edito
 optional = false
 python-versions = ">=3.6"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9"},
     {file = "jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0"},
@@ -738,14 +693,12 @@ description = "Jupyter protocol implementation and client libraries"
 optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "jupyter_client-8.6.3-py3-none-any.whl", hash = "sha256:e8a19cc986cc45905ac3362915f410f3af85424b4c0905e94fa5f2cb08e8f23f"},
     {file = "jupyter_client-8.6.3.tar.gz", hash = "sha256:35b3a0947c4a6e9d589eb97d7d4cd5e90f910ee73101611f01283732bd6d9419"},
 ]
 
 [package.dependencies]
-importlib-metadata = {version = ">=4.8.3", markers = "python_version < \"3.10\""}
 jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0"
 python-dateutil = ">=2.8.2"
 pyzmq = ">=23.0"
@@ -754,7 +707,7 @@ traitlets = ">=5.3"
 
 [package.extras]
 docs = ["ipykernel", "myst-parser", "pydata-sphinx-theme", "sphinx (>=4)", "sphinx-autodoc-typehints", "sphinxcontrib-github-alt", "sphinxcontrib-spelling"]
-test = ["coverage", "ipykernel (>=6.14)", "mypy", "paramiko", "pre-commit", "pytest (<8.2.0)", "pytest-cov", "pytest-jupyter[client] (>=0.4.1)", "pytest-timeout"]
+test = ["coverage", "ipykernel (>=6.14)", "mypy", "paramiko ; sys_platform == \"win32\"", "pre-commit", "pytest (<8.2.0)", "pytest-cov", "pytest-jupyter[client] (>=0.4.1)", "pytest-timeout"]
 
 [[package]]
 name = "jupyter-core"
@@ -763,7 +716,6 @@ description = "Jupyter core package. A base package on which Jupyter projects re
 optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "jupyter_core-5.7.2-py3-none-any.whl", hash = "sha256:4f7315d2f6b4bcf2e3e7cb6e46772eba760ae459cd1f59d29eb57b0a01bd7409"},
     {file = "jupyter_core-5.7.2.tar.gz", hash = "sha256:aa5f8d32bbf6b431ac830496da7392035d6f61b4f54872f15c4bd2a9c3f536d9"},
@@ -785,7 +737,6 @@ description = "Powerful and Pythonic XML processing library combining libxml2/li
 optional = false
 python-versions = ">=3.6"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:dd36439be765e2dde7660212b5275641edbc813e7b24668831a5c8ac91180656"},
     {file = "lxml-5.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ae5fe5c4b525aa82b8076c1a59d642c17b6e8739ecf852522c6321852178119d"},
@@ -941,7 +892,6 @@ description = "Python port of markdown-it. Markdown parsing, done right!"
 optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
     {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
@@ -967,7 +917,6 @@ description = "Inline Matplotlib backend for Jupyter"
 optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca"},
     {file = "matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90"},
@@ -983,7 +932,6 @@ description = "Markdown URL utilities"
 optional = false
 python-versions = ">=3.7"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
     {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
@@ -996,68 +944,11 @@ description = "Patch asyncio to allow nested event loops"
 optional = false
 python-versions = ">=3.5"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"},
     {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"},
 ]
 
-[[package]]
-name = "numpy"
-version = "2.0.2"
-description = "Fundamental package for array computing in Python"
-optional = false
-python-versions = ">=3.9"
-groups = ["main"]
-markers = "python_version < \"3.11\""
-files = [
-    {file = "numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece"},
-    {file = "numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04"},
-    {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66"},
-    {file = "numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b"},
-    {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd"},
-    {file = "numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318"},
-    {file = "numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8"},
-    {file = "numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326"},
-    {file = "numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97"},
-    {file = "numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131"},
-    {file = "numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448"},
-    {file = "numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195"},
-    {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57"},
-    {file = "numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a"},
-    {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669"},
-    {file = "numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951"},
-    {file = "numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9"},
-    {file = "numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15"},
-    {file = "numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4"},
-    {file = "numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc"},
-    {file = "numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b"},
-    {file = "numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e"},
-    {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c"},
-    {file = "numpy-2.0.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c"},
-    {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692"},
-    {file = "numpy-2.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a"},
-    {file = "numpy-2.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c"},
-    {file = "numpy-2.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded"},
-    {file = "numpy-2.0.2-cp312-cp312-win32.whl", hash = "sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5"},
-    {file = "numpy-2.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a"},
-    {file = "numpy-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c"},
-    {file = "numpy-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd"},
-    {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b"},
-    {file = "numpy-2.0.2-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729"},
-    {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1"},
-    {file = "numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd"},
-    {file = "numpy-2.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d"},
-    {file = "numpy-2.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d"},
-    {file = "numpy-2.0.2-cp39-cp39-win32.whl", hash = "sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa"},
-    {file = "numpy-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73"},
-    {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8"},
-    {file = "numpy-2.0.2-pp39-pypy39_pp73-macosx_14_0_x86_64.whl", hash = "sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4"},
-    {file = "numpy-2.0.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c"},
-    {file = "numpy-2.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385"},
-    {file = "numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78"},
-]
-
 [[package]]
 name = "numpy"
 version = "2.2.2"
@@ -1065,7 +956,6 @@ description = "Fundamental package for array computing in Python"
 optional = false
 python-versions = ">=3.10"
 groups = ["main"]
-markers = "python_version == \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "numpy-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7079129b64cb78bdc8d611d1fd7e8002c0a2565da6a47c4df8062349fee90e3e"},
     {file = "numpy-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2ec6c689c61df613b783aeb21f945c4cbe6c51c28cb70aae8430577ab39f163e"},
@@ -1131,7 +1021,6 @@ description = "Capture the outcome of Python function calls."
 optional = false
 python-versions = ">=3.7"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "outcome-1.3.0.post0-py2.py3-none-any.whl", hash = "sha256:e771c5ce06d1415e356078d3bdd68523f284b4ce5419828922b6871e65eda82b"},
     {file = "outcome-1.3.0.post0.tar.gz", hash = "sha256:9dcf02e65f2971b80047b377468e72a268e15c0af3cf1238e6ff14f7f91143b8"},
@@ -1147,7 +1036,6 @@ description = "Core utilities for Python packages"
 optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"},
     {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"},
@@ -1160,7 +1048,6 @@ description = "Powerful data structures for data analysis, time series, and stat
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "pandas-2.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5"},
     {file = "pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348"},
@@ -1248,7 +1135,6 @@ description = "A Python Parser"
 optional = false
 python-versions = ">=3.6"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18"},
     {file = "parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d"},
@@ -1265,7 +1151,7 @@ description = "Pexpect allows easy control of interactive console applications."
 optional = false
 python-versions = "*"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" and sys_platform != \"win32\" or python_version >= \"3.12\" and sys_platform != \"win32\""
+markers = "sys_platform != \"win32\""
 files = [
     {file = "pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523"},
     {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"},
@@ -1281,7 +1167,6 @@ description = "A small Python package for determining appropriate platform-speci
 optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"},
     {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"},
@@ -1299,7 +1184,6 @@ description = "plugin and hook calling mechanisms for python"
 optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"},
     {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"},
@@ -1316,7 +1200,6 @@ description = "Library for building powerful interactive command lines in Python
 optional = false
 python-versions = ">=3.8.0"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "prompt_toolkit-3.0.50-py3-none-any.whl", hash = "sha256:9b6427eb19e479d98acff65196a307c555eb567989e6d88ebbb1b509d9779198"},
     {file = "prompt_toolkit-3.0.50.tar.gz", hash = "sha256:544748f3860a2623ca5cd6d2795e7a14f3d0e1c3c9728359013f79877fc89bab"},
@@ -1332,7 +1215,6 @@ description = ""
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "protobuf-6.30.0-cp310-abi3-win32.whl", hash = "sha256:7337d76d8efe65ee09ee566b47b5914c517190196f414e5418fa236dfd1aed3e"},
     {file = "protobuf-6.30.0-cp310-abi3-win_amd64.whl", hash = "sha256:9b33d51cc95a7ec4f407004c8b744330b6911a37a782e2629c67e1e8ac41318f"},
@@ -1352,7 +1234,6 @@ description = "Cross-platform lib for process and system monitoring in Python."
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "psutil-6.1.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:9ccc4316f24409159897799b83004cb1e24f9819b0dcf9c0b68bdcb6cefee6a8"},
     {file = "psutil-6.1.1-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ca9609c77ea3b8481ab005da74ed894035936223422dc591d6772b147421f777"},
@@ -1384,7 +1265,7 @@ description = "Run a subprocess in a pseudo terminal"
 optional = false
 python-versions = "*"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" and sys_platform != \"win32\" or python_version >= \"3.12\" and sys_platform != \"win32\""
+markers = "sys_platform != \"win32\""
 files = [
     {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"},
     {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"},
@@ -1397,7 +1278,6 @@ description = "Safely evaluate AST nodes without side effects"
 optional = false
 python-versions = "*"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0"},
     {file = "pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42"},
@@ -1417,7 +1297,7 @@ files = [
     {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"},
     {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"},
 ]
-markers = {main = "python_version <= \"3.11\" and os_name == \"nt\" and implementation_name != \"pypy\" or python_version >= \"3.12\" and os_name == \"nt\" and implementation_name != \"pypy\"", dev = "python_version <= \"3.11\" and implementation_name == \"pypy\" or python_version >= \"3.12\" and implementation_name == \"pypy\""}
+markers = {main = "os_name == \"nt\" and implementation_name != \"pypy\"", dev = "implementation_name == \"pypy\""}
 
 [[package]]
 name = "pydantic"
@@ -1426,7 +1306,6 @@ description = "Data validation using Python type hints"
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "pydantic-2.10.6-py3-none-any.whl", hash = "sha256:427d664bf0b8a2b34ff5dd0f5a18df00591adcee7198fbd71981054cef37b584"},
     {file = "pydantic-2.10.6.tar.gz", hash = "sha256:ca5daa827cce33de7a42be142548b0096bf05a7e7b365aebfa5f8eeec7128236"},
@@ -1439,7 +1318,7 @@ typing-extensions = ">=4.12.2"
 
 [package.extras]
 email = ["email-validator (>=2.0.0)"]
-timezone = ["tzdata"]
+timezone = ["tzdata ; python_version >= \"3.9\" and platform_system == \"Windows\""]
 
 [[package]]
 name = "pydantic-core"
@@ -1448,7 +1327,6 @@ description = "Core functionality for Pydantic validation and serialization"
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "pydantic_core-2.27.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2d367ca20b2f14095a8f4fa1210f5a7b78b8a20009ecced6b12818f455b1e9fa"},
     {file = "pydantic_core-2.27.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:491a2b73db93fab69731eaee494f320faa4e093dbed776be1a829c2eb222c34c"},
@@ -1562,7 +1440,6 @@ description = "Pygments is a syntax highlighting package written in Python."
 optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c"},
     {file = "pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f"},
@@ -1578,7 +1455,6 @@ description = "A Python SOCKS client module. See https://github.com/Anorov/PySoc
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "PySocks-1.7.1-py27-none-any.whl", hash = "sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299"},
     {file = "PySocks-1.7.1-py3-none-any.whl", hash = "sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5"},
@@ -1592,7 +1468,6 @@ description = "pytest: simple powerful testing with Python"
 optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"},
     {file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"},
@@ -1616,7 +1491,6 @@ description = "Extensions to the standard Python datetime module"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
 groups = ["main", "dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
     {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
@@ -1632,7 +1506,6 @@ description = "World timezone definitions, modern and historical"
 optional = false
 python-versions = "*"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "pytz-2025.1-py2.py3-none-any.whl", hash = "sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57"},
     {file = "pytz-2025.1.tar.gz", hash = "sha256:c2db42be2a2518b28e65f9207c4d05e6ff547d1efa4086469ef855e4ab70178e"},
@@ -1645,7 +1518,7 @@ description = "Python for Window Extensions"
 optional = false
 python-versions = "*"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" and sys_platform == \"win32\" and platform_python_implementation != \"PyPy\" or python_version >= \"3.12\" and sys_platform == \"win32\" and platform_python_implementation != \"PyPy\""
+markers = "sys_platform == \"win32\" and platform_python_implementation != \"PyPy\""
 files = [
     {file = "pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e"},
     {file = "pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e"},
@@ -1674,7 +1547,6 @@ description = "Python bindings for 0MQ"
 optional = false
 python-versions = ">=3.7"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "pyzmq-26.2.1-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:f39d1227e8256d19899d953e6e19ed2ccb689102e6d85e024da5acf410f301eb"},
     {file = "pyzmq-26.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a23948554c692df95daed595fdd3b76b420a4939d7a8a28d6d7dea9711878641"},
@@ -1797,7 +1669,6 @@ description = "Python HTTP for Humans."
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
     {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
@@ -1820,7 +1691,6 @@ description = "File transport adapter for Requests"
 optional = false
 python-versions = "*"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "requests_file-2.1.0-py2.py3-none-any.whl", hash = "sha256:cf270de5a4c5874e84599fc5778303d496c10ae5e870bfa378818f35d21bda5c"},
     {file = "requests_file-2.1.0.tar.gz", hash = "sha256:0f549a3f3b0699415ac04d167e9cb39bccfb730cb832b4d20be3d9867356e658"},
@@ -1836,7 +1706,6 @@ description = "Render rich text, tables, progress bars, syntax highlighting, mar
 optional = false
 python-versions = ">=3.8.0"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "rich-13.9.4-py3-none-any.whl", hash = "sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90"},
     {file = "rich-13.9.4.tar.gz", hash = "sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098"},
@@ -1857,7 +1726,6 @@ description = "Official Python bindings for Selenium WebDriver"
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "selenium-4.29.0-py3-none-any.whl", hash = "sha256:ce5d26f1ddc1111641113653af33694c13947dd36c2df09cdd33f554351d372e"},
     {file = "selenium-4.29.0.tar.gz", hash = "sha256:3a62f7ec33e669364a6c0562a701deb69745b569c50d55f1a912bf8eb33358ba"},
@@ -1878,7 +1746,6 @@ description = "Tool to Detect Surrounding Shell"
 optional = false
 python-versions = ">=3.7"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686"},
     {file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"},
@@ -1891,7 +1758,6 @@ description = "Python 2 and 3 compatibility utilities"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
 groups = ["main", "dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"},
     {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"},
@@ -1904,7 +1770,6 @@ description = "Sniff out which async library your code is running under"
 optional = false
 python-versions = ">=3.7"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"},
     {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
@@ -1917,7 +1782,6 @@ description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set"
 optional = false
 python-versions = "*"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"},
     {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
@@ -1930,7 +1794,6 @@ description = "A modern CSS selector implementation for Beautiful Soup."
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"},
     {file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"},
@@ -1943,7 +1806,6 @@ description = "Extract data from python stack frames and tracebacks for informat
 optional = false
 python-versions = "*"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695"},
     {file = "stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9"},
@@ -1964,7 +1826,6 @@ description = "Pytest Snapshot Test Utility"
 optional = false
 python-versions = ">=3.8.1"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "syrupy-4.8.1-py3-none-any.whl", hash = "sha256:274f97cbaf44175f5e478a2f3a53559d31f41c66c6bf28131695f94ac893ea00"},
     {file = "syrupy-4.8.1.tar.gz", hash = "sha256:8da8c0311e6d92de0b15767768c6ab98982b7b4a4c67083c08fbac3fbad4d44c"},
@@ -1980,7 +1841,6 @@ description = "Accurately separates a URL's subdomain, domain, and public suffix
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "tldextract-5.1.3-py3-none-any.whl", hash = "sha256:78de310cc2ca018692de5ddf320f9d6bd7c5cf857d0fd4f2175f0cdf4440ea75"},
     {file = "tldextract-5.1.3.tar.gz", hash = "sha256:d43c7284c23f5dc8a42fd0fee2abede2ff74cc622674e4cb07f514ab3330c338"},
@@ -2046,7 +1906,6 @@ description = "Tornado is a Python web framework and asynchronous networking lib
 optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "tornado-6.4.2-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:e828cce1123e9e44ae2a50a9de3055497ab1d0aeb440c5ac23064d9e44880da1"},
     {file = "tornado-6.4.2-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:072ce12ada169c5b00b7d92a99ba089447ccc993ea2143c9ede887e0937aa803"},
@@ -2068,7 +1927,6 @@ description = "Traitlets Python configuration system"
 optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f"},
     {file = "traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7"},
@@ -2085,7 +1943,6 @@ description = "A friendly Python library for async concurrency and I/O"
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "trio-0.29.0-py3-none-any.whl", hash = "sha256:d8c463f1a9cc776ff63e331aba44c125f423a5a13c684307e828d930e625ba66"},
     {file = "trio-0.29.0.tar.gz", hash = "sha256:ea0d3967159fc130acb6939a0be0e558e364fee26b5deeecc893a6b08c361bdf"},
@@ -2107,7 +1964,6 @@ description = "WebSocket library for Trio"
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "trio_websocket-0.12.2-py3-none-any.whl", hash = "sha256:df605665f1db533f4a386c94525870851096a223adcb97f72a07e8b4beba45b6"},
     {file = "trio_websocket-0.12.2.tar.gz", hash = "sha256:22c72c436f3d1e264d0910a3951934798dcc5b00ae56fc4ee079d46c7cf20fae"},
@@ -2126,7 +1982,6 @@ description = "Typer, build great CLIs. Easy to code. Based on Python type hints
 optional = false
 python-versions = ">=3.7"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "typer-0.15.2-py3-none-any.whl", hash = "sha256:46a499c6107d645a9c13f7ee46c5d5096cae6f5fc57dd11eccbbb9ae3e44ddfc"},
     {file = "typer-0.15.2.tar.gz", hash = "sha256:ab2fab47533a813c49fe1f16b1a370fd5819099c00b119e0633df65f22144ba5"},
@@ -2145,7 +2000,6 @@ description = "Backported and Experimental Type Hints for Python 3.8+"
 optional = false
 python-versions = ">=3.8"
 groups = ["main", "dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
     {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
@@ -2158,7 +2012,6 @@ description = "Provider of IANA time zone data"
 optional = false
 python-versions = ">=2"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "tzdata-2025.1-py2.py3-none-any.whl", hash = "sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639"},
     {file = "tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694"},
@@ -2171,7 +2024,6 @@ description = "('Selenium.webdriver.Chrome replacement with compatiblity for Bra
 optional = false
 python-versions = "*"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "undetected-chromedriver-3.5.5.tar.gz", hash = "sha256:9f945e1435005247abe17de316bcfda85b284a4177fd5f25167c78ced33b65ec"},
 ]
@@ -2188,7 +2040,6 @@ description = "HTTP library with thread-safe connection pooling, file post, and
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df"},
     {file = "urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d"},
@@ -2198,7 +2049,7 @@ files = [
 pysocks = {version = ">=1.5.6,<1.5.7 || >1.5.7,<2.0", optional = true, markers = "extra == \"socks\""}
 
 [package.extras]
-brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
+brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""]
 h2 = ["h2 (>=4,<5)"]
 socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
 zstd = ["zstandard (>=0.18.0)"]
@@ -2210,7 +2061,6 @@ description = "Measures the displayed width of unicode strings in a terminal"
 optional = false
 python-versions = "*"
 groups = ["dev"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"},
     {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"},
@@ -2223,7 +2073,6 @@ description = "WebSocket client for Python with low level API options"
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "websocket_client-1.8.0-py3-none-any.whl", hash = "sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526"},
     {file = "websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da"},
@@ -2241,7 +2090,6 @@ description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)"
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "websockets-15.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d63efaa0cd96cf0c5fe4d581521d9fa87744540d4bc999ae6e08595a1014b45b"},
     {file = "websockets-15.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ac60e3b188ec7574cb761b08d50fcedf9d77f1530352db4eef1707fe9dee7205"},
@@ -2321,7 +2169,6 @@ description = "WebSockets state-machine based protocol implementation"
 optional = false
 python-versions = ">=3.7.0"
 groups = ["main"]
-markers = "python_version <= \"3.11\" or python_version >= \"3.12\""
 files = [
     {file = "wsproto-1.2.0-py3-none-any.whl", hash = "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736"},
     {file = "wsproto-1.2.0.tar.gz", hash = "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065"},
@@ -2330,28 +2177,7 @@ files = [
 [package.dependencies]
 h11 = ">=0.9.0,<1"
 
-[[package]]
-name = "zipp"
-version = "3.21.0"
-description = "Backport of pathlib-compatible object wrapper for zip files"
-optional = false
-python-versions = ">=3.9"
-groups = ["dev"]
-markers = "python_version < \"3.10\""
-files = [
-    {file = "zipp-3.21.0-py3-none-any.whl", hash = "sha256:ac1bbe05fd2991f160ebce24ffbac5f6d11d83dc90891255885223d42b3cd931"},
-    {file = "zipp-3.21.0.tar.gz", hash = "sha256:2c9958f6430a2040341a52eb608ed6dd93ef4392e02ffe219417c1b28b5dd1f4"},
-]
-
-[package.extras]
-check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"]
-cover = ["pytest-cov"]
-doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
-enabler = ["pytest-enabler (>=2.2)"]
-test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
-type = ["pytest-mypy"]
-
 [metadata]
 lock-version = "2.1"
-python-versions = ">=3.9"
-content-hash = "1afa3bf7c3d9ce06c3cf91b77da72e8f7bf4d543351120cdfe00bedb1286df6b"
+python-versions = ">=3.10"
+content-hash = "19e460b385e6e3fb8901153196b5cbdcf0a318c743113b0916abc353115c9a4f"

From 6ee5209eab936f071632d042207a7d725376480c Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Tue, 1 Apr 2025 10:19:42 -0700
Subject: [PATCH 052/101] update: using orjson for speed, must decode dumps to
 string

---
 .../search_methods/selenium_searcher.py       |  6 +-
 WebSearcher/utils.py                          | 10 ++-
 poetry.lock                                   | 80 ++++++++++++++++++-
 pyproject.toml                                |  1 +
 4 files changed, 89 insertions(+), 8 deletions(-)

diff --git a/WebSearcher/search_methods/selenium_searcher.py b/WebSearcher/search_methods/selenium_searcher.py
index 00eb829..1e67025 100644
--- a/WebSearcher/search_methods/selenium_searcher.py
+++ b/WebSearcher/search_methods/selenium_searcher.py
@@ -1,5 +1,5 @@
 import time
-import json
+import orjson
 from datetime import datetime, timezone
 from typing import Dict, Any
 
@@ -42,8 +42,8 @@ def init_driver(self) -> None:
             'driver_version': self.driver.capabilities['chrome']['chromedriverVersion'].split(' ')[0],
             'user_agent': self.driver.execute_script('return navigator.userAgent'),
         }
-        self.browser_info['browser_id'] = utils.hash_id(json.dumps(self.browser_info))
-        self.log.debug(json.dumps(self.browser_info, indent=4))
+        self.browser_info['browser_id'] = utils.hash_id(orjson.dumps(self.browser_info).decode('utf-8'))
+        self.log.debug(orjson.dumps(self.browser_info, option=orjson.OPT_INDENT_2))
         
     def send_typed_query(self, query: str):
         """Send a typed query to the search box"""
diff --git a/WebSearcher/utils.py b/WebSearcher/utils.py
index a12a270..dec20e9 100644
--- a/WebSearcher/utils.py
+++ b/WebSearcher/utils.py
@@ -1,10 +1,9 @@
 import re
 import os
-import json
+import orjson
 import random
 import hashlib
 import itertools
-from timeit import default_timer
 from string import ascii_letters, digits
 
 # Files ------------------------------------------------------------------------
@@ -24,7 +23,7 @@ def read_lines(fp):
 
     with open(fp, 'r') as infile:
         if is_json:
-            return [json.loads(line) for line in infile]
+            return [orjson.loads(line) for line in infile]
         else:
             return [line.strip() for line in infile]
 
@@ -38,7 +37,10 @@ def write_lines(iter_data, fp, overwrite=False):
 
     with open(fp, mode) as outfile:
         for data in iter_data:
-            line_output = json.dumps(data) if is_json else data
+            if is_json:
+                line_output = orjson.dumps(data).decode('utf-8')
+            else:
+                line_output = data
             outfile.write(f"{line_output}\n")
 
 
diff --git a/poetry.lock b/poetry.lock
index db4733a..34ccb6a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1014,6 +1014,84 @@ files = [
     {file = "numpy-2.2.2.tar.gz", hash = "sha256:ed6906f61834d687738d25988ae117683705636936cc605be0bb208b23df4d8f"},
 ]
 
+[[package]]
+name = "orjson"
+version = "3.10.16"
+description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "orjson-3.10.16-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:4cb473b8e79154fa778fb56d2d73763d977be3dcc140587e07dbc545bbfc38f8"},
+    {file = "orjson-3.10.16-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:622a8e85eeec1948690409a19ca1c7d9fd8ff116f4861d261e6ae2094fe59a00"},
+    {file = "orjson-3.10.16-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c682d852d0ce77613993dc967e90e151899fe2d8e71c20e9be164080f468e370"},
+    {file = "orjson-3.10.16-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8c520ae736acd2e32df193bcff73491e64c936f3e44a2916b548da048a48b46b"},
+    {file = "orjson-3.10.16-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:134f87c76bfae00f2094d85cfab261b289b76d78c6da8a7a3b3c09d362fd1e06"},
+    {file = "orjson-3.10.16-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b59afde79563e2cf37cfe62ee3b71c063fd5546c8e662d7fcfc2a3d5031a5c4c"},
+    {file = "orjson-3.10.16-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:113602f8241daaff05d6fad25bd481d54c42d8d72ef4c831bb3ab682a54d9e15"},
+    {file = "orjson-3.10.16-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4fc0077d101f8fab4031e6554fc17b4c2ad8fdbc56ee64a727f3c95b379e31da"},
+    {file = "orjson-3.10.16-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:9c6bf6ff180cd69e93f3f50380224218cfab79953a868ea3908430bcfaf9cb5e"},
+    {file = "orjson-3.10.16-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5673eadfa952f95a7cd76418ff189df11b0a9c34b1995dff43a6fdbce5d63bf4"},
+    {file = "orjson-3.10.16-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5fe638a423d852b0ae1e1a79895851696cb0d9fa0946fdbfd5da5072d9bb9551"},
+    {file = "orjson-3.10.16-cp310-cp310-win32.whl", hash = "sha256:33af58f479b3c6435ab8f8b57999874b4b40c804c7a36b5cc6b54d8f28e1d3dd"},
+    {file = "orjson-3.10.16-cp310-cp310-win_amd64.whl", hash = "sha256:0338356b3f56d71293c583350af26f053017071836b07e064e92819ecf1aa055"},
+    {file = "orjson-3.10.16-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:44fcbe1a1884f8bc9e2e863168b0f84230c3d634afe41c678637d2728ea8e739"},
+    {file = "orjson-3.10.16-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78177bf0a9d0192e0b34c3d78bcff7fe21d1b5d84aeb5ebdfe0dbe637b885225"},
+    {file = "orjson-3.10.16-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:12824073a010a754bb27330cad21d6e9b98374f497f391b8707752b96f72e741"},
+    {file = "orjson-3.10.16-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ddd41007e56284e9867864aa2f29f3136bb1dd19a49ca43c0b4eda22a579cf53"},
+    {file = "orjson-3.10.16-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0877c4d35de639645de83666458ca1f12560d9fa7aa9b25d8bb8f52f61627d14"},
+    {file = "orjson-3.10.16-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9a09a539e9cc3beead3e7107093b4ac176d015bec64f811afb5965fce077a03c"},
+    {file = "orjson-3.10.16-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31b98bc9b40610fec971d9a4d67bb2ed02eec0a8ae35f8ccd2086320c28526ca"},
+    {file = "orjson-3.10.16-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0ce243f5a8739f3a18830bc62dc2e05b69a7545bafd3e3249f86668b2bcd8e50"},
+    {file = "orjson-3.10.16-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:64792c0025bae049b3074c6abe0cf06f23c8e9f5a445f4bab31dc5ca23dbf9e1"},
+    {file = "orjson-3.10.16-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ea53f7e68eec718b8e17e942f7ca56c6bd43562eb19db3f22d90d75e13f0431d"},
+    {file = "orjson-3.10.16-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a741ba1a9488c92227711bde8c8c2b63d7d3816883268c808fbeada00400c164"},
+    {file = "orjson-3.10.16-cp311-cp311-win32.whl", hash = "sha256:c7ed2c61bb8226384c3fdf1fb01c51b47b03e3f4536c985078cccc2fd19f1619"},
+    {file = "orjson-3.10.16-cp311-cp311-win_amd64.whl", hash = "sha256:cd67d8b3e0e56222a2e7b7f7da9031e30ecd1fe251c023340b9f12caca85ab60"},
+    {file = "orjson-3.10.16-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:6d3444abbfa71ba21bb042caa4b062535b122248259fdb9deea567969140abca"},
+    {file = "orjson-3.10.16-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:30245c08d818fdcaa48b7d5b81499b8cae09acabb216fe61ca619876b128e184"},
+    {file = "orjson-3.10.16-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0ba1d0baa71bf7579a4ccdcf503e6f3098ef9542106a0eca82395898c8a500a"},
+    {file = "orjson-3.10.16-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb0beefa5ef3af8845f3a69ff2a4aa62529b5acec1cfe5f8a6b4141033fd46ef"},
+    {file = "orjson-3.10.16-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6daa0e1c9bf2e030e93c98394de94506f2a4d12e1e9dadd7c53d5e44d0f9628e"},
+    {file = "orjson-3.10.16-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9da9019afb21e02410ef600e56666652b73eb3e4d213a0ec919ff391a7dd52aa"},
+    {file = "orjson-3.10.16-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:daeb3a1ee17b69981d3aae30c3b4e786b0f8c9e6c71f2b48f1aef934f63f38f4"},
+    {file = "orjson-3.10.16-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80fed80eaf0e20a31942ae5d0728849862446512769692474be5e6b73123a23b"},
+    {file = "orjson-3.10.16-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:73390ed838f03764540a7bdc4071fe0123914c2cc02fb6abf35182d5fd1b7a42"},
+    {file = "orjson-3.10.16-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:a22bba012a0c94ec02a7768953020ab0d3e2b884760f859176343a36c01adf87"},
+    {file = "orjson-3.10.16-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5385bbfdbc90ff5b2635b7e6bebf259652db00a92b5e3c45b616df75b9058e88"},
+    {file = "orjson-3.10.16-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:02c6279016346e774dd92625d46c6c40db687b8a0d685aadb91e26e46cc33e1e"},
+    {file = "orjson-3.10.16-cp312-cp312-win32.whl", hash = "sha256:7ca55097a11426db80f79378e873a8c51f4dde9ffc22de44850f9696b7eb0e8c"},
+    {file = "orjson-3.10.16-cp312-cp312-win_amd64.whl", hash = "sha256:86d127efdd3f9bf5f04809b70faca1e6836556ea3cc46e662b44dab3fe71f3d6"},
+    {file = "orjson-3.10.16-cp313-cp313-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:148a97f7de811ba14bc6dbc4a433e0341ffd2cc285065199fb5f6a98013744bd"},
+    {file = "orjson-3.10.16-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:1d960c1bf0e734ea36d0adc880076de3846aaec45ffad29b78c7f1b7962516b8"},
+    {file = "orjson-3.10.16-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a318cd184d1269f68634464b12871386808dc8b7c27de8565234d25975a7a137"},
+    {file = "orjson-3.10.16-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:df23f8df3ef9223d1d6748bea63fca55aae7da30a875700809c500a05975522b"},
+    {file = "orjson-3.10.16-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b94dda8dd6d1378f1037d7f3f6b21db769ef911c4567cbaa962bb6dc5021cf90"},
+    {file = "orjson-3.10.16-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f12970a26666a8775346003fd94347d03ccb98ab8aa063036818381acf5f523e"},
+    {file = "orjson-3.10.16-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:15a1431a245d856bd56e4d29ea0023eb4d2c8f71efe914beb3dee8ab3f0cd7fb"},
+    {file = "orjson-3.10.16-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c83655cfc247f399a222567d146524674a7b217af7ef8289c0ff53cfe8db09f0"},
+    {file = "orjson-3.10.16-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:fa59ae64cb6ddde8f09bdbf7baf933c4cd05734ad84dcf4e43b887eb24e37652"},
+    {file = "orjson-3.10.16-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:ca5426e5aacc2e9507d341bc169d8af9c3cbe88f4cd4c1cf2f87e8564730eb56"},
+    {file = "orjson-3.10.16-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:6fd5da4edf98a400946cd3a195680de56f1e7575109b9acb9493331047157430"},
+    {file = "orjson-3.10.16-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:980ecc7a53e567169282a5e0ff078393bac78320d44238da4e246d71a4e0e8f5"},
+    {file = "orjson-3.10.16-cp313-cp313-win32.whl", hash = "sha256:28f79944dd006ac540a6465ebd5f8f45dfdf0948ff998eac7a908275b4c1add6"},
+    {file = "orjson-3.10.16-cp313-cp313-win_amd64.whl", hash = "sha256:fe0a145e96d51971407cb8ba947e63ead2aa915db59d6631a355f5f2150b56b7"},
+    {file = "orjson-3.10.16-cp39-cp39-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:c35b5c1fb5a5d6d2fea825dec5d3d16bea3c06ac744708a8e1ff41d4ba10cdf1"},
+    {file = "orjson-3.10.16-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9aac7ecc86218b4b3048c768f227a9452287001d7548500150bb75ee21bf55d"},
+    {file = "orjson-3.10.16-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6e19f5102fff36f923b6dfdb3236ec710b649da975ed57c29833cb910c5a73ab"},
+    {file = "orjson-3.10.16-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:17210490408eb62755a334a6f20ed17c39f27b4f45d89a38cd144cd458eba80b"},
+    {file = "orjson-3.10.16-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fbbe04451db85916e52a9f720bd89bf41f803cf63b038595674691680cbebd1b"},
+    {file = "orjson-3.10.16-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a966eba501a3a1f309f5a6af32ed9eb8f316fa19d9947bac3e6350dc63a6f0a"},
+    {file = "orjson-3.10.16-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01e0d22f06c81e6c435723343e1eefc710e0510a35d897856766d475f2a15687"},
+    {file = "orjson-3.10.16-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:7c1e602d028ee285dbd300fb9820b342b937df64d5a3336e1618b354e95a2569"},
+    {file = "orjson-3.10.16-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:d230e5020666a6725629df81e210dc11c3eae7d52fe909a7157b3875238484f3"},
+    {file = "orjson-3.10.16-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:0f8baac07d4555f57d44746a7d80fbe6b2c4fe2ed68136b4abb51cfec512a5e9"},
+    {file = "orjson-3.10.16-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:524e48420b90fc66953e91b660b3d05faaf921277d6707e328fde1c218b31250"},
+    {file = "orjson-3.10.16-cp39-cp39-win32.whl", hash = "sha256:a9f614e31423d7292dbca966a53b2d775c64528c7d91424ab2747d8ab8ce5c72"},
+    {file = "orjson-3.10.16-cp39-cp39-win_amd64.whl", hash = "sha256:c338dc2296d1ed0d5c5c27dfb22d00b330555cb706c2e0be1e1c3940a0895905"},
+    {file = "orjson-3.10.16.tar.gz", hash = "sha256:d2aaa5c495e11d17b9b93205f5fa196737ee3202f000aaebf028dc9a73750f10"},
+]
+
 [[package]]
 name = "outcome"
 version = "1.3.0.post0"
@@ -2180,4 +2258,4 @@ h11 = ">=0.9.0,<1"
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10"
-content-hash = "19e460b385e6e3fb8901153196b5cbdcf0a318c743113b0916abc353115c9a4f"
+content-hash = "684e3794b5ea4541fde5a46b9bf83f67cbeedcecf4cd969dce683ffc3210b382"
diff --git a/pyproject.toml b/pyproject.toml
index ba8b769..488993d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,6 +18,7 @@ dependencies = [
     "undetected-chromedriver>=3.5.5",
     "selenium>=4.9.0",
     "protobuf (>=6.30.0,<7.0.0)",
+    "orjson (>=3.10.16,<4.0.0)",
 ]
 
 [project.urls]

From 48ae902d10c46d5ee5ffcbdac9b1095431073df7 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Tue, 1 Apr 2025 10:23:18 -0700
Subject: [PATCH 053/101] update: archive result collector, ignore archive

---
 .gitignore                      |  3 +-
 WebSearcher/result_collector.py | 99 ---------------------------------
 2 files changed, 2 insertions(+), 100 deletions(-)
 delete mode 100644 WebSearcher/result_collector.py

diff --git a/.gitignore b/.gitignore
index 10689ea..c705978 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,5 @@
-.pytest_cache
 .venv
+.archive
 
 build
 data
@@ -9,4 +9,5 @@ notebooks
 *__pycache__
 
 # Ignore test data
+.pytest_cache
 tests/__snapshots__/*
diff --git a/WebSearcher/result_collector.py b/WebSearcher/result_collector.py
deleted file mode 100644
index 77041cc..0000000
--- a/WebSearcher/result_collector.py
+++ /dev/null
@@ -1,99 +0,0 @@
-""" Collect HTML for individual results from a SERP
-"""
-
-import time
-import requests
-from . import utils
-from . import webutils as wu
-
-
-def check_valid_url(result): 
-    """Check if result has url and url is in a valid format"""
-    if 'url' in result:
-        return True if result['url'].startswith('http') else False
-    else:
-        return False
-
-
-def scrape_results_html(results, serp_id, log, headers, ssh_tunnel, 
-                        save_dir='.', append_to=''):
-    """Scrape and save all unique, non-internal URLs parsed from the SERP
-    
-    Args:
-        save_dir (str, optional): Save results html as `save_dir/results_html/{serp_id}.json`
-        append_to (str, optional): Append results html to this file path
-    """
-
-    results_html = [] 
-    if not results:
-        log.info(f'No results to scrape for serp_id {serp_id}')
-    else:
-
-        results_wurl = [r for r in results if check_valid_url(r)]
-        
-        if results_wurl:
-
-            # Prepare session
-            keep_headers = ['User-Agent']
-            headers = {k:v for k,v in headers.items() if k in keep_headers}
-            if ssh_tunnel:
-                result_sesh = wu.start_sesh(headers=headers, proxy_port=ssh_tunnel.port)
-            else:
-                result_sesh = wu.start_sesh(headers=headers)
-
-            # Get all unique result urls
-            result_urls = []
-            unique_urls = set()
-            for result in results_wurl:
-                # If the result has a url and we haven't seen it yet
-                if result['url'] and result['url'] not in unique_urls:
-                    # Take a subset of the keys
-                    keep_keys = {'serp_id', 'serp_rank', 'url'}
-                    res = {k:v for k,v in result.items() if k in keep_keys} 
-                    result_urls.append(res)
-                    unique_urls.add(result['url'])
-                
-            # Scrape results HTML
-            for result in result_urls:
-                result = scrape_result_html(result_sesh, result, log, ssh_tunnel)
-                results_html.append(result)
-
-            # Save results HTML
-            if append_to:
-                # Append to aggregate file
-                utils.write_lines(results_html, append_to)
-            else:
-                # Save new SERP-specific file
-                fp = os.path.join(save_dir, 'results_html', f'{serp_id}.json')
-                utils.write_lines(results_html, fp)
-
-
-def scrape_result_html(result_sesh, result, log, ssh_tunnel):
-        resid = f"{result['serp_id']} | {result['url']}"
-    
-        try:
-            r = result_sesh.get(result['url'], timeout=15)
-            result['html'] = r.content.decode('utf-8', 'ignore')
-
-        except requests.exceptions.TooManyRedirects:
-            result['html'] = 'error_redirects'
-            log.exception(f"Results | RedirectsErr | {resid}")
-
-        except requests.exceptions.Timeout:
-            result['html'] = 'error_timeout'
-            log.exception(f"Results | TimeoutErr | {resid}")
-
-        except requests.exceptions.ConnectionError:
-            result['html'] = 'error_connection'
-            log.exception(f"Results | ConnectionErr | {resid}")
-
-            # SSH Tunnel may have died, reset SSH session
-            if ssh_tunnel:
-                ssh_tunnel.tunnel.kill()
-                ssh_tunnel.open_tunnel()
-                log.info('Results | Restarted SSH tunnel')
-                time.sleep(10) # Allow time to establish connection
-
-        except Exception:
-            result['html'] = 'error_unknown'
-            log.exception(f"Results | Collection Error | {resid}")

From d076e4f41b6486b75adf66945951afffba4daa8e Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Wed, 2 Apr 2025 09:44:52 -0700
Subject: [PATCH 054/101] fix: downgrade log warning to debug

---
 WebSearcher/searchers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index 72c791f..ae86474 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -90,7 +90,7 @@ def search(self,
             headers (Dict[str, str], optional): Custom headers to include in the request
         """
 
-        self.log.warning('starting search config')
+        self.log.debug('starting search config')
         self.search_params = SearchParams.create({
             'qry': str(qry),
             'loc': str(location) if not pd.isnull(location) else '',

From e726acddb23292415d6877ba823cf955656b411a Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Wed, 2 Apr 2025 10:43:30 -0700
Subject: [PATCH 055/101] update: breaking change for log config, using logger
 kwargs

---
 WebSearcher/models/configs.py | 11 +++++++----
 WebSearcher/searchers.py      | 16 +++-------------
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py
index 99021cf..81e011d 100644
--- a/WebSearcher/models/configs.py
+++ b/WebSearcher/models/configs.py
@@ -15,9 +15,13 @@ def create(cls, config=None):
         return config or cls()
 
 class LogConfig(BaseConfig):
-    fp: str = ''
-    mode: str = 'a'
-    level: str = 'INFO'
+    console: bool = True
+    console_format: str = 'medium'
+    console_level: str = 'INFO'
+    file_name: str = ''
+    file_mode: str = 'a'
+    file_format: str = 'detailed'
+    file_level: str = 'INFO'
 
 class SeleniumConfig(BaseConfig):
     headless: bool = False
@@ -45,7 +49,6 @@ def sesh(self) -> requests.Session:
         sesh.headers.update(self.headers)
         return sesh
 
-
 class SearchMethod(Enum):
     REQUESTS = "requests"
     SELENIUM = "selenium"
diff --git a/WebSearcher/searchers.py b/WebSearcher/searchers.py
index ae86474..2147b39 100644
--- a/WebSearcher/searchers.py
+++ b/WebSearcher/searchers.py
@@ -35,7 +35,7 @@ def __init__(self,
             crawl_id (str, optional): A unique identifier for the crawl. Defaults to ''.
         """
   
-        # Initialize configuration
+        # Initialize config settings, log, and session data
         self.method = method.value if isinstance(method, SearchMethod) else method
         self.config = SearchConfig.create({
             "method": SearchMethod.create(method),
@@ -43,23 +43,14 @@ def __init__(self,
             "selenium": SeleniumConfig.create(selenium_config),
             "requests": RequestsConfig.create(requests_config),
         })
-
-        # Initialize session data
+        self.log = logger.Logger(**self.config.log.model_dump()).start(__name__)
         self.session_data = {
             "method": self.config.method.value,
             "version": WS_VERSION,
             "crawl_id": crawl_id,
         }
 
-        # Set a log file, prints to console by default
-        self.log = logger.Logger(
-            console=True if not self.config.log.fp else False,
-            console_level=self.config.log.level,
-            file_name=self.config.log.fp, 
-            file_mode=self.config.log.mode,
-            file_level=self.config.log.level,
-        ).start(__name__)
-
+        # Initialize searcher based on method
         if self.config.method == SearchMethod.SELENIUM:
             self.searcher = SeleniumDriver(config=self.config.selenium, logger=self.log)
             self.searcher.init_driver()
@@ -70,7 +61,6 @@ def __init__(self,
         self.search_params = SearchParams.create()
         self.parsed = {'results': [], 'features': {}}
 
-
     def search(self, 
             qry: str, 
             location: str = None, 

From 70e774f0b4ee3ffa16ffa5dca8bb130784005271 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 24 Apr 2025 18:59:52 +0000
Subject: [PATCH 056/101] build(deps): bump h11 from 0.14.0 to 0.16.0

Bumps [h11](https://github.com/python-hyper/h11) from 0.14.0 to 0.16.0.
- [Commits](https://github.com/python-hyper/h11/compare/v0.14.0...v0.16.0)

---
updated-dependencies:
- dependency-name: h11
  dependency-version: 0.16.0
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 34ccb6a..79638ee 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -558,14 +558,14 @@ typing = ["typing-extensions (>=4.12.2) ; python_version < \"3.11\""]
 
 [[package]]
 name = "h11"
-version = "0.14.0"
+version = "0.16.0"
 description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
-    {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
+    {file = "h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86"},
+    {file = "h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1"},
 ]
 
 [[package]]

From 592c69b27bb77a6954b300a6907d18bfda34ef76 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Sun, 27 Apr 2025 15:58:24 -0700
Subject: [PATCH 057/101] update: ad component parsers

---
 WebSearcher/component_parsers/ads.py | 101 ++++++++++++++++++++++-----
 1 file changed, 83 insertions(+), 18 deletions(-)

diff --git a/WebSearcher/component_parsers/ads.py b/WebSearcher/component_parsers/ads.py
index d38917c..53ab212 100644
--- a/WebSearcher/component_parsers/ads.py
+++ b/WebSearcher/component_parsers/ads.py
@@ -13,6 +13,16 @@
 from .shopping_ads import parse_shopping_ads
 import bs4
 
+PARSED = {
+    'type': 'ad',
+    'sub_type': '',
+    'sub_rank': 0,
+    'title': '',
+    'url': '',
+    'cite': '',
+    'text': '',
+}
+
 def parse_ads(cmpt: bs4.element.Tag) -> list:
     """Parse ads from ad component"""
 
@@ -33,6 +43,8 @@ def parse_ads(cmpt: bs4.element.Tag) -> list:
                 parsed_list.extend(parse_shopping_ads(sub))
             elif "uEierd" in sub_classes:
                 parsed_list.append(parse_ad(sub))
+    elif sub_type == 'carousel':
+        parsed_list = parse_ad_carousel(cmpt, sub_type)
     return parsed_list
 
 
@@ -41,7 +53,8 @@ def classify_ad_type(cmpt: bs4.element.Tag) -> str:
     label_divs = {
         "legacy": webutils.find_all_divs(cmpt, 'div', {'class': 'ad_cclk'}),
         "secondary": webutils.find_all_divs(cmpt, 'div', {'class': 'd5oMvf'}),
-        "standard": webutils.find_all_divs(cmpt, 'div', {'class': ['uEierd', 'commercial-unit-desktop-top']})
+        "standard": webutils.find_all_divs(cmpt, 'div', {'class': ['uEierd', 'commercial-unit-desktop-top']}),
+        "carousel": webutils.find_all_divs(cmpt, 'g-scrolling-carousel'),
     }
     for label, divs in label_divs.items():
         if divs:
@@ -49,12 +62,63 @@ def classify_ad_type(cmpt: bs4.element.Tag) -> str:
     return 'unknown'
 
 
+def parse_ad_carousel(cmpt: bs4.element.Tag, sub_type: str, filter_visible: bool = True) -> list:
+
+    def parse_ad_carousel_div(sub: bs4.element.Tag, sub_type: str, sub_rank: int) -> dict:
+        """Parse ad carousel div, seen 2025-02-06"""
+        parsed = PARSED.copy()
+        parsed['sub_type'] = sub_type
+        parsed['sub_rank'] = sub_rank
+        parsed['title'] = webutils.get_text(sub, 'div', {'class':'e7SMre'})
+        parsed['url'] = webutils.get_link(sub)
+        parsed['text'] = webutils.get_text(sub, 'div', {"class":"vrAZpb"})
+        parsed['cite'] = webutils.get_text(sub, 'div', {"class":"zpIwr"})
+        parsed['visible'] = not (sub.has_attr('data-has-shown') and sub['data-has-shown'] == 'false')
+        return parsed
+    
+    def parse_ad_carousel_card(sub: bs4.element.Tag, sub_type: str, sub_rank: int) -> dict:
+        """Parse ad carousel card, seen 2024-09-21"""
+        parsed = PARSED.copy()
+        parsed['sub_type'] = sub_type
+        parsed['sub_rank'] = sub_rank
+        parsed['title'] = webutils.get_text(sub, 'div', {'class':'gCv54b'})
+        parsed['url'] = webutils.get_link(sub, {"class": "KTsHxd"})
+        parsed['text'] = webutils.get_text(sub, 'div', {"class":"VHpBje"})
+        parsed['cite'] = webutils.get_text(sub, 'div', {"class":"j958Pd"})
+        parsed['visible'] = not (sub.has_attr('data-viewurl') and sub['data-viewurl'])
+        return parsed
+
+    ad_carousel_parsers = [
+        {'find_kwargs': {'name': 'g-inner-card'}, 
+         'parser': parse_ad_carousel_card},
+        {'find_kwargs': {'name': 'div', 'attrs': {'class': 'ZPze1e'}},
+         'parser': parse_ad_carousel_div}
+    ]
+
+    output_list = []
+    ad_carousel = cmpt.find('g-scrolling-carousel')
+    if ad_carousel:
+        for parser_details in ad_carousel_parsers:
+            parser_func = parser_details['parser']
+            kwargs = parser_details['find_kwargs']
+            sub_cmpts = webutils.find_all_divs(ad_carousel, **kwargs)
+            print(f"sub_cmpts: {len(sub_cmpts)}")
+            if sub_cmpts:
+                for sub_rank, sub in enumerate(sub_cmpts):
+                    parsed = parser_func(sub, sub_type, sub_rank)
+                    output_list.append(parsed)
+
+    if filter_visible:
+        output_list = [{k:v for k,v in x.items() if k != 'visible'} for x in output_list if x['visible']]
+    return output_list
+
+
 def parse_ad(sub: bs4.element.Tag, sub_rank: int = 0) -> dict:
     """Parse details of a single ad subcomponent, similar to general"""
-    parsed = {"type": "ad", 
-              "sub_type": "standard", 
-              "sub_rank": sub_rank}
-    
+    parsed = PARSED.copy()
+    parsed["sub_type"] = "standard"
+    parsed["sub_rank"] = sub_rank
+
     parsed['title'] = webutils.get_text(sub, 'div', {'role':'heading'})
     parsed['url'] = webutils.get_link(sub, {"class":"sVXRqc"})
     parsed['cite'] = webutils.get_text(sub, 'span', {"role":"text"})
@@ -96,13 +160,14 @@ def parse_ad_menu(sub: bs4.element.Tag) -> list:
 
 def parse_ad_secondary(sub: bs4.element.Tag, sub_rank: int = 0) -> dict:
     """Parse details of a single ad subcomponent, similar to general"""
+    parsed = PARSED.copy()
+    parsed["sub_type"] = "secondary"
+    parsed["sub_rank"] = sub_rank
 
-    parsed = {"type": "ad", 
-              "sub_type": "secondary", 
-              "sub_rank": sub_rank}
-    parsed['title'] = sub.find('div', {'role':'heading'}).text
-    parsed['url'] = sub.find('div', {'class':'d5oMvf'}).find('a')['href']
-    parsed['cite'] = sub.find('span', {'class':'gBIQub'}).text
+    parsed['title'] = webutils.get_text(sub, 'div', {'role':'heading'})
+    link_div = sub.find('div', {'class':'d5oMvf'})
+    parsed['url'] = webutils.get_link(link_div) if link_div else ''
+    parsed['cite'] = webutils.get_text(sub, 'span', {'class':'gBIQub'})
 
     # Take the top div with this class, should be main result abstract
     text_divs = sub.find_all('div', {'class':'yDYNvb'})
@@ -123,14 +188,14 @@ def parse_ad_secondary(sub: bs4.element.Tag, sub_rank: int = 0) -> dict:
 
 def parse_ad_legacy(sub: bs4.element.Tag, sub_rank: int = 0) -> dict:
     """[legacy] Parse details of a single ad subcomponent, similar to general"""
-
-    parsed = {"type": "ad", 
-              "sub_type": "legacy", 
-              "sub_rank": sub_rank}
+    parsed = PARSED.copy()
+    parsed["sub_type"] = "legacy"
+    parsed["sub_rank"] = sub_rank
+    
     header = sub.find('div', {'class':'ad_cclk'})
-    parsed['title'] = header.find('h3').text
-    parsed['url'] = header.find('cite').text
-    parsed['text'] = sub.find('div', {'class':'ads-creative'}).text
+    parsed['title'] = webutils.get_text(header, 'h3')
+    parsed['url'] = webutils.get_text(header, 'cite')
+    parsed['text'] = webutils.get_text(sub, 'div', {'class':'ads-creative'})
     
     bottom_text = sub.find('ul')
     if bottom_text:

From 2ca408153a8c03be145d1f2ff800b803befc1390 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Sun, 27 Apr 2025 15:58:37 -0700
Subject: [PATCH 058/101] version: 0.6.5.dev0

---
 WebSearcher/__init__.py | 2 +-
 pyproject.toml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index 9eb7a83..5201135 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.4"
+__version__ = "0.6.5.dev0"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index 0e936df..8bad132 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.6.4"
+version = "0.6.5.dev0"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From 0f9dc401640e2084430aff484736b241e42ee62f Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Sun, 27 Apr 2025 16:17:10 -0700
Subject: [PATCH 059/101] update: videos component parser

---
 WebSearcher/component_parsers/ads.py    |  1 +
 WebSearcher/component_parsers/videos.py | 22 ++++++++++++----------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/WebSearcher/component_parsers/ads.py b/WebSearcher/component_parsers/ads.py
index 53ab212..b2aac52 100644
--- a/WebSearcher/component_parsers/ads.py
+++ b/WebSearcher/component_parsers/ads.py
@@ -6,6 +6,7 @@
 - added new div class for text field
 - added labels (e.g., "Provides abortions") from <span class="mXsQRe">, appended to text field
 
+2025-04-27: added carousel sub_type, global parsed output
 
 """
 
diff --git a/WebSearcher/component_parsers/videos.py b/WebSearcher/component_parsers/videos.py
index 950d849..cd374d3 100644
--- a/WebSearcher/component_parsers/videos.py
+++ b/WebSearcher/component_parsers/videos.py
@@ -1,8 +1,9 @@
 """ Parsers for video components
 
 Changelog
-2021-05-08: added find_all for divs with class 'VibNM'
-2021-05-08: added adjustment for new cite and timestamp
+2024-05-08: added find_all for divs with class 'VibNM'
+2024-05-08: added adjustment for new cite and timestamp
+2025-04-27: added div subcomponent class and sub_type labels
 
 """
 
@@ -23,24 +24,25 @@ def parse_videos(cmpt) -> list:
     # Get known div structures
     divs = []
     name_attrs = [
-        {'name':'g-inner-card'},
-        {'name':'div', 'attrs':{'class':'VibNM'}},
-        {'name':'div', 'attrs':{'class':'mLmaBd'}},
-        {'name':'div', 'attrs':{'class':'RzdJxc'}},
+        ({'name':'g-inner-card'}, 'unspecified-0'),
+        ({'name':'div', 'attrs':{'class':'VibNM'}},  'unspecified-1'),
+        ({'name':'div', 'attrs':{'class':'mLmaBd'}}, 'unspecified-2'),
+        ({'name':'div', 'attrs':{'class':'RzdJxc'}}, 'unspecified-3'),
+        ({'name':'div', 'attrs':{'class':'sHEJob'}}, 'vertical'),
     ]
-    for kwargs in name_attrs:
+    for kwargs, sub_type in name_attrs:
         divs = webutils.find_all_divs(cmpt, **kwargs)
         if divs:
             break
     divs = list(filter(None, divs))
 
     if divs:
-        return [parse_video(div, i) for i, div in enumerate(divs)]
+        return [parse_video(div, sub_type, i) for i, div in enumerate(divs)]
     else:
         return [{'type': 'videos', 'sub_rank': 0, 'error': 'No subcomponents found'}]
 
 
-def parse_video(sub, sub_rank=0) -> dict:
+def parse_video(sub, sub_type: str, sub_rank=0) -> dict:
     """Parse a videos subcomponent
     
     Args:
@@ -52,6 +54,7 @@ def parse_video(sub, sub_rank=0) -> dict:
 
     parsed = {
         'type': 'videos',
+        'sub_type': sub_type,
         'sub_rank': sub_rank,
         'url': get_url(sub),
         'title': webutils.get_text(sub, 'div', {'role':'heading'}),
@@ -82,7 +85,6 @@ def parse_video(sub, sub_rank=0) -> dict:
 
     return parsed
 
-
 def get_url(sub):
     """Get video URL by filtering for non-hash links"""
     all_urls = sub.find_all('a')

From 502a025acad207f89fd4adc1f749919c6f68cb07 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Sun, 27 Apr 2025 16:17:25 -0700
Subject: [PATCH 060/101] version: 0.6.5.dev1

---
 WebSearcher/__init__.py | 2 +-
 pyproject.toml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index 5201135..c70f874 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.5.dev0"
+__version__ = "0.6.5.dev1"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index 8bad132..dff6630 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.6.5.dev0"
+version = "0.6.5.dev1"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From cc2395c5847ee0e9135031979f74208f5e4fb683 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Sun, 27 Apr 2025 17:03:07 -0700
Subject: [PATCH 061/101] update: discussions and forums classifier

---
 WebSearcher/classifiers/main.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/WebSearcher/classifiers/main.py b/WebSearcher/classifiers/main.py
index 70cb570..8eba449 100644
--- a/WebSearcher/classifiers/main.py
+++ b/WebSearcher/classifiers/main.py
@@ -14,6 +14,7 @@ def classify(cmpt: bs4.element.Tag) -> str:
         # Ordered list of classifiers to try
         component_classifiers = [
             ClassifyMain.top_stories,        # Check top stories
+            ClassifyMain.discussions_and_forums, # Check discussions and forums
             ClassifyHeaderText.classify,     # Check levels 2 & 3 header text
             ClassifyMain.news_quotes,        # Check news quotes
             ClassifyMain.img_cards,          # Check image cards
@@ -40,6 +41,12 @@ def classify(cmpt: bs4.element.Tag) -> str:
         
         return cmpt_type
 
+    @staticmethod
+    def discussions_and_forums(cmpt: bs4.element.Tag) -> str:
+        conditions = [
+            cmpt.find("div", {"class": "IFnjPb"}),
+        ]
+        return 'discussions_and_forums' if all(conditions) else "unknown"
 
     @staticmethod
     def available_on(cmpt: bs4.element.Tag) -> str:

From cedc7d2ced5faea4c3a0c8d1150aef026ce830fe Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Sun, 27 Apr 2025 17:03:38 -0700
Subject: [PATCH 062/101] update: extract more divs for top_bar layout

---
 WebSearcher/extractors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/WebSearcher/extractors.py b/WebSearcher/extractors.py
index 5b1202e..5481d7a 100644
--- a/WebSearcher/extractors.py
+++ b/WebSearcher/extractors.py
@@ -203,7 +203,7 @@ def extract_from_top_bar(self, drop_tags: set = {}) -> list:
         top_bar_divs = Extractor.extract_from_top_bar_divs(self.layout_divs['top-bars'])
         column.extend(top_bar_divs)
         
-        rso_layout_divs = self.layout_divs['rso'].find_all('div', {'class':'sATSHe'})
+        rso_layout_divs = self.layout_divs['rso'].find_all('div', {'class':['sATSHe','vtSz8d', 'cUnQKe','g']})
         if rso_layout_divs:
             self.layout_label = 'top-bars-divs'
             layout_column = [div for div in rso_layout_divs if div.name not in drop_tags]

From 1c68ea84f51300720a12b15f1948715bdc3c3800 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Sun, 27 Apr 2025 17:04:22 -0700
Subject: [PATCH 063/101] version: 0.6.5.dev2

---
 WebSearcher/__init__.py | 2 +-
 pyproject.toml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index c70f874..cbc0ed6 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.5.dev1"
+__version__ = "0.6.5.dev2"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index dff6630..5d2edd7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.6.5.dev1"
+version = "0.6.5.dev2"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From 29d62c7df3c23e9d4939da0f6a08592d81a2e702 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Mon, 28 Apr 2025 11:02:21 -0700
Subject: [PATCH 064/101] fix: drop debug print and fix print var

---
 WebSearcher/component_parsers/ads.py    | 1 -
 WebSearcher/component_parsers/images.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/WebSearcher/component_parsers/ads.py b/WebSearcher/component_parsers/ads.py
index b2aac52..885f51c 100644
--- a/WebSearcher/component_parsers/ads.py
+++ b/WebSearcher/component_parsers/ads.py
@@ -103,7 +103,6 @@ def parse_ad_carousel_card(sub: bs4.element.Tag, sub_type: str, sub_rank: int) -
             parser_func = parser_details['parser']
             kwargs = parser_details['find_kwargs']
             sub_cmpts = webutils.find_all_divs(ad_carousel, **kwargs)
-            print(f"sub_cmpts: {len(sub_cmpts)}")
             if sub_cmpts:
                 for sub_rank, sub in enumerate(sub_cmpts):
                     parsed = parser_func(sub, sub_type, sub_rank)
diff --git a/WebSearcher/component_parsers/images.py b/WebSearcher/component_parsers/images.py
index 27f932d..74c8026 100644
--- a/WebSearcher/component_parsers/images.py
+++ b/WebSearcher/component_parsers/images.py
@@ -121,7 +121,7 @@ def get_image_url_from_attrs(sub):
         try:
             url = func(sub)
             if url.startswith('data:image'):
-                raise ValueError(f"Data URL: {img_src}")
+                raise ValueError(f"Data URL: {url}")
             else:
                 return url
         except Exception as e:

From fa411b86dee3e2920284ea4eeb299b2ce6a34d45 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Mon, 28 Apr 2025 11:02:41 -0700
Subject: [PATCH 065/101] update: expand general classifier classes

---
 WebSearcher/classifiers/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/WebSearcher/classifiers/main.py b/WebSearcher/classifiers/main.py
index 8eba449..43a2803 100644
--- a/WebSearcher/classifiers/main.py
+++ b/WebSearcher/classifiers/main.py
@@ -75,7 +75,7 @@ def general(cmpt: bs4.element.Tag) -> str:
                 "format-01": cmpt.attrs["class"] == ["g"],
                 "format-02": ( ("g" in cmpt.attrs["class"]) &                            
                                any(s in ["Ww4FFb"] for s in cmpt.attrs["class"]) ),
-                "format-03": any(s in ["hlcw0c", "MjjYud"] for s in cmpt.attrs["class"]),
+                "format-03": any(s in ["hlcw0c", "MjjYud", "PmEWq"] for s in cmpt.attrs["class"]),
                 "format-04": cmpt.find('div', {'class': ['g', 'Ww4FFb']}),
             }
         else: 

From 2f9bb28fd6dfdd2395012d5d5de278d73a3a5912 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Mon, 28 Apr 2025 11:03:40 -0700
Subject: [PATCH 066/101] update: extract from top bar for 2025 serps

---
 WebSearcher/extractors.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/extractors.py b/WebSearcher/extractors.py
index 5481d7a..6900187 100644
--- a/WebSearcher/extractors.py
+++ b/WebSearcher/extractors.py
@@ -202,8 +202,19 @@ def extract_from_top_bar(self, drop_tags: set = {}) -> list:
 
         top_bar_divs = Extractor.extract_from_top_bar_divs(self.layout_divs['top-bars'])
         column.extend(top_bar_divs)
-        
-        rso_layout_divs = self.layout_divs['rso'].find_all('div', {'class':['sATSHe','vtSz8d', 'cUnQKe','g']})
+        # No duplicates, but missing data
+        # rso_layout_divs = self.layout_divs['rso'].find_all('div', {'class':'sATSHe'})
+
+        div_classes = [
+            'cUnQKe', # people also ask
+            'g',      # general
+            'Lv2Cle', # images-medium
+            'oIk2Cb', # searches_related
+            'Ww4FFb', # discussions_and_forums
+            'vtSz8d', # videos
+        ]
+        rso_layout_divs = self.layout_divs['rso'].find_all('div', attrs={'class': div_classes}, recursive=True)
+
         if rso_layout_divs:
             self.layout_label = 'top-bars-divs'
             layout_column = [div for div in rso_layout_divs if div.name not in drop_tags]

From 5373a852eb11bac9b312c0dd17a411afd1050e06 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Mon, 28 Apr 2025 11:06:44 -0700
Subject: [PATCH 067/101] update: expand images sub cmpt class list and
 title/url parsing

---
 WebSearcher/component_parsers/images.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/component_parsers/images.py b/WebSearcher/component_parsers/images.py
index 74c8026..7e9bc51 100644
--- a/WebSearcher/component_parsers/images.py
+++ b/WebSearcher/component_parsers/images.py
@@ -1,3 +1,10 @@
+""" Parsers for image components
+
+Changelog
+2025-04-28: added div subcomponent class and sub_type labels
+
+"""
+
 from ..webutils import get_text, get_link, get_div
 
 def parse_images(cmpt) -> list:
@@ -25,7 +32,7 @@ def parse_images(cmpt) -> list:
         parsed_list.extend(parsed_subs)
     else:
         # Medium images with titles and urls
-        subs = cmpt.find_all('div', {'class':'eA0Zlc'})
+        subs = cmpt.find_all('div', {'class': ['eA0Zlc', 'vCUuC']})
         parsed_subs = [parse_image_medium(sub, sub_rank + len(parsed_list)) for sub_rank, sub in enumerate(subs)]
         parsed_list.extend(parsed_subs)
 
@@ -63,9 +70,14 @@ def parse_image_medium(sub, sub_rank=0) -> dict:
     """
     
     title_div = get_div(sub, 'a', {'class':'EZAeBe'})
-    title = get_text(title_div) if title_div else get_img_alt(sub)
+    title = get_text(title_div) if title_div else get_text(sub, 'span', {'class':'Yt787'})
     url = get_link(sub) if title_div else get_img_url(sub)
 
+    if not title:
+        title = get_img_alt(sub)
+    if not url:
+        url = get_link(sub, attrs={'class':['EZAeBe', 'ddkIM']})
+
     return {
         "type": "images",
         "sub_type": "medium",

From 036e67a6d108a5c9756ff2eed742a69e3ff56a65 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Mon, 28 Apr 2025 11:09:48 -0700
Subject: [PATCH 068/101] update: reduce doc strings

---
 WebSearcher/component_parsers/images.py | 37 ++++---------------------
 1 file changed, 5 insertions(+), 32 deletions(-)

diff --git a/WebSearcher/component_parsers/images.py b/WebSearcher/component_parsers/images.py
index 7e9bc51..ac4907c 100644
--- a/WebSearcher/component_parsers/images.py
+++ b/WebSearcher/component_parsers/images.py
@@ -8,14 +8,7 @@
 from ..webutils import get_text, get_link, get_div
 
 def parse_images(cmpt) -> list:
-    """Parse an image component
-    
-    Args:
-        cmpt (bs4 object): an image component
-    
-    Returns:
-        list: list of parsed subcomponent dictionaries
-    """
+    """Parse an images component"""
 
     parsed_list = []
 
@@ -42,14 +35,7 @@ def parse_images(cmpt) -> list:
     return parsed_list
 
 def parse_image_multimedia(sub, sub_rank=0) -> dict:
-    """Parse an image subcomponent
-    
-    Args:
-        sub (bs4 object): an image subcomponent
-    
-    Returns:
-        dict : parsed subresult
-    """
+    """Parse an images multimedia subcomponent"""
     return {
         "type": "images",
         "sub_type": "multimedia",
@@ -60,14 +46,7 @@ def parse_image_multimedia(sub, sub_rank=0) -> dict:
     }
 
 def parse_image_medium(sub, sub_rank=0) -> dict:
-    """Parse an image subcomponent
-    
-    Args:
-        sub (bs4 object): an image subcomponent
-    
-    Returns:
-        dict : parsed subresult
-    """
+    """Parse an images medium subcomponent"""
     
     title_div = get_div(sub, 'a', {'class':'EZAeBe'})
     title = get_text(title_div) if title_div else get_text(sub, 'span', {'class':'Yt787'})
@@ -89,14 +68,8 @@ def parse_image_medium(sub, sub_rank=0) -> dict:
     }
 
 def parse_image_small(sub, sub_rank=0) -> dict:
-    """Parse an image subcomponent
-    
-    Args:
-        sub (bs4 object): an image subcomponent
-    
-    Returns:
-        dict : parsed subresult
-    """
+    """Parse an images small subcomponent"""
+
     return {
         "type": "images", 
         "sub_type": "small",

From 205bba576da3b674ad5299bbf0df13a2c1302b77 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Mon, 28 Apr 2025 11:25:06 -0700
Subject: [PATCH 069/101] version: 0.6.5.dev3

---
 WebSearcher/__init__.py | 2 +-
 pyproject.toml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index cbc0ed6..9c3b696 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.5.dev2"
+__version__ = "0.6.5.dev3"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index 5d2edd7..2f43c50 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.6.5.dev2"
+version = "0.6.5.dev3"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From e23e70bc486daec7a064fe9e3286d85ed24745f5 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Tue, 29 Apr 2025 08:35:48 -0700
Subject: [PATCH 070/101] update: more restrictive discussions classifier

---
 WebSearcher/classifiers/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/WebSearcher/classifiers/main.py b/WebSearcher/classifiers/main.py
index 43a2803..6131c39 100644
--- a/WebSearcher/classifiers/main.py
+++ b/WebSearcher/classifiers/main.py
@@ -44,7 +44,7 @@ def classify(cmpt: bs4.element.Tag) -> str:
     @staticmethod
     def discussions_and_forums(cmpt: bs4.element.Tag) -> str:
         conditions = [
-            cmpt.find("div", {"class": "IFnjPb"}),
+            cmpt.find("div", {"class": "IFnjPb", "role": "heading"}),
         ]
         return 'discussions_and_forums' if all(conditions) else "unknown"
 

From 41cfba2b9202157da852c09ad5565cc7bd1bd8d0 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Tue, 29 Apr 2025 08:36:17 -0700
Subject: [PATCH 071/101] update: expand classes for video cmpt extraction

---
 WebSearcher/extractors.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/WebSearcher/extractors.py b/WebSearcher/extractors.py
index 6900187..58ba5f7 100644
--- a/WebSearcher/extractors.py
+++ b/WebSearcher/extractors.py
@@ -116,7 +116,6 @@ def extract_main(self):
     #     if shopping_ads:
     #         self.components.add_component(shopping_ads, section='main', type='shopping_ads')
 
-
     def extract_main_ads_top(self):
         """Extract the main ads section of the SERP"""
         ads = self.soup.find('div', {'id':'tads'})
@@ -212,6 +211,7 @@ def extract_from_top_bar(self, drop_tags: set = {}) -> list:
             'oIk2Cb', # searches_related
             'Ww4FFb', # discussions_and_forums
             'vtSz8d', # videos
+            'uVMCKf', # videos
         ]
         rso_layout_divs = self.layout_divs['rso'].find_all('div', attrs={'class': div_classes}, recursive=True)
 

From 2d017015cd03c7d6f0754d032ed4c197722d7cd3 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 8 May 2025 11:02:02 -0700
Subject: [PATCH 072/101] fix: no empty whitespace in filter_empty_divs func

---
 WebSearcher/webutils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/WebSearcher/webutils.py b/WebSearcher/webutils.py
index 4489468..a36c2b4 100644
--- a/WebSearcher/webutils.py
+++ b/WebSearcher/webutils.py
@@ -128,7 +128,7 @@ def find_all_divs(soup: BeautifulSoup, name: str, attrs: dict = {}, filter_empty
 
 def filter_empty_divs(divs):
     divs = [c for c in divs if c]
-    divs = [c for c in divs if c.text != '']
+    divs = [c for c in divs if c.text.strip() != '']
     return divs
 
 def find_children(soup, name: str, attrs: dict = {}, filter_empty: bool = False):

From 9d66539c15fd14b7f075494983a6b0624fcbfad6 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 8 May 2025 11:02:56 -0700
Subject: [PATCH 073/101] update: more knowledge panel identifiers

---
 WebSearcher/classifiers/main.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/WebSearcher/classifiers/main.py b/WebSearcher/classifiers/main.py
index 6131c39..4130fd3 100644
--- a/WebSearcher/classifiers/main.py
+++ b/WebSearcher/classifiers/main.py
@@ -150,6 +150,7 @@ def knowledge_panel(cmpt: bs4.element.Tag) -> str:
             cmpt.find("h1", {"class": "VW3apb"}),
             cmpt.find("div", {"class": ["knowledge-panel", "knavi", "kp-blk", "kp-wholepage-osrp"]}),
             cmpt.find("div", {"aria-label": "Featured results", "role": "complementary"}),
+            cmpt.find("div", {"jscontroller": "qTdDb"}),
             webutils.check_dict_value(cmpt.attrs, "jscontroller", "qTdDb")
         ]
         return 'knowledge' if any(conditions) else "unknown"

From 85f5766db982ad3565ae81ee06af757ef6b0e42f Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 8 May 2025 11:03:09 -0700
Subject: [PATCH 074/101] fix: count sub ranks for standard ads

---
 WebSearcher/component_parsers/ads.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/component_parsers/ads.py b/WebSearcher/component_parsers/ads.py
index 885f51c..26d7480 100644
--- a/WebSearcher/component_parsers/ads.py
+++ b/WebSearcher/component_parsers/ads.py
@@ -38,12 +38,12 @@ def parse_ads(cmpt: bs4.element.Tag) -> list:
         parsed_list = [parse_ad_secondary(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
     elif sub_type == 'standard':
         subs = webutils.find_all_divs(cmpt, 'div', {'class': ['uEierd', 'commercial-unit-desktop-top']})
-        for sub in subs:
+        for sub_rank, sub in enumerate(subs):
             sub_classes = sub.attrs.get("class", [])
             if "commercial-unit-desktop-top" in sub_classes:
                 parsed_list.extend(parse_shopping_ads(sub))
             elif "uEierd" in sub_classes:
-                parsed_list.append(parse_ad(sub))
+                parsed_list.append(parse_ad(sub, sub_rank=sub_rank))
     elif sub_type == 'carousel':
         parsed_list = parse_ad_carousel(cmpt, sub_type)
     return parsed_list

From ac79df03ae57df31ac475b2ed9bd85df4fd0d9c6 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 8 May 2025 11:05:23 -0700
Subject: [PATCH 075/101] update: result types dictionaries

---
 WebSearcher/models/cmpt_mappings.py | 185 ++++++++++++++++++++++++++++
 1 file changed, 185 insertions(+)
 create mode 100644 WebSearcher/models/cmpt_mappings.py

diff --git a/WebSearcher/models/cmpt_mappings.py b/WebSearcher/models/cmpt_mappings.py
new file mode 100644
index 0000000..616bbad
--- /dev/null
+++ b/WebSearcher/models/cmpt_mappings.py
@@ -0,0 +1,185 @@
+"""
+Metadata about WebSearcher result types and subtypes.
+This provides documentation and structure for the various result types parsed by WebSearcher.
+"""
+
+# Header result types with descriptions and subtypes
+HEADER_RESULT_TYPES = {
+    "notice": {
+        "description": "Special notices and suggestions shown at the top of search results",
+        "sub_types": [
+            "query_edit",
+            "query_edit_no_results",
+            "query_suggestion",
+            "location_choose_area",
+            "location_use_precise_location",
+            "language_tip",
+        ],
+    },
+    "top_image_carousel": {
+        "description": "Carousel of images displayed at the top of search results",
+        "sub_types": [],
+    },
+}
+
+# Main result types with descriptions and subtypes
+MAIN_RESULT_TYPES = {
+    "ad": {
+        "description": "Advertisements displayed in search results",
+        "sub_types": ["standard", "legacy", "secondary", "submenu"],
+    },
+    "available_on": {
+        "description": "Where entertainment content is available to stream or purchase",
+        "sub_types": [],
+    },
+    "banner": {
+        "description": "Banner notifications shown at top of results",
+        "sub_types": [],
+    },
+    "discussions_and_forums": {
+        "description": "Forum and discussion board results",
+        "sub_types": [],
+    },
+    "general": {
+        "description": "Standard web search results",
+        "sub_types": [
+            "video",
+            "submenu",
+            "submenu_mini",
+            "submenu_rating",
+            "submenu_scholarly",
+            "submenu_product",
+            "subresult",
+        ],
+    },
+    "general_questions": {
+        "description": "General results with related questions",
+        "sub_types": [],
+    },
+    "images": {
+        "description": "Image search results",
+        "sub_types": ["multimedia", "medium", "small"],
+    },
+    "knowledge": {
+        "description": "Knowledge panels and featured snippets",
+        "sub_types": [
+            "ai_overview",
+            "featured_results",
+            "featured_snippet",
+            "unit_converter",
+            "sports",
+            "weather",
+            "finance",
+            "dictionary",
+            "translate",
+            "calculator",
+            "election",
+            "panel",
+        ],
+    },
+    "latest_from": {
+        "description": "Latest news results from specific sources",
+        "sub_types": [],
+    },
+    "local_news": {
+        "description": "News results specific to a location",
+        "sub_types": [],
+    },
+    "local_results": {
+        "description": "Map-based local business results",
+        "sub_types": ["places", "locations", "businesses"],  # Dynamically generated
+    },
+    "map_results": {"description": "Map-only results", "sub_types": []},
+    "news_quotes": {
+        "description": "Quote snippets from news articles",
+        "sub_types": [],
+    },
+    "notice": {
+        "description": "Special notices about searches",
+        "sub_types": [
+            "query_edit",
+            "query_edit_no_results",
+            "query_suggestion",
+            "location_choose_area",
+            "location_use_precise_location",
+            "language_tip",
+        ],
+    },
+    "people_also_ask": {
+        "description": "Related questions that people search for",
+        "sub_types": [],
+    },
+    "perspectives": {"description": "Opinion and perspective results", "sub_types": []},
+    "scholarly_articles": {"description": "Google Scholar results", "sub_types": []},
+    "searches_related": {
+        "description": "Related search terms",
+        "sub_types": [
+            "additional_searches",
+            "related_searches",
+        ],  # Dynamically generated
+    },
+    "shopping_ads": {"description": "Product shopping advertisements", "sub_types": []},
+    "top_image_carousel": {
+        "description": "Carousel of images displayed at top of page",
+        "sub_types": [],
+    },
+    "top_stories": {"description": "Featured news stories", "sub_types": []},
+    "twitter_cards": {
+        "description": "Twitter content displayed in cards",
+        "sub_types": [],
+    },
+    "twitter_result": {"description": "Individual Twitter result", "sub_types": []},
+    "videos": {"description": "Video results", "sub_types": []},
+    "view_more_news": {"description": "News result expansion links", "sub_types": []},
+    "knowledge_rhs": {
+        "description": "Knowledge panels in right-hand sidebar",
+        "sub_types": [],
+    },
+    "unknown": {"description": "Unclassified components", "sub_types": []},
+}
+
+# Footer result types with descriptions and subtypes
+FOOTER_RESULT_TYPES = {
+    "img_cards": {"description": "Image cards displayed in footer", "sub_types": []},
+    "searches_related": {
+        "description": "Related searches displayed in footer",
+        "sub_types": [
+            "additional_searches",
+            "related_searches",
+        ],  # Dynamically generated
+    },
+    "discover_more": {"description": "'Discover more' suggestions", "sub_types": []},
+    "general": {
+        "description": "General results in footer",
+        "sub_types": [
+            "video",
+            "submenu",
+            "submenu_mini",
+            "submenu_rating",
+            "submenu_scholarly",
+            "submenu_product",
+            "subresult",
+        ],
+    },
+    "people_also_ask": {"description": "Related questions in footer", "sub_types": []},
+    "omitted_notice": {
+        "description": "Notices about filtered results",
+        "sub_types": [],
+    },
+}
+
+# Special types not directly linked to parsers
+SPECIAL_RESULT_TYPES = {
+    "unclassified": {
+        "description": "Default type in the BaseResult model",
+        "sub_types": [],
+    },
+}
+
+# Combined dictionary of all result types
+ALL_RESULT_TYPES = {
+    **HEADER_RESULT_TYPES,
+    **MAIN_RESULT_TYPES,
+    **FOOTER_RESULT_TYPES,
+    **SPECIAL_RESULT_TYPES,
+}

From a737aafa96c84c39c89b7ba92b8387140954689c Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 8 May 2025 11:07:49 -0700
Subject: [PATCH 076/101] move: extractors to dir

---
 WebSearcher/{ => extractors}/extractors.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename WebSearcher/{ => extractors}/extractors.py (100%)

diff --git a/WebSearcher/extractors.py b/WebSearcher/extractors/extractors.py
similarity index 100%
rename from WebSearcher/extractors.py
rename to WebSearcher/extractors/extractors.py

From b6be243a985201a7fbe3b1961cca233168d5da6a Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 8 May 2025 11:08:29 -0700
Subject: [PATCH 077/101] rename: extractors code

---
 WebSearcher/extractors/{extractors.py => __init__.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename WebSearcher/extractors/{extractors.py => __init__.py} (100%)

diff --git a/WebSearcher/extractors/extractors.py b/WebSearcher/extractors/__init__.py
similarity index 100%
rename from WebSearcher/extractors/extractors.py
rename to WebSearcher/extractors/__init__.py

From 52c79f616b5149b4b313377783d82f0936cca92d Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 8 May 2025 11:09:28 -0700
Subject: [PATCH 078/101] add: breakout extractor functions into files by
 section

---
 WebSearcher/extractors/__init__.py         | 379 +--------------------
 WebSearcher/extractors/extractor_footer.py |  54 +++
 WebSearcher/extractors/extractor_header.py |  33 ++
 WebSearcher/extractors/extractor_main.py   | 227 ++++++++++++
 WebSearcher/extractors/extractor_rhs.py    |  43 +++
 5 files changed, 375 insertions(+), 361 deletions(-)
 create mode 100644 WebSearcher/extractors/extractor_footer.py
 create mode 100644 WebSearcher/extractors/extractor_header.py
 create mode 100644 WebSearcher/extractors/extractor_main.py
 create mode 100644 WebSearcher/extractors/extractor_rhs.py

diff --git a/WebSearcher/extractors/__init__.py b/WebSearcher/extractors/__init__.py
index 58ba5f7..59252a6 100644
--- a/WebSearcher/extractors/__init__.py
+++ b/WebSearcher/extractors/__init__.py
@@ -1,370 +1,27 @@
-from .components import Component, ComponentList
-from . import utils
-from . import webutils
-from . import logger
-log = logger.Logger().start(__name__)
 import bs4
+from ..components import ComponentList
+from .extractor_rhs import ExtractorRightHandSide
+from .extractor_main import ExtractorMain
+from .extractor_header import ExtractorHeader
+from .extractor_footer import ExtractorFooter
 
+from .. import logger
+log = logger.Logger().start(__name__)
 
 class Extractor:
     def __init__(self, soup: bs4.BeautifulSoup):
         self.soup = soup
         self.components = ComponentList()
-        self.rhs = {}
-        self.layout_divs = {
-            "rso": None,
-            "top-bars": None,
-            "left-bar": None,
-        }
-        self.layouts = {
-            "rso": False,
-            "top-bars": False,
-            "left-bar": False,
-            "standard": False,
-            "no-rso": False,
-        }
-        self.layout_label = None
-        self.layout_extractors = {
-            "standard": self.extract_from_standard,
-            "top-bars": self.extract_from_top_bar,
-            "left-bar": self.extract_from_left_bar,
-            "no-rso": self.extract_from_no_rso
-        }
+        self.rhs_handler = ExtractorRightHandSide(self.soup, self.components)
+        self.header_handler = ExtractorHeader(self.soup, self.components)
+        self.main_handler = ExtractorMain(self.soup, self.components)
+        self.footer_handler = ExtractorFooter(self.soup, self.components)
 
     def extract_components(self):
-        log.debug("Extracting Components")
-        self.extract_rhs()
-        self.extract_header()
-        self.extract_main()
-        self.extract_footer()
-        self.append_rhs()
-        log.debug(f"Extracted {self.components.cmpt_rank_counter:,} components")
-
-    # --------------------------------------------------------------------------
-    # Right Hand Sidebar Components
-    # --------------------------------------------------------------------------
-
-    def extract_rhs(self):
-        """Extract the Right Hand Side (RHS) Knowledge Panel. Can appear in arbitrary order, must extract first."""
-        rhs_kws = ('div', {'id': 'rhs'})
-        rhs = self.soup.find(*rhs_kws).extract() if self.soup.find(*rhs_kws) else None
-        if rhs:
-            rhs_layouts = {
-                'rhs_complementary': rhs if webutils.check_dict_value(rhs.attrs, "role", "complementary") else None,
-                'rhs_knowledge': rhs.find('div', {'class': ['kp-wholepage', 'knowledge-panel', 'TzHB6b']}),
-            }
-            rhs_layout = next((layout for layout, component in rhs_layouts.items() if component), None)
-            if rhs_layout:
-                log.debug(f"rhs_layout: {rhs_layout}")
-                self.rhs = {"elem": rhs_layouts[rhs_layout], 
-                            "section": "rhs", 
-                            "type": "knowledge_rhs"}
-            else:
-                log.debug(f"no rhs_layout")
-
-
-    def append_rhs(self):
-        """Append the RHS Knowledge Panel to the components list at the end"""
-        if self.rhs:
-            log.debug(f"appending rhs")
-            self.components.add_component(**self.rhs)
-            self.rhs = None
-
-
-    # --------------------------------------------------------------------------
-    # Header Components
-    # --------------------------------------------------------------------------
-
-    def extract_header(self):
-        """Extract the header section, often a carousel of images or other suggestions."""
-        self.extract_top_bar()
-        self.extract_notices()
-
-
-    def extract_top_bar(self):
-        """Extract the top bar section, often a carousel of images or other suggestions."""
-        top_bar = self.soup.find('div', {'id':'appbar'})
-        if top_bar:
-            has_img = top_bar.find(lambda tag: tag.has_attr('src') and not tag.has_attr('data-src'))
-            if top_bar.find('g-scrolling-carousel') and has_img:
-                self.components.add_component(top_bar, section='header', type='top_image_carousel')
-
-
-    def extract_notices(self):
-        """Append notices to the components list at the end"""
-        notices = webutils.find_all_divs(self.soup, "div", {"id": "oFNiHe"})
-        notices = webutils.filter_empty_divs(notices)
-        log.debug(f"notices: {len(notices)}")
-        for notice in notices:
-            self.components.add_component(notice, section="header", type="notice")
-
-    # --------------------------------------------------------------------------
-    # Main Components
-    # --------------------------------------------------------------------------
-
-    def extract_main(self):
-        """Extract the main results sections of the SERP"""
-        # self.extract_main_shopping_ads()
-        self.extract_main_ads_top()
-        self.extract_main_components()
-        self.extract_main_ads_bottom()
-
-
-    # def extract_main_shopping_ads(self):
-    #     """Extract the main shopping ads section of the SERP"""
-    #     shopping_ads = self.soup.find('div', {'class': 'commercial-unit-desktop-top'})
-    #     if shopping_ads:
-    #         self.components.add_component(shopping_ads, section='main', type='shopping_ads')
-
-    def extract_main_ads_top(self):
-        """Extract the main ads section of the SERP"""
-        ads = self.soup.find('div', {'id':'tads'})
-        if ads and webutils.get_text(ads):
-            # Filter if already extracted as shopping ads
-            # if not ads.find('div', {'class': 'commercial-unit-desktop-top'}):
-            self.components.add_component(ads, section='main', type='ad')
-
-
-    def extract_main_components(self, drop_tags: set={'script', 'style', None}):
-        """Extract main components based on SERP layout"""
-        log.debug("Extracting main column components")
-        self.check_layout_main()
-        try:
-            layout_extractor = self.layout_extractors[self.layout_label]
-            column = layout_extractor(drop_tags)
-            for component in column:
-                if Extractor.is_valid_main_component(component):
-                    self.components.add_component(component, section='main')
-        except KeyError:
-            raise ValueError(f"no extractor for layout_label: {self.layout_label}")    
-        log.debug(f"Extracted main components: {self.components.cmpt_rank_counter:,}")
-
-
-    def extract_main_ads_bottom(self):
-        """Extract the main ads section of the SERP"""
-        ads = self.soup.find('div', {'id':'tadsb'})
-        if ads and webutils.get_text(ads):
-            self.components.add_component(ads, section='main', type='ad')
-
-    # --------------------------------------------------------------------------
-    # Layout Specifics
-    # --------------------------------------------------------------------------
-
-
-    def check_layout_main(self):
-        """Divide and label the page layout"""
-        log.debug(f"Checking SERP layout")
-
-        # Layout soup subsets
-        self.layout_divs['rso'] = self.soup.find('div', {'id':'rso'})
-        self.layout_divs['left-bar'] = self.soup.find('div', {'class': 'OeVqAd'})
-        self.layout_divs['top-bars'] = self.soup.find_all('div', {'class': ['XqFnDf', 'M8OgIe']})
-        
-        # Layout classifications
-        self.layouts['rso'] = bool(self.layout_divs['rso'])
-        self.layouts['top-bars'] = bool(self.layout_divs['top-bars'])
-        self.layouts['left-bar'] = bool(self.layout_divs['left-bar'])
-        self.layouts['standard'] = (self.layouts['rso'] &
-                                    (not self.layouts['top-bars']) &
-                                    (not self.layouts['left-bar']))
-        self.layouts['no-rso'] = not self.layouts['rso']
-
-        # Get layout label
-        label_matches = [k for k,v in self.layouts.items() if k !='rso' and v]
-        first_match = label_matches[0] if label_matches else None
-        self.layout_label = first_match
-        log.debug(f"layout: {self.layout_label}")
-    
-
-    def extract_from_standard(self, drop_tags: set = {}) -> list:
-
-        if self.layout_divs['rso'].find('div', {'id':'kp-wp-tab-overview'}):
-            log.debug("layout update: standard-alt-1")
-            self.layout_label = 'standard-alt'
-            column = self.layout_divs['rso'].find_all('div', {'class':'TzHB6b'})
-            return column
-        
-        column = Extractor.extract_children(self.layout_divs['rso'], drop_tags)
-        column = [c for c in column if Extractor.is_valid_main_component(c)]
-        
-        if len(column) == 0:
-            log.debug("layout update: standard-alt-0")
-            self.layout_label = 'standard-alt'
-            divs = self.layout_divs['rso'].find_all('div', {'id':'kp-wp-tab-overview'})
-            column = sum([div.find_all('div', {'class':'TzHB6b'}) for div in divs], [])
-        return column
-
-
-    def extract_from_top_bar(self, drop_tags: set = {}) -> list:
-        """Extract components from top-bars layout"""
-        column = []
-
-        top_bar_divs = Extractor.extract_from_top_bar_divs(self.layout_divs['top-bars'])
-        column.extend(top_bar_divs)
-        # No duplicates, but missing data
-        # rso_layout_divs = self.layout_divs['rso'].find_all('div', {'class':'sATSHe'})
-
-        div_classes = [
-            'cUnQKe', # people also ask
-            'g',      # general
-            'Lv2Cle', # images-medium
-            'oIk2Cb', # searches_related
-            'Ww4FFb', # discussions_and_forums
-            'vtSz8d', # videos
-            'uVMCKf', # videos
-        ]
-        rso_layout_divs = self.layout_divs['rso'].find_all('div', attrs={'class': div_classes}, recursive=True)
-
-        if rso_layout_divs:
-            self.layout_label = 'top-bars-divs'
-            layout_column = [div for div in rso_layout_divs if div.name not in drop_tags]
-        else:
-            self.layout_label = 'top-bars-children'
-            layout_column = Extractor.extract_children(self.layout_divs['rso'], drop_tags)
-        log.debug(f"layout update: {self.layout_label}")
-
-        column.extend(layout_column)
-        return column
-    
-    @staticmethod
-    def extract_from_top_bar_divs(soup, drop_tags: set = {}) -> list:
-        output_list = []
-        for top_bar in soup:
-            if webutils.check_dict_value(top_bar.attrs, "class", ["M8OgIe"]):
-                knowledge_divs = webutils.find_all_divs(top_bar, "div", {"jscontroller": ["qTdDb", "OWrb3e"]})
-                output_list.extend(knowledge_divs)
-                log.debug(f"layout: M8OgIe divs: {len(knowledge_divs)}")
-            else:
-                output_list.append(top_bar)
-        return output_list
-
-
-    def extract_from_left_bar(self, drop_tags: set = {}) -> list:
-        """Extract components from left-bar layout"""
-        column = self.soup.find_all('div', {'class':'TzHB6b'})
-        return column
-
-
-    def extract_from_no_rso(self, drop_tags: set = {}) -> list:
-        """Extract components from no-rso layout"""
-        log.debug("layout: no-rso")
-        column = []
-        section1 = self.soup.find_all('div', {'class':'UDZeY OTFaAf'})
-        for div in section1:
-
-            # Conditional handling for Twitter result
-            if div.find('h2') and div.find('h2').text == "Twitter Results": 
-                column.append(div.find('div').parent)
-
-            # Conditional handling for g-section with header
-            elif div.find('g-section-with-header'): 
-                column.append(div.find('g-section-with-header').parent)
-
-            # Include divs with a "View more" type of button
-            elif div.find('g-more-link'): 
-                column.append(div)
-
-            # Include footer components that appear in the main column
-            elif div.find('div', {'class':'oIk2Cb'}):
-                column.append(div)
-
-            else:
-                # Handle general results
-                for child in div.find_all('div',  {'class':'g'}): 
-                    column.append(child)
-
-            # Find section 2 results and append to column list
-            section2 = self.soup.find('div', {'class':'WvKfwe a3spGf'})
-            if section2:
-                for child in section2.children:
-                    column.append(child)
-            column = [c for c in column if c.name not in drop_tags]
-        return column
-
-
-    @staticmethod
-    def extract_children(soup: bs4.BeautifulSoup, drop_tags: set = {}) -> list:
-        """Extract children from BeautifulSoup, drop specific tags, flatten list"""
-        log.debug("layout: extracting children")
-        children = []
-        for child in soup.children:
-            if child.name in drop_tags:
-                continue
-            if not child.attrs:
-                children.extend(child.contents)
-            else:
-                children.append(child)
-        return children
-
-
-    @staticmethod
-    def is_valid_main_component(c) -> bool:
-        """Check if a given component is neither empty nor a hidden survey"""
-        if not c:
-            return False
-        else:
-            drop_text = {
-                "Main results",    # Remove empty rso component; hidden <h2> header  
-                "Twitter Results", # Remove empty Twitter component
-                "",                # Remove empty divs
-            }
-            return c.text not in drop_text and not Extractor.is_hidden_survey(c)
-
-    @staticmethod
-    def is_hidden_survey(element):
-        """Check if a component is a hidden survey component; no visual presence so filter out"""
-        conditions = [
-            element.find('promo-throttler'),
-            webutils.check_dict_value(element.attrs, "class", ["ULSxyf"]),
-        ]
-        return all(conditions)
-
-
-    # --------------------------------------------------------------------------
-    # Footer Components
-    # --------------------------------------------------------------------------
-
-
-    def extract_footer(self):
-        """Extract the footer section of the SERP"""
-        log.debug("extracting footer components")
-
-        footer_div = self.soup.find('div', {'id':'botstuff'})
-        footer_component_list = []
-
-        # Check if footer div exists
-        if footer_div:
-            footer_component_divs = webutils.find_all_divs(self.soup, 'div', {'id':['bres', 'brs']}) 
-            if footer_component_divs:
-                log.debug(f"found footer components: {len(footer_component_divs):,}")
-
-                # Expand components by checking for nested divs
-                for footer_component_div in footer_component_divs:
-                    expanded_divs = webutils.find_all_divs(footer_component_div, "div", {"class":"MjjYud"})
-                    if expanded_divs and len(expanded_divs) > 1:
-                        footer_component_list.extend(expanded_divs)
-                    else:
-                        footer_component_list.append(footer_component_div)
-
-        # Check for omitted notice
-        omitted_notice = self.soup.find('div', {'class':'ClPXac'})
-        if omitted_notice:
-            footer_component_list.append(omitted_notice)
-
-        footer_component_list = [e for e in footer_component_list if not Extractor.is_hidden_footer(e)]
-        log.debug(f'footer_component_list len: {len(footer_component_list)}')
-        
-        for footer_component in footer_component_list:
-            self.components.add_component(footer_component, section='footer')
-
-
-    @staticmethod
-    def is_hidden_footer(element):
-        """Check if a component is a hidden footer component; no visual presence so filter out"""
-        conditions = [
-            # element.find("b", {"class":"uDuvJd"}),
-            element.find("span", {"class":"oUAcPd"}),   
-            element.find("div", {"class": "RTaUke"}),   
-            element.find("div", {"class": "KJ7Tg"}),    
-        ]
-        return any(conditions)
+        log.debug(f"Extracting Components {'-'*50}")
+        self.rhs_handler.extract()
+        self.header_handler.extract()
+        self.main_handler.extract()
+        self.footer_handler.extract()
+        self.rhs_handler.append()
+        log.debug(f"total components: {self.components.cmpt_rank_counter:,}")
diff --git a/WebSearcher/extractors/extractor_footer.py b/WebSearcher/extractors/extractor_footer.py
new file mode 100644
index 0000000..ccf3397
--- /dev/null
+++ b/WebSearcher/extractors/extractor_footer.py
@@ -0,0 +1,54 @@
+import bs4
+from .. import webutils
+from .. import logger
+
+log = logger.Logger().start(__name__)
+
+class ExtractorFooter:
+    def __init__(self, soup: bs4.BeautifulSoup, components):
+        self.soup = soup
+        self.components = components
+
+    def extract(self):
+        """Extract the footer section of the SERP"""
+
+        footer_div = self.soup.find('div', {'id':'botstuff'})
+        footer_component_list = []
+
+        if footer_div:
+            footer_component_divs = webutils.find_all_divs(
+                self.soup, 'div', {'id': ['bres', 'brs']}
+            )
+            if footer_component_divs:
+                log.debug(f"footer_components: {len(footer_component_divs):,}")
+                for footer_component_div in footer_component_divs:
+                    expanded_divs = webutils.find_all_divs(
+                        footer_component_div, "div", {"class": "MjjYud"}
+                    )
+                    if expanded_divs and len(expanded_divs) > 1:
+                        footer_component_list.extend(expanded_divs)
+                    else:
+                        footer_component_list.append(footer_component_div)
+
+        omitted_notice = self.soup.find('div', {'class':'ClPXac'})
+        if omitted_notice:
+            footer_component_list.append(omitted_notice)
+
+        footer_component_list = [
+            e for e in footer_component_list
+            if not ExtractorFooter.is_hidden_footer(e)
+        ]
+        log.debug(f'footer_components: {len(footer_component_list)}')
+
+        for footer_component in footer_component_list:
+            self.components.add_component(footer_component, section='footer')
+
+    @staticmethod
+    def is_hidden_footer(element):
+        """Filter out hidden footer components (no visual presence)."""
+        conditions = [
+            element.find("span", {"class":"oUAcPd"}),
+            element.find("div", {"class": "RTaUke"}),
+            element.find("div", {"class": "KJ7Tg"}),
+        ]
+        return any(conditions)
\ No newline at end of file
diff --git a/WebSearcher/extractors/extractor_header.py b/WebSearcher/extractors/extractor_header.py
new file mode 100644
index 0000000..7955d04
--- /dev/null
+++ b/WebSearcher/extractors/extractor_header.py
@@ -0,0 +1,33 @@
+import bs4
+from .. import webutils
+from .. import logger
+
+log = logger.Logger().start(__name__)
+
+class ExtractorHeader:
+    def __init__(self, soup: bs4.BeautifulSoup, components):
+        self.soup = soup
+        self.components = components
+        self.exists = False
+
+    def extract(self):
+        """Extract the header section: appbar and notices."""
+        self.extract_appbar()
+        self.extract_notices()
+
+    def extract_appbar(self):
+        """Extract the top bar section, often a carousel of images or other suggestions."""
+        appbar = self.soup.find('div', {'id':'appbar'})
+        if appbar:
+            has_img = appbar.find(lambda tag: tag.has_attr('src') and not tag.has_attr('data-src'))
+            if appbar.find('g-scrolling-carousel') and has_img:
+                self.components.add_component(appbar, section='header', type='top_image_carousel')
+                self.exists = True
+
+    def extract_notices(self):
+        """Append notices to the components list at the end."""
+        notices = webutils.find_all_divs(self.soup, "div", {"id": "oFNiHe"}, filter_empty=True)
+        if notices:
+            self.exists = True
+            for notice in notices:
+                self.components.add_component(notice, section="header", type="notice")
\ No newline at end of file
diff --git a/WebSearcher/extractors/extractor_main.py b/WebSearcher/extractors/extractor_main.py
new file mode 100644
index 0000000..9f772df
--- /dev/null
+++ b/WebSearcher/extractors/extractor_main.py
@@ -0,0 +1,227 @@
+import bs4
+from .. import webutils
+from ..logger import Logger
+
+log = Logger().start(__name__)
+
+class ExtractorMain:
+    def __init__(self, soup: bs4.BeautifulSoup, components):
+        self.soup = soup
+        self.components = components
+
+        # copied from Extractor.__init__
+        self.layout_divs = {
+            "rso": None,
+            "top-bars": None,
+            "left-bar": None,
+        }
+        self.layouts = {
+            "top-bars": False,
+            "left-bar": False,
+            "standard": False,
+            "no-rso": False,
+        }
+        self.layout_label = None
+        self.layout_extractors = {
+            "standard": self.extract_from_standard,
+            "top-bars": self.extract_from_top_bar,
+            "left-bar": self.extract_from_left_bar,
+            "no-rso": self.extract_from_no_rso
+        }
+
+    def extract(self):
+        self.get_layout()
+        self._ads_top()
+        self._main_column()
+        self._ads_bottom()
+        log.debug(f"main_components: {self.components.cmpt_rank_counter:,}")
+
+    def get_layout(self):
+        """Divide and label the page layout"""
+
+        # Layout soup subsets
+        layout_divs = {}
+        layout_divs['rso'] = self.soup.find('div', {'id':'rso'})
+        layout_divs['left-bar'] = self.soup.find('div', {'class': 'OeVqAd'})
+        # layout_divs['top-bars'] = self.soup.find_all('div', {'class': ['XqFnDf', 'M8OgIe']})
+        
+        rcnt = self.soup.find('div', {'id':'rcnt'})
+        layout_divs['top-bars'] = rcnt.find_all('div', {'class': ['XqFnDf', 'M8OgIe']}, recursive=False)
+        
+        # Layout classifications
+        layouts = {}
+        layouts['top-bars'] = bool(layout_divs['top-bars'])
+        layouts['left-bar'] = bool(layout_divs['left-bar'])
+        layouts['standard'] = (
+            bool(layout_divs['rso']) &
+            (not layouts['top-bars']) &
+            (not layouts['left-bar'])
+        )
+        layouts['no-rso'] = not bool(layout_divs['rso'])
+
+        if layouts['top-bars'] and bool(layout_divs['rso']) and not layouts['left-bar']:
+            layout_label = 'standard'
+        else:    
+            # Get layout label
+            label_matches = [k for k,v in layouts.items() if v]
+            layout_label = label_matches[0] if label_matches else None
+
+        # Set layout details
+        log.debug(f"main_layout: {layout_label}")
+        self.layout_label = layout_label
+        self.layouts.update(layouts)
+        self.layout_divs.update(layout_divs)
+
+    def _ads_top(self):
+        ads = self.soup.find('div', {'id':'tads'})
+        if ads and webutils.get_text(ads):
+            ads.extract()
+            self.components.add_component(ads, section='main', type='ad')
+
+    def _main_column(self, drop_tags: set = {'script', 'style', None}):
+        try:
+            extractor = self.layout_extractors[self.layout_label]
+        except KeyError:
+            raise ValueError(f"no extractor for layout_label: {self.layout_label}")
+
+        column = extractor(drop_tags)
+        column = webutils.filter_empty_divs(column)
+        for c in column:
+            if ExtractorMain.is_valid(c):
+                self.components.add_component(c, section='main')
+
+    def _ads_bottom(self):
+        ads = self.soup.find('div', {'id':'tadsb'})
+        if ads and webutils.get_text(ads):
+            ads.extract()
+            self.components.add_component(ads, section='main', type='ad')
+
+    def extract_from_standard(self, drop_tags:set={}) -> list:
+
+        rso_div = self.layout_divs['rso']
+        standard_layouts = {
+            "standard-0": rso_div.find('div', {'id':'kp-wp-tab-overview'}),
+            "standard-1": rso_div.find('div', {'id':'kp-wp-tab-Songs'}),
+        }
+        for layout_name, layout_div in standard_layouts.items():
+            if layout_div:
+                if layout_div.find_all("div"):
+                    return self._extract_from_standard(layout_name)
+
+                # self.layout_label = layout_name
+                # return self._extract_from_standard(layout_name)
+    
+        col = ExtractorMain.extract_children(rso_div, drop_tags)
+        col = [c for c in col if ExtractorMain.is_valid(c)]
+        if not col:
+            self.layout_label = 'standard-2'
+            log.debug(f"main_layout: {self.layout_label} (update)")
+            divs = rso_div.find_all('div', {'id':'kp-wp-tab-overview'})
+            col = sum([d.find_all('div', {'class':'TzHB6b'}) for d in divs], [])
+        return col
+
+    def _extract_from_standard(self, sub_type:str = "") -> list:
+        
+        self.layout_label = sub_type
+        rso_div = self.layout_divs['rso']
+        log.debug(f"main_layout: {self.layout_label} (update)")
+                   
+        if self.layout_label == "standard-0":
+            column = []
+            top_divs = ExtractorMain.extract_top_divs(self.layout_divs['top-bars']) or []
+            main_divs = rso_div.find_all('div', {'class':'TzHB6b'}) or []
+            column.extend(top_divs)
+            column.extend(main_divs)
+            log.debug(f"main_components: {len(column):,}")
+            return column
+    
+        if self.layout_label == "standard-1":
+            column = []
+            top_divs = ExtractorMain.extract_top_divs(self.layout_divs['top-bars']) or []
+            main_divs = rso_div.find('div', {'id':'kp-wp-tab-Songs'}).children or []
+            column.extend(top_divs)
+            column.extend(main_divs)
+            column = [div for div in column if div.name not in {'script', 'style'}]
+            column = webutils.filter_empty_divs(column)
+            return column
+            
+
+    def extract_from_top_bar(self, drop_tags:set={}) -> list:
+        out = []
+        tops = ExtractorMain.extract_top_divs(self.layout_divs['top-bars'])
+        out.extend(tops)
+
+        div_classes = [
+            'cUnQKe', # people also ask
+            'g',      # general
+            'Lv2Cle', # images-medium
+            'oIk2Cb', # searches_related
+            'Ww4FFb', # discussions_and_forums
+            'vtSz8d', # videos
+            'uVMCKf', # videos
+        ]
+
+        rso_divs = self.layout_divs['rso'].find_all('div', attrs={'class':div_classes})
+        if rso_divs:
+            self.layout_label = 'top-bars-divs'
+            col = [div for div in rso_divs if div.name not in drop_tags]
+        else:
+            self.layout_label = 'top-bars-children'
+            col = ExtractorMain.extract_children(self.layout_divs['rso'], drop_tags)
+        log.debug(f"main_layout: {self.layout_label} (update)")
+        out.extend(col)
+        return out
+
+    @staticmethod
+    def extract_top_divs(soup, drop_tags:set={}) -> list:
+        out = []
+        for tb in soup:
+            if webutils.check_dict_value(tb.attrs, "class", ["M8OgIe"]):
+                kd = webutils.find_all_divs(tb, "div", {"jscontroller":["qTdDb","OWrb3e"]})
+                out.extend(kd)
+            else:
+                out.append(tb)
+        return out
+
+    def extract_from_left_bar(self, drop_tags:set={}) -> list:
+        return self.soup.find_all('div', {'class':'TzHB6b'})
+
+    def extract_from_no_rso(self, drop_tags:set={}) -> list:
+        out=[]; sec1=self.soup.find_all('div', {'class':'UDZeY OTFaAf'})
+        for div in sec1:
+            if div.find('h2') and div.find('h2').text=="Twitter Results":
+                out.append(div.find('div').parent)
+            elif div.find('g-section-with-header'):
+                out.append(div.find('g-section-with-header').parent)
+            elif div.find('g-more-link'):
+                out.append(div)
+            elif div.find('div',{'class':'oIk2Cb'}):
+                out.append(div)
+            else:
+                out.extend(div.find_all('div',{'class':'g'}))
+            sec2=self.soup.find('div',{'class':'WvKfwe a3spGf'})
+            if sec2:
+                out.extend(sec2.children)
+        return [c for c in out if c.name not in drop_tags]
+
+    @staticmethod
+    def extract_children(soup, drop_tags:set={}) -> list:
+        cts=[]
+        for ch in soup.children:
+            if ch.name in drop_tags: continue
+            if not ch.attrs: cts.extend(ch.contents)
+            else: cts.append(ch)
+        return cts
+
+    @staticmethod
+    def is_valid(c) -> bool:
+        if not c: return False
+        bad = {"Main results","Twitter Results",""}
+        if c.text in bad: return False
+        # hidden survey
+        cond = [
+            c.find('promo-throttler'),
+            webutils.check_dict_value(c.attrs,"class",["ULSxyf"]) if 'attrs' in c else False,
+        ]
+        if all(cond): return False
+        return True
\ No newline at end of file
diff --git a/WebSearcher/extractors/extractor_rhs.py b/WebSearcher/extractors/extractor_rhs.py
new file mode 100644
index 0000000..4fc013d
--- /dev/null
+++ b/WebSearcher/extractors/extractor_rhs.py
@@ -0,0 +1,43 @@
+import bs4
+from .. import webutils
+from .. import logger
+
+log = logger.Logger().start(__name__)
+
+class ExtractorRightHandSide:
+    def __init__(self, soup: bs4.BeautifulSoup, components):
+        self.soup = soup
+        self.components = components
+        self.rhs = {}
+
+    def extract(self):
+        """Extract the RHS Knowledge Panel, if present."""
+        rhs_div = self.soup.find('div', {'id': 'rhs'})
+        if not rhs_div:
+            return
+        rhs_div.extract()
+        layout, div = self._get_layout(rhs_div)
+        if layout:
+            log.debug(f"rhs_layout: {layout}")
+            self.rhs = {
+                "elem": div,
+                "section": "rhs",
+                "type": "knowledge_rhs"
+            }
+        else:
+            log.debug("no rhs_layout")
+
+    def append(self):
+        """Append the RHS panel as a component at the end."""
+        if self.rhs:
+            log.debug("appending rhs")
+            self.components.add_component(**self.rhs)
+            self.rhs = {}
+
+    def _get_layout(self, rhs_div):
+        rhs_layouts = {
+            'rhs_complementary': rhs_div if webutils.check_dict_value(rhs_div.attrs, "role", "complementary") else None,
+            'rhs_knowledge': rhs_div.find('div', {'class': ['kp-wholepage', 'knowledge-panel', 'TzHB6b']})
+        }
+        found = next((name for name, node in rhs_layouts.items() if node), None)
+        return (found, rhs_div) if found else (None, rhs_div)
\ No newline at end of file

From a3b7c006233415143c7fbc783e4c702e6c6d3073 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 8 May 2025 11:12:53 -0700
Subject: [PATCH 079/101] version: 0.6.5.dev4

---
 WebSearcher/__init__.py | 2 +-
 pyproject.toml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index 9c3b696..d0c3d64 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.5.dev3"
+__version__ = "0.6.5.dev4"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index 2f43c50..6dfbf8f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.6.5.dev3"
+version = "0.6.5.dev4"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From 1a32ee051950a9a27487f4179ae79fdfb1d0c5e4 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 8 May 2025 17:38:15 -0700
Subject: [PATCH 080/101] add: recent_posts variant of top_stories

---
 WebSearcher/classifiers/header_text.py        |  1 +
 WebSearcher/component_parsers/__init__.py     |  2 ++
 WebSearcher/component_parsers/recent_posts.py | 14 ++++++++++++++
 3 files changed, 17 insertions(+)
 create mode 100644 WebSearcher/component_parsers/recent_posts.py

diff --git a/WebSearcher/classifiers/header_text.py b/WebSearcher/classifiers/header_text.py
index 8c0c3c7..713f5e5 100644
--- a/WebSearcher/classifiers/header_text.py
+++ b/WebSearcher/classifiers/header_text.py
@@ -116,6 +116,7 @@ def _get_header_level_mapping(level) -> dict:
                         "News", 
                         "Noticias",
                         "Market news"],
+        "recent_posts": ["Recent posts"],
         "twitter": ["Twitter Results"],
         "videos": ["Videos"]
     }
diff --git a/WebSearcher/component_parsers/__init__.py b/WebSearcher/component_parsers/__init__.py
index aaff223..afb67a2 100644
--- a/WebSearcher/component_parsers/__init__.py
+++ b/WebSearcher/component_parsers/__init__.py
@@ -15,6 +15,7 @@
 from .latest_from import parse_latest_from
 from .local_news import parse_local_news
 from .perspectives import parse_perspectives
+from .recent_posts import parse_recent_posts
 
 from .local_results import parse_local_results
 from .map_results import parse_map_results
@@ -57,6 +58,7 @@
     ('news_quotes', parse_news_quotes, 'News Quotes'),
     ('people_also_ask', parse_people_also_ask, 'People Also Ask'),
     ('perspectives', parse_perspectives, 'Perspectives & Opinions'),
+    ('recent_posts', parse_recent_posts, 'Recent Posts'),
     ('scholarly_articles', parse_scholarly_articles, 'Scholar Articles'),
     ('searches_related', parse_searches_related, 'Related Searches'),
     ('shopping_ads', parse_shopping_ads, 'Shopping Ad'),
diff --git a/WebSearcher/component_parsers/recent_posts.py b/WebSearcher/component_parsers/recent_posts.py
new file mode 100644
index 0000000..ee0a24d
--- /dev/null
+++ b/WebSearcher/component_parsers/recent_posts.py
@@ -0,0 +1,14 @@
+from .top_stories import parse_top_stories
+
+def parse_recent_posts(cmpt):
+    """Parse a "Recent posts" component
+
+    These components have a similar carousel as Top Stories and Perspectives.
+    
+    Args:
+        cmpt (bs4 object): A html component
+    
+    Returns:
+        dict : parsed result
+    """
+    return parse_top_stories(cmpt, ctype='recent_posts')

From f775eac6a268bc6a407a0f21c2e4ac2c7e93a02f Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 8 May 2025 17:38:33 -0700
Subject: [PATCH 081/101] update: remove duplicate log

---
 WebSearcher/extractors/extractor_footer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/WebSearcher/extractors/extractor_footer.py b/WebSearcher/extractors/extractor_footer.py
index ccf3397..abe2530 100644
--- a/WebSearcher/extractors/extractor_footer.py
+++ b/WebSearcher/extractors/extractor_footer.py
@@ -20,7 +20,6 @@ def extract(self):
                 self.soup, 'div', {'id': ['bres', 'brs']}
             )
             if footer_component_divs:
-                log.debug(f"footer_components: {len(footer_component_divs):,}")
                 for footer_component_div in footer_component_divs:
                     expanded_divs = webutils.find_all_divs(
                         footer_component_div, "div", {"class": "MjjYud"}

From e0edd4e6b013cda8ede766d44f5efff6ffdf7c9b Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 8 May 2025 17:38:46 -0700
Subject: [PATCH 082/101] version: 0.6.5.dev5

---
 WebSearcher/__init__.py | 2 +-
 pyproject.toml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index d0c3d64..61e16df 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.5.dev4"
+__version__ = "0.6.5.dev5"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index 6dfbf8f..34de6e7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.6.5.dev4"
+version = "0.6.5.dev5"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From bd02fa8983464f2bb6e13df06874848a64265e27 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 8 May 2025 20:56:19 -0700
Subject: [PATCH 083/101] fix: missing comma

---
 WebSearcher/classifiers/header_text.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/WebSearcher/classifiers/header_text.py b/WebSearcher/classifiers/header_text.py
index 713f5e5..b247af8 100644
--- a/WebSearcher/classifiers/header_text.py
+++ b/WebSearcher/classifiers/header_text.py
@@ -90,7 +90,8 @@ def _get_header_level_mapping(level) -> dict:
         "local_results": [
             "Local Results",
             "Locations",
-            "Places", "Sitios"
+            "Places", 
+            "Sitios",
             "Businesses",
             "locations",
         ],

From 71e1a552bbb0683dfa741d1861fd528d24b2e216 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 8 May 2025 20:56:45 -0700
Subject: [PATCH 084/101] update: main column extractors

---
 WebSearcher/extractors/extractor_main.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/WebSearcher/extractors/extractor_main.py b/WebSearcher/extractors/extractor_main.py
index 9f772df..af7b399 100644
--- a/WebSearcher/extractors/extractor_main.py
+++ b/WebSearcher/extractors/extractor_main.py
@@ -102,30 +102,30 @@ def extract_from_standard(self, drop_tags:set={}) -> list:
         standard_layouts = {
             "standard-0": rso_div.find('div', {'id':'kp-wp-tab-overview'}),
             "standard-1": rso_div.find('div', {'id':'kp-wp-tab-Songs'}),
+            "standard-2": rso_div.find('div', {'id':'kp-wp-tab-SportsStandings'}),
         }
         for layout_name, layout_div in standard_layouts.items():
             if layout_div:
                 if layout_div.find_all("div"):
-                    return self._extract_from_standard(layout_name)
+                    return self._extract_from_standard_sub_type(layout_name)
 
-                # self.layout_label = layout_name
-                # return self._extract_from_standard(layout_name)
-    
+        top_divs = ExtractorMain.extract_top_divs(self.layout_divs['top-bars']) or []
         col = ExtractorMain.extract_children(rso_div, drop_tags)
+        col = top_divs + col
         col = [c for c in col if ExtractorMain.is_valid(c)]
         if not col:
-            self.layout_label = 'standard-2'
+            self.layout_label = 'standard-3'
             log.debug(f"main_layout: {self.layout_label} (update)")
             divs = rso_div.find_all('div', {'id':'kp-wp-tab-overview'})
             col = sum([d.find_all('div', {'class':'TzHB6b'}) for d in divs], [])
         return col
 
-    def _extract_from_standard(self, sub_type:str = "") -> list:
+    def _extract_from_standard_sub_type(self, sub_type:str = "") -> list:
         
         self.layout_label = sub_type
         rso_div = self.layout_divs['rso']
         log.debug(f"main_layout: {self.layout_label} (update)")
-                   
+        
         if self.layout_label == "standard-0":
             column = []
             top_divs = ExtractorMain.extract_top_divs(self.layout_divs['top-bars']) or []
@@ -144,6 +144,16 @@ def _extract_from_standard(self, sub_type:str = "") -> list:
             column = [div for div in column if div.name not in {'script', 'style'}]
             column = webutils.filter_empty_divs(column)
             return column
+        
+        if self.layout_label == "standard-2":
+            column = []
+            top_divs = ExtractorMain.extract_top_divs(self.layout_divs['top-bars']) or []
+            main_divs = rso_div.find('div', {'id':'kp-wp-tab-SportsStandings'}).children or []
+            column.extend(top_divs)
+            column.extend(main_divs)
+            column = [div for div in column if div.name not in {'script', 'style'}]
+            column = webutils.filter_empty_divs(column)
+            return column
             
 
     def extract_from_top_bar(self, drop_tags:set={}) -> list:

From 9955b7c591331277a62a34bf8c3477aec6bbcc70 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Fri, 9 May 2025 09:10:32 -0700
Subject: [PATCH 085/101] update: bump h11 per dependabot

---
 poetry.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 34ccb6a..79638ee 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -558,14 +558,14 @@ typing = ["typing-extensions (>=4.12.2) ; python_version < \"3.11\""]
 
 [[package]]
 name = "h11"
-version = "0.14.0"
+version = "0.16.0"
 description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
-    {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
+    {file = "h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86"},
+    {file = "h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1"},
 ]
 
 [[package]]

From 951a5dac0c35d9f13b9733290e49fd7b69340cbd Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Wed, 14 May 2025 18:59:17 -0700
Subject: [PATCH 086/101] fix: handle serps with no rcnt div

---
 WebSearcher/extractors/extractor_main.py | 3 +--
 WebSearcher/webutils.py                  | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/extractors/extractor_main.py b/WebSearcher/extractors/extractor_main.py
index af7b399..70c9550 100644
--- a/WebSearcher/extractors/extractor_main.py
+++ b/WebSearcher/extractors/extractor_main.py
@@ -43,10 +43,9 @@ def get_layout(self):
         layout_divs = {}
         layout_divs['rso'] = self.soup.find('div', {'id':'rso'})
         layout_divs['left-bar'] = self.soup.find('div', {'class': 'OeVqAd'})
-        # layout_divs['top-bars'] = self.soup.find_all('div', {'class': ['XqFnDf', 'M8OgIe']})
         
         rcnt = self.soup.find('div', {'id':'rcnt'})
-        layout_divs['top-bars'] = rcnt.find_all('div', {'class': ['XqFnDf', 'M8OgIe']}, recursive=False)
+        layout_divs['top-bars'] = webutils.find_all_divs(rcnt, 'div', {'class': ['XqFnDf', 'M8OgIe']})
         
         # Layout classifications
         layouts = {}
diff --git a/WebSearcher/webutils.py b/WebSearcher/webutils.py
index a36c2b4..f4db20f 100644
--- a/WebSearcher/webutils.py
+++ b/WebSearcher/webutils.py
@@ -122,6 +122,8 @@ def get_link_list(soup: BeautifulSoup, attrs: dict = {}, key: str = 'href', filt
     return [link.attrs.get(key, None) for link in links] if links else None
 
 def find_all_divs(soup: BeautifulSoup, name: str, attrs: dict = {}, filter_empty: bool = True) -> list:
+    if not soup:
+        return []
     divs = soup.find_all(name, attrs) if attrs else soup.find_all(name)
     divs = filter_empty_divs(divs) if filter_empty else divs
     return divs

From 22f639b2fcebbda71e6bf83161b31a486bb6ed69 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Thu, 15 May 2025 14:40:04 -0700
Subject: [PATCH 087/101] update: stricter news_quotes classification, more
 knowledge classifier signals

---
 WebSearcher/classifiers/main.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/WebSearcher/classifiers/main.py b/WebSearcher/classifiers/main.py
index 4130fd3..616a539 100644
--- a/WebSearcher/classifiers/main.py
+++ b/WebSearcher/classifiers/main.py
@@ -1,9 +1,9 @@
+import bs4
 from .. import logger
 log = logger.Logger().start(__name__)
 
 from .header_text import ClassifyHeaderText
 from .. import webutils
-import bs4
 
 class ClassifyMain:
     """Classify a component from the main section based on its bs4.element.Tag """
@@ -151,7 +151,8 @@ def knowledge_panel(cmpt: bs4.element.Tag) -> str:
             cmpt.find("div", {"class": ["knowledge-panel", "knavi", "kp-blk", "kp-wholepage-osrp"]}),
             cmpt.find("div", {"aria-label": "Featured results", "role": "complementary"}),
             cmpt.find("div", {"jscontroller": "qTdDb"}),
-            webutils.check_dict_value(cmpt.attrs, "jscontroller", "qTdDb")
+            webutils.check_dict_value(cmpt.attrs, "jscontroller", "qTdDb"),
+            cmpt.find('div', {'class':'obcontainer'})
         ]
         return 'knowledge' if any(conditions) else "unknown"
 
@@ -187,10 +188,9 @@ def top_stories(cmpt: bs4.element.Tag) -> str:
     @staticmethod
     def news_quotes(cmpt: bs4.element.Tag) -> str:
         """Classify top stories components"""
-        conditions = [
-            cmpt.find("g-tray-header", role="heading"),
-        ]
-        return 'news_quotes' if all(conditions) else "unknown"
+        header_div = cmpt.find("g-tray-header", role="heading")
+        condition = webutils.get_text(header_div, strip=True) == "News quotes"
+        return 'news_quotes' if condition else "unknown"
 
     @staticmethod
     def twitter(cmpt: bs4.element.Tag) -> str:

From ebcca84085ff55348699e46551e330c884969bfb Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Sat, 17 May 2025 15:17:25 -0700
Subject: [PATCH 088/101] fix: stricter parsing for songs id div

---
 WebSearcher/extractors/extractor_main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/WebSearcher/extractors/extractor_main.py b/WebSearcher/extractors/extractor_main.py
index 70c9550..7be0c19 100644
--- a/WebSearcher/extractors/extractor_main.py
+++ b/WebSearcher/extractors/extractor_main.py
@@ -100,7 +100,7 @@ def extract_from_standard(self, drop_tags:set={}) -> list:
         rso_div = self.layout_divs['rso']
         standard_layouts = {
             "standard-0": rso_div.find('div', {'id':'kp-wp-tab-overview'}),
-            "standard-1": rso_div.find('div', {'id':'kp-wp-tab-Songs'}),
+            "standard-1": rso_div.find('div', {'id':'kp-wp-tab-cont-Songs', 'role':'tabpanel'}),
             "standard-2": rso_div.find('div', {'id':'kp-wp-tab-SportsStandings'}),
         }
         for layout_name, layout_div in standard_layouts.items():

From cc9a93884d6f6238b8965096643d148767c0d2d2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 23 May 2025 19:35:41 +0000
Subject: [PATCH 089/101] build(deps-dev): bump tornado from 6.4.2 to 6.5.1

Bumps [tornado](https://github.com/tornadoweb/tornado) from 6.4.2 to 6.5.1.
- [Changelog](https://github.com/tornadoweb/tornado/blob/master/docs/releases.rst)
- [Commits](https://github.com/tornadoweb/tornado/compare/v6.4.2...v6.5.1)

---
updated-dependencies:
- dependency-name: tornado
  dependency-version: 6.5.1
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 34ccb6a..8d5c7a8 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1979,23 +1979,24 @@ files = [
 
 [[package]]
 name = "tornado"
-version = "6.4.2"
+version = "6.5.1"
 description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed."
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 groups = ["dev"]
 files = [
-    {file = "tornado-6.4.2-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:e828cce1123e9e44ae2a50a9de3055497ab1d0aeb440c5ac23064d9e44880da1"},
-    {file = "tornado-6.4.2-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:072ce12ada169c5b00b7d92a99ba089447ccc993ea2143c9ede887e0937aa803"},
-    {file = "tornado-6.4.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a017d239bd1bb0919f72af256a970624241f070496635784d9bf0db640d3fec"},
-    {file = "tornado-6.4.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c36e62ce8f63409301537222faffcef7dfc5284f27eec227389f2ad11b09d946"},
-    {file = "tornado-6.4.2-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca9eb02196e789c9cb5c3c7c0f04fb447dc2adffd95265b2c7223a8a615ccbf"},
-    {file = "tornado-6.4.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:304463bd0772442ff4d0f5149c6f1c2135a1fae045adf070821c6cdc76980634"},
-    {file = "tornado-6.4.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:c82c46813ba483a385ab2a99caeaedf92585a1f90defb5693351fa7e4ea0bf73"},
-    {file = "tornado-6.4.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:932d195ca9015956fa502c6b56af9eb06106140d844a335590c1ec7f5277d10c"},
-    {file = "tornado-6.4.2-cp38-abi3-win32.whl", hash = "sha256:2876cef82e6c5978fde1e0d5b1f919d756968d5b4282418f3146b79b58556482"},
-    {file = "tornado-6.4.2-cp38-abi3-win_amd64.whl", hash = "sha256:908b71bf3ff37d81073356a5fadcc660eb10c1476ee6e2725588626ce7e5ca38"},
-    {file = "tornado-6.4.2.tar.gz", hash = "sha256:92bad5b4746e9879fd7bf1eb21dce4e3fc5128d71601f80005afa39237ad620b"},
+    {file = "tornado-6.5.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:d50065ba7fd11d3bd41bcad0825227cc9a95154bad83239357094c36708001f7"},
+    {file = "tornado-6.5.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9e9ca370f717997cb85606d074b0e5b247282cf5e2e1611568b8821afe0342d6"},
+    {file = "tornado-6.5.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b77e9dfa7ed69754a54c89d82ef746398be82f749df69c4d3abe75c4d1ff4888"},
+    {file = "tornado-6.5.1-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:253b76040ee3bab8bcf7ba9feb136436a3787208717a1fb9f2c16b744fba7331"},
+    {file = "tornado-6.5.1-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:308473f4cc5a76227157cdf904de33ac268af770b2c5f05ca6c1161d82fdd95e"},
+    {file = "tornado-6.5.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:caec6314ce8a81cf69bd89909f4b633b9f523834dc1a352021775d45e51d9401"},
+    {file = "tornado-6.5.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:13ce6e3396c24e2808774741331638ee6c2f50b114b97a55c5b442df65fd9692"},
+    {file = "tornado-6.5.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5cae6145f4cdf5ab24744526cc0f55a17d76f02c98f4cff9daa08ae9a217448a"},
+    {file = "tornado-6.5.1-cp39-abi3-win32.whl", hash = "sha256:e0a36e1bc684dca10b1aa75a31df8bdfed656831489bc1e6a6ebed05dc1ec365"},
+    {file = "tornado-6.5.1-cp39-abi3-win_amd64.whl", hash = "sha256:908e7d64567cecd4c2b458075589a775063453aeb1d2a1853eedb806922f568b"},
+    {file = "tornado-6.5.1-cp39-abi3-win_arm64.whl", hash = "sha256:02420a0eb7bf617257b9935e2b754d1b63897525d8a289c9d65690d580b4dcf7"},
+    {file = "tornado-6.5.1.tar.gz", hash = "sha256:84ceece391e8eb9b2b95578db65e920d2a61070260594819589609ba9bc6308c"},
 ]
 
 [[package]]

From 8daaec7a943840f10b95d04b6f98f8c2d5fdb6dc Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 10 Jun 2025 10:04:13 +0000
Subject: [PATCH 090/101] build(deps): bump requests from 2.32.3 to 2.32.4

Bumps [requests](https://github.com/psf/requests) from 2.32.3 to 2.32.4.
- [Release notes](https://github.com/psf/requests/releases)
- [Changelog](https://github.com/psf/requests/blob/main/HISTORY.md)
- [Commits](https://github.com/psf/requests/compare/v2.32.3...v2.32.4)

---
updated-dependencies:
- dependency-name: requests
  dependency-version: 2.32.4
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 34ccb6a..941d541 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1742,19 +1742,19 @@ cffi = {version = "*", markers = "implementation_name == \"pypy\""}
 
 [[package]]
 name = "requests"
-version = "2.32.3"
+version = "2.32.4"
 description = "Python HTTP for Humans."
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
-    {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
+    {file = "requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c"},
+    {file = "requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422"},
 ]
 
 [package.dependencies]
 certifi = ">=2017.4.17"
-charset-normalizer = ">=2,<4"
+charset_normalizer = ">=2,<4"
 idna = ">=2.5,<4"
 urllib3 = ">=1.21.1,<3"
 

From a4a4a3239e9bef32676aa4d9950ef8c287ab7513 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 17 Jun 2025 03:45:08 +0000
Subject: [PATCH 091/101] build(deps): bump protobuf from 6.30.0 to 6.31.1

Bumps [protobuf](https://github.com/protocolbuffers/protobuf) from 6.30.0 to 6.31.1.
- [Release notes](https://github.com/protocolbuffers/protobuf/releases)
- [Changelog](https://github.com/protocolbuffers/protobuf/blob/main/protobuf_release.bzl)
- [Commits](https://github.com/protocolbuffers/protobuf/compare/v6.30.0...v6.31.1)

---
updated-dependencies:
- dependency-name: protobuf
  dependency-version: 6.31.1
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 34ccb6a..55752a3 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1288,21 +1288,21 @@ wcwidth = "*"
 
 [[package]]
 name = "protobuf"
-version = "6.30.0"
+version = "6.31.1"
 description = ""
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "protobuf-6.30.0-cp310-abi3-win32.whl", hash = "sha256:7337d76d8efe65ee09ee566b47b5914c517190196f414e5418fa236dfd1aed3e"},
-    {file = "protobuf-6.30.0-cp310-abi3-win_amd64.whl", hash = "sha256:9b33d51cc95a7ec4f407004c8b744330b6911a37a782e2629c67e1e8ac41318f"},
-    {file = "protobuf-6.30.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:52d4bb6fe76005860e1d0b8bfa126f5c97c19cc82704961f60718f50be16942d"},
-    {file = "protobuf-6.30.0-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:7940ab4dfd60d514b2e1d3161549ea7aed5be37d53bafde16001ac470a3e202b"},
-    {file = "protobuf-6.30.0-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:d79bf6a202a536b192b7e8d295d7eece0c86fbd9b583d147faf8cfeff46bf598"},
-    {file = "protobuf-6.30.0-cp39-cp39-win32.whl", hash = "sha256:bb35ad251d222f03d6c4652c072dfee156be0ef9578373929c1a7ead2bd5492c"},
-    {file = "protobuf-6.30.0-cp39-cp39-win_amd64.whl", hash = "sha256:501810e0eba1d327e783fde47cc767a563b0f1c292f1a3546d4f2b8c3612d4d0"},
-    {file = "protobuf-6.30.0-py3-none-any.whl", hash = "sha256:e5ef216ea061b262b8994cb6b7d6637a4fb27b3fb4d8e216a6040c0b93bd10d7"},
-    {file = "protobuf-6.30.0.tar.gz", hash = "sha256:852b675d276a7d028f660da075af1841c768618f76b90af771a8e2c29e6f5965"},
+    {file = "protobuf-6.31.1-cp310-abi3-win32.whl", hash = "sha256:7fa17d5a29c2e04b7d90e5e32388b8bfd0e7107cd8e616feef7ed3fa6bdab5c9"},
+    {file = "protobuf-6.31.1-cp310-abi3-win_amd64.whl", hash = "sha256:426f59d2964864a1a366254fa703b8632dcec0790d8862d30034d8245e1cd447"},
+    {file = "protobuf-6.31.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:6f1227473dc43d44ed644425268eb7c2e488ae245d51c6866d19fe158e207402"},
+    {file = "protobuf-6.31.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:a40fc12b84c154884d7d4c4ebd675d5b3b5283e155f324049ae396b95ddebc39"},
+    {file = "protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:4ee898bf66f7a8b0bd21bce523814e6fbd8c6add948045ce958b73af7e8878c6"},
+    {file = "protobuf-6.31.1-cp39-cp39-win32.whl", hash = "sha256:0414e3aa5a5f3ff423828e1e6a6e907d6c65c1d5b7e6e975793d5590bdeecc16"},
+    {file = "protobuf-6.31.1-cp39-cp39-win_amd64.whl", hash = "sha256:8764cf4587791e7564051b35524b72844f845ad0bb011704c3736cce762d8fe9"},
+    {file = "protobuf-6.31.1-py3-none-any.whl", hash = "sha256:720a6c7e6b77288b85063569baae8536671b39f15cc22037ec7045658d80489e"},
+    {file = "protobuf-6.31.1.tar.gz", hash = "sha256:d8cac4c982f0b957a4dc73a80e2ea24fab08e679c0de9deb835f4a12d69aca9a"},
 ]
 
 [[package]]

From a05270efcd5e3601394fb4be8d04c18f74b3ea52 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 19 Jun 2025 04:51:01 +0000
Subject: [PATCH 092/101] build(deps): bump urllib3 from 2.3.0 to 2.5.0

Bumps [urllib3](https://github.com/urllib3/urllib3) from 2.3.0 to 2.5.0.
- [Release notes](https://github.com/urllib3/urllib3/releases)
- [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst)
- [Commits](https://github.com/urllib3/urllib3/compare/2.3.0...2.5.0)

---
updated-dependencies:
- dependency-name: urllib3
  dependency-version: 2.5.0
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 poetry.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 34ccb6a..7a88f9a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2113,14 +2113,14 @@ websockets = "*"
 
 [[package]]
 name = "urllib3"
-version = "2.3.0"
+version = "2.5.0"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df"},
-    {file = "urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d"},
+    {file = "urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc"},
+    {file = "urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760"},
 ]
 
 [package.dependencies]

From 0209fdba55fd70c6004f974ea7618ac55b420af9 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Tue, 14 Oct 2025 08:16:39 -0700
Subject: [PATCH 093/101] version: 0.6.5a0

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 34de6e7..019cd14 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.6.5.dev5"
+version = "0.6.5a0"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]

From b7cc70011c858c75e2e14468624c937c8dc1ecd1 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Tue, 14 Oct 2025 08:17:20 -0700
Subject: [PATCH 094/101] refactor: convert Footer methods to staticmethod

---
 WebSearcher/component_parsers/footer.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/WebSearcher/component_parsers/footer.py b/WebSearcher/component_parsers/footer.py
index b60ff86..e45a044 100644
--- a/WebSearcher/component_parsers/footer.py
+++ b/WebSearcher/component_parsers/footer.py
@@ -2,13 +2,13 @@
 
 class Footer:
 
-    @classmethod
-    def parse_image_cards(self, elem) -> list:
+    @staticmethod
+    def parse_image_cards(elem) -> list:
         subs = webutils.find_all_divs(elem, 'div', {'class':'g'})
-        return [self.parse_image_card(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
+        return [Footer.parse_image_card(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
 
-    @classmethod
-    def parse_image_card(self, sub, sub_rank=0) -> dict:
+    @staticmethod
+    def parse_image_card(sub, sub_rank=0) -> dict:
         parsed = {'type':'img_cards', 'sub_rank':sub_rank}
         parsed['title'] = webutils.get_text(sub, "div", {'aria-level':"3", "role":"heading"})
         images = sub.find_all('img')
@@ -16,8 +16,8 @@ def parse_image_card(self, sub, sub_rank=0) -> dict:
             parsed['details'] = [{'text':i['alt'], 'url':i['src']} for i in images]
         return parsed
 
-    @classmethod
-    def parse_discover_more(self, elem) -> list:
+    @staticmethod
+    def parse_discover_more(elem) -> list:
         carousel = elem.find('g-scrolling-carousel')
         return [{
             'type':'discover_more', 
@@ -25,8 +25,8 @@ def parse_discover_more(self, elem) -> list:
             'text': '|'.join(c.text for c in carousel.find_all('g-inner-card'))
         }]
 
-    @classmethod
-    def parse_omitted_notice(self, elem) -> list:
+    @staticmethod
+    def parse_omitted_notice(elem) -> list:
         return [{
             'type':'omitted_notice',
             'sub_rank':0, 

From e66515c8ec4e9ecc4fb9c4e27727d99f6a14a996 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Tue, 14 Oct 2025 08:29:30 -0700
Subject: [PATCH 095/101] fix: update demo-search entry point to use typer app

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 019cd14..a814d57 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ homepage = "http://github.com/gitronald/WebSearcher"
 repository = "http://github.com/gitronald/WebSearcher"
 
 [project.scripts]
-demo-search = 'scripts.demo_search:main'
+demo-search = 'scripts.demo_search:app'
 
 [tool.poetry]
 packages = [{include = "WebSearcher"}]

From bb938d5719d0aeb5db591795a5290ba1ea84bfa7 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Tue, 14 Oct 2025 08:29:41 -0700
Subject: [PATCH 096/101] update: version in __init__.py to match
 pyproject.toml

---
 WebSearcher/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index 61e16df..f8bdeb9 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.5.dev5"
+__version__ = "0.6.5a0"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor

From cc33dea9b326a29b648b6ce073f335de6b921afe Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Tue, 14 Oct 2025 08:47:42 -0700
Subject: [PATCH 097/101] update: default Chrome version to 141

---
 README.md                                       | 4 ++--
 WebSearcher/models/configs.py                   | 2 +-
 WebSearcher/search_methods/selenium_searcher.py | 2 --
 scripts/demo_search.py                          | 2 +-
 scripts/demo_searches.py                        | 2 +-
 5 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 677c408..ba79109 100644
--- a/README.md
+++ b/README.md
@@ -119,7 +119,7 @@ drwxr-xr-x 2 user user 4.0K 2024-11-11 10:55 html/
 
 ### Step by Step 
 
-Example search and parse pipeline:
+Example search and parse pipeline (via requests):
 
 ```python
 import WebSearcher as ws
@@ -143,7 +143,7 @@ se = ws.SearchEngine(
         "headless": False,
         "use_subprocess": False,
         "driver_executable_path": "",
-        "version_main": 133,
+        "version_main": 141,
     }
 )
 ```   
diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py
index 81e011d..5d6ea80 100644
--- a/WebSearcher/models/configs.py
+++ b/WebSearcher/models/configs.py
@@ -25,7 +25,7 @@ class LogConfig(BaseConfig):
 
 class SeleniumConfig(BaseConfig):
     headless: bool = False
-    version_main: int = 133
+    version_main: int = 141
     use_subprocess: bool = False
     driver_executable_path: str = ""
 
diff --git a/WebSearcher/search_methods/selenium_searcher.py b/WebSearcher/search_methods/selenium_searcher.py
index 1e67025..e4b1e16 100644
--- a/WebSearcher/search_methods/selenium_searcher.py
+++ b/WebSearcher/search_methods/selenium_searcher.py
@@ -136,14 +136,12 @@ def cleanup(self) -> bool:
             try:
                 self.delete_cookies()      
                 self.close_all_windows()          
-                # Finally quit the driver
                 self.driver.quit()
                 self.driver = None
                 self.log.debug(f'Browser successfully closed')
                 return True
             except Exception as e:
                 self.log.warning(f'Failed to close browser: {e}')
-                # Force driver to be None so we create a fresh instance next time
                 self.driver = None
                 return False
         return True
diff --git a/scripts/demo_search.py b/scripts/demo_search.py
index 3debcaf..ebfdd1c 100644
--- a/scripts/demo_search.py
+++ b/scripts/demo_search.py
@@ -22,7 +22,7 @@ def main(
     data_dir: str = typer.Option(DEFAULT_DATA_DIR, help="Prefix for output files"),
     headless: bool = typer.Option(False, help="Run browser in headless mode"),
     use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"),
-    version_main: int = typer.Option(133, help="Main version of Chrome to use"),
+    version_main: int = typer.Option(141, help="Main version of Chrome to use"),
     ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"),
     driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"),
 ) -> None:
diff --git a/scripts/demo_searches.py b/scripts/demo_searches.py
index 82eee67..f63f454 100644
--- a/scripts/demo_searches.py
+++ b/scripts/demo_searches.py
@@ -22,7 +22,7 @@ def main(
     data_dir: str = typer.Option(DEFAULT_DATA_DIR, help="Prefix for output files"),
     headless: bool = typer.Option(False, help="Run browser in headless mode"),
     use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"),
-    version_main: int = typer.Option(133, help="Main version of Chrome to use"),
+    version_main: int = typer.Option(141, help="Main version of Chrome to use"),
     ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"),
     driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"),
 ) -> None:

From 1cb9ae5c274493d445b8c8069b6e1927103c9798 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Tue, 14 Oct 2025 08:54:55 -0700
Subject: [PATCH 098/101] update: bump requests to 2.32.4 and protobuf to
 6.31.1

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a814d57..16f8df9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ license = "GPL-3.0"
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
-    "requests>=2.32.3",
+    "requests>=2.32.4",
     "lxml>=5.3.0",
     "beautifulsoup4>=4.12.3",
     "tldextract>=5.1.2",
@@ -17,7 +17,7 @@ dependencies = [
     "pandas>=2.2.3",
     "undetected-chromedriver>=3.5.5",
     "selenium>=4.9.0",
-    "protobuf (>=6.30.0,<7.0.0)",
+    "protobuf (>=6.31.1,<7.0.0)",
     "orjson (>=3.10.16,<4.0.0)",
 ]
 

From 3cb10932d00a107d57c7534d8efbcd5f35ba6b37 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Tue, 14 Oct 2025 08:57:02 -0700
Subject: [PATCH 099/101] update: regenerate poetry.lock

---
 poetry.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 22c0970..369c990 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand.
 
 [[package]]
 name = "annotated-types"
@@ -515,7 +515,7 @@ description = "Backport of PEP 654 (exception groups)"
 optional = false
 python-versions = ">=3.7"
 groups = ["main", "dev"]
-markers = "python_version < \"3.11\""
+markers = "python_version == \"3.10\""
 files = [
     {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"},
     {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"},
@@ -1941,7 +1941,7 @@ description = "A lil' TOML parser"
 optional = false
 python-versions = ">=3.8"
 groups = ["dev"]
-markers = "python_version < \"3.11\""
+markers = "python_version == \"3.10\""
 files = [
     {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"},
     {file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"},
@@ -2259,4 +2259,4 @@ h11 = ">=0.9.0,<1"
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10"
-content-hash = "684e3794b5ea4541fde5a46b9bf83f67cbeedcecf4cd969dce683ffc3210b382"
+content-hash = "c571829b60451314f3df0749f1f8f8b553bdfe22d4e8a183c096335cfae000ae"

From eb3cec9442463487e8d36cfea45b92f039a5ebf8 Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Fri, 5 Dec 2025 13:19:56 -0800
Subject: [PATCH 100/101] update: github actions readme section

---
 README.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/README.md b/README.md
index ba79109..fa43bd2 100644
--- a/README.md
+++ b/README.md
@@ -47,6 +47,7 @@ Below are some details about recent updates. For a longer list, see the [Update
     - [Repair or Enhance a Parser](#repair-or-enhance-a-parser)
     - [Add a Parser](#add-a-parser)
     - [Testing](#testing)
+  - [GitHub Actions](#github-actions)
   - [Update Log](#update-log)
   - [Similar Packages](#similar-packages)
   - [License](#license)
@@ -253,6 +254,22 @@ With the `-k` flag you can run a test for a specific html file:
 pytest -k "1684837514.html"
 ```
 
+---
+## GitHub Actions
+
+This repository uses GitHub Actions for automated publishing:
+
+**Release Workflow** (`.github/workflows/publish.yml`)
+Automatically publishes to PyPI when a pull request is merged into `master`. The workflow:
+- Triggers on merged PRs to `master`
+- Builds the package using Poetry
+- Publishes to PyPI using trusted publishing (no API tokens required)
+
+To release a new version:
+1. Update the version in `pyproject.toml`
+2. Create a PR to `master`
+3. Once merged, the package is automatically published to PyPI
+
 ---
 ## Update Log
 

From a864e09ccf4318d0b05c154c3be24397ba18f52a Mon Sep 17 00:00:00 2001
From: gitronald <gitronald@users.noreply.github.com>
Date: Fri, 5 Dec 2025 13:23:51 -0800
Subject: [PATCH 101/101] version: 0.6.5

---
 WebSearcher/__init__.py | 2 +-
 pyproject.toml          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
index f8bdeb9..6380b2f 100644
--- a/WebSearcher/__init__.py
+++ b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.5a0"
+__version__ = "0.6.5"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor
diff --git a/pyproject.toml b/pyproject.toml
index 16f8df9..a63593b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "WebSearcher"
-version = "0.6.5a0"
+version = "0.6.5"
 description = "Tools for conducting, collecting, and parsing web search"
 authors = [{name = "Ronald E. Robertson", email = "<rer@acm.org>"}]
 keywords = ["web", "search", "parser"]