diff --git a/README.md b/README.md index 677c408..fa43bd2 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ Below are some details about recent updates. For a longer list, see the [Update - [Repair or Enhance a Parser](#repair-or-enhance-a-parser) - [Add a Parser](#add-a-parser) - [Testing](#testing) + - [GitHub Actions](#github-actions) - [Update Log](#update-log) - [Similar Packages](#similar-packages) - [License](#license) @@ -119,7 +120,7 @@ drwxr-xr-x 2 user user 4.0K 2024-11-11 10:55 html/ ### Step by Step -Example search and parse pipeline: +Example search and parse pipeline (via requests): ```python import WebSearcher as ws @@ -143,7 +144,7 @@ se = ws.SearchEngine( "headless": False, "use_subprocess": False, "driver_executable_path": "", - "version_main": 133, + "version_main": 141, } ) ``` @@ -253,6 +254,22 @@ With the `-k` flag you can run a test for a specific html file: pytest -k "1684837514.html" ``` +--- +## GitHub Actions + +This repository uses GitHub Actions for automated publishing: + +**Release Workflow** (`.github/workflows/publish.yml`) +Automatically publishes to PyPI when a pull request is merged into `master`. The workflow: +- Triggers on merged PRs to `master` +- Builds the package using Poetry +- Publishes to PyPI using trusted publishing (no API tokens required) + +To release a new version: +1. Update the version in `pyproject.toml` +2. Create a PR to `master` +3. Once merged, the package is automatically published to PyPI + --- ## Update Log diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py index 9eb7a83..6380b2f 100644 --- a/WebSearcher/__init__.py +++ b/WebSearcher/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.6.4" +__version__ = "0.6.5" from .searchers import SearchEngine from .parsers import parse_serp, FeatureExtractor from .extractors import Extractor diff --git a/WebSearcher/classifiers/header_text.py b/WebSearcher/classifiers/header_text.py index 8c0c3c7..b247af8 100644 --- a/WebSearcher/classifiers/header_text.py +++ b/WebSearcher/classifiers/header_text.py @@ -90,7 +90,8 @@ def _get_header_level_mapping(level) -> dict: "local_results": [ "Local Results", "Locations", - "Places", "Sitios" + "Places", + "Sitios", "Businesses", "locations", ], @@ -116,6 +117,7 @@ def _get_header_level_mapping(level) -> dict: "News", "Noticias", "Market news"], + "recent_posts": ["Recent posts"], "twitter": ["Twitter Results"], "videos": ["Videos"] } diff --git a/WebSearcher/classifiers/main.py b/WebSearcher/classifiers/main.py index 70cb570..616a539 100644 --- a/WebSearcher/classifiers/main.py +++ b/WebSearcher/classifiers/main.py @@ -1,9 +1,9 @@ +import bs4 from .. import logger log = logger.Logger().start(__name__) from .header_text import ClassifyHeaderText from .. import webutils -import bs4 class ClassifyMain: """Classify a component from the main section based on its bs4.element.Tag """ @@ -14,6 +14,7 @@ def classify(cmpt: bs4.element.Tag) -> str: # Ordered list of classifiers to try component_classifiers = [ ClassifyMain.top_stories, # Check top stories + ClassifyMain.discussions_and_forums, # Check discussions and forums ClassifyHeaderText.classify, # Check levels 2 & 3 header text ClassifyMain.news_quotes, # Check news quotes ClassifyMain.img_cards, # Check image cards @@ -40,6 +41,12 @@ def classify(cmpt: bs4.element.Tag) -> str: return cmpt_type + @staticmethod + def discussions_and_forums(cmpt: bs4.element.Tag) -> str: + conditions = [ + cmpt.find("div", {"class": "IFnjPb", "role": "heading"}), + ] + return 'discussions_and_forums' if all(conditions) else "unknown" @staticmethod def available_on(cmpt: bs4.element.Tag) -> str: @@ -68,7 +75,7 @@ def general(cmpt: bs4.element.Tag) -> str: "format-01": cmpt.attrs["class"] == ["g"], "format-02": ( ("g" in cmpt.attrs["class"]) & any(s in ["Ww4FFb"] for s in cmpt.attrs["class"]) ), - "format-03": any(s in ["hlcw0c", "MjjYud"] for s in cmpt.attrs["class"]), + "format-03": any(s in ["hlcw0c", "MjjYud", "PmEWq"] for s in cmpt.attrs["class"]), "format-04": cmpt.find('div', {'class': ['g', 'Ww4FFb']}), } else: @@ -143,7 +150,9 @@ def knowledge_panel(cmpt: bs4.element.Tag) -> str: cmpt.find("h1", {"class": "VW3apb"}), cmpt.find("div", {"class": ["knowledge-panel", "knavi", "kp-blk", "kp-wholepage-osrp"]}), cmpt.find("div", {"aria-label": "Featured results", "role": "complementary"}), - webutils.check_dict_value(cmpt.attrs, "jscontroller", "qTdDb") + cmpt.find("div", {"jscontroller": "qTdDb"}), + webutils.check_dict_value(cmpt.attrs, "jscontroller", "qTdDb"), + cmpt.find('div', {'class':'obcontainer'}) ] return 'knowledge' if any(conditions) else "unknown" @@ -179,10 +188,9 @@ def top_stories(cmpt: bs4.element.Tag) -> str: @staticmethod def news_quotes(cmpt: bs4.element.Tag) -> str: """Classify top stories components""" - conditions = [ - cmpt.find("g-tray-header", role="heading"), - ] - return 'news_quotes' if all(conditions) else "unknown" + header_div = cmpt.find("g-tray-header", role="heading") + condition = webutils.get_text(header_div, strip=True) == "News quotes" + return 'news_quotes' if condition else "unknown" @staticmethod def twitter(cmpt: bs4.element.Tag) -> str: diff --git a/WebSearcher/component_parsers/__init__.py b/WebSearcher/component_parsers/__init__.py index aaff223..afb67a2 100644 --- a/WebSearcher/component_parsers/__init__.py +++ b/WebSearcher/component_parsers/__init__.py @@ -15,6 +15,7 @@ from .latest_from import parse_latest_from from .local_news import parse_local_news from .perspectives import parse_perspectives +from .recent_posts import parse_recent_posts from .local_results import parse_local_results from .map_results import parse_map_results @@ -57,6 +58,7 @@ ('news_quotes', parse_news_quotes, 'News Quotes'), ('people_also_ask', parse_people_also_ask, 'People Also Ask'), ('perspectives', parse_perspectives, 'Perspectives & Opinions'), + ('recent_posts', parse_recent_posts, 'Recent Posts'), ('scholarly_articles', parse_scholarly_articles, 'Scholar Articles'), ('searches_related', parse_searches_related, 'Related Searches'), ('shopping_ads', parse_shopping_ads, 'Shopping Ad'), diff --git a/WebSearcher/component_parsers/ads.py b/WebSearcher/component_parsers/ads.py index d38917c..26d7480 100644 --- a/WebSearcher/component_parsers/ads.py +++ b/WebSearcher/component_parsers/ads.py @@ -6,6 +6,7 @@ - added new div class for text field - added labels (e.g., "Provides abortions") from , appended to text field +2025-04-27: added carousel sub_type, global parsed output """ @@ -13,6 +14,16 @@ from .shopping_ads import parse_shopping_ads import bs4 +PARSED = { + 'type': 'ad', + 'sub_type': '', + 'sub_rank': 0, + 'title': '', + 'url': '', + 'cite': '', + 'text': '', +} + def parse_ads(cmpt: bs4.element.Tag) -> list: """Parse ads from ad component""" @@ -27,12 +38,14 @@ def parse_ads(cmpt: bs4.element.Tag) -> list: parsed_list = [parse_ad_secondary(sub, sub_rank) for sub_rank, sub in enumerate(subs)] elif sub_type == 'standard': subs = webutils.find_all_divs(cmpt, 'div', {'class': ['uEierd', 'commercial-unit-desktop-top']}) - for sub in subs: + for sub_rank, sub in enumerate(subs): sub_classes = sub.attrs.get("class", []) if "commercial-unit-desktop-top" in sub_classes: parsed_list.extend(parse_shopping_ads(sub)) elif "uEierd" in sub_classes: - parsed_list.append(parse_ad(sub)) + parsed_list.append(parse_ad(sub, sub_rank=sub_rank)) + elif sub_type == 'carousel': + parsed_list = parse_ad_carousel(cmpt, sub_type) return parsed_list @@ -41,7 +54,8 @@ def classify_ad_type(cmpt: bs4.element.Tag) -> str: label_divs = { "legacy": webutils.find_all_divs(cmpt, 'div', {'class': 'ad_cclk'}), "secondary": webutils.find_all_divs(cmpt, 'div', {'class': 'd5oMvf'}), - "standard": webutils.find_all_divs(cmpt, 'div', {'class': ['uEierd', 'commercial-unit-desktop-top']}) + "standard": webutils.find_all_divs(cmpt, 'div', {'class': ['uEierd', 'commercial-unit-desktop-top']}), + "carousel": webutils.find_all_divs(cmpt, 'g-scrolling-carousel'), } for label, divs in label_divs.items(): if divs: @@ -49,12 +63,62 @@ def classify_ad_type(cmpt: bs4.element.Tag) -> str: return 'unknown' +def parse_ad_carousel(cmpt: bs4.element.Tag, sub_type: str, filter_visible: bool = True) -> list: + + def parse_ad_carousel_div(sub: bs4.element.Tag, sub_type: str, sub_rank: int) -> dict: + """Parse ad carousel div, seen 2025-02-06""" + parsed = PARSED.copy() + parsed['sub_type'] = sub_type + parsed['sub_rank'] = sub_rank + parsed['title'] = webutils.get_text(sub, 'div', {'class':'e7SMre'}) + parsed['url'] = webutils.get_link(sub) + parsed['text'] = webutils.get_text(sub, 'div', {"class":"vrAZpb"}) + parsed['cite'] = webutils.get_text(sub, 'div', {"class":"zpIwr"}) + parsed['visible'] = not (sub.has_attr('data-has-shown') and sub['data-has-shown'] == 'false') + return parsed + + def parse_ad_carousel_card(sub: bs4.element.Tag, sub_type: str, sub_rank: int) -> dict: + """Parse ad carousel card, seen 2024-09-21""" + parsed = PARSED.copy() + parsed['sub_type'] = sub_type + parsed['sub_rank'] = sub_rank + parsed['title'] = webutils.get_text(sub, 'div', {'class':'gCv54b'}) + parsed['url'] = webutils.get_link(sub, {"class": "KTsHxd"}) + parsed['text'] = webutils.get_text(sub, 'div', {"class":"VHpBje"}) + parsed['cite'] = webutils.get_text(sub, 'div', {"class":"j958Pd"}) + parsed['visible'] = not (sub.has_attr('data-viewurl') and sub['data-viewurl']) + return parsed + + ad_carousel_parsers = [ + {'find_kwargs': {'name': 'g-inner-card'}, + 'parser': parse_ad_carousel_card}, + {'find_kwargs': {'name': 'div', 'attrs': {'class': 'ZPze1e'}}, + 'parser': parse_ad_carousel_div} + ] + + output_list = [] + ad_carousel = cmpt.find('g-scrolling-carousel') + if ad_carousel: + for parser_details in ad_carousel_parsers: + parser_func = parser_details['parser'] + kwargs = parser_details['find_kwargs'] + sub_cmpts = webutils.find_all_divs(ad_carousel, **kwargs) + if sub_cmpts: + for sub_rank, sub in enumerate(sub_cmpts): + parsed = parser_func(sub, sub_type, sub_rank) + output_list.append(parsed) + + if filter_visible: + output_list = [{k:v for k,v in x.items() if k != 'visible'} for x in output_list if x['visible']] + return output_list + + def parse_ad(sub: bs4.element.Tag, sub_rank: int = 0) -> dict: """Parse details of a single ad subcomponent, similar to general""" - parsed = {"type": "ad", - "sub_type": "standard", - "sub_rank": sub_rank} - + parsed = PARSED.copy() + parsed["sub_type"] = "standard" + parsed["sub_rank"] = sub_rank + parsed['title'] = webutils.get_text(sub, 'div', {'role':'heading'}) parsed['url'] = webutils.get_link(sub, {"class":"sVXRqc"}) parsed['cite'] = webutils.get_text(sub, 'span', {"role":"text"}) @@ -96,13 +160,14 @@ def parse_ad_menu(sub: bs4.element.Tag) -> list: def parse_ad_secondary(sub: bs4.element.Tag, sub_rank: int = 0) -> dict: """Parse details of a single ad subcomponent, similar to general""" + parsed = PARSED.copy() + parsed["sub_type"] = "secondary" + parsed["sub_rank"] = sub_rank - parsed = {"type": "ad", - "sub_type": "secondary", - "sub_rank": sub_rank} - parsed['title'] = sub.find('div', {'role':'heading'}).text - parsed['url'] = sub.find('div', {'class':'d5oMvf'}).find('a')['href'] - parsed['cite'] = sub.find('span', {'class':'gBIQub'}).text + parsed['title'] = webutils.get_text(sub, 'div', {'role':'heading'}) + link_div = sub.find('div', {'class':'d5oMvf'}) + parsed['url'] = webutils.get_link(link_div) if link_div else '' + parsed['cite'] = webutils.get_text(sub, 'span', {'class':'gBIQub'}) # Take the top div with this class, should be main result abstract text_divs = sub.find_all('div', {'class':'yDYNvb'}) @@ -123,14 +188,14 @@ def parse_ad_secondary(sub: bs4.element.Tag, sub_rank: int = 0) -> dict: def parse_ad_legacy(sub: bs4.element.Tag, sub_rank: int = 0) -> dict: """[legacy] Parse details of a single ad subcomponent, similar to general""" - - parsed = {"type": "ad", - "sub_type": "legacy", - "sub_rank": sub_rank} + parsed = PARSED.copy() + parsed["sub_type"] = "legacy" + parsed["sub_rank"] = sub_rank + header = sub.find('div', {'class':'ad_cclk'}) - parsed['title'] = header.find('h3').text - parsed['url'] = header.find('cite').text - parsed['text'] = sub.find('div', {'class':'ads-creative'}).text + parsed['title'] = webutils.get_text(header, 'h3') + parsed['url'] = webutils.get_text(header, 'cite') + parsed['text'] = webutils.get_text(sub, 'div', {'class':'ads-creative'}) bottom_text = sub.find('ul') if bottom_text: diff --git a/WebSearcher/component_parsers/footer.py b/WebSearcher/component_parsers/footer.py index b60ff86..e45a044 100644 --- a/WebSearcher/component_parsers/footer.py +++ b/WebSearcher/component_parsers/footer.py @@ -2,13 +2,13 @@ class Footer: - @classmethod - def parse_image_cards(self, elem) -> list: + @staticmethod + def parse_image_cards(elem) -> list: subs = webutils.find_all_divs(elem, 'div', {'class':'g'}) - return [self.parse_image_card(sub, sub_rank) for sub_rank, sub in enumerate(subs)] + return [Footer.parse_image_card(sub, sub_rank) for sub_rank, sub in enumerate(subs)] - @classmethod - def parse_image_card(self, sub, sub_rank=0) -> dict: + @staticmethod + def parse_image_card(sub, sub_rank=0) -> dict: parsed = {'type':'img_cards', 'sub_rank':sub_rank} parsed['title'] = webutils.get_text(sub, "div", {'aria-level':"3", "role":"heading"}) images = sub.find_all('img') @@ -16,8 +16,8 @@ def parse_image_card(self, sub, sub_rank=0) -> dict: parsed['details'] = [{'text':i['alt'], 'url':i['src']} for i in images] return parsed - @classmethod - def parse_discover_more(self, elem) -> list: + @staticmethod + def parse_discover_more(elem) -> list: carousel = elem.find('g-scrolling-carousel') return [{ 'type':'discover_more', @@ -25,8 +25,8 @@ def parse_discover_more(self, elem) -> list: 'text': '|'.join(c.text for c in carousel.find_all('g-inner-card')) }] - @classmethod - def parse_omitted_notice(self, elem) -> list: + @staticmethod + def parse_omitted_notice(elem) -> list: return [{ 'type':'omitted_notice', 'sub_rank':0, diff --git a/WebSearcher/component_parsers/images.py b/WebSearcher/component_parsers/images.py index 27f932d..ac4907c 100644 --- a/WebSearcher/component_parsers/images.py +++ b/WebSearcher/component_parsers/images.py @@ -1,14 +1,14 @@ +""" Parsers for image components + +Changelog +2025-04-28: added div subcomponent class and sub_type labels + +""" + from ..webutils import get_text, get_link, get_div def parse_images(cmpt) -> list: - """Parse an image component - - Args: - cmpt (bs4 object): an image component - - Returns: - list: list of parsed subcomponent dictionaries - """ + """Parse an images component""" parsed_list = [] @@ -25,7 +25,7 @@ def parse_images(cmpt) -> list: parsed_list.extend(parsed_subs) else: # Medium images with titles and urls - subs = cmpt.find_all('div', {'class':'eA0Zlc'}) + subs = cmpt.find_all('div', {'class': ['eA0Zlc', 'vCUuC']}) parsed_subs = [parse_image_medium(sub, sub_rank + len(parsed_list)) for sub_rank, sub in enumerate(subs)] parsed_list.extend(parsed_subs) @@ -35,14 +35,7 @@ def parse_images(cmpt) -> list: return parsed_list def parse_image_multimedia(sub, sub_rank=0) -> dict: - """Parse an image subcomponent - - Args: - sub (bs4 object): an image subcomponent - - Returns: - dict : parsed subresult - """ + """Parse an images multimedia subcomponent""" return { "type": "images", "sub_type": "multimedia", @@ -53,19 +46,17 @@ def parse_image_multimedia(sub, sub_rank=0) -> dict: } def parse_image_medium(sub, sub_rank=0) -> dict: - """Parse an image subcomponent - - Args: - sub (bs4 object): an image subcomponent - - Returns: - dict : parsed subresult - """ + """Parse an images medium subcomponent""" title_div = get_div(sub, 'a', {'class':'EZAeBe'}) - title = get_text(title_div) if title_div else get_img_alt(sub) + title = get_text(title_div) if title_div else get_text(sub, 'span', {'class':'Yt787'}) url = get_link(sub) if title_div else get_img_url(sub) + if not title: + title = get_img_alt(sub) + if not url: + url = get_link(sub, attrs={'class':['EZAeBe', 'ddkIM']}) + return { "type": "images", "sub_type": "medium", @@ -77,14 +68,8 @@ def parse_image_medium(sub, sub_rank=0) -> dict: } def parse_image_small(sub, sub_rank=0) -> dict: - """Parse an image subcomponent - - Args: - sub (bs4 object): an image subcomponent - - Returns: - dict : parsed subresult - """ + """Parse an images small subcomponent""" + return { "type": "images", "sub_type": "small", @@ -121,7 +106,7 @@ def get_image_url_from_attrs(sub): try: url = func(sub) if url.startswith('data:image'): - raise ValueError(f"Data URL: {img_src}") + raise ValueError(f"Data URL: {url}") else: return url except Exception as e: diff --git a/WebSearcher/component_parsers/recent_posts.py b/WebSearcher/component_parsers/recent_posts.py new file mode 100644 index 0000000..ee0a24d --- /dev/null +++ b/WebSearcher/component_parsers/recent_posts.py @@ -0,0 +1,14 @@ +from .top_stories import parse_top_stories + +def parse_recent_posts(cmpt): + """Parse a "Recent posts" component + + These components have a similar carousel as Top Stories and Perspectives. + + Args: + cmpt (bs4 object): A html component + + Returns: + dict : parsed result + """ + return parse_top_stories(cmpt, ctype='recent_posts') diff --git a/WebSearcher/component_parsers/videos.py b/WebSearcher/component_parsers/videos.py index 950d849..cd374d3 100644 --- a/WebSearcher/component_parsers/videos.py +++ b/WebSearcher/component_parsers/videos.py @@ -1,8 +1,9 @@ """ Parsers for video components Changelog -2021-05-08: added find_all for divs with class 'VibNM' -2021-05-08: added adjustment for new cite and timestamp +2024-05-08: added find_all for divs with class 'VibNM' +2024-05-08: added adjustment for new cite and timestamp +2025-04-27: added div subcomponent class and sub_type labels """ @@ -23,24 +24,25 @@ def parse_videos(cmpt) -> list: # Get known div structures divs = [] name_attrs = [ - {'name':'g-inner-card'}, - {'name':'div', 'attrs':{'class':'VibNM'}}, - {'name':'div', 'attrs':{'class':'mLmaBd'}}, - {'name':'div', 'attrs':{'class':'RzdJxc'}}, + ({'name':'g-inner-card'}, 'unspecified-0'), + ({'name':'div', 'attrs':{'class':'VibNM'}}, 'unspecified-1'), + ({'name':'div', 'attrs':{'class':'mLmaBd'}}, 'unspecified-2'), + ({'name':'div', 'attrs':{'class':'RzdJxc'}}, 'unspecified-3'), + ({'name':'div', 'attrs':{'class':'sHEJob'}}, 'vertical'), ] - for kwargs in name_attrs: + for kwargs, sub_type in name_attrs: divs = webutils.find_all_divs(cmpt, **kwargs) if divs: break divs = list(filter(None, divs)) if divs: - return [parse_video(div, i) for i, div in enumerate(divs)] + return [parse_video(div, sub_type, i) for i, div in enumerate(divs)] else: return [{'type': 'videos', 'sub_rank': 0, 'error': 'No subcomponents found'}] -def parse_video(sub, sub_rank=0) -> dict: +def parse_video(sub, sub_type: str, sub_rank=0) -> dict: """Parse a videos subcomponent Args: @@ -52,6 +54,7 @@ def parse_video(sub, sub_rank=0) -> dict: parsed = { 'type': 'videos', + 'sub_type': sub_type, 'sub_rank': sub_rank, 'url': get_url(sub), 'title': webutils.get_text(sub, 'div', {'role':'heading'}), @@ -82,7 +85,6 @@ def parse_video(sub, sub_rank=0) -> dict: return parsed - def get_url(sub): """Get video URL by filtering for non-hash links""" all_urls = sub.find_all('a') diff --git a/WebSearcher/extractors.py b/WebSearcher/extractors.py deleted file mode 100644 index 5b1202e..0000000 --- a/WebSearcher/extractors.py +++ /dev/null @@ -1,359 +0,0 @@ -from .components import Component, ComponentList -from . import utils -from . import webutils -from . import logger -log = logger.Logger().start(__name__) -import bs4 - - -class Extractor: - def __init__(self, soup: bs4.BeautifulSoup): - self.soup = soup - self.components = ComponentList() - self.rhs = {} - self.layout_divs = { - "rso": None, - "top-bars": None, - "left-bar": None, - } - self.layouts = { - "rso": False, - "top-bars": False, - "left-bar": False, - "standard": False, - "no-rso": False, - } - self.layout_label = None - self.layout_extractors = { - "standard": self.extract_from_standard, - "top-bars": self.extract_from_top_bar, - "left-bar": self.extract_from_left_bar, - "no-rso": self.extract_from_no_rso - } - - def extract_components(self): - log.debug("Extracting Components") - self.extract_rhs() - self.extract_header() - self.extract_main() - self.extract_footer() - self.append_rhs() - log.debug(f"Extracted {self.components.cmpt_rank_counter:,} components") - - # -------------------------------------------------------------------------- - # Right Hand Sidebar Components - # -------------------------------------------------------------------------- - - def extract_rhs(self): - """Extract the Right Hand Side (RHS) Knowledge Panel. Can appear in arbitrary order, must extract first.""" - rhs_kws = ('div', {'id': 'rhs'}) - rhs = self.soup.find(*rhs_kws).extract() if self.soup.find(*rhs_kws) else None - if rhs: - rhs_layouts = { - 'rhs_complementary': rhs if webutils.check_dict_value(rhs.attrs, "role", "complementary") else None, - 'rhs_knowledge': rhs.find('div', {'class': ['kp-wholepage', 'knowledge-panel', 'TzHB6b']}), - } - rhs_layout = next((layout for layout, component in rhs_layouts.items() if component), None) - if rhs_layout: - log.debug(f"rhs_layout: {rhs_layout}") - self.rhs = {"elem": rhs_layouts[rhs_layout], - "section": "rhs", - "type": "knowledge_rhs"} - else: - log.debug(f"no rhs_layout") - - - def append_rhs(self): - """Append the RHS Knowledge Panel to the components list at the end""" - if self.rhs: - log.debug(f"appending rhs") - self.components.add_component(**self.rhs) - self.rhs = None - - - # -------------------------------------------------------------------------- - # Header Components - # -------------------------------------------------------------------------- - - def extract_header(self): - """Extract the header section, often a carousel of images or other suggestions.""" - self.extract_top_bar() - self.extract_notices() - - - def extract_top_bar(self): - """Extract the top bar section, often a carousel of images or other suggestions.""" - top_bar = self.soup.find('div', {'id':'appbar'}) - if top_bar: - has_img = top_bar.find(lambda tag: tag.has_attr('src') and not tag.has_attr('data-src')) - if top_bar.find('g-scrolling-carousel') and has_img: - self.components.add_component(top_bar, section='header', type='top_image_carousel') - - - def extract_notices(self): - """Append notices to the components list at the end""" - notices = webutils.find_all_divs(self.soup, "div", {"id": "oFNiHe"}) - notices = webutils.filter_empty_divs(notices) - log.debug(f"notices: {len(notices)}") - for notice in notices: - self.components.add_component(notice, section="header", type="notice") - - # -------------------------------------------------------------------------- - # Main Components - # -------------------------------------------------------------------------- - - def extract_main(self): - """Extract the main results sections of the SERP""" - # self.extract_main_shopping_ads() - self.extract_main_ads_top() - self.extract_main_components() - self.extract_main_ads_bottom() - - - # def extract_main_shopping_ads(self): - # """Extract the main shopping ads section of the SERP""" - # shopping_ads = self.soup.find('div', {'class': 'commercial-unit-desktop-top'}) - # if shopping_ads: - # self.components.add_component(shopping_ads, section='main', type='shopping_ads') - - - def extract_main_ads_top(self): - """Extract the main ads section of the SERP""" - ads = self.soup.find('div', {'id':'tads'}) - if ads and webutils.get_text(ads): - # Filter if already extracted as shopping ads - # if not ads.find('div', {'class': 'commercial-unit-desktop-top'}): - self.components.add_component(ads, section='main', type='ad') - - - def extract_main_components(self, drop_tags: set={'script', 'style', None}): - """Extract main components based on SERP layout""" - log.debug("Extracting main column components") - self.check_layout_main() - try: - layout_extractor = self.layout_extractors[self.layout_label] - column = layout_extractor(drop_tags) - for component in column: - if Extractor.is_valid_main_component(component): - self.components.add_component(component, section='main') - except KeyError: - raise ValueError(f"no extractor for layout_label: {self.layout_label}") - log.debug(f"Extracted main components: {self.components.cmpt_rank_counter:,}") - - - def extract_main_ads_bottom(self): - """Extract the main ads section of the SERP""" - ads = self.soup.find('div', {'id':'tadsb'}) - if ads and webutils.get_text(ads): - self.components.add_component(ads, section='main', type='ad') - - # -------------------------------------------------------------------------- - # Layout Specifics - # -------------------------------------------------------------------------- - - - def check_layout_main(self): - """Divide and label the page layout""" - log.debug(f"Checking SERP layout") - - # Layout soup subsets - self.layout_divs['rso'] = self.soup.find('div', {'id':'rso'}) - self.layout_divs['left-bar'] = self.soup.find('div', {'class': 'OeVqAd'}) - self.layout_divs['top-bars'] = self.soup.find_all('div', {'class': ['XqFnDf', 'M8OgIe']}) - - # Layout classifications - self.layouts['rso'] = bool(self.layout_divs['rso']) - self.layouts['top-bars'] = bool(self.layout_divs['top-bars']) - self.layouts['left-bar'] = bool(self.layout_divs['left-bar']) - self.layouts['standard'] = (self.layouts['rso'] & - (not self.layouts['top-bars']) & - (not self.layouts['left-bar'])) - self.layouts['no-rso'] = not self.layouts['rso'] - - # Get layout label - label_matches = [k for k,v in self.layouts.items() if k !='rso' and v] - first_match = label_matches[0] if label_matches else None - self.layout_label = first_match - log.debug(f"layout: {self.layout_label}") - - - def extract_from_standard(self, drop_tags: set = {}) -> list: - - if self.layout_divs['rso'].find('div', {'id':'kp-wp-tab-overview'}): - log.debug("layout update: standard-alt-1") - self.layout_label = 'standard-alt' - column = self.layout_divs['rso'].find_all('div', {'class':'TzHB6b'}) - return column - - column = Extractor.extract_children(self.layout_divs['rso'], drop_tags) - column = [c for c in column if Extractor.is_valid_main_component(c)] - - if len(column) == 0: - log.debug("layout update: standard-alt-0") - self.layout_label = 'standard-alt' - divs = self.layout_divs['rso'].find_all('div', {'id':'kp-wp-tab-overview'}) - column = sum([div.find_all('div', {'class':'TzHB6b'}) for div in divs], []) - return column - - - def extract_from_top_bar(self, drop_tags: set = {}) -> list: - """Extract components from top-bars layout""" - column = [] - - top_bar_divs = Extractor.extract_from_top_bar_divs(self.layout_divs['top-bars']) - column.extend(top_bar_divs) - - rso_layout_divs = self.layout_divs['rso'].find_all('div', {'class':'sATSHe'}) - if rso_layout_divs: - self.layout_label = 'top-bars-divs' - layout_column = [div for div in rso_layout_divs if div.name not in drop_tags] - else: - self.layout_label = 'top-bars-children' - layout_column = Extractor.extract_children(self.layout_divs['rso'], drop_tags) - log.debug(f"layout update: {self.layout_label}") - - column.extend(layout_column) - return column - - @staticmethod - def extract_from_top_bar_divs(soup, drop_tags: set = {}) -> list: - output_list = [] - for top_bar in soup: - if webutils.check_dict_value(top_bar.attrs, "class", ["M8OgIe"]): - knowledge_divs = webutils.find_all_divs(top_bar, "div", {"jscontroller": ["qTdDb", "OWrb3e"]}) - output_list.extend(knowledge_divs) - log.debug(f"layout: M8OgIe divs: {len(knowledge_divs)}") - else: - output_list.append(top_bar) - return output_list - - - def extract_from_left_bar(self, drop_tags: set = {}) -> list: - """Extract components from left-bar layout""" - column = self.soup.find_all('div', {'class':'TzHB6b'}) - return column - - - def extract_from_no_rso(self, drop_tags: set = {}) -> list: - """Extract components from no-rso layout""" - log.debug("layout: no-rso") - column = [] - section1 = self.soup.find_all('div', {'class':'UDZeY OTFaAf'}) - for div in section1: - - # Conditional handling for Twitter result - if div.find('h2') and div.find('h2').text == "Twitter Results": - column.append(div.find('div').parent) - - # Conditional handling for g-section with header - elif div.find('g-section-with-header'): - column.append(div.find('g-section-with-header').parent) - - # Include divs with a "View more" type of button - elif div.find('g-more-link'): - column.append(div) - - # Include footer components that appear in the main column - elif div.find('div', {'class':'oIk2Cb'}): - column.append(div) - - else: - # Handle general results - for child in div.find_all('div', {'class':'g'}): - column.append(child) - - # Find section 2 results and append to column list - section2 = self.soup.find('div', {'class':'WvKfwe a3spGf'}) - if section2: - for child in section2.children: - column.append(child) - column = [c for c in column if c.name not in drop_tags] - return column - - - @staticmethod - def extract_children(soup: bs4.BeautifulSoup, drop_tags: set = {}) -> list: - """Extract children from BeautifulSoup, drop specific tags, flatten list""" - log.debug("layout: extracting children") - children = [] - for child in soup.children: - if child.name in drop_tags: - continue - if not child.attrs: - children.extend(child.contents) - else: - children.append(child) - return children - - - @staticmethod - def is_valid_main_component(c) -> bool: - """Check if a given component is neither empty nor a hidden survey""" - if not c: - return False - else: - drop_text = { - "Main results", # Remove empty rso component; hidden

header - "Twitter Results", # Remove empty Twitter component - "", # Remove empty divs - } - return c.text not in drop_text and not Extractor.is_hidden_survey(c) - - @staticmethod - def is_hidden_survey(element): - """Check if a component is a hidden survey component; no visual presence so filter out""" - conditions = [ - element.find('promo-throttler'), - webutils.check_dict_value(element.attrs, "class", ["ULSxyf"]), - ] - return all(conditions) - - - # -------------------------------------------------------------------------- - # Footer Components - # -------------------------------------------------------------------------- - - - def extract_footer(self): - """Extract the footer section of the SERP""" - log.debug("extracting footer components") - - footer_div = self.soup.find('div', {'id':'botstuff'}) - footer_component_list = [] - - # Check if footer div exists - if footer_div: - footer_component_divs = webutils.find_all_divs(self.soup, 'div', {'id':['bres', 'brs']}) - if footer_component_divs: - log.debug(f"found footer components: {len(footer_component_divs):,}") - - # Expand components by checking for nested divs - for footer_component_div in footer_component_divs: - expanded_divs = webutils.find_all_divs(footer_component_div, "div", {"class":"MjjYud"}) - if expanded_divs and len(expanded_divs) > 1: - footer_component_list.extend(expanded_divs) - else: - footer_component_list.append(footer_component_div) - - # Check for omitted notice - omitted_notice = self.soup.find('div', {'class':'ClPXac'}) - if omitted_notice: - footer_component_list.append(omitted_notice) - - footer_component_list = [e for e in footer_component_list if not Extractor.is_hidden_footer(e)] - log.debug(f'footer_component_list len: {len(footer_component_list)}') - - for footer_component in footer_component_list: - self.components.add_component(footer_component, section='footer') - - - @staticmethod - def is_hidden_footer(element): - """Check if a component is a hidden footer component; no visual presence so filter out""" - conditions = [ - # element.find("b", {"class":"uDuvJd"}), - element.find("span", {"class":"oUAcPd"}), - element.find("div", {"class": "RTaUke"}), - element.find("div", {"class": "KJ7Tg"}), - ] - return any(conditions) diff --git a/WebSearcher/extractors/__init__.py b/WebSearcher/extractors/__init__.py new file mode 100644 index 0000000..59252a6 --- /dev/null +++ b/WebSearcher/extractors/__init__.py @@ -0,0 +1,27 @@ +import bs4 +from ..components import ComponentList +from .extractor_rhs import ExtractorRightHandSide +from .extractor_main import ExtractorMain +from .extractor_header import ExtractorHeader +from .extractor_footer import ExtractorFooter + +from .. import logger +log = logger.Logger().start(__name__) + +class Extractor: + def __init__(self, soup: bs4.BeautifulSoup): + self.soup = soup + self.components = ComponentList() + self.rhs_handler = ExtractorRightHandSide(self.soup, self.components) + self.header_handler = ExtractorHeader(self.soup, self.components) + self.main_handler = ExtractorMain(self.soup, self.components) + self.footer_handler = ExtractorFooter(self.soup, self.components) + + def extract_components(self): + log.debug(f"Extracting Components {'-'*50}") + self.rhs_handler.extract() + self.header_handler.extract() + self.main_handler.extract() + self.footer_handler.extract() + self.rhs_handler.append() + log.debug(f"total components: {self.components.cmpt_rank_counter:,}") diff --git a/WebSearcher/extractors/extractor_footer.py b/WebSearcher/extractors/extractor_footer.py new file mode 100644 index 0000000..abe2530 --- /dev/null +++ b/WebSearcher/extractors/extractor_footer.py @@ -0,0 +1,53 @@ +import bs4 +from .. import webutils +from .. import logger + +log = logger.Logger().start(__name__) + +class ExtractorFooter: + def __init__(self, soup: bs4.BeautifulSoup, components): + self.soup = soup + self.components = components + + def extract(self): + """Extract the footer section of the SERP""" + + footer_div = self.soup.find('div', {'id':'botstuff'}) + footer_component_list = [] + + if footer_div: + footer_component_divs = webutils.find_all_divs( + self.soup, 'div', {'id': ['bres', 'brs']} + ) + if footer_component_divs: + for footer_component_div in footer_component_divs: + expanded_divs = webutils.find_all_divs( + footer_component_div, "div", {"class": "MjjYud"} + ) + if expanded_divs and len(expanded_divs) > 1: + footer_component_list.extend(expanded_divs) + else: + footer_component_list.append(footer_component_div) + + omitted_notice = self.soup.find('div', {'class':'ClPXac'}) + if omitted_notice: + footer_component_list.append(omitted_notice) + + footer_component_list = [ + e for e in footer_component_list + if not ExtractorFooter.is_hidden_footer(e) + ] + log.debug(f'footer_components: {len(footer_component_list)}') + + for footer_component in footer_component_list: + self.components.add_component(footer_component, section='footer') + + @staticmethod + def is_hidden_footer(element): + """Filter out hidden footer components (no visual presence).""" + conditions = [ + element.find("span", {"class":"oUAcPd"}), + element.find("div", {"class": "RTaUke"}), + element.find("div", {"class": "KJ7Tg"}), + ] + return any(conditions) \ No newline at end of file diff --git a/WebSearcher/extractors/extractor_header.py b/WebSearcher/extractors/extractor_header.py new file mode 100644 index 0000000..7955d04 --- /dev/null +++ b/WebSearcher/extractors/extractor_header.py @@ -0,0 +1,33 @@ +import bs4 +from .. import webutils +from .. import logger + +log = logger.Logger().start(__name__) + +class ExtractorHeader: + def __init__(self, soup: bs4.BeautifulSoup, components): + self.soup = soup + self.components = components + self.exists = False + + def extract(self): + """Extract the header section: appbar and notices.""" + self.extract_appbar() + self.extract_notices() + + def extract_appbar(self): + """Extract the top bar section, often a carousel of images or other suggestions.""" + appbar = self.soup.find('div', {'id':'appbar'}) + if appbar: + has_img = appbar.find(lambda tag: tag.has_attr('src') and not tag.has_attr('data-src')) + if appbar.find('g-scrolling-carousel') and has_img: + self.components.add_component(appbar, section='header', type='top_image_carousel') + self.exists = True + + def extract_notices(self): + """Append notices to the components list at the end.""" + notices = webutils.find_all_divs(self.soup, "div", {"id": "oFNiHe"}, filter_empty=True) + if notices: + self.exists = True + for notice in notices: + self.components.add_component(notice, section="header", type="notice") \ No newline at end of file diff --git a/WebSearcher/extractors/extractor_main.py b/WebSearcher/extractors/extractor_main.py new file mode 100644 index 0000000..7be0c19 --- /dev/null +++ b/WebSearcher/extractors/extractor_main.py @@ -0,0 +1,236 @@ +import bs4 +from .. import webutils +from ..logger import Logger + +log = Logger().start(__name__) + +class ExtractorMain: + def __init__(self, soup: bs4.BeautifulSoup, components): + self.soup = soup + self.components = components + + # copied from Extractor.__init__ + self.layout_divs = { + "rso": None, + "top-bars": None, + "left-bar": None, + } + self.layouts = { + "top-bars": False, + "left-bar": False, + "standard": False, + "no-rso": False, + } + self.layout_label = None + self.layout_extractors = { + "standard": self.extract_from_standard, + "top-bars": self.extract_from_top_bar, + "left-bar": self.extract_from_left_bar, + "no-rso": self.extract_from_no_rso + } + + def extract(self): + self.get_layout() + self._ads_top() + self._main_column() + self._ads_bottom() + log.debug(f"main_components: {self.components.cmpt_rank_counter:,}") + + def get_layout(self): + """Divide and label the page layout""" + + # Layout soup subsets + layout_divs = {} + layout_divs['rso'] = self.soup.find('div', {'id':'rso'}) + layout_divs['left-bar'] = self.soup.find('div', {'class': 'OeVqAd'}) + + rcnt = self.soup.find('div', {'id':'rcnt'}) + layout_divs['top-bars'] = webutils.find_all_divs(rcnt, 'div', {'class': ['XqFnDf', 'M8OgIe']}) + + # Layout classifications + layouts = {} + layouts['top-bars'] = bool(layout_divs['top-bars']) + layouts['left-bar'] = bool(layout_divs['left-bar']) + layouts['standard'] = ( + bool(layout_divs['rso']) & + (not layouts['top-bars']) & + (not layouts['left-bar']) + ) + layouts['no-rso'] = not bool(layout_divs['rso']) + + if layouts['top-bars'] and bool(layout_divs['rso']) and not layouts['left-bar']: + layout_label = 'standard' + else: + # Get layout label + label_matches = [k for k,v in layouts.items() if v] + layout_label = label_matches[0] if label_matches else None + + # Set layout details + log.debug(f"main_layout: {layout_label}") + self.layout_label = layout_label + self.layouts.update(layouts) + self.layout_divs.update(layout_divs) + + def _ads_top(self): + ads = self.soup.find('div', {'id':'tads'}) + if ads and webutils.get_text(ads): + ads.extract() + self.components.add_component(ads, section='main', type='ad') + + def _main_column(self, drop_tags: set = {'script', 'style', None}): + try: + extractor = self.layout_extractors[self.layout_label] + except KeyError: + raise ValueError(f"no extractor for layout_label: {self.layout_label}") + + column = extractor(drop_tags) + column = webutils.filter_empty_divs(column) + for c in column: + if ExtractorMain.is_valid(c): + self.components.add_component(c, section='main') + + def _ads_bottom(self): + ads = self.soup.find('div', {'id':'tadsb'}) + if ads and webutils.get_text(ads): + ads.extract() + self.components.add_component(ads, section='main', type='ad') + + def extract_from_standard(self, drop_tags:set={}) -> list: + + rso_div = self.layout_divs['rso'] + standard_layouts = { + "standard-0": rso_div.find('div', {'id':'kp-wp-tab-overview'}), + "standard-1": rso_div.find('div', {'id':'kp-wp-tab-cont-Songs', 'role':'tabpanel'}), + "standard-2": rso_div.find('div', {'id':'kp-wp-tab-SportsStandings'}), + } + for layout_name, layout_div in standard_layouts.items(): + if layout_div: + if layout_div.find_all("div"): + return self._extract_from_standard_sub_type(layout_name) + + top_divs = ExtractorMain.extract_top_divs(self.layout_divs['top-bars']) or [] + col = ExtractorMain.extract_children(rso_div, drop_tags) + col = top_divs + col + col = [c for c in col if ExtractorMain.is_valid(c)] + if not col: + self.layout_label = 'standard-3' + log.debug(f"main_layout: {self.layout_label} (update)") + divs = rso_div.find_all('div', {'id':'kp-wp-tab-overview'}) + col = sum([d.find_all('div', {'class':'TzHB6b'}) for d in divs], []) + return col + + def _extract_from_standard_sub_type(self, sub_type:str = "") -> list: + + self.layout_label = sub_type + rso_div = self.layout_divs['rso'] + log.debug(f"main_layout: {self.layout_label} (update)") + + if self.layout_label == "standard-0": + column = [] + top_divs = ExtractorMain.extract_top_divs(self.layout_divs['top-bars']) or [] + main_divs = rso_div.find_all('div', {'class':'TzHB6b'}) or [] + column.extend(top_divs) + column.extend(main_divs) + log.debug(f"main_components: {len(column):,}") + return column + + if self.layout_label == "standard-1": + column = [] + top_divs = ExtractorMain.extract_top_divs(self.layout_divs['top-bars']) or [] + main_divs = rso_div.find('div', {'id':'kp-wp-tab-Songs'}).children or [] + column.extend(top_divs) + column.extend(main_divs) + column = [div for div in column if div.name not in {'script', 'style'}] + column = webutils.filter_empty_divs(column) + return column + + if self.layout_label == "standard-2": + column = [] + top_divs = ExtractorMain.extract_top_divs(self.layout_divs['top-bars']) or [] + main_divs = rso_div.find('div', {'id':'kp-wp-tab-SportsStandings'}).children or [] + column.extend(top_divs) + column.extend(main_divs) + column = [div for div in column if div.name not in {'script', 'style'}] + column = webutils.filter_empty_divs(column) + return column + + + def extract_from_top_bar(self, drop_tags:set={}) -> list: + out = [] + tops = ExtractorMain.extract_top_divs(self.layout_divs['top-bars']) + out.extend(tops) + + div_classes = [ + 'cUnQKe', # people also ask + 'g', # general + 'Lv2Cle', # images-medium + 'oIk2Cb', # searches_related + 'Ww4FFb', # discussions_and_forums + 'vtSz8d', # videos + 'uVMCKf', # videos + ] + + rso_divs = self.layout_divs['rso'].find_all('div', attrs={'class':div_classes}) + if rso_divs: + self.layout_label = 'top-bars-divs' + col = [div for div in rso_divs if div.name not in drop_tags] + else: + self.layout_label = 'top-bars-children' + col = ExtractorMain.extract_children(self.layout_divs['rso'], drop_tags) + log.debug(f"main_layout: {self.layout_label} (update)") + out.extend(col) + return out + + @staticmethod + def extract_top_divs(soup, drop_tags:set={}) -> list: + out = [] + for tb in soup: + if webutils.check_dict_value(tb.attrs, "class", ["M8OgIe"]): + kd = webutils.find_all_divs(tb, "div", {"jscontroller":["qTdDb","OWrb3e"]}) + out.extend(kd) + else: + out.append(tb) + return out + + def extract_from_left_bar(self, drop_tags:set={}) -> list: + return self.soup.find_all('div', {'class':'TzHB6b'}) + + def extract_from_no_rso(self, drop_tags:set={}) -> list: + out=[]; sec1=self.soup.find_all('div', {'class':'UDZeY OTFaAf'}) + for div in sec1: + if div.find('h2') and div.find('h2').text=="Twitter Results": + out.append(div.find('div').parent) + elif div.find('g-section-with-header'): + out.append(div.find('g-section-with-header').parent) + elif div.find('g-more-link'): + out.append(div) + elif div.find('div',{'class':'oIk2Cb'}): + out.append(div) + else: + out.extend(div.find_all('div',{'class':'g'})) + sec2=self.soup.find('div',{'class':'WvKfwe a3spGf'}) + if sec2: + out.extend(sec2.children) + return [c for c in out if c.name not in drop_tags] + + @staticmethod + def extract_children(soup, drop_tags:set={}) -> list: + cts=[] + for ch in soup.children: + if ch.name in drop_tags: continue + if not ch.attrs: cts.extend(ch.contents) + else: cts.append(ch) + return cts + + @staticmethod + def is_valid(c) -> bool: + if not c: return False + bad = {"Main results","Twitter Results",""} + if c.text in bad: return False + # hidden survey + cond = [ + c.find('promo-throttler'), + webutils.check_dict_value(c.attrs,"class",["ULSxyf"]) if 'attrs' in c else False, + ] + if all(cond): return False + return True \ No newline at end of file diff --git a/WebSearcher/extractors/extractor_rhs.py b/WebSearcher/extractors/extractor_rhs.py new file mode 100644 index 0000000..4fc013d --- /dev/null +++ b/WebSearcher/extractors/extractor_rhs.py @@ -0,0 +1,43 @@ +import bs4 +from .. import webutils +from .. import logger + +log = logger.Logger().start(__name__) + +class ExtractorRightHandSide: + def __init__(self, soup: bs4.BeautifulSoup, components): + self.soup = soup + self.components = components + self.rhs = {} + + def extract(self): + """Extract the RHS Knowledge Panel, if present.""" + rhs_div = self.soup.find('div', {'id': 'rhs'}) + if not rhs_div: + return + rhs_div.extract() + layout, div = self._get_layout(rhs_div) + if layout: + log.debug(f"rhs_layout: {layout}") + self.rhs = { + "elem": div, + "section": "rhs", + "type": "knowledge_rhs" + } + else: + log.debug("no rhs_layout") + + def append(self): + """Append the RHS panel as a component at the end.""" + if self.rhs: + log.debug("appending rhs") + self.components.add_component(**self.rhs) + self.rhs = {} + + def _get_layout(self, rhs_div): + rhs_layouts = { + 'rhs_complementary': rhs_div if webutils.check_dict_value(rhs_div.attrs, "role", "complementary") else None, + 'rhs_knowledge': rhs_div.find('div', {'class': ['kp-wholepage', 'knowledge-panel', 'TzHB6b']}) + } + found = next((name for name, node in rhs_layouts.items() if node), None) + return (found, rhs_div) if found else (None, rhs_div) \ No newline at end of file diff --git a/WebSearcher/models/cmpt_mappings.py b/WebSearcher/models/cmpt_mappings.py new file mode 100644 index 0000000..616bbad --- /dev/null +++ b/WebSearcher/models/cmpt_mappings.py @@ -0,0 +1,185 @@ +""" +Metadata about WebSearcher result types and subtypes. +This provides documentation and structure for the various result types parsed by WebSearcher. +""" + +# Header result types with descriptions and subtypes +HEADER_RESULT_TYPES = { + "notice": { + "description": "Special notices and suggestions shown at the top of search results", + "sub_types": [ + "query_edit", + "query_edit_no_results", + "query_suggestion", + "location_choose_area", + "location_use_precise_location", + "language_tip", + ], + }, + "top_image_carousel": { + "description": "Carousel of images displayed at the top of search results", + "sub_types": [], + }, +} + +# Main result types with descriptions and subtypes +MAIN_RESULT_TYPES = { + "ad": { + "description": "Advertisements displayed in search results", + "sub_types": ["standard", "legacy", "secondary", "submenu"], + }, + "available_on": { + "description": "Where entertainment content is available to stream or purchase", + "sub_types": [], + }, + "banner": { + "description": "Banner notifications shown at top of results", + "sub_types": [], + }, + "discussions_and_forums": { + "description": "Forum and discussion board results", + "sub_types": [], + }, + "general": { + "description": "Standard web search results", + "sub_types": [ + "video", + "submenu", + "submenu_mini", + "submenu_rating", + "submenu_scholarly", + "submenu_product", + "subresult", + ], + }, + "general_questions": { + "description": "General results with related questions", + "sub_types": [], + }, + "images": { + "description": "Image search results", + "sub_types": ["multimedia", "medium", "small"], + }, + "knowledge": { + "description": "Knowledge panels and featured snippets", + "sub_types": [ + "ai_overview", + "featured_results", + "featured_snippet", + "unit_converter", + "sports", + "weather", + "finance", + "dictionary", + "translate", + "calculator", + "election", + "panel", + ], + }, + "latest_from": { + "description": "Latest news results from specific sources", + "sub_types": [], + }, + "local_news": { + "description": "News results specific to a location", + "sub_types": [], + }, + "local_results": { + "description": "Map-based local business results", + "sub_types": ["places", "locations", "businesses"], # Dynamically generated + }, + "map_results": {"description": "Map-only results", "sub_types": []}, + "news_quotes": { + "description": "Quote snippets from news articles", + "sub_types": [], + }, + "notice": { + "description": "Special notices about searches", + "sub_types": [ + "query_edit", + "query_edit_no_results", + "query_suggestion", + "location_choose_area", + "location_use_precise_location", + "language_tip", + ], + }, + "people_also_ask": { + "description": "Related questions that people search for", + "sub_types": [], + }, + "perspectives": {"description": "Opinion and perspective results", "sub_types": []}, + "scholarly_articles": {"description": "Google Scholar results", "sub_types": []}, + "searches_related": { + "description": "Related search terms", + "sub_types": [ + "additional_searches", + "related_searches", + ], # Dynamically generated + }, + "shopping_ads": {"description": "Product shopping advertisements", "sub_types": []}, + "top_image_carousel": { + "description": "Carousel of images displayed at top of page", + "sub_types": [], + }, + "top_stories": {"description": "Featured news stories", "sub_types": []}, + "twitter_cards": { + "description": "Twitter content displayed in cards", + "sub_types": [], + }, + "twitter_result": {"description": "Individual Twitter result", "sub_types": []}, + "videos": {"description": "Video results", "sub_types": []}, + "view_more_news": {"description": "News result expansion links", "sub_types": []}, + "knowledge_rhs": { + "description": "Knowledge panels in right-hand sidebar", + "sub_types": [], + }, + "unknown": {"description": "Unclassified components", "sub_types": []}, +} + +# Footer result types with descriptions and subtypes +FOOTER_RESULT_TYPES = { + "img_cards": {"description": "Image cards displayed in footer", "sub_types": []}, + "searches_related": { + "description": "Related searches displayed in footer", + "sub_types": [ + "additional_searches", + "related_searches", + ], # Dynamically generated + }, + "discover_more": {"description": "'Discover more' suggestions", "sub_types": []}, + "general": { + "description": "General results in footer", + "sub_types": [ + "video", + "submenu", + "submenu_mini", + "submenu_rating", + "submenu_scholarly", + "submenu_product", + "subresult", + ], + }, + "people_also_ask": {"description": "Related questions in footer", "sub_types": []}, + "omitted_notice": { + "description": "Notices about filtered results", + "sub_types": [], + }, +} + +# Special types not directly linked to parsers +SPECIAL_RESULT_TYPES = { + "unclassified": { + "description": "Default type in the BaseResult model", + "sub_types": [], + }, +} + +# Combined dictionary of all result types +ALL_RESULT_TYPES = { + **HEADER_RESULT_TYPES, + **MAIN_RESULT_TYPES, + **FOOTER_RESULT_TYPES, + **SPECIAL_RESULT_TYPES, +} diff --git a/WebSearcher/models/configs.py b/WebSearcher/models/configs.py index 81e011d..5d6ea80 100644 --- a/WebSearcher/models/configs.py +++ b/WebSearcher/models/configs.py @@ -25,7 +25,7 @@ class LogConfig(BaseConfig): class SeleniumConfig(BaseConfig): headless: bool = False - version_main: int = 133 + version_main: int = 141 use_subprocess: bool = False driver_executable_path: str = "" diff --git a/WebSearcher/search_methods/selenium_searcher.py b/WebSearcher/search_methods/selenium_searcher.py index 1e67025..e4b1e16 100644 --- a/WebSearcher/search_methods/selenium_searcher.py +++ b/WebSearcher/search_methods/selenium_searcher.py @@ -136,14 +136,12 @@ def cleanup(self) -> bool: try: self.delete_cookies() self.close_all_windows() - # Finally quit the driver self.driver.quit() self.driver = None self.log.debug(f'Browser successfully closed') return True except Exception as e: self.log.warning(f'Failed to close browser: {e}') - # Force driver to be None so we create a fresh instance next time self.driver = None return False return True diff --git a/WebSearcher/webutils.py b/WebSearcher/webutils.py index 4489468..f4db20f 100644 --- a/WebSearcher/webutils.py +++ b/WebSearcher/webutils.py @@ -122,13 +122,15 @@ def get_link_list(soup: BeautifulSoup, attrs: dict = {}, key: str = 'href', filt return [link.attrs.get(key, None) for link in links] if links else None def find_all_divs(soup: BeautifulSoup, name: str, attrs: dict = {}, filter_empty: bool = True) -> list: + if not soup: + return [] divs = soup.find_all(name, attrs) if attrs else soup.find_all(name) divs = filter_empty_divs(divs) if filter_empty else divs return divs def filter_empty_divs(divs): divs = [c for c in divs if c] - divs = [c for c in divs if c.text != ''] + divs = [c for c in divs if c.text.strip() != ''] return divs def find_children(soup, name: str, attrs: dict = {}, filter_empty: bool = False): diff --git a/poetry.lock b/poetry.lock index 34ccb6a..369c990 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. [[package]] name = "annotated-types" @@ -515,7 +515,7 @@ description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" groups = ["main", "dev"] -markers = "python_version < \"3.11\"" +markers = "python_version == \"3.10\"" files = [ {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, @@ -558,14 +558,14 @@ typing = ["typing-extensions (>=4.12.2) ; python_version < \"3.11\""] [[package]] name = "h11" -version = "0.14.0" +version = "0.16.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" groups = ["main"] files = [ - {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, - {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, + {file = "h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86"}, + {file = "h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1"}, ] [[package]] @@ -1288,21 +1288,21 @@ wcwidth = "*" [[package]] name = "protobuf" -version = "6.30.0" +version = "6.31.1" description = "" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "protobuf-6.30.0-cp310-abi3-win32.whl", hash = "sha256:7337d76d8efe65ee09ee566b47b5914c517190196f414e5418fa236dfd1aed3e"}, - {file = "protobuf-6.30.0-cp310-abi3-win_amd64.whl", hash = "sha256:9b33d51cc95a7ec4f407004c8b744330b6911a37a782e2629c67e1e8ac41318f"}, - {file = "protobuf-6.30.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:52d4bb6fe76005860e1d0b8bfa126f5c97c19cc82704961f60718f50be16942d"}, - {file = "protobuf-6.30.0-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:7940ab4dfd60d514b2e1d3161549ea7aed5be37d53bafde16001ac470a3e202b"}, - {file = "protobuf-6.30.0-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:d79bf6a202a536b192b7e8d295d7eece0c86fbd9b583d147faf8cfeff46bf598"}, - {file = "protobuf-6.30.0-cp39-cp39-win32.whl", hash = "sha256:bb35ad251d222f03d6c4652c072dfee156be0ef9578373929c1a7ead2bd5492c"}, - {file = "protobuf-6.30.0-cp39-cp39-win_amd64.whl", hash = "sha256:501810e0eba1d327e783fde47cc767a563b0f1c292f1a3546d4f2b8c3612d4d0"}, - {file = "protobuf-6.30.0-py3-none-any.whl", hash = "sha256:e5ef216ea061b262b8994cb6b7d6637a4fb27b3fb4d8e216a6040c0b93bd10d7"}, - {file = "protobuf-6.30.0.tar.gz", hash = "sha256:852b675d276a7d028f660da075af1841c768618f76b90af771a8e2c29e6f5965"}, + {file = "protobuf-6.31.1-cp310-abi3-win32.whl", hash = "sha256:7fa17d5a29c2e04b7d90e5e32388b8bfd0e7107cd8e616feef7ed3fa6bdab5c9"}, + {file = "protobuf-6.31.1-cp310-abi3-win_amd64.whl", hash = "sha256:426f59d2964864a1a366254fa703b8632dcec0790d8862d30034d8245e1cd447"}, + {file = "protobuf-6.31.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:6f1227473dc43d44ed644425268eb7c2e488ae245d51c6866d19fe158e207402"}, + {file = "protobuf-6.31.1-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:a40fc12b84c154884d7d4c4ebd675d5b3b5283e155f324049ae396b95ddebc39"}, + {file = "protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:4ee898bf66f7a8b0bd21bce523814e6fbd8c6add948045ce958b73af7e8878c6"}, + {file = "protobuf-6.31.1-cp39-cp39-win32.whl", hash = "sha256:0414e3aa5a5f3ff423828e1e6a6e907d6c65c1d5b7e6e975793d5590bdeecc16"}, + {file = "protobuf-6.31.1-cp39-cp39-win_amd64.whl", hash = "sha256:8764cf4587791e7564051b35524b72844f845ad0bb011704c3736cce762d8fe9"}, + {file = "protobuf-6.31.1-py3-none-any.whl", hash = "sha256:720a6c7e6b77288b85063569baae8536671b39f15cc22037ec7045658d80489e"}, + {file = "protobuf-6.31.1.tar.gz", hash = "sha256:d8cac4c982f0b957a4dc73a80e2ea24fab08e679c0de9deb835f4a12d69aca9a"}, ] [[package]] @@ -1742,19 +1742,19 @@ cffi = {version = "*", markers = "implementation_name == \"pypy\""} [[package]] name = "requests" -version = "2.32.3" +version = "2.32.4" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, - {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, + {file = "requests-2.32.4-py3-none-any.whl", hash = "sha256:27babd3cda2a6d50b30443204ee89830707d396671944c998b5975b031ac2b2c"}, + {file = "requests-2.32.4.tar.gz", hash = "sha256:27d0316682c8a29834d3264820024b62a36942083d52caf2f14c0591336d3422"}, ] [package.dependencies] certifi = ">=2017.4.17" -charset-normalizer = ">=2,<4" +charset_normalizer = ">=2,<4" idna = ">=2.5,<4" urllib3 = ">=1.21.1,<3" @@ -1941,7 +1941,7 @@ description = "A lil' TOML parser" optional = false python-versions = ">=3.8" groups = ["dev"] -markers = "python_version < \"3.11\"" +markers = "python_version == \"3.10\"" files = [ {file = "tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249"}, {file = "tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6"}, @@ -1979,23 +1979,24 @@ files = [ [[package]] name = "tornado" -version = "6.4.2" +version = "6.5.1" description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed." optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" groups = ["dev"] files = [ - {file = "tornado-6.4.2-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:e828cce1123e9e44ae2a50a9de3055497ab1d0aeb440c5ac23064d9e44880da1"}, - {file = "tornado-6.4.2-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:072ce12ada169c5b00b7d92a99ba089447ccc993ea2143c9ede887e0937aa803"}, - {file = "tornado-6.4.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a017d239bd1bb0919f72af256a970624241f070496635784d9bf0db640d3fec"}, - {file = "tornado-6.4.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c36e62ce8f63409301537222faffcef7dfc5284f27eec227389f2ad11b09d946"}, - {file = "tornado-6.4.2-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bca9eb02196e789c9cb5c3c7c0f04fb447dc2adffd95265b2c7223a8a615ccbf"}, - {file = "tornado-6.4.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:304463bd0772442ff4d0f5149c6f1c2135a1fae045adf070821c6cdc76980634"}, - {file = "tornado-6.4.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:c82c46813ba483a385ab2a99caeaedf92585a1f90defb5693351fa7e4ea0bf73"}, - {file = "tornado-6.4.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:932d195ca9015956fa502c6b56af9eb06106140d844a335590c1ec7f5277d10c"}, - {file = "tornado-6.4.2-cp38-abi3-win32.whl", hash = "sha256:2876cef82e6c5978fde1e0d5b1f919d756968d5b4282418f3146b79b58556482"}, - {file = "tornado-6.4.2-cp38-abi3-win_amd64.whl", hash = "sha256:908b71bf3ff37d81073356a5fadcc660eb10c1476ee6e2725588626ce7e5ca38"}, - {file = "tornado-6.4.2.tar.gz", hash = "sha256:92bad5b4746e9879fd7bf1eb21dce4e3fc5128d71601f80005afa39237ad620b"}, + {file = "tornado-6.5.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:d50065ba7fd11d3bd41bcad0825227cc9a95154bad83239357094c36708001f7"}, + {file = "tornado-6.5.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9e9ca370f717997cb85606d074b0e5b247282cf5e2e1611568b8821afe0342d6"}, + {file = "tornado-6.5.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b77e9dfa7ed69754a54c89d82ef746398be82f749df69c4d3abe75c4d1ff4888"}, + {file = "tornado-6.5.1-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:253b76040ee3bab8bcf7ba9feb136436a3787208717a1fb9f2c16b744fba7331"}, + {file = "tornado-6.5.1-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:308473f4cc5a76227157cdf904de33ac268af770b2c5f05ca6c1161d82fdd95e"}, + {file = "tornado-6.5.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:caec6314ce8a81cf69bd89909f4b633b9f523834dc1a352021775d45e51d9401"}, + {file = "tornado-6.5.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:13ce6e3396c24e2808774741331638ee6c2f50b114b97a55c5b442df65fd9692"}, + {file = "tornado-6.5.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5cae6145f4cdf5ab24744526cc0f55a17d76f02c98f4cff9daa08ae9a217448a"}, + {file = "tornado-6.5.1-cp39-abi3-win32.whl", hash = "sha256:e0a36e1bc684dca10b1aa75a31df8bdfed656831489bc1e6a6ebed05dc1ec365"}, + {file = "tornado-6.5.1-cp39-abi3-win_amd64.whl", hash = "sha256:908e7d64567cecd4c2b458075589a775063453aeb1d2a1853eedb806922f568b"}, + {file = "tornado-6.5.1-cp39-abi3-win_arm64.whl", hash = "sha256:02420a0eb7bf617257b9935e2b754d1b63897525d8a289c9d65690d580b4dcf7"}, + {file = "tornado-6.5.1.tar.gz", hash = "sha256:84ceece391e8eb9b2b95578db65e920d2a61070260594819589609ba9bc6308c"}, ] [[package]] @@ -2113,14 +2114,14 @@ websockets = "*" [[package]] name = "urllib3" -version = "2.3.0" +version = "2.5.0" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df"}, - {file = "urllib3-2.3.0.tar.gz", hash = "sha256:f8c5449b3cf0861679ce7e0503c7b44b5ec981bec0d1d3795a07f1ba96f0204d"}, + {file = "urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc"}, + {file = "urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760"}, ] [package.dependencies] @@ -2258,4 +2259,4 @@ h11 = ">=0.9.0,<1" [metadata] lock-version = "2.1" python-versions = ">=3.10" -content-hash = "684e3794b5ea4541fde5a46b9bf83f67cbeedcecf4cd969dce683ffc3210b382" +content-hash = "c571829b60451314f3df0749f1f8f8b553bdfe22d4e8a183c096335cfae000ae" diff --git a/pyproject.toml b/pyproject.toml index 0e936df..a63593b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "WebSearcher" -version = "0.6.4" +version = "0.6.5" description = "Tools for conducting, collecting, and parsing web search" authors = [{name = "Ronald E. Robertson", email = ""}] keywords = ["web", "search", "parser"] @@ -8,7 +8,7 @@ license = "GPL-3.0" readme = "README.md" requires-python = ">=3.10" dependencies = [ - "requests>=2.32.3", + "requests>=2.32.4", "lxml>=5.3.0", "beautifulsoup4>=4.12.3", "tldextract>=5.1.2", @@ -17,7 +17,7 @@ dependencies = [ "pandas>=2.2.3", "undetected-chromedriver>=3.5.5", "selenium>=4.9.0", - "protobuf (>=6.30.0,<7.0.0)", + "protobuf (>=6.31.1,<7.0.0)", "orjson (>=3.10.16,<4.0.0)", ] @@ -26,7 +26,7 @@ homepage = "http://github.com/gitronald/WebSearcher" repository = "http://github.com/gitronald/WebSearcher" [project.scripts] -demo-search = 'scripts.demo_search:main' +demo-search = 'scripts.demo_search:app' [tool.poetry] packages = [{include = "WebSearcher"}] diff --git a/scripts/demo_search.py b/scripts/demo_search.py index 3debcaf..ebfdd1c 100644 --- a/scripts/demo_search.py +++ b/scripts/demo_search.py @@ -22,7 +22,7 @@ def main( data_dir: str = typer.Option(DEFAULT_DATA_DIR, help="Prefix for output files"), headless: bool = typer.Option(False, help="Run browser in headless mode"), use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"), - version_main: int = typer.Option(133, help="Main version of Chrome to use"), + version_main: int = typer.Option(141, help="Main version of Chrome to use"), ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"), driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"), ) -> None: diff --git a/scripts/demo_searches.py b/scripts/demo_searches.py index 82eee67..f63f454 100644 --- a/scripts/demo_searches.py +++ b/scripts/demo_searches.py @@ -22,7 +22,7 @@ def main( data_dir: str = typer.Option(DEFAULT_DATA_DIR, help="Prefix for output files"), headless: bool = typer.Option(False, help="Run browser in headless mode"), use_subprocess: bool = typer.Option(False, help="Run browser in a separate subprocess"), - version_main: int = typer.Option(133, help="Main version of Chrome to use"), + version_main: int = typer.Option(141, help="Main version of Chrome to use"), ai_expand: bool = typer.Option(True, help="Expand AI overviews if present"), driver_executable_path: str = typer.Option("", help="Path to ChromeDriver executable"), ) -> None: