gitronald · gitronald · Dec 5, 2025 · Feb 10, 2025 · Feb 17, 2025 · Feb 26, 2025
diff --git a/README.md b/README.md
@@ -47,6 +47,7 @@ Below are some details about recent updates. For a longer list, see the [Update
     - [Repair or Enhance a Parser](#repair-or-enhance-a-parser)
     - [Add a Parser](#add-a-parser)
     - [Testing](#testing)
+  - [GitHub Actions](#github-actions)
   - [Update Log](#update-log)
   - [Similar Packages](#similar-packages)
   - [License](#license)
@@ -119,7 +120,7 @@ drwxr-xr-x 2 user user 4.0K 2024-11-11 10:55 html/
 
 ### Step by Step 
 
-Example search and parse pipeline:
+Example search and parse pipeline (via requests):
 
 ```python
 import WebSearcher as ws
@@ -143,7 +144,7 @@ se = ws.SearchEngine(
         "headless": False,
         "use_subprocess": False,
         "driver_executable_path": "",
-        "version_main": 133,
+        "version_main": 141,
     }
 )
 ```   
@@ -253,6 +254,22 @@ With the `-k` flag you can run a test for a specific html file:
 pytest -k "1684837514.html"
 ```
 
+---
+## GitHub Actions
+
+This repository uses GitHub Actions for automated publishing:
+
+**Release Workflow** (`.github/workflows/publish.yml`)
+Automatically publishes to PyPI when a pull request is merged into `master`. The workflow:
+- Triggers on merged PRs to `master`
+- Builds the package using Poetry
+- Publishes to PyPI using trusted publishing (no API tokens required)
+
+To release a new version:
+1. Update the version in `pyproject.toml`
+2. Create a PR to `master`
+3. Once merged, the package is automatically published to PyPI
+
 ---
 ## Update Log
 

diff --git a/WebSearcher/__init__.py b/WebSearcher/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.6.4"
+__version__ = "0.6.5"
 from .searchers import SearchEngine
 from .parsers import parse_serp, FeatureExtractor
 from .extractors import Extractor

diff --git a/WebSearcher/classifiers/header_text.py b/WebSearcher/classifiers/header_text.py
@@ -90,7 +90,8 @@ def _get_header_level_mapping(level) -> dict:
         "local_results": [
             "Local Results",
             "Locations",
-            "Places", "Sitios"
+            "Places", 
+            "Sitios",
             "Businesses",
             "locations",
         ],
@@ -116,6 +117,7 @@ def _get_header_level_mapping(level) -> dict:
                         "News", 
                         "Noticias",
                         "Market news"],
+        "recent_posts": ["Recent posts"],
         "twitter": ["Twitter Results"],
         "videos": ["Videos"]
     }

diff --git a/WebSearcher/classifiers/main.py b/WebSearcher/classifiers/main.py
@@ -1,9 +1,9 @@
+import bs4
 from .. import logger
 log = logger.Logger().start(__name__)
 
 from .header_text import ClassifyHeaderText
 from .. import webutils
-import bs4
 
 class ClassifyMain:
     """Classify a component from the main section based on its bs4.element.Tag """
@@ -14,6 +14,7 @@ def classify(cmpt: bs4.element.Tag) -> str:
         # Ordered list of classifiers to try
         component_classifiers = [
             ClassifyMain.top_stories,        # Check top stories
+            ClassifyMain.discussions_and_forums, # Check discussions and forums
             ClassifyHeaderText.classify,     # Check levels 2 & 3 header text
             ClassifyMain.news_quotes,        # Check news quotes
             ClassifyMain.img_cards,          # Check image cards
@@ -40,6 +41,12 @@ def classify(cmpt: bs4.element.Tag) -> str:
 
         return cmpt_type
 
+    @staticmethod
+    def discussions_and_forums(cmpt: bs4.element.Tag) -> str:
+        conditions = [
+            cmpt.find("div", {"class": "IFnjPb", "role": "heading"}),
+        ]
+        return 'discussions_and_forums' if all(conditions) else "unknown"
 
     @staticmethod
     def available_on(cmpt: bs4.element.Tag) -> str:
@@ -68,7 +75,7 @@ def general(cmpt: bs4.element.Tag) -> str:
                 "format-01": cmpt.attrs["class"] == ["g"],
                 "format-02": ( ("g" in cmpt.attrs["class"]) &                            
                                any(s in ["Ww4FFb"] for s in cmpt.attrs["class"]) ),
-                "format-03": any(s in ["hlcw0c", "MjjYud"] for s in cmpt.attrs["class"]),
+                "format-03": any(s in ["hlcw0c", "MjjYud", "PmEWq"] for s in cmpt.attrs["class"]),
                 "format-04": cmpt.find('div', {'class': ['g', 'Ww4FFb']}),
             }
         else: 
@@ -143,7 +150,9 @@ def knowledge_panel(cmpt: bs4.element.Tag) -> str:
             cmpt.find("h1", {"class": "VW3apb"}),
             cmpt.find("div", {"class": ["knowledge-panel", "knavi", "kp-blk", "kp-wholepage-osrp"]}),
             cmpt.find("div", {"aria-label": "Featured results", "role": "complementary"}),
-            webutils.check_dict_value(cmpt.attrs, "jscontroller", "qTdDb")
+            cmpt.find("div", {"jscontroller": "qTdDb"}),
+            webutils.check_dict_value(cmpt.attrs, "jscontroller", "qTdDb"),
+            cmpt.find('div', {'class':'obcontainer'})
         ]
         return 'knowledge' if any(conditions) else "unknown"
 
@@ -179,10 +188,9 @@ def top_stories(cmpt: bs4.element.Tag) -> str:
     @staticmethod
     def news_quotes(cmpt: bs4.element.Tag) -> str:
         """Classify top stories components"""
-        conditions = [
-            cmpt.find("g-tray-header", role="heading"),
-        ]
-        return 'news_quotes' if all(conditions) else "unknown"
+        header_div = cmpt.find("g-tray-header", role="heading")
+        condition = webutils.get_text(header_div, strip=True) == "News quotes"
+        return 'news_quotes' if condition else "unknown"
 
     @staticmethod
     def twitter(cmpt: bs4.element.Tag) -> str:

diff --git a/WebSearcher/component_parsers/__init__.py b/WebSearcher/component_parsers/__init__.py
@@ -15,6 +15,7 @@
 from .latest_from import parse_latest_from
 from .local_news import parse_local_news
 from .perspectives import parse_perspectives
+from .recent_posts import parse_recent_posts
 
 from .local_results import parse_local_results
 from .map_results import parse_map_results
@@ -57,6 +58,7 @@
     ('news_quotes', parse_news_quotes, 'News Quotes'),
     ('people_also_ask', parse_people_also_ask, 'People Also Ask'),
     ('perspectives', parse_perspectives, 'Perspectives & Opinions'),
+    ('recent_posts', parse_recent_posts, 'Recent Posts'),
     ('scholarly_articles', parse_scholarly_articles, 'Scholar Articles'),
     ('searches_related', parse_searches_related, 'Related Searches'),
     ('shopping_ads', parse_shopping_ads, 'Shopping Ad'),

diff --git a/WebSearcher/component_parsers/ads.py b/WebSearcher/component_parsers/ads.py
@@ -6,13 +6,24 @@
 - added new div class for text field
 - added labels (e.g., "Provides abortions") from <span class="mXsQRe">, appended to text field
 
+2025-04-27: added carousel sub_type, global parsed output
 
 """
 
 from .. import webutils
 from .shopping_ads import parse_shopping_ads
 import bs4
 
+PARSED = {
+    'type': 'ad',
+    'sub_type': '',
+    'sub_rank': 0,
+    'title': '',
+    'url': '',
+    'cite': '',
+    'text': '',
+}
+
 def parse_ads(cmpt: bs4.element.Tag) -> list:
     """Parse ads from ad component"""
 
@@ -27,12 +38,14 @@ def parse_ads(cmpt: bs4.element.Tag) -> list:
         parsed_list = [parse_ad_secondary(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
     elif sub_type == 'standard':
         subs = webutils.find_all_divs(cmpt, 'div', {'class': ['uEierd', 'commercial-unit-desktop-top']})
-        for sub in subs:
+        for sub_rank, sub in enumerate(subs):
             sub_classes = sub.attrs.get("class", [])
             if "commercial-unit-desktop-top" in sub_classes:
                 parsed_list.extend(parse_shopping_ads(sub))
             elif "uEierd" in sub_classes:
-                parsed_list.append(parse_ad(sub))
+                parsed_list.append(parse_ad(sub, sub_rank=sub_rank))
+    elif sub_type == 'carousel':
+        parsed_list = parse_ad_carousel(cmpt, sub_type)
     return parsed_list
 
 
@@ -41,20 +54,71 @@ def classify_ad_type(cmpt: bs4.element.Tag) -> str:
     label_divs = {
         "legacy": webutils.find_all_divs(cmpt, 'div', {'class': 'ad_cclk'}),
         "secondary": webutils.find_all_divs(cmpt, 'div', {'class': 'd5oMvf'}),
-        "standard": webutils.find_all_divs(cmpt, 'div', {'class': ['uEierd', 'commercial-unit-desktop-top']})
+        "standard": webutils.find_all_divs(cmpt, 'div', {'class': ['uEierd', 'commercial-unit-desktop-top']}),
+        "carousel": webutils.find_all_divs(cmpt, 'g-scrolling-carousel'),
     }
     for label, divs in label_divs.items():
         if divs:
             return label
     return 'unknown'
 
 
+def parse_ad_carousel(cmpt: bs4.element.Tag, sub_type: str, filter_visible: bool = True) -> list:
+
+    def parse_ad_carousel_div(sub: bs4.element.Tag, sub_type: str, sub_rank: int) -> dict:
+        """Parse ad carousel div, seen 2025-02-06"""
+        parsed = PARSED.copy()
+        parsed['sub_type'] = sub_type
+        parsed['sub_rank'] = sub_rank
+        parsed['title'] = webutils.get_text(sub, 'div', {'class':'e7SMre'})
+        parsed['url'] = webutils.get_link(sub)
+        parsed['text'] = webutils.get_text(sub, 'div', {"class":"vrAZpb"})
+        parsed['cite'] = webutils.get_text(sub, 'div', {"class":"zpIwr"})
+        parsed['visible'] = not (sub.has_attr('data-has-shown') and sub['data-has-shown'] == 'false')
+        return parsed
+
+    def parse_ad_carousel_card(sub: bs4.element.Tag, sub_type: str, sub_rank: int) -> dict:
+        """Parse ad carousel card, seen 2024-09-21"""
+        parsed = PARSED.copy()
+        parsed['sub_type'] = sub_type
+        parsed['sub_rank'] = sub_rank
+        parsed['title'] = webutils.get_text(sub, 'div', {'class':'gCv54b'})
+        parsed['url'] = webutils.get_link(sub, {"class": "KTsHxd"})
+        parsed['text'] = webutils.get_text(sub, 'div', {"class":"VHpBje"})
+        parsed['cite'] = webutils.get_text(sub, 'div', {"class":"j958Pd"})
+        parsed['visible'] = not (sub.has_attr('data-viewurl') and sub['data-viewurl'])
+        return parsed
+
+    ad_carousel_parsers = [
+        {'find_kwargs': {'name': 'g-inner-card'}, 
+         'parser': parse_ad_carousel_card},
+        {'find_kwargs': {'name': 'div', 'attrs': {'class': 'ZPze1e'}},
+         'parser': parse_ad_carousel_div}
+    ]
+
+    output_list = []
+    ad_carousel = cmpt.find('g-scrolling-carousel')
+    if ad_carousel:
+        for parser_details in ad_carousel_parsers:
+            parser_func = parser_details['parser']
+            kwargs = parser_details['find_kwargs']
+            sub_cmpts = webutils.find_all_divs(ad_carousel, **kwargs)
+            if sub_cmpts:
+                for sub_rank, sub in enumerate(sub_cmpts):
+                    parsed = parser_func(sub, sub_type, sub_rank)
+                    output_list.append(parsed)
+
+    if filter_visible:
+        output_list = [{k:v for k,v in x.items() if k != 'visible'} for x in output_list if x['visible']]
+    return output_list
+
+
 def parse_ad(sub: bs4.element.Tag, sub_rank: int = 0) -> dict:
     """Parse details of a single ad subcomponent, similar to general"""
-    parsed = {"type": "ad", 
-              "sub_type": "standard", 
-              "sub_rank": sub_rank}
-    
+    parsed = PARSED.copy()
+    parsed["sub_type"] = "standard"
+    parsed["sub_rank"] = sub_rank
+
     parsed['title'] = webutils.get_text(sub, 'div', {'role':'heading'})
     parsed['url'] = webutils.get_link(sub, {"class":"sVXRqc"})
     parsed['cite'] = webutils.get_text(sub, 'span', {"role":"text"})
@@ -96,13 +160,14 @@ def parse_ad_menu(sub: bs4.element.Tag) -> list:
 
 def parse_ad_secondary(sub: bs4.element.Tag, sub_rank: int = 0) -> dict:
     """Parse details of a single ad subcomponent, similar to general"""
+    parsed = PARSED.copy()
+    parsed["sub_type"] = "secondary"
+    parsed["sub_rank"] = sub_rank
 
-    parsed = {"type": "ad", 
-              "sub_type": "secondary", 
-              "sub_rank": sub_rank}
-    parsed['title'] = sub.find('div', {'role':'heading'}).text
-    parsed['url'] = sub.find('div', {'class':'d5oMvf'}).find('a')['href']
-    parsed['cite'] = sub.find('span', {'class':'gBIQub'}).text
+    parsed['title'] = webutils.get_text(sub, 'div', {'role':'heading'})
+    link_div = sub.find('div', {'class':'d5oMvf'})
+    parsed['url'] = webutils.get_link(link_div) if link_div else ''
+    parsed['cite'] = webutils.get_text(sub, 'span', {'class':'gBIQub'})
 
     # Take the top div with this class, should be main result abstract
     text_divs = sub.find_all('div', {'class':'yDYNvb'})
@@ -123,14 +188,14 @@ def parse_ad_secondary(sub: bs4.element.Tag, sub_rank: int = 0) -> dict:
 
 def parse_ad_legacy(sub: bs4.element.Tag, sub_rank: int = 0) -> dict:
     """[legacy] Parse details of a single ad subcomponent, similar to general"""
-
-    parsed = {"type": "ad", 
-              "sub_type": "legacy", 
-              "sub_rank": sub_rank}
+    parsed = PARSED.copy()
+    parsed["sub_type"] = "legacy"
+    parsed["sub_rank"] = sub_rank
+
     header = sub.find('div', {'class':'ad_cclk'})
-    parsed['title'] = header.find('h3').text
-    parsed['url'] = header.find('cite').text
-    parsed['text'] = sub.find('div', {'class':'ads-creative'}).text
+    parsed['title'] = webutils.get_text(header, 'h3')
+    parsed['url'] = webutils.get_text(header, 'cite')
+    parsed['text'] = webutils.get_text(sub, 'div', {'class':'ads-creative'})
 
     bottom_text = sub.find('ul')
     if bottom_text:

diff --git a/WebSearcher/component_parsers/footer.py b/WebSearcher/component_parsers/footer.py
@@ -2,31 +2,31 @@
 
 class Footer:
 
-    @classmethod
-    def parse_image_cards(self, elem) -> list:
+    @staticmethod
+    def parse_image_cards(elem) -> list:
         subs = webutils.find_all_divs(elem, 'div', {'class':'g'})
-        return [self.parse_image_card(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
+        return [Footer.parse_image_card(sub, sub_rank) for sub_rank, sub in enumerate(subs)]
 
-    @classmethod
-    def parse_image_card(self, sub, sub_rank=0) -> dict:
+    @staticmethod
+    def parse_image_card(sub, sub_rank=0) -> dict:
         parsed = {'type':'img_cards', 'sub_rank':sub_rank}
         parsed['title'] = webutils.get_text(sub, "div", {'aria-level':"3", "role":"heading"})
         images = sub.find_all('img')
         if images:
             parsed['details'] = [{'text':i['alt'], 'url':i['src']} for i in images]
         return parsed
 
-    @classmethod
-    def parse_discover_more(self, elem) -> list:
+    @staticmethod
+    def parse_discover_more(elem) -> list:
         carousel = elem.find('g-scrolling-carousel')
         return [{
             'type':'discover_more', 
             'sub_rank':0,
             'text': '|'.join(c.text for c in carousel.find_all('g-inner-card'))
         }]
 
-    @classmethod
-    def parse_omitted_notice(self, elem) -> list:
+    @staticmethod
+    def parse_omitted_notice(elem) -> list:
         return [{
             'type':'omitted_notice',
             'sub_rank':0,