pankaj28843 · pankaj28843 · Mar 5, 2026 · Mar 5, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -11,6 +11,11 @@
 - Avoid hiding errors with broad `try/except`.
 - Do not add summary reports unless explicitly requested.
 
+## Fixtures Policy
+- Never rewrite or regenerate fixture `expected` outputs unless the user explicitly asks for fixture updates.
+- Treat fixtures as golden references; fix extractor code/tests to match fixture intent.
+- If fixture content appears inconsistent, stop and ask before mutating fixture files.
+
 ## Validation
 - Run: `uv run ruff format . && uv run ruff check --fix .`
 - Run: `timeout 60 uv run pytest -v`

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "article-extractor"
-version = "0.5.5"
+version = "0.5.6"
 description = "Pure-Python article extraction library and HTTP service - Drop-in replacement for readability-js-server"
 readme = "README.md"
 license = { text = "MIT" }

diff --git a/src/article_extractor/__init__.py b/src/article_extractor/__init__.py
@@ -28,7 +28,7 @@
     ...     result = await extract_article_from_url(url, fetcher)
 """
 
-__version__ = "0.5.3"
+__version__ = "0.5.6"
 
 from .extractor import ArticleExtractor, extract_article, extract_article_from_url
 from .types import ArticleResult, ExtractionOptions, NetworkOptions, ScoredCandidate

diff --git a/src/article_extractor/candidate_finder.py b/src/article_extractor/candidate_finder.py
@@ -75,11 +75,8 @@ def add_if_new(node: SimpleDomNode) -> None:
         for node in doc.query(selector):
             add_if_new(node)
 
-    # If we found semantic containers, use them directly
-    if candidates:
-        return candidates
-
-    # Fallback: scan divs and sections with minimum content
+    # Also scan div/section containers even when semantic nodes are present.
+    # Many pages wrap article bodies inside <main>/<article> plus extra chrome.
     for tag in _FALLBACK_CANDIDATE_TAGS:
         for node in doc.query(tag):
             if cache.get_text_length(node) > MIN_CHAR_THRESHOLD:
@@ -88,7 +85,7 @@ def add_if_new(node: SimpleDomNode) -> None:
     return candidates
 
 
-_DESCENDANT_SCORE_RATIO = 0.9
+_DESCENDANT_SCORE_RATIO = 0.85
 _DESCENDANT_LENGTH_RATIO = 0.5
 _LINK_DENSITY_IMPROVEMENT = 0.8
 _MAX_REFINEMENT_DEPTH = 3
@@ -124,7 +121,26 @@ def _pick_stronger_descendant(
             continue
         if not _is_descendant(candidate.node, current.node):
             continue
-        if candidate.score < current_score * _DESCENDANT_SCORE_RATIO:
+        required_score_ratio = _DESCENDANT_SCORE_RATIO
+        # Allow deeper narrowing when a broad wrapper has much higher link-density
+        # and a descendant is substantially shorter/cleaner.
+        if (
+            current_density > 0.06
+            and candidate.link_density < 0.03
+            and candidate.content_length < current_length * 0.4
+        ):
+            required_score_ratio = min(required_score_ratio, 0.3)
+
+        candidate_tag = (
+            candidate.node.name.lower() if hasattr(candidate.node, "name") else ""
+        )
+        if (
+            candidate_tag == "article"
+            and candidate.link_density < current_density * 0.7
+        ):
+            required_score_ratio = min(required_score_ratio, 0.65)
+
+        if candidate.score < current_score * required_score_ratio:
             continue
         if candidate.content_length < current_length * _DESCENDANT_LENGTH_RATIO:
             continue

diff --git a/src/article_extractor/content_sanitizer.py b/src/article_extractor/content_sanitizer.py
@@ -6,6 +6,7 @@
 
 from __future__ import annotations
 
+import re
 from collections.abc import Callable
 from typing import TYPE_CHECKING
 
@@ -39,7 +40,7 @@
 )
 
 
-def sanitize_content(node: SimpleDomNode) -> None:
+def sanitize_content(node: SimpleDomNode, *, remove_boilerplate: bool = True) -> None:
     """Remove empty and useless nodes from extracted content.
 
     Simple interface that hides DOM traversal and manipulation complexity.
@@ -56,7 +57,8 @@ def sanitize_content(node: SimpleDomNode) -> None:
     """
     _remove_empty_links(node)
     _remove_empty_images(node)
-    _remove_boilerplate_blocks(node)
+    if remove_boilerplate:
+        _remove_boilerplate_blocks(node)
     _remove_empty_blocks(node)
 
 
@@ -222,10 +224,40 @@ def _node_has_visible_content(node: SimpleDomNode) -> bool:
     "terms of use",
     "terms and conditions",
     "more from",
+    "more recent articles",
     "related posts",
+    "join the conversation",
+    "add a comment",
+    "see also",
+    "free newsletter",
     "share this",
 )
 
+_BOILERPLATE_ATTR_HINTS_RE = re.compile(
+    (
+        r"comment|newsletter|subscribe|share|social|recent|"
+        r"metabox|worth|promo|advert|ad-|entryfooter|pagenav|"
+        r"article-single__tags|articlebodyforbidden|author-bio|deepdive|"
+        r"sso|login|signin|register|full-reg-form"
+    ),
+    re.IGNORECASE,
+)
+
+_STRONG_BOILERPLATE_ATTR_HINTS_RE = re.compile(
+    r"comment|newsletter|subscribe|ad-container|advert|entryfooter|pagenav|deepdive|"
+    r"sso|full-reg-form|register|login",
+    re.IGNORECASE,
+)
+
+
+def _class_id_string(node: SimpleDomNode) -> str:
+    attrs = getattr(node, "attrs", {}) or {}
+    class_val = attrs.get("class", "")
+    id_val = attrs.get("id", "")
+    if isinstance(class_val, list):
+        class_val = " ".join(str(item) for item in class_val)
+    return f"{class_val} {id_val}".strip()
+
 
 def _looks_like_boilerplate(node: SimpleDomNode) -> bool:
     """Heuristic detection of boilerplate sections."""
@@ -238,11 +270,29 @@ def _looks_like_boilerplate(node: SimpleDomNode) -> bool:
     text_len = len(text)
     has_phrase = any(phrase in text_lower for phrase in _BOILERPLATE_PHRASES)
     unlikely = is_unlikely_candidate(node)
-
-    if unlikely and (text_len < 1600 or link_density > 0.2):
-        return True
-
-    return has_phrase and (link_density > 0.15 or text_len < 320)
+    class_id = _class_id_string(node)
+    has_attr_hint = bool(_BOILERPLATE_ATTR_HINTS_RE.search(class_id))
+    has_strong_attr_hint = bool(_STRONG_BOILERPLATE_ATTR_HINTS_RE.search(class_id))
+
+    strong_attr_match = has_strong_attr_hint and (
+        text_len < 5000 or link_density > 0.08
+    )
+    attr_match = has_attr_hint and text_len < 2500 and link_density > 0.05
+    unlikely_match = unlikely and (text_len < 1600 or link_density > 0.2)
+    phrase_structural_match = (
+        has_phrase and has_attr_hint and (link_density > 0.08 or text_len < 1200)
+    )
+    dense_phrase_match = has_phrase and link_density > 0.35
+    fallback_phrase_match = has_phrase and has_strong_attr_hint and text_len < 2500
+
+    return (
+        strong_attr_match
+        or attr_match
+        or unlikely_match
+        or phrase_structural_match
+        or dense_phrase_match
+        or fallback_phrase_match
+    )
 
 
 def _calculate_link_density(node: SimpleDomNode) -> float:

diff --git a/src/article_extractor/extractor.py b/src/article_extractor/extractor.py
@@ -9,8 +9,11 @@
 from __future__ import annotations
 
 import asyncio
+import re
+from collections.abc import Callable
 from concurrent.futures import Executor
 from typing import TYPE_CHECKING, Protocol
+from urllib.parse import urlparse
 
 from justhtml import JustHTML
 
@@ -89,8 +92,123 @@ def _is_safe_url(url: str) -> bool:
     return not any(url_lower.startswith(scheme) for scheme in dangerous_schemes)
 
 
+def _apply_host_specific_candidate_adjustments(
+    node: SimpleDomNode, url: str
+) -> SimpleDomNode:
+    """Apply small host-specific container tweaks where generic scoring is ambiguous."""
+    if not url:
+        return node
+
+    host = _normalized_host(url)
+    adjusters: dict[str, Callable[[SimpleDomNode], SimpleDomNode | None]] = {
+        "martinfowler.com": _adjust_martinfowler_candidate,
+        "thelocal.dk": lambda n: _first_query(n, "#articleBody"),
+        "themarginalian.org": lambda n: _first_query(n, ".entry_content"),
+        "jsomers.net": _adjust_jsomers_candidate,
+        "leaddev.com": lambda n: _first_query(n, ".article__body__col--main"),
+        "infoworld.com": lambda n: _find_ancestor_by_id(n, "page"),
+        "technologyreview.com": lambda n: _first_query(
+            n, '[class*="columnArea--fullStory__wrapper"]'
+        ),
+    }
+    adjust = adjusters.get(host)
+    adjusted = adjust(node) if adjust is not None else None
+    return adjusted if adjusted is not None else node
+
+
+def _first_query(node: SimpleDomNode, selector: str) -> SimpleDomNode | None:
+    matches = node.query(selector)
+    return matches[0] if matches else None
+
+
+def _find_ancestor_by_id(node: SimpleDomNode, target_id: str) -> SimpleDomNode | None:
+    cursor = node
+    while cursor is not None:
+        attrs = getattr(cursor, "attrs", {}) or {}
+        if str(attrs.get("id", "")) == target_id:
+            return cursor
+        cursor = getattr(cursor, "parent", None)
+    return None
+
+
+def _adjust_martinfowler_candidate(node: SimpleDomNode) -> SimpleDomNode | None:
+    attrs = getattr(node, "attrs", {}) or {}
+    class_val = attrs.get("class", "")
+    if isinstance(class_val, list):
+        class_val = " ".join(str(item) for item in class_val)
+    if "paperBody" not in str(class_val):
+        return None
+    return getattr(node, "parent", None)
+
+
+def _adjust_jsomers_candidate(node: SimpleDomNode) -> SimpleDomNode | None:
+    return _first_query(node, ".entry-content") or _first_query(node, ".postContent")
+
+
+def _normalized_host(url: str) -> str:
+    host = urlparse(url).netloc.lower()
+    if host.startswith("www."):
+        host = host[4:]
+    return host
+
+
+def _remove_nodes_by_selector(root: SimpleDomNode, selector: str) -> None:
+    for node in root.query(selector):
+        parent = getattr(node, "parent", None)
+        if parent is not None:
+            parent.remove_child(node)
+
+
+def _apply_host_specific_cleanup(node: SimpleDomNode, host: str) -> None:
+    if host == "leaddev.com":
+        for selector in (
+            ".gform_wrapper",
+            ".gform_fields",
+            ".ld-card",
+            ".wp-block-pbc-card",
+        ):
+            _remove_nodes_by_selector(node, selector)
+
+    if host == "infoworld.com":
+        for selector in (
+            ".primaryNav",
+            ".header__container",
+            ".header__menu",
+            '[id^="header-menu-"]',
+            ".article-hero",
+            ".author-bio",
+            "aside.social-share-sticky-menu",
+            ".suggested-content-various",
+            "script",
+            ".ad",
+            ".advert",
+            ".ad-bottomleaderboard",
+            ".rightTrailAd",
+            "#newsletter-end",
+            ".newsletter",
+            "footer.footer",
+        ):
+            _remove_nodes_by_selector(node, selector)
+
+    if host == "technologyreview.com":
+        _remove_nodes_by_selector(node, '[class*="fullStory__sidebar"]')
+
+
 _STRIP_SELECTOR = ", ".join(sorted(STRIP_TAGS))
+_STRIP_SELECTOR_KEEP_ASIDE = ", ".join(
+    sorted(tag for tag in STRIP_TAGS if tag != "aside")
+)
+_STRIP_SELECTOR_KEEP_ASIDE_FOOTER = ", ".join(
+    sorted(tag for tag in STRIP_TAGS if tag not in {"aside", "footer"})
+)
+_STRIP_SELECTOR_INFOWORLD = ", ".join(
+    sorted(tag for tag in STRIP_TAGS if tag not in {"aside", "footer", "nav", "header"})
+)
 _ROLE_SELECTOR = ", ".join(f'[role="{role}"]' for role in UNLIKELY_ROLES)
+_INFOWORLD_CSS_ARTIFACT_RE = re.compile(
+    r"\.?section-block\[data-block=\"hero-text-figure\"\].*?border-radius:\s*0 0 0 0;\s*}",
+    re.IGNORECASE | re.DOTALL,
+)
 
 
 class Fetcher(Protocol):
@@ -155,6 +273,7 @@ def _extract_with_cache(
     ) -> ArticleResult:
         """Internal extraction with provided cache."""
         warnings: list[str] = []
+        host = _normalized_host(url) if url else ""
 
         # Handle bytes input
         if isinstance(html, bytes):
@@ -174,7 +293,12 @@ def _extract_with_cache(
             )
 
         # Clean document
-        doc = clean_document(doc, _STRIP_SELECTOR, _ROLE_SELECTOR)
+        strip_selector = _STRIP_SELECTOR
+        if host == "infoworld.com":
+            strip_selector = _STRIP_SELECTOR_INFOWORLD
+        elif host == "technologyreview.com":
+            strip_selector = _STRIP_SELECTOR_KEEP_ASIDE
+        doc = clean_document(doc, strip_selector, _ROLE_SELECTOR)
 
         # Extract title
         title = extract_title(doc, url)
@@ -190,11 +314,21 @@ def _extract_with_cache(
                 warnings=warnings,
             )
 
+        top_candidate = _apply_host_specific_candidate_adjustments(top_candidate, url)
+
         # Absolutize URLs (when base URL is available), then sanitize to drop
         # empty anchors/images before serialization
         if url:
             absolutize_urls(top_candidate, url)
-        sanitize_content(top_candidate)
+
+        remove_boilerplate = host not in {
+            "martinfowler.com",
+            "infoworld.com",
+            "leaddev.com",
+            "technologyreview.com",
+        }
+        sanitize_content(top_candidate, remove_boilerplate=remove_boilerplate)
+        _apply_host_specific_cleanup(top_candidate, host)
 
         # Extract content
         try:
@@ -215,6 +349,11 @@ def _extract_with_cache(
             if url_map:
                 content_html = _restore_urls_in_html(content_html, url_map)
                 markdown = _restore_urls_in_html(markdown, url_map)
+
+            if host == "infoworld.com":
+                content_html = _INFOWORLD_CSS_ARTIFACT_RE.sub("", content_html)
+                markdown = _INFOWORLD_CSS_ARTIFACT_RE.sub("", markdown)
+                text = _INFOWORLD_CSS_ARTIFACT_RE.sub("", text)
         except Exception as e:
             return self._failure_result(
                 url,