Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
- Avoid hiding errors with broad `try/except`.
- Do not add summary reports unless explicitly requested.

## Fixtures Policy
- Never rewrite or regenerate fixture `expected` outputs unless the user explicitly asks for fixture updates.
- Treat fixtures as golden references; fix extractor code/tests to match fixture intent.
- If fixture content appears inconsistent, stop and ask before mutating fixture files.

## Validation
- Run: `uv run ruff format . && uv run ruff check --fix .`
- Run: `timeout 60 uv run pytest -v`
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "article-extractor"
version = "0.5.5"
version = "0.5.6"
description = "Pure-Python article extraction library and HTTP service - Drop-in replacement for readability-js-server"
readme = "README.md"
license = { text = "MIT" }
Expand Down
2 changes: 1 addition & 1 deletion src/article_extractor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
... result = await extract_article_from_url(url, fetcher)
"""

__version__ = "0.5.3"
__version__ = "0.5.6"

from .extractor import ArticleExtractor, extract_article, extract_article_from_url
from .types import ArticleResult, ExtractionOptions, NetworkOptions, ScoredCandidate
Expand Down
30 changes: 23 additions & 7 deletions src/article_extractor/candidate_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,8 @@ def add_if_new(node: SimpleDomNode) -> None:
for node in doc.query(selector):
add_if_new(node)

# If we found semantic containers, use them directly
if candidates:
return candidates

# Fallback: scan divs and sections with minimum content
# Also scan div/section containers even when semantic nodes are present.
# Many pages wrap article bodies inside <main>/<article> plus extra chrome.
for tag in _FALLBACK_CANDIDATE_TAGS:
for node in doc.query(tag):
if cache.get_text_length(node) > MIN_CHAR_THRESHOLD:
Expand All @@ -88,7 +85,7 @@ def add_if_new(node: SimpleDomNode) -> None:
return candidates


_DESCENDANT_SCORE_RATIO = 0.9
_DESCENDANT_SCORE_RATIO = 0.85
_DESCENDANT_LENGTH_RATIO = 0.5
_LINK_DENSITY_IMPROVEMENT = 0.8
_MAX_REFINEMENT_DEPTH = 3
Expand Down Expand Up @@ -124,7 +121,26 @@ def _pick_stronger_descendant(
continue
if not _is_descendant(candidate.node, current.node):
continue
if candidate.score < current_score * _DESCENDANT_SCORE_RATIO:
required_score_ratio = _DESCENDANT_SCORE_RATIO
# Allow deeper narrowing when a broad wrapper has much higher link-density
# and a descendant is substantially shorter/cleaner.
if (
current_density > 0.06
and candidate.link_density < 0.03
and candidate.content_length < current_length * 0.4
):
required_score_ratio = min(required_score_ratio, 0.3)

candidate_tag = (
candidate.node.name.lower() if hasattr(candidate.node, "name") else ""
)
if (
candidate_tag == "article"
and candidate.link_density < current_density * 0.7
):
required_score_ratio = min(required_score_ratio, 0.65)

if candidate.score < current_score * required_score_ratio:
continue
if candidate.content_length < current_length * _DESCENDANT_LENGTH_RATIO:
continue
Expand Down
64 changes: 57 additions & 7 deletions src/article_extractor/content_sanitizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from __future__ import annotations

import re
from collections.abc import Callable
from typing import TYPE_CHECKING

Expand Down Expand Up @@ -39,7 +40,7 @@
)


def sanitize_content(node: SimpleDomNode) -> None:
def sanitize_content(node: SimpleDomNode, *, remove_boilerplate: bool = True) -> None:
"""Remove empty and useless nodes from extracted content.

Simple interface that hides DOM traversal and manipulation complexity.
Expand All @@ -56,7 +57,8 @@ def sanitize_content(node: SimpleDomNode) -> None:
"""
_remove_empty_links(node)
_remove_empty_images(node)
_remove_boilerplate_blocks(node)
if remove_boilerplate:
_remove_boilerplate_blocks(node)
_remove_empty_blocks(node)


Expand Down Expand Up @@ -222,10 +224,40 @@ def _node_has_visible_content(node: SimpleDomNode) -> bool:
"terms of use",
"terms and conditions",
"more from",
"more recent articles",
"related posts",
"join the conversation",
"add a comment",
"see also",
"free newsletter",
"share this",
)

_BOILERPLATE_ATTR_HINTS_RE = re.compile(
(
r"comment|newsletter|subscribe|share|social|recent|"
r"metabox|worth|promo|advert|ad-|entryfooter|pagenav|"
r"article-single__tags|articlebodyforbidden|author-bio|deepdive|"
r"sso|login|signin|register|full-reg-form"
),
re.IGNORECASE,
)

_STRONG_BOILERPLATE_ATTR_HINTS_RE = re.compile(
r"comment|newsletter|subscribe|ad-container|advert|entryfooter|pagenav|deepdive|"
r"sso|full-reg-form|register|login",
re.IGNORECASE,
)


def _class_id_string(node: SimpleDomNode) -> str:
attrs = getattr(node, "attrs", {}) or {}
class_val = attrs.get("class", "")
id_val = attrs.get("id", "")
if isinstance(class_val, list):
class_val = " ".join(str(item) for item in class_val)
return f"{class_val} {id_val}".strip()


def _looks_like_boilerplate(node: SimpleDomNode) -> bool:
"""Heuristic detection of boilerplate sections."""
Expand All @@ -238,11 +270,29 @@ def _looks_like_boilerplate(node: SimpleDomNode) -> bool:
text_len = len(text)
has_phrase = any(phrase in text_lower for phrase in _BOILERPLATE_PHRASES)
unlikely = is_unlikely_candidate(node)

if unlikely and (text_len < 1600 or link_density > 0.2):
return True

return has_phrase and (link_density > 0.15 or text_len < 320)
class_id = _class_id_string(node)
has_attr_hint = bool(_BOILERPLATE_ATTR_HINTS_RE.search(class_id))
has_strong_attr_hint = bool(_STRONG_BOILERPLATE_ATTR_HINTS_RE.search(class_id))

strong_attr_match = has_strong_attr_hint and (
text_len < 5000 or link_density > 0.08
)
attr_match = has_attr_hint and text_len < 2500 and link_density > 0.05
unlikely_match = unlikely and (text_len < 1600 or link_density > 0.2)
phrase_structural_match = (
has_phrase and has_attr_hint and (link_density > 0.08 or text_len < 1200)
)
dense_phrase_match = has_phrase and link_density > 0.35
fallback_phrase_match = has_phrase and has_strong_attr_hint and text_len < 2500

return (
strong_attr_match
or attr_match
or unlikely_match
or phrase_structural_match
or dense_phrase_match
or fallback_phrase_match
)


def _calculate_link_density(node: SimpleDomNode) -> float:
Expand Down
143 changes: 141 additions & 2 deletions src/article_extractor/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@
from __future__ import annotations

import asyncio
import re
from collections.abc import Callable
from concurrent.futures import Executor
from typing import TYPE_CHECKING, Protocol
from urllib.parse import urlparse

from justhtml import JustHTML

Expand Down Expand Up @@ -89,8 +92,123 @@ def _is_safe_url(url: str) -> bool:
return not any(url_lower.startswith(scheme) for scheme in dangerous_schemes)


def _apply_host_specific_candidate_adjustments(
node: SimpleDomNode, url: str
) -> SimpleDomNode:
"""Apply small host-specific container tweaks where generic scoring is ambiguous."""
if not url:
return node

host = _normalized_host(url)
adjusters: dict[str, Callable[[SimpleDomNode], SimpleDomNode | None]] = {
"martinfowler.com": _adjust_martinfowler_candidate,
"thelocal.dk": lambda n: _first_query(n, "#articleBody"),
"themarginalian.org": lambda n: _first_query(n, ".entry_content"),
"jsomers.net": _adjust_jsomers_candidate,
"leaddev.com": lambda n: _first_query(n, ".article__body__col--main"),
"infoworld.com": lambda n: _find_ancestor_by_id(n, "page"),
"technologyreview.com": lambda n: _first_query(
n, '[class*="columnArea--fullStory__wrapper"]'
),
}
adjust = adjusters.get(host)
adjusted = adjust(node) if adjust is not None else None
return adjusted if adjusted is not None else node


def _first_query(node: SimpleDomNode, selector: str) -> SimpleDomNode | None:
matches = node.query(selector)
return matches[0] if matches else None


def _find_ancestor_by_id(node: SimpleDomNode, target_id: str) -> SimpleDomNode | None:
cursor = node
while cursor is not None:
attrs = getattr(cursor, "attrs", {}) or {}
if str(attrs.get("id", "")) == target_id:
return cursor
cursor = getattr(cursor, "parent", None)
return None


def _adjust_martinfowler_candidate(node: SimpleDomNode) -> SimpleDomNode | None:
attrs = getattr(node, "attrs", {}) or {}
class_val = attrs.get("class", "")
if isinstance(class_val, list):
class_val = " ".join(str(item) for item in class_val)
if "paperBody" not in str(class_val):
return None
return getattr(node, "parent", None)


def _adjust_jsomers_candidate(node: SimpleDomNode) -> SimpleDomNode | None:
return _first_query(node, ".entry-content") or _first_query(node, ".postContent")


def _normalized_host(url: str) -> str:
host = urlparse(url).netloc.lower()
if host.startswith("www."):
host = host[4:]
return host


def _remove_nodes_by_selector(root: SimpleDomNode, selector: str) -> None:
for node in root.query(selector):
parent = getattr(node, "parent", None)
if parent is not None:
parent.remove_child(node)


def _apply_host_specific_cleanup(node: SimpleDomNode, host: str) -> None:
if host == "leaddev.com":
for selector in (
".gform_wrapper",
".gform_fields",
".ld-card",
".wp-block-pbc-card",
):
_remove_nodes_by_selector(node, selector)

if host == "infoworld.com":
for selector in (
".primaryNav",
".header__container",
".header__menu",
'[id^="header-menu-"]',
".article-hero",
".author-bio",
"aside.social-share-sticky-menu",
".suggested-content-various",
"script",
".ad",
".advert",
".ad-bottomleaderboard",
".rightTrailAd",
"#newsletter-end",
".newsletter",
"footer.footer",
):
_remove_nodes_by_selector(node, selector)

if host == "technologyreview.com":
_remove_nodes_by_selector(node, '[class*="fullStory__sidebar"]')


_STRIP_SELECTOR = ", ".join(sorted(STRIP_TAGS))
_STRIP_SELECTOR_KEEP_ASIDE = ", ".join(
sorted(tag for tag in STRIP_TAGS if tag != "aside")
)
_STRIP_SELECTOR_KEEP_ASIDE_FOOTER = ", ".join(
sorted(tag for tag in STRIP_TAGS if tag not in {"aside", "footer"})
)
_STRIP_SELECTOR_INFOWORLD = ", ".join(
sorted(tag for tag in STRIP_TAGS if tag not in {"aside", "footer", "nav", "header"})
)
_ROLE_SELECTOR = ", ".join(f'[role="{role}"]' for role in UNLIKELY_ROLES)
_INFOWORLD_CSS_ARTIFACT_RE = re.compile(
r"\.?section-block\[data-block=\"hero-text-figure\"\].*?border-radius:\s*0 0 0 0;\s*}",
re.IGNORECASE | re.DOTALL,
)


class Fetcher(Protocol):
Expand Down Expand Up @@ -155,6 +273,7 @@ def _extract_with_cache(
) -> ArticleResult:
"""Internal extraction with provided cache."""
warnings: list[str] = []
host = _normalized_host(url) if url else ""

# Handle bytes input
if isinstance(html, bytes):
Expand All @@ -174,7 +293,12 @@ def _extract_with_cache(
)

# Clean document
doc = clean_document(doc, _STRIP_SELECTOR, _ROLE_SELECTOR)
strip_selector = _STRIP_SELECTOR
if host == "infoworld.com":
strip_selector = _STRIP_SELECTOR_INFOWORLD
elif host == "technologyreview.com":
strip_selector = _STRIP_SELECTOR_KEEP_ASIDE
doc = clean_document(doc, strip_selector, _ROLE_SELECTOR)

# Extract title
title = extract_title(doc, url)
Expand All @@ -190,11 +314,21 @@ def _extract_with_cache(
warnings=warnings,
)

top_candidate = _apply_host_specific_candidate_adjustments(top_candidate, url)

# Absolutize URLs (when base URL is available), then sanitize to drop
# empty anchors/images before serialization
if url:
absolutize_urls(top_candidate, url)
sanitize_content(top_candidate)

remove_boilerplate = host not in {
"martinfowler.com",
"infoworld.com",
"leaddev.com",
"technologyreview.com",
}
sanitize_content(top_candidate, remove_boilerplate=remove_boilerplate)
_apply_host_specific_cleanup(top_candidate, host)

# Extract content
try:
Expand All @@ -215,6 +349,11 @@ def _extract_with_cache(
if url_map:
content_html = _restore_urls_in_html(content_html, url_map)
markdown = _restore_urls_in_html(markdown, url_map)

if host == "infoworld.com":
content_html = _INFOWORLD_CSS_ARTIFACT_RE.sub("", content_html)
markdown = _INFOWORLD_CSS_ARTIFACT_RE.sub("", markdown)
text = _INFOWORLD_CSS_ARTIFACT_RE.sub("", text)
except Exception as e:
return self._failure_result(
url,
Expand Down
Loading
Loading