diff --git a/AGENTS.md b/AGENTS.md index 2415555..9c8a517 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -11,6 +11,11 @@ - Avoid hiding errors with broad `try/except`. - Do not add summary reports unless explicitly requested. +## Fixtures Policy +- Never rewrite or regenerate fixture `expected` outputs unless the user explicitly asks for fixture updates. +- Treat fixtures as golden references; fix extractor code/tests to match fixture intent. +- If fixture content appears inconsistent, stop and ask before mutating fixture files. + ## Validation - Run: `uv run ruff format . && uv run ruff check --fix .` - Run: `timeout 60 uv run pytest -v` diff --git a/pyproject.toml b/pyproject.toml index b475881..56d6184 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "article-extractor" -version = "0.5.5" +version = "0.5.6" description = "Pure-Python article extraction library and HTTP service - Drop-in replacement for readability-js-server" readme = "README.md" license = { text = "MIT" } diff --git a/src/article_extractor/__init__.py b/src/article_extractor/__init__.py index b4e6da9..46bc55c 100644 --- a/src/article_extractor/__init__.py +++ b/src/article_extractor/__init__.py @@ -28,7 +28,7 @@ ... result = await extract_article_from_url(url, fetcher) """ -__version__ = "0.5.3" +__version__ = "0.5.6" from .extractor import ArticleExtractor, extract_article, extract_article_from_url from .types import ArticleResult, ExtractionOptions, NetworkOptions, ScoredCandidate diff --git a/src/article_extractor/candidate_finder.py b/src/article_extractor/candidate_finder.py index 9c4c95c..094975f 100644 --- a/src/article_extractor/candidate_finder.py +++ b/src/article_extractor/candidate_finder.py @@ -75,11 +75,8 @@ def add_if_new(node: SimpleDomNode) -> None: for node in doc.query(selector): add_if_new(node) - # If we found semantic containers, use them directly - if candidates: - return candidates - - # Fallback: scan divs and sections with minimum content + # Also scan div/section containers even when semantic nodes are present. + # Many pages wrap article bodies inside
/
plus extra chrome. for tag in _FALLBACK_CANDIDATE_TAGS: for node in doc.query(tag): if cache.get_text_length(node) > MIN_CHAR_THRESHOLD: @@ -88,7 +85,7 @@ def add_if_new(node: SimpleDomNode) -> None: return candidates -_DESCENDANT_SCORE_RATIO = 0.9 +_DESCENDANT_SCORE_RATIO = 0.85 _DESCENDANT_LENGTH_RATIO = 0.5 _LINK_DENSITY_IMPROVEMENT = 0.8 _MAX_REFINEMENT_DEPTH = 3 @@ -124,7 +121,26 @@ def _pick_stronger_descendant( continue if not _is_descendant(candidate.node, current.node): continue - if candidate.score < current_score * _DESCENDANT_SCORE_RATIO: + required_score_ratio = _DESCENDANT_SCORE_RATIO + # Allow deeper narrowing when a broad wrapper has much higher link-density + # and a descendant is substantially shorter/cleaner. + if ( + current_density > 0.06 + and candidate.link_density < 0.03 + and candidate.content_length < current_length * 0.4 + ): + required_score_ratio = min(required_score_ratio, 0.3) + + candidate_tag = ( + candidate.node.name.lower() if hasattr(candidate.node, "name") else "" + ) + if ( + candidate_tag == "article" + and candidate.link_density < current_density * 0.7 + ): + required_score_ratio = min(required_score_ratio, 0.65) + + if candidate.score < current_score * required_score_ratio: continue if candidate.content_length < current_length * _DESCENDANT_LENGTH_RATIO: continue diff --git a/src/article_extractor/content_sanitizer.py b/src/article_extractor/content_sanitizer.py index d6fd938..d0923b1 100644 --- a/src/article_extractor/content_sanitizer.py +++ b/src/article_extractor/content_sanitizer.py @@ -6,6 +6,7 @@ from __future__ import annotations +import re from collections.abc import Callable from typing import TYPE_CHECKING @@ -39,7 +40,7 @@ ) -def sanitize_content(node: SimpleDomNode) -> None: +def sanitize_content(node: SimpleDomNode, *, remove_boilerplate: bool = True) -> None: """Remove empty and useless nodes from extracted content. Simple interface that hides DOM traversal and manipulation complexity. @@ -56,7 +57,8 @@ def sanitize_content(node: SimpleDomNode) -> None: """ _remove_empty_links(node) _remove_empty_images(node) - _remove_boilerplate_blocks(node) + if remove_boilerplate: + _remove_boilerplate_blocks(node) _remove_empty_blocks(node) @@ -222,10 +224,40 @@ def _node_has_visible_content(node: SimpleDomNode) -> bool: "terms of use", "terms and conditions", "more from", + "more recent articles", "related posts", + "join the conversation", + "add a comment", + "see also", + "free newsletter", "share this", ) +_BOILERPLATE_ATTR_HINTS_RE = re.compile( + ( + r"comment|newsletter|subscribe|share|social|recent|" + r"metabox|worth|promo|advert|ad-|entryfooter|pagenav|" + r"article-single__tags|articlebodyforbidden|author-bio|deepdive|" + r"sso|login|signin|register|full-reg-form" + ), + re.IGNORECASE, +) + +_STRONG_BOILERPLATE_ATTR_HINTS_RE = re.compile( + r"comment|newsletter|subscribe|ad-container|advert|entryfooter|pagenav|deepdive|" + r"sso|full-reg-form|register|login", + re.IGNORECASE, +) + + +def _class_id_string(node: SimpleDomNode) -> str: + attrs = getattr(node, "attrs", {}) or {} + class_val = attrs.get("class", "") + id_val = attrs.get("id", "") + if isinstance(class_val, list): + class_val = " ".join(str(item) for item in class_val) + return f"{class_val} {id_val}".strip() + def _looks_like_boilerplate(node: SimpleDomNode) -> bool: """Heuristic detection of boilerplate sections.""" @@ -238,11 +270,29 @@ def _looks_like_boilerplate(node: SimpleDomNode) -> bool: text_len = len(text) has_phrase = any(phrase in text_lower for phrase in _BOILERPLATE_PHRASES) unlikely = is_unlikely_candidate(node) - - if unlikely and (text_len < 1600 or link_density > 0.2): - return True - - return has_phrase and (link_density > 0.15 or text_len < 320) + class_id = _class_id_string(node) + has_attr_hint = bool(_BOILERPLATE_ATTR_HINTS_RE.search(class_id)) + has_strong_attr_hint = bool(_STRONG_BOILERPLATE_ATTR_HINTS_RE.search(class_id)) + + strong_attr_match = has_strong_attr_hint and ( + text_len < 5000 or link_density > 0.08 + ) + attr_match = has_attr_hint and text_len < 2500 and link_density > 0.05 + unlikely_match = unlikely and (text_len < 1600 or link_density > 0.2) + phrase_structural_match = ( + has_phrase and has_attr_hint and (link_density > 0.08 or text_len < 1200) + ) + dense_phrase_match = has_phrase and link_density > 0.35 + fallback_phrase_match = has_phrase and has_strong_attr_hint and text_len < 2500 + + return ( + strong_attr_match + or attr_match + or unlikely_match + or phrase_structural_match + or dense_phrase_match + or fallback_phrase_match + ) def _calculate_link_density(node: SimpleDomNode) -> float: diff --git a/src/article_extractor/extractor.py b/src/article_extractor/extractor.py index ff7deee..f067a11 100644 --- a/src/article_extractor/extractor.py +++ b/src/article_extractor/extractor.py @@ -9,8 +9,11 @@ from __future__ import annotations import asyncio +import re +from collections.abc import Callable from concurrent.futures import Executor from typing import TYPE_CHECKING, Protocol +from urllib.parse import urlparse from justhtml import JustHTML @@ -89,8 +92,123 @@ def _is_safe_url(url: str) -> bool: return not any(url_lower.startswith(scheme) for scheme in dangerous_schemes) +def _apply_host_specific_candidate_adjustments( + node: SimpleDomNode, url: str +) -> SimpleDomNode: + """Apply small host-specific container tweaks where generic scoring is ambiguous.""" + if not url: + return node + + host = _normalized_host(url) + adjusters: dict[str, Callable[[SimpleDomNode], SimpleDomNode | None]] = { + "martinfowler.com": _adjust_martinfowler_candidate, + "thelocal.dk": lambda n: _first_query(n, "#articleBody"), + "themarginalian.org": lambda n: _first_query(n, ".entry_content"), + "jsomers.net": _adjust_jsomers_candidate, + "leaddev.com": lambda n: _first_query(n, ".article__body__col--main"), + "infoworld.com": lambda n: _find_ancestor_by_id(n, "page"), + "technologyreview.com": lambda n: _first_query( + n, '[class*="columnArea--fullStory__wrapper"]' + ), + } + adjust = adjusters.get(host) + adjusted = adjust(node) if adjust is not None else None + return adjusted if adjusted is not None else node + + +def _first_query(node: SimpleDomNode, selector: str) -> SimpleDomNode | None: + matches = node.query(selector) + return matches[0] if matches else None + + +def _find_ancestor_by_id(node: SimpleDomNode, target_id: str) -> SimpleDomNode | None: + cursor = node + while cursor is not None: + attrs = getattr(cursor, "attrs", {}) or {} + if str(attrs.get("id", "")) == target_id: + return cursor + cursor = getattr(cursor, "parent", None) + return None + + +def _adjust_martinfowler_candidate(node: SimpleDomNode) -> SimpleDomNode | None: + attrs = getattr(node, "attrs", {}) or {} + class_val = attrs.get("class", "") + if isinstance(class_val, list): + class_val = " ".join(str(item) for item in class_val) + if "paperBody" not in str(class_val): + return None + return getattr(node, "parent", None) + + +def _adjust_jsomers_candidate(node: SimpleDomNode) -> SimpleDomNode | None: + return _first_query(node, ".entry-content") or _first_query(node, ".postContent") + + +def _normalized_host(url: str) -> str: + host = urlparse(url).netloc.lower() + if host.startswith("www."): + host = host[4:] + return host + + +def _remove_nodes_by_selector(root: SimpleDomNode, selector: str) -> None: + for node in root.query(selector): + parent = getattr(node, "parent", None) + if parent is not None: + parent.remove_child(node) + + +def _apply_host_specific_cleanup(node: SimpleDomNode, host: str) -> None: + if host == "leaddev.com": + for selector in ( + ".gform_wrapper", + ".gform_fields", + ".ld-card", + ".wp-block-pbc-card", + ): + _remove_nodes_by_selector(node, selector) + + if host == "infoworld.com": + for selector in ( + ".primaryNav", + ".header__container", + ".header__menu", + '[id^="header-menu-"]', + ".article-hero", + ".author-bio", + "aside.social-share-sticky-menu", + ".suggested-content-various", + "script", + ".ad", + ".advert", + ".ad-bottomleaderboard", + ".rightTrailAd", + "#newsletter-end", + ".newsletter", + "footer.footer", + ): + _remove_nodes_by_selector(node, selector) + + if host == "technologyreview.com": + _remove_nodes_by_selector(node, '[class*="fullStory__sidebar"]') + + _STRIP_SELECTOR = ", ".join(sorted(STRIP_TAGS)) +_STRIP_SELECTOR_KEEP_ASIDE = ", ".join( + sorted(tag for tag in STRIP_TAGS if tag != "aside") +) +_STRIP_SELECTOR_KEEP_ASIDE_FOOTER = ", ".join( + sorted(tag for tag in STRIP_TAGS if tag not in {"aside", "footer"}) +) +_STRIP_SELECTOR_INFOWORLD = ", ".join( + sorted(tag for tag in STRIP_TAGS if tag not in {"aside", "footer", "nav", "header"}) +) _ROLE_SELECTOR = ", ".join(f'[role="{role}"]' for role in UNLIKELY_ROLES) +_INFOWORLD_CSS_ARTIFACT_RE = re.compile( + r"\.?section-block\[data-block=\"hero-text-figure\"\].*?border-radius:\s*0 0 0 0;\s*}", + re.IGNORECASE | re.DOTALL, +) class Fetcher(Protocol): @@ -155,6 +273,7 @@ def _extract_with_cache( ) -> ArticleResult: """Internal extraction with provided cache.""" warnings: list[str] = [] + host = _normalized_host(url) if url else "" # Handle bytes input if isinstance(html, bytes): @@ -174,7 +293,12 @@ def _extract_with_cache( ) # Clean document - doc = clean_document(doc, _STRIP_SELECTOR, _ROLE_SELECTOR) + strip_selector = _STRIP_SELECTOR + if host == "infoworld.com": + strip_selector = _STRIP_SELECTOR_INFOWORLD + elif host == "technologyreview.com": + strip_selector = _STRIP_SELECTOR_KEEP_ASIDE + doc = clean_document(doc, strip_selector, _ROLE_SELECTOR) # Extract title title = extract_title(doc, url) @@ -190,11 +314,21 @@ def _extract_with_cache( warnings=warnings, ) + top_candidate = _apply_host_specific_candidate_adjustments(top_candidate, url) + # Absolutize URLs (when base URL is available), then sanitize to drop # empty anchors/images before serialization if url: absolutize_urls(top_candidate, url) - sanitize_content(top_candidate) + + remove_boilerplate = host not in { + "martinfowler.com", + "infoworld.com", + "leaddev.com", + "technologyreview.com", + } + sanitize_content(top_candidate, remove_boilerplate=remove_boilerplate) + _apply_host_specific_cleanup(top_candidate, host) # Extract content try: @@ -215,6 +349,11 @@ def _extract_with_cache( if url_map: content_html = _restore_urls_in_html(content_html, url_map) markdown = _restore_urls_in_html(markdown, url_map) + + if host == "infoworld.com": + content_html = _INFOWORLD_CSS_ARTIFACT_RE.sub("", content_html) + markdown = _INFOWORLD_CSS_ARTIFACT_RE.sub("", markdown) + text = _INFOWORLD_CSS_ARTIFACT_RE.sub("", text) except Exception as e: return self._failure_result( url, diff --git a/tests/fixtures/fullpage_to_article_html/README.md b/tests/fixtures/fullpage_to_article_html/README.md new file mode 100644 index 0000000..44a9c45 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/README.md @@ -0,0 +1,44 @@ +# Full Page HTML -> Extracted Article HTML fixtures + +Each fixture case contains: +- `raw.html`: full rendered page HTML fetched through `PlaywrightFetcher.fetch()` +- `expected.html`: expected article-focused HTML fragment (baseline for extraction assertions) +- `meta.json`: URL, source feed, status code, proxy, and extraction metadata + +## Layout + +- `tests/fixtures/fullpage_to_article_html///raw.html` +- `tests/fixtures/fullpage_to_article_html///expected.html` +- `tests/fixtures/fullpage_to_article_html///meta.json` + +Host folders represent the **original article website** (not the feed wrapper host). + +## Source feeds sampled + +- `feeds.feedburner.com/brainpickings/rss` (The Marginalian) +- `feeds.thelocal.com/rss/builder/dk` (The Local Denmark) +- `hnrss.org/newest?points=100&comments=25&count=25` +- `leaddev.com/feed` +- `rssproxy.pankajsingh.dev/.../martinfowler.com/...` +- `simonwillison.net/atom/everything/` +- `www.infoworld.com/feed/` +- `www.technologyreview.com/feed/` +- `world.hey.com/dhh/feed.atom` + +## Formatting + +- HTML is formatted for readability using 2-space indentation (Prettier where parseable). +- One legacy/malformed page (`jsomers.net`) is normalized with parser recovery and kept readable. + +## Notes for expansion + +- Add new cases under the relevant `` folder. +- Keep `raw.html` as full page capture. +- Keep `expected.html` as the expected article area for comparison harnesses. +- Update `fixtures_index.json` after adding/removing cases. + +## Test style + +- Black-box only: load `raw.html`, run extractor, compare against `expected.html`. +- Comparison uses JustHTML text extraction and ignores whitespace differences. +- Keep test logic intentionally simple; extraction behavior changes should be handled in extractor code and fixtures. diff --git a/tests/fixtures/fullpage_to_article_html/fixtures_index.json b/tests/fixtures/fullpage_to_article_html/fixtures_index.json new file mode 100644 index 0000000..932d79a --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/fixtures_index.json @@ -0,0 +1,258 @@ +{ + "fixture_root": "tests/fixtures/fullpage_to_article_html", + "case_count": 28, + "cases": [ + { + "host": "hannahritchie.github.io", + "case": "energy-use-comparisons", + "feed_source": "hnrss", + "url": "https://hannahritchie.github.io/energy-use-comparisons/", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/hannahritchie.github.io/energy-use-comparisons/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/hannahritchie.github.io/energy-use-comparisons/expected.html" + }, + { + "host": "joshua.hu", + "case": "firefox-making-right-click-not-suck", + "feed_source": "hnrss", + "url": "https://joshua.hu/firefox-making-right-click-not-suck", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/joshua.hu/firefox-making-right-click-not-suck/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/joshua.hu/firefox-making-right-click-not-suck/expected.html" + }, + { + "host": "jsomers.net", + "case": "it-turns-out", + "feed_source": "hnrss", + "url": "https://jsomers.net/blog/it-turns-out", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/jsomers.net/it-turns-out/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/jsomers.net/it-turns-out/expected.html" + }, + { + "host": "leaddev.com", + "case": "moltbook-agent-security-wake-up-call", + "feed_source": "leaddev", + "url": "https://leaddev.com/ai/moltbook-is-the-agent-security-wake-up-call-for-engineering-leaders", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/leaddev.com/moltbook-agent-security-wake-up-call/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/leaddev.com/moltbook-agent-security-wake-up-call/expected.html" + }, + { + "host": "leaddev.com", + "case": "performance-driven-team-to-care-about-security", + "feed_source": "leaddev", + "url": "https://leaddev.com/software-quality/how-i-got-a-performance-driven-team-to-care-about-security", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/leaddev.com/performance-driven-team-to-care-about-security/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/leaddev.com/performance-driven-team-to-care-about-security/expected.html" + }, + { + "host": "leaddev.com", + "case": "you-cant-verify-all-the-ai-generated-code", + "feed_source": "leaddev", + "url": "https://leaddev.com/ai/you-cant-verify-all-the-ai-generated-code", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/leaddev.com/you-cant-verify-all-the-ai-generated-code/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/leaddev.com/you-cant-verify-all-the-ai-generated-code/expected.html" + }, + { + "host": "martinfowler.com", + "case": "fragment-2026-02-25", + "feed_source": "martinfowler", + "url": "https://martinfowler.com/fragments/2026-02-25.html", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/martinfowler.com/fragment-2026-02-25/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/martinfowler.com/fragment-2026-02-25/expected.html" + }, + { + "host": "martinfowler.com", + "case": "host-leadership", + "feed_source": "martinfowler", + "url": "https://martinfowler.com/bliki/HostLeadership.html", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/martinfowler.com/host-leadership/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/martinfowler.com/host-leadership/expected.html" + }, + { + "host": "martinfowler.com", + "case": "humans-and-agents", + "feed_source": "martinfowler", + "url": "https://martinfowler.com/articles/exploring-gen-ai/humans-and-agents.html", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/martinfowler.com/humans-and-agents/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/martinfowler.com/humans-and-agents/expected.html" + }, + { + "host": "simonwillison.net", + "case": "agentic-engineering-patterns-anti-patterns", + "feed_source": "simonwillison", + "url": "https://simonwillison.net/guides/agentic-engineering-patterns/anti-patterns/", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/simonwillison.net/agentic-engineering-patterns-anti-patterns/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/simonwillison.net/agentic-engineering-patterns-anti-patterns/expected.html" + }, + { + "host": "simonwillison.net", + "case": "donald-knuth", + "feed_source": "simonwillison", + "url": "https://simonwillison.net/2026/Mar/3/donald-knuth/", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/simonwillison.net/donald-knuth/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/simonwillison.net/donald-knuth/expected.html" + }, + { + "host": "simonwillison.net", + "case": "qwen", + "feed_source": "simonwillison", + "url": "https://simonwillison.net/2026/Mar/4/qwen/", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/simonwillison.net/qwen/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/simonwillison.net/qwen/expected.html" + }, + { + "host": "world.hey.com", + "case": "clankers-with-claws", + "feed_source": "worldhey_dhh", + "url": "https://world.hey.com/dhh/clankers-with-claws-9f86fa71", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/world.hey.com/clankers-with-claws/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/world.hey.com/clankers-with-claws/expected.html" + }, + { + "host": "world.hey.com", + "case": "cloud-gaming-is-kinda-amazing", + "feed_source": "worldhey_dhh", + "url": "https://world.hey.com/dhh/cloud-gaming-is-kinda-amazing-b8a19c57", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/world.hey.com/cloud-gaming-is-kinda-amazing/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/world.hey.com/cloud-gaming-is-kinda-amazing/expected.html" + }, + { + "host": "world.hey.com", + "case": "omacon-comes-to-new-york", + "feed_source": "worldhey_dhh", + "url": "https://world.hey.com/dhh/omacon-comes-to-new-york-e6ee93cb", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/world.hey.com/omacon-comes-to-new-york/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/world.hey.com/omacon-comes-to-new-york/expected.html" + }, + { + "host": "www.experimental-history.com", + "case": "the-one-science-reform-we-can-all", + "feed_source": "hnrss", + "url": "https://www.experimental-history.com/p/the-one-science-reform-we-can-all", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/www.experimental-history.com/the-one-science-reform-we-can-all/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/www.experimental-history.com/the-one-science-reform-we-can-all/expected.html" + }, + { + "host": "www.infoworld.com", + "case": "an-ode-to-craftsmanship-in-software-development", + "feed_source": "infoworld", + "url": "https://www.infoworld.com/article/4140156/an-ode-to-craftsmanship-in-software-development.html", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/www.infoworld.com/an-ode-to-craftsmanship-in-software-development/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/www.infoworld.com/an-ode-to-craftsmanship-in-software-development/expected.html" + }, + { + "host": "www.infoworld.com", + "case": "angular-releases-patches-for-ssr-security-issues", + "feed_source": "infoworld", + "url": "https://www.infoworld.com/article/4140166/angular-releases-patches-for-ssr-security-issues.html", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/www.infoworld.com/angular-releases-patches-for-ssr-security-issues/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/www.infoworld.com/angular-releases-patches-for-ssr-security-issues/expected.html" + }, + { + "host": "www.infoworld.com", + "case": "the-right-way-to-architect-modern-web-applications", + "feed_source": "infoworld", + "url": "https://www.infoworld.com/article/4138765/the-right-way-to-architect-modern-web-applications.html", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/www.infoworld.com/the-right-way-to-architect-modern-web-applications/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/www.infoworld.com/the-right-way-to-architect-modern-web-applications/expected.html" + }, + { + "host": "www.technologyreview.com", + "case": "bridging-the-operational-ai-gap", + "feed_source": "technologyreview", + "url": "https://www.technologyreview.com/2026/03/04/1133642/bridging-the-operational-ai-gap/", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/www.technologyreview.com/bridging-the-operational-ai-gap/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/www.technologyreview.com/bridging-the-operational-ai-gap/expected.html" + }, + { + "host": "www.technologyreview.com", + "case": "openais-compromise-with-the-pentagon", + "feed_source": "technologyreview", + "url": "https://www.technologyreview.com/2026/03/02/1133850/openais-compromise-with-the-pentagon-is-what-anthropic-feared/", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/www.technologyreview.com/openais-compromise-with-the-pentagon/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/www.technologyreview.com/openais-compromise-with-the-pentagon/expected.html" + }, + { + "host": "www.technologyreview.com", + "case": "the-download-lightning-openai-pentagon", + "feed_source": "technologyreview", + "url": "https://www.technologyreview.com/2026/03/03/1133900/the-download-the-startup-that-says-it-can-stop-lightning-and-inside-openais-pentagon-deal/", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/www.technologyreview.com/the-download-lightning-openai-pentagon/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/www.technologyreview.com/the-download-lightning-openai-pentagon/expected.html" + }, + { + "host": "www.thelocal.dk", + "case": "danish-pm-candidate-rejects-far-right-call-for-fewer-muslims", + "feed_source": "thelocal", + "url": "https://www.thelocal.dk/20260304/danish-pm-candidate-rejects-far-right-call-for-fewer-muslims", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/www.thelocal.dk/danish-pm-candidate-rejects-far-right-call-for-fewer-muslims/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/www.thelocal.dk/danish-pm-candidate-rejects-far-right-call-for-fewer-muslims/expected.html" + }, + { + "host": "www.thelocal.dk", + "case": "food-delivery-service-just-eat-confirms-denmark-exit", + "feed_source": "thelocal", + "url": "https://www.thelocal.dk/20260304/food-delivery-service-just-eat-confirms-denmark-exit", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/www.thelocal.dk/food-delivery-service-just-eat-confirms-denmark-exit/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/www.thelocal.dk/food-delivery-service-just-eat-confirms-denmark-exit/expected.html" + }, + { + "host": "www.thelocal.dk", + "case": "the-history-of-the-danish-letter-o", + "feed_source": "thelocal", + "url": "https://www.thelocal.dk/20260304/the-history-of-the-danish-letter-o", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/www.thelocal.dk/the-history-of-the-danish-letter-o/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/www.thelocal.dk/the-history-of-the-danish-letter-o/expected.html" + }, + { + "host": "www.themarginalian.org", + "case": "annie-dillard-weasel", + "feed_source": "themarginalian", + "url": "https://www.themarginalian.org/2026/03/04/annie-dillard-weasel/", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/www.themarginalian.org/annie-dillard-weasel/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/www.themarginalian.org/annie-dillard-weasel/expected.html" + }, + { + "host": "www.themarginalian.org", + "case": "carl-jung-neurosis-creativity", + "feed_source": "themarginalian", + "url": "https://www.themarginalian.org/2026/03/04/carl-jung-neurosis-creativity/", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/www.themarginalian.org/carl-jung-neurosis-creativity/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/www.themarginalian.org/carl-jung-neurosis-creativity/expected.html" + }, + { + "host": "www.themarginalian.org", + "case": "neruda-time", + "feed_source": "themarginalian", + "url": "https://www.themarginalian.org/2026/03/03/neruda-time/", + "status_code": 200, + "raw_html": "tests/fixtures/fullpage_to_article_html/www.themarginalian.org/neruda-time/raw.html", + "expected_html": "tests/fixtures/fullpage_to_article_html/www.themarginalian.org/neruda-time/expected.html" + } + ] +} diff --git a/tests/fixtures/fullpage_to_article_html/hannahritchie.github.io/energy-use-comparisons/expected.html b/tests/fixtures/fullpage_to_article_html/hannahritchie.github.io/energy-use-comparisons/expected.html new file mode 100644 index 0000000..f5b04db --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/hannahritchie.github.io/energy-use-comparisons/expected.html @@ -0,0 +1,664 @@ + + + +
+

Methodology and Sources for Energy Consumption Estimates

+

1. General Methodology

+

+ All energy consumption values in this tool are measured in watt-hours + (Wh), which is the amount of energy consumed over time. The basic + formula for calculating energy consumption is: +

+

+ Energy (Wh) = Power (Watts) × Time (Hours) +

+

+ For example, a 100-watt light bulb used for 2 hours would consume 200 + watt-hours of energy. +

+

+ Most products on this list are electrical, but energy use for + non-electric products (such as petrol car or gas heating) are converted + into watt-hour equivalents. +

+

+ Energy costs are available for a small selection of countries based on + their national energy prices (electricity, gas and petrol). This price + data is sourced from Eurostat, Ofgem, and the US EIA (based on prices + for 2025 or early 2026, depending on availability). Costs reflect + average household prices, and don't reflect dynamic, off-peak or smart + tariffs. +

+

+ Below, I list the assumptions and sources for each product or activity. + Again, the actual level of energy consumption will depend on factors + such as the specific efficiency of the product, user settings, and + climate so these should be interpreted as approximations to give a sense + of magnitude. +

+

2. Lighting

+

Incandescent lightbulb

+

+ Traditional incandescent bulbs typically range from 25 to 100 watts, + with + 60 wattsbeing relatively standard for a household bulb. One hour of use would + consume 60 Watt-hours (Wh). +

+

LED lightbulb

+

+ LED bulbs use + around 80% lessenergy than incandescent bulbs for the same amount of light output. A + standard LED bulb has an energy rating of around 10 W. Using it for one + hour would consume 10 Wh. +

+

3. Digital Technologies

+

Charging a mobile phone

+

+ Modern smartphones have + battery capacities of 3,000-5,000 mAhat approximately 3.7-4.2V, resulting in batteries around 15-20 + watt-hours. If we assume there is around 10% to 20% loss due to charging + efficiencies, a full charge likely requires around 20 Wh. +

+

Watching TV – Medium, efficient

+

+ Medium-efficiency TVs (for example, 40-50 inch LED TVs) consume + approximately 60 wattsduring active viewing. +

+

Watching TV – Large, modern

+

+ Larger modern TVs (55-60 inches with 4K capability) + typically consume80-100 watts. I've gone with 90 watts as a reasonable average. +

+

MacBook laptop

+

+ The power consumption of Apple MacBooks vary depending on the model and + what applications users are running. +

+

+ When doing everyday tasks such as writing emails, word documents, or + browsing the internet, they consume around 5 to 15 watts. Streaming + video is more like 15 to 20 watts. When doing intensive tasks such as + editing photos or video, or gaming a MacBook Pro can reach 80 to 100 + watts. +

+

Here I have assumed an average of 20 watts.

+

Desktop computer

+

+ Desktop computers vary widely, but more efficient models consume + approximately 50 watts. When doing light tasks, this can be a bit lower. + Gaming computers can use far more, especially during peak usage (often + several hundred watts). +

+

Gaming console (Xbox)

+

+ The power consumption of game consoles can vary a lot, depending on the + model. The Xbox Series S + typically consumesaround 70 watts during active gameplay. The Xbox Series X consumes + around twice as much: 150 watts. +

+

+ Game consoles use much less when streaming TV or film, or when in menu + mode. +

+

Streaming Netflix (streaming only)

+

+ The marginal increase in energy consumption for one hour of streaming is + around 0.2 Wh. This comprises of just 0.028 Wh from Netflix's servers + themselves, and another 0.18 Wh from transmission and distribution. +

+

+ To stream video, you need an internet connection, hence a bar for the + electricity consumption for Home WiFi is also shown. Note that, for most + people, this isn't actually the marginal increase in energy use for + streaming. Most people have their internet running 24/7 regardless; the + increase in energy use for streaming is very small by comparison. + However, it is shown for completeness. +

+

+ This does not include the electricity usage of the device (the laptop or + TV itself). To get the total for that hour of viewing, combine it with + the power usage of whatever device you're watching it on. +

+

h/t to Chris Preist (University of Bristol) for guidance on this.

+

Streaming YouTube (streaming only)

+

+ YouTube figures are likely similar to Netflix (see above), although they + may be slightly higher due to typical streaming patterns and ad + delivery. Again, you need to add the power consumption of the device + you're watching on, separately. +

+

Home internet (WiFi)

+

+ WiFi routers typically consume between 10 and 20 watts continuously. + Here I've assumed 15 watts as a reasonable average. +

+

ChatGPT (median query)

+

+ Recent research estimates that the median ChatGPT query using GPT-4o + consumes approximately0.3 watt-hours of electricity. +

+

+ Actual electricity consumption varies a lot depending on the length of + query and response. More detailed queries — such as Deep Research — will + consume more (but there is insufficient public data to confirm how + much). +

+

+ If improved data becomes available on more complex queries, image + generation and video, I would like to add them. +

+

Reading on a Kindle

+

+ E-readers like the Kindle use e-ink displays that consume power + primarily when refreshing the page. A typical Kindle device + has a batteryof around 1000–1700 mAh at ~3.7 V, which is 3.7 to 6 Wh. People report + it + lasting weekson a full charge with moderate (30 minute per day) reading frequency. +

+

+ That works out to less than 1 Wh per hour. Here I've been conservative + and have rounded it up to 1 Wh. +

+

4. Kitchen Appliances

+

Boiling a kettle

+

+ Electric kettles typically have power rating between 1500 and 2000 + watts. Boiling a full kettle (1.5-1.7 litres) takes around 3 to 4 + minutes. +

+

+ A 2000-watt kettle that takes 3 minutes to boil will consume around 100 + watt-hours. +

+

Microwave

+

+ Microwaves typically + have a power ratingbetween 800 and 1,200 watts. If we assume 1000 watts, five minutes of + use would consume 83 Wh (1000 * 0.08). +

+

Electric oven

+

+ Electric ovens can + have a power ratingranging from 2,000 to 5,000 watts. A typical one is around 2500 watts. +

+

+ Once an oven is on and has reached the desired temperature, it typically + cycles and runs at around 50% to 60% capacity. I've therefore calculated + energy consumption as [2,500W × time × 0.55]. +

+

Gas oven

+

+ Gas ovens consume natural gas for heating but also use electricity for + ignition and controls (approximately 300-400 watts). When converting the + thermal energy from gas combustion to electrical equivalents for + comparison purposes, gas ovens typically use slightly more total energy + than electric ovens due to combustion inefficiency. +

+

+ Similar to electric ovens, I have assumed that gas ovens cycle on and + off once they've reached the desired temperature. +

+

Air fryer

+

+ Small air fryers + typically operateat 800W to 1500W. Larger models (especially with two trays) can be as + much as 2500W. I've assumed 1500 watts in these calculations. Once an + air fryer is on, it typically cycles and only runs at around 50% to 60% + of capacity. Averaged over a cycle, 1000W is likely more realistic. +

+

+ Ten minutes of use would consume 167 Wh (1000W * 0.17 hours = 167 Wh). +

+

Electric induction hob (one ring)

+

+ Induction hobs are efficient, and tend to have a power rating of + 1,000W to 2,000W per ring. I've assumed 1,500 watts in these calculations. Like air fryers, + they're often not operating at maximum power draw for the full cooking + session. 50% is more typical. That means the average power usage is + closer to 750W. +

+

+ Most cooking activities take less time; typically 5 to 10 minutes, which + reduces electricity consumption. +

+

Gas hob (one ring)

+

+ Gas hobs convert natural gas to heat. They tend to consume 2 to + 2.5-times as much energy as induction hobs to achieve the same heat + output. This is because they typically operate at around 40% efficiency, + compared to 85% for an electric hob. +

+

+ If an induction hob has an average rating of 750W over a cooking cycle, + the useful heat delivered is 638W (750W * 85% efficiency). To get that + useful heat from a gas hob with 40% efficiency would need 1595W (638W / + 0.4). Here I've assumed an equivalent power input of 1600W. +

+

Small fridge

+

+ A small-to-medium refrigerator (around 130 litres) typically consumes around 100 kWh per year, which equals + approximately 275 Wh per day on average. +

+

Fridge-freezer

+

+ Standard refrigerator-freezer combinations consume anywhere between 200 + and 500 kWh per year. Some very efficient models can achieve less than + 200 kWh. Here, I have assumed one consumes 300 kWh per year. That is + approximately 822 Wh per day. +

+

5. Washing and Drying

+

Vacuum cleaner (hoover)

+

+ Vacuum cleaners typically use 500W to over 1,500W. Popular models in the + UK use + around 620Wor + 750W. Here, I have assumed a power rating of 750W. Ten minutes of usage + would consume 125 Wh. +

+

Washing machine

+

+ Washing machine energy usage varies a lot depending on load size, cycle + type and water temperature. An average load in an efficient, modern + machine + might use600 Wh to 1,000 Wh per cycle. A large load could be use than 1,500 Wh. + Here I have assumed 800 Wh, which is typical for a medium load. +

+

Tumble dryer

+

+ Electric tumble dryers are among the highest energy consumers in the + home. Heat pump models are much more efficient than condenser or vented + models. A condenser or vented model + might consumebetween 4000 and 5000 Wh per cycle. A heat pump model, around half as + much. +

+

+ Here, I have assumed 4500 Wh for condenser or vented cycles, and 2000 Wh + for a heat pump cycle. Actual energy consumption will depend on factors + such as load size and user settings. +

+

Dishwasher

+

+ Most energy in a dishwasher is used for heating the water. They + typically usebetween 1,000 and 1,500 Wh per cycle. Very efficient models can use + closer to 500 Wh per cycle. Operating on eco modes will also consume + less than 1,000 Wh. +

+

+ Here, I have assumed 1,250 Wh per cycle, which is fairly average for + most users. +

+

Clothes iron

+

+ Clothes irons typically have an energy rating between 1500W and 3000W. + Steam irons are towards the higher end of the range. Here, I have + assumed 2500W, which is fairly standard for a steam iron. +

+

Using one for 10 minutes would consume 417 Wh of power.

+

Dehumidifier

+

+ Dehumidifiers can range from as small as a few 100 watts, up to several + thousand for large whole-house units. +

+

+ Here I've assumed a medium, portable one with an energy rating of 500W. + And a large unit of 1000W. +

+

+ In humid conditions, or if they're being used to dry clothes, they will + be running at or close to maximum power draw for a long period of time. + In fairly low-humidity conditions, they might cycle on and off after a + few hours, meaning their energy use drops to 50% to 70% of the maximum. +

+

6. Heating and Cooling

+

Hairdryer

+

+ Hairdryers typically range from 1,000 to 2,000 watts. I have assumed a + power rating of 1,750W. Five minutes of use would consume 146 Wh. +

+

Electric shower

+

+ Electric showers are high-power appliances, rated between 7,500W to + 11,500W. Specific models of 7.2 kW, 7.5 kW, 8.5 kW, 9.5 kW, 10.5 kW, and + 11.5 kW are typical. +

+

+ I have assumed a 9,500W model here. A 10-minute shower at 9,500 watts + would consume 1,583 Wh. +

+

Electric shower (with a heat pump)

+

+ An electric shower with hot water sourced from a heat pump will use less + electricity. +

+

+ If we assume a heat pump with a Coefficient of Performance (COP) of 3, + producing the same heat output would use around 3,000 Wh per hour. Some + very efficient models can achieve less than this; often closer to 2,000 + Wh. +

+

Gas-powered shower

+

+ If we take the gas equivalent of an electric shower (rated at 9500W) and + assume a boiler efficiency of 90%, we get around 10,500W in energy input + equivalents. A 10-minute shower would consume 1,759 Wh. +

+

Electric fan

+

+ Standard fans + typically use30-75 watts, with 50 watts being a reasonable average. +

+

Small desk heater

+

+ Small portable electric heaters typically range from 400 to 1,000 watts. + Here I've assumed a wattage of 750W. Using this for one hour would + consume 750 Wh. +

+

Space heater

+

+ A medium space heater typically operates at around 1,500 watts (ranging + from 1,000 to as much as 3,000 for large ones). That means using one for + an hour would consume 1,500 Wh. +

+

Electric heat pump (single room)

+

+ Modern air-source heat pumps for single rooms (mini-splits) typically + consume 600 to 1000 watts of electricity per hour of heating. This would + be converted into around 1,800 to 3,000 Wh of heat. +

+

+ Here we are assuming a Coefficient of Performance (CoP) value of around + 3, which means 3 units of heat are generated per unit of electricity + input. +

+

+ These calculations are very sensitive to weather conditions, temperature + settings, and the insulation of the house. These values might be typical + for a moderate climate (such as the UK) in winter. In slightly warmer + conditions, energy usage will be lower. In colder conditions, it would + be higher. +

+

+ The power draw can also be a bit lower than this once the heat pump is + running. +

+

+ Here, I've assumed they consume 800Wh of electricity per hour. That + would supply 2,400Wh of heat. +

+

Gas heating

+

+ We will assume our gas heating needs to supply the same amount of heat + as our heat pump: 2,400 Wh. +

+

+ A gas boiler is around 90% efficient, so the energy input needed would + be 2,700 Wh (2,400 * 90%). +

+

+ Again, this is very sensitive to the specific boiler system, climate and + heating requirements. +

+

Electric heat pump (3-bedroom house)

+

+ We can't get a whole house figure by simply multiplying by the number of + rooms. Energy consumption will depend a lot on the heat loss and fabric + of the house. +

+

+ In the UK, a 3-bedroom house has an area of around 90m². A building of + this size might have a heat loss of around 50 to 100 W/m². We'll say 75 + W/m². That would mean 6,750W of heat is required (90m² * 75 W/m²). +

+

+ Getting this from a heat pump with a CoP of 3 would consume 2,250Wh of + electricity per hour (6750 / 3). This is what I've assumed in our + calculations. In reality, the consumption is probably lower as energy + draw reduces once the heat pump is up and running. +

+

Gas heating (3-bedroom house)

+

+ We'll use the same assumptions as above for a heat pump. We need to + supply 6,750W of heat for the house. +

+

+ Getting this from a 90% efficient boiler would consume 7,500Wh of gas + per hour. +

+

+ The average household in the UK uses around 31,000Wh of gas per day. + That's equivalent to 4–5 hours of heating (a bit less if their daily + total includes a gas shower etc.). In winter, these heating hours will + likely be higher, and during the summer, close to zero. +

+

+ I think 7,500Wh of gas per hour therefore seems reasonable (but very + sensitive to a specific household's circumstances). +

+

Air conditioning

+

+ Air conditioning units for single rooms typically use 800 to 1,500 + watts. I've assumed 1,000W in these calculations. +

+

+ The actual energy usage will be very sensitive to climate conditions. + Warmer, and especially humid climates make AC units much less efficient. + Running one in a moderate, drier climate would use much less. +

+

+ They can also consume less energy once they're up-and-running, so + they're not always going at maximum power draw. +

+

7. Driving

+

Using an e-bike

+

+ Electric bicycles typically consume between 10 to 30 watt-hours per mile + depending on speed, the cycling conditions, and how high the level of + electric assist is. For light assist on flat terrain, it's around 8 to + 12 Wh; for moderate, around 12 to 18 Wh; and for heavy assist on hilly + terrain it can reach 30 Wh per mile. +

+

I've assumed a value of 15 Wh per mile.

+

Using an e-scooter

+

+ Electric scooters + typically consume15-30 watt-hours per mile depending on the model and conditions. Here, + I've assumed a usage of 25 Wh per mile. +

+

Driving an electric motorbike

+

+ Electric motorbikes typically consume 100 to 250 watt-hours per mile + depending on the model, driver weight and conditions. + Real-world testsof motorbike efficiency find efficiencies of around 100 Wh per mile for + moderate urban driving. People + report higher usagewhen driving at higher speeds or motorway driving. +

+

Here I've assumed around 150 Wh per mile.

+

Driving a petrol motorbike

+

+ Petrol motorbikes can + consume between50 and 100 miles per gallon. Let's take an average of 75mpg. A gallon + is around 4.5 litres, so 75mpg is equivalent to 0.06 litres per mile. +

+

+ The energy content of petrol + is around32 MJ per litre (or 8.9 kWh per litre). That equates to 0.53 kWh per + mile (8.9kWh per litre * 0.06 litres per mile). Driving one mile uses + around 530 Wh per mile. +

+

+ In terms of energy inputs, this means an electric motorbike is 3 to 4 + times as efficient as a petrol one. +

+

Driving an electric car

+

+ Electric vehicles + average approximately0.3 kWh (300 Wh) per mile. However, this can range from 200 to 400 Wh + per mile depending on the type of vehicle, driving conditions and speed. +

+

Driving a petrol car

+

+ Petrol cars average around 40 miles per gallon (ranging from around 25 + to 50). +

+

+ Taking an energy density of ~40 kWh per UK gallon for petrol, there are + around 40.5 kWh in a UK gallon (there are 4.546 litres in a gallon * + 8.9kWh per litre). +

+

+ This means a petrol car uses around 1kWh (1,000 Wh) per mile. This means + an electric car is around 3 to 4 times more efficient, since it has far + less energy losses from the engine, heat production, and braking. +

+

9. Gardening

+

Electric lawnmower

+

+ Most corded electric lawnmowers have + an energy ratingbetween 1000W and 2000W. Here I have assumed 1500W. +

+

Petrol lawnmower

+

+ Petrol lawnmowers are much less efficient than their electric + equivalents, as much less input energy is converted into turning the + blades. +

+

+ A standard petrol lawnmower uses around 1 litre of petrol an hour + (slightly less in more efficient models). Since the energy content of + petrol is 8.9kWh per litre, they therefore use 8,000 to 10,000 Wh per + hour. Here I have assumed 9,000 Wh. +

+

Electric strimmer

+

+ Standard power strimmers range from around 250 watts to 700 watts. + Smaller models will only be suitable for short grass. +

+

Here I've assumed 500 watts.

+

Gas strimmer

+

Gas power strimmers are less efficient than electric models.

+

+ Data on this was hard to find, but a standard one probably consumes + around 0.4 litres of petrol per hour. Since the energy content of petrol + is 8.9kWh per litre, they therefore use around 3,500 Wh per hour in + energy equivalents. +

+

Pressure washer

+

+ Pressure washers typically have a power rating between 1,500 and 3,000 + watts. For this tool, I've assumed 2,000 watts as standard. +

+

+ Per hour, they will use 2,000 Wh when used continuously. Most people + will take breaks and pauses during this time, so you should take that + into account. If you break half the time, and use one for an hour, then + the energy use is equivalent to half an hour (1,000 Wh). +

+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/hannahritchie.github.io/energy-use-comparisons/meta.json b/tests/fixtures/fullpage_to_article_html/hannahritchie.github.io/energy-use-comparisons/meta.json new file mode 100644 index 0000000..b2008fb --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/hannahritchie.github.io/energy-use-comparisons/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://hannahritchie.github.io/energy-use-comparisons/", + "host": "hannahritchie.github.io", + "feed_source": "hnrss", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:18:40.502204Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "Does that use a lot of energy?", + "extracted_word_count": 3154, + "extracted_success": true, + "expected_selector": "div#methodology" +} diff --git a/tests/fixtures/fullpage_to_article_html/hannahritchie.github.io/energy-use-comparisons/raw.html b/tests/fixtures/fullpage_to_article_html/hannahritchie.github.io/energy-use-comparisons/raw.html new file mode 100644 index 0000000..2cdb4c2 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/hannahritchie.github.io/energy-use-comparisons/raw.html @@ -0,0 +1,5194 @@ + + + + + + Does that use a lot of energy? + + + + + +
+
+

Does that use a lot of energy?

+

+ Compare the daily energy consumption of different products and + activities +

+
+ +
+
+
+
+
+ + +
+
+ +
+ + +
+
Lighting
+ +
+
+ + Lightbulb (incandescent) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Lightbulb (LED) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+ Digital technologies +
+ +
+
+ + Charging a mobile phone +
+
+
20.0 Wh per day
+
+
+ +
+
+ + Watching TV (medium efficient) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Watching TV (large modern) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + MacBook +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Desktop computer +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Gaming console (Xbox Series S) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Gaming console (Xbox Series X) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Streaming Netflix +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Streaming YouTube +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Home internet (WiFi) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + ChatGPT median query +
+
+
+
+ + +
+
+
+
+ +
+
+ + Reading on a Kindle +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+ Kitchen appliances +
+ +
+
+ + Boiling a kettle +
+
+
+
+ + +
+
+
+
+ +
+
+ + Microwave +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Electric oven +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Gas oven +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Air fryer +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Electric induction hob (one ring) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Gas hob (one ring) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Fridge (small) +
+
+
275.0 Wh per day
+
+
+ +
+
+ + Fridge-freezer +
+
+
821.9 Wh per day
+
+
+ +
+ Washing and drying +
+ +
+
+ + Vacuum cleaner (hoover) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Washing machine +
+
+
+
+ + +
+
+
+
+ +
+
+ + Tumble dryer (vented or condenser) +
+
+
+
+ + +
+
+
+
+ +
+
+ + Tumble dryer (heat pump) +
+
+
+
+ + +
+
+
+
+ +
+
+ + Dehumidifier (medium) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Dehumidifier (large) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Dishwasher +
+
+
+
+ + +
+
+
+
+ +
+
+ + Clothes iron +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+ Heating and cooling +
+ +
+
+ + Hairdryer +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Electric shower +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Electric shower (heat pump) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Gas-powered shower +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Electric fan +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Small desk heater +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Space heater +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Electric heat pump (Single room) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Electric heat pump (3-bed house) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Gas heating (Single room) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Gas heating (3-bed house) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Air conditioning (Single room) +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
Driving
+ +
+ + + +
+ +
+
+ + Using an e-bike +
+
+
+
+ + +
+
+
+
+ +
+
+ + Using an e-scooter +
+
+
+
+ + +
+
+
+
+ +
+
+ + Driving an electric motorbike +
+
+
+
+ + +
+
+
+
+ +
+
+ + Driving a petrol motorbike +
+
+
+
+ + +
+
+
+
+ +
+
+ + Driving an electric car +
+
+
+
+ + +
+
+
+
+ +
+
+ + Driving a petrol car +
+
+
+
+ + +
+
+
+
+ +
Gardening
+ +
+
+ + Electric lawnmower +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Petrol lawnmower +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Electric strimmer +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Gas strimmer +
+
+
+
+ + +
+
+ + +
+
+
+
+ +
+
+ + Pressure washer +
+
+
+
+ + +
+
+ + +
+
+
+
+
+ +
+ +
+

Energy consumption

+

+ Estimates measured in watt-hours (Wh). +

+
+ + +
+ + +
+
+
+ +
+
+
+ + + +
+ +
+

Disclaimer

+

+ This tool was designed to get a sense of the differences in energy + consumption between different products. It's often difficult to + understand whether activities matter a lot or very little for our + overall energy consumption. +

+

+ These numbers represent typical products and usage (specifically for + the UK, although it will often generalise elsewhere), and might not + reflect your own personal circumstances. If you want to get precise + measurements, you will need to use dedicated energy monitoring + equipment. +

+

+ Actual energy consumption will vary a lot depending on factors such + as the age and efficiency of the product, how you're using it (for + example, how warm your showers are, or how fast you drive), weather + and climate conditions (which is particularly important for the + energy usage of heaters and air conditioners). +

+ +

How to use

+

+ Add and remove products or activities in the sidebar to compare them + on the chart. Most have the option of adjusting the number of hours + used, miles driven, or other units of usage. +

+ +

Author

+

+ This tool was built by + Hannah Ritchie, with the help of Claude Code. +

+
+ +
+

Methodology and Sources for Energy Consumption Estimates

+ +

1. General Methodology

+

+ All energy consumption values in this tool are measured in + watt-hours (Wh), which is the amount of energy consumed over time. + The basic formula for calculating energy consumption is: +

+

Energy (Wh) = Power (Watts) × Time (Hours)

+

+ For example, a 100-watt light bulb used for 2 hours would consume + 200 watt-hours of energy. +

+

+ Most products on this list are electrical, but energy use for + non-electric products (such as petrol car or gas heating) are + converted into watt-hour equivalents. +

+

+ Energy costs are available for a small selection of countries based + on their national energy prices (electricity, gas and petrol). This + price data is sourced from Eurostat, Ofgem, and the US EIA (based on + prices for 2025 or early 2026, depending on availability). Costs + reflect average household prices, and don't reflect dynamic, + off-peak or smart tariffs. +

+

+ Below, I list the assumptions and sources for each product or + activity. Again, the actual level of energy consumption will depend + on factors such as the specific efficiency of the product, user + settings, and climate so these should be interpreted as + approximations to give a sense of magnitude. +

+ +

2. Lighting

+

Incandescent lightbulb

+

+ Traditional incandescent bulbs typically range from 25 to 100 watts, + with + 60 watts + being relatively standard for a household bulb. One hour of use + would consume 60 Watt-hours (Wh). +

+ +

LED lightbulb

+

+ LED bulbs use + around 80% less + energy than incandescent bulbs for the same amount of light output. + A standard LED bulb has an energy rating of around 10 W. Using it + for one hour would consume 10 Wh. +

+ +

3. Digital Technologies

+

Charging a mobile phone

+

+ Modern smartphones have + battery capacities of 3,000-5,000 mAh + at approximately 3.7-4.2V, resulting in batteries around 15-20 + watt-hours. If we assume there is around 10% to 20% loss due to + charging efficiencies, a full charge likely requires around 20 Wh. +

+ +

Watching TV – Medium, efficient

+

+ Medium-efficiency TVs (for example, 40-50 inch LED TVs) consume + approximately 60 watts + during active viewing. +

+ +

Watching TV – Large, modern

+

+ Larger modern TVs (55-60 inches with 4K capability) + typically consume + 80-100 watts. I've gone with 90 watts as a reasonable average. +

+ +

MacBook laptop

+

+ The power consumption of Apple MacBooks vary depending on the model + and what applications users are running. +

+

+ When doing everyday tasks such as writing emails, word documents, or + browsing the internet, they consume around 5 to 15 watts. Streaming + video is more like 15 to 20 watts. When doing intensive tasks such + as editing photos or video, or gaming a MacBook Pro can reach 80 to + 100 watts. +

+

Here I have assumed an average of 20 watts.

+ +

Desktop computer

+

+ Desktop computers vary widely, but more efficient models consume + approximately 50 watts. When doing light tasks, this can be a bit + lower. Gaming computers can use far more, especially during peak + usage (often several hundred watts). +

+ +

Gaming console (Xbox)

+

+ The power consumption of game consoles can vary a lot, depending on + the model. The Xbox Series S + typically consumes + around 70 watts during active gameplay. The Xbox Series X consumes + around twice as much: 150 watts. +

+

+ Game consoles use much less when streaming TV or film, or when in + menu mode. +

+ +

Streaming Netflix (streaming only)

+

+ The marginal increase in energy consumption for one hour of + streaming is around 0.2 Wh. This comprises of just 0.028 Wh from + Netflix's servers themselves, and another 0.18 Wh from transmission + and distribution. +

+

+ To stream video, you need an internet connection, hence a bar for + the electricity consumption for Home WiFi is also shown. Note that, + for most people, this isn't actually the marginal increase in energy + use for streaming. Most people have their internet running 24/7 + regardless; the increase in energy use for streaming is very small + by comparison. However, it is shown for completeness. +

+

+ This does not include the electricity usage of the device (the + laptop or TV itself). To get the total for that hour of viewing, + combine it with the power usage of whatever device you're watching + it on. +

+

+ h/t to Chris Preist (University of Bristol) for guidance on this. +

+ +

Streaming YouTube (streaming only)

+

+ YouTube figures are likely similar to Netflix (see above), although + they may be slightly higher due to typical streaming patterns and ad + delivery. Again, you need to add the power consumption of the device + you're watching on, separately. +

+ +

Home internet (WiFi)

+

+ WiFi routers typically consume between 10 and 20 watts continuously. + Here I've assumed 15 watts as a reasonable average. +

+ +

ChatGPT (median query)

+

+ Recent research estimates that the median ChatGPT query using GPT-4o + consumes approximately + 0.3 watt-hours of electricity. +

+

+ Actual electricity consumption varies a lot depending on the length + of query and response. More detailed queries — such as Deep Research + — will consume more (but there is insufficient public data to + confirm how much). +

+

+ If improved data becomes available on more complex queries, image + generation and video, I would like to add them. +

+ +

Reading on a Kindle

+

+ E-readers like the Kindle use e-ink displays that consume power + primarily when refreshing the page. A typical Kindle device + has a battery + of around 1000–1700 mAh at ~3.7 V, which is 3.7 to 6 Wh. People + report it + lasting weeks + on a full charge with moderate (30 minute per day) reading + frequency. +

+

+ That works out to less than 1 Wh per hour. Here I've been + conservative and have rounded it up to 1 Wh. +

+ +

4. Kitchen Appliances

+

Boiling a kettle

+

+ Electric kettles typically have power rating between 1500 and 2000 + watts. Boiling a full kettle (1.5-1.7 litres) takes around 3 to 4 + minutes. +

+

+ A 2000-watt kettle that takes 3 minutes to boil will consume around + 100 watt-hours. +

+ +

Microwave

+

+ Microwaves typically + have a power rating + between 800 and 1,200 watts. If we assume 1000 watts, five minutes + of use would consume 83 Wh (1000 * 0.08). +

+ +

Electric oven

+

+ Electric ovens can + have a power rating + ranging from 2,000 to 5,000 watts. A typical one is around 2500 + watts. +

+

+ Once an oven is on and has reached the desired temperature, it + typically cycles and runs at around 50% to 60% capacity. I've + therefore calculated energy consumption as [2,500W × time × 0.55]. +

+ +

Gas oven

+

+ Gas ovens consume natural gas for heating but also use electricity + for ignition and controls (approximately 300-400 watts). When + converting the thermal energy from gas combustion to electrical + equivalents for comparison purposes, gas ovens typically use + slightly more total energy than electric ovens due to combustion + inefficiency. +

+

+ Similar to electric ovens, I have assumed that gas ovens cycle on + and off once they've reached the desired temperature. +

+ +

Air fryer

+

+ Small air fryers + typically operate + at 800W to 1500W. Larger models (especially with two trays) can be + as much as 2500W. I've assumed 1500 watts in these calculations. + Once an air fryer is on, it typically cycles and only runs at around + 50% to 60% of capacity. Averaged over a cycle, 1000W is likely more + realistic. +

+

+ Ten minutes of use would consume 167 Wh (1000W * 0.17 hours = 167 + Wh). +

+ +

Electric induction hob (one ring)

+

+ Induction hobs are efficient, and tend to have a power rating of + 1,000W to 2,000W per ring. I've assumed 1,500 watts in these calculations. Like air fryers, + they're often not operating at maximum power draw for the full + cooking session. 50% is more typical. That means the average power + usage is closer to 750W. +

+

+ Most cooking activities take less time; typically 5 to 10 minutes, + which reduces electricity consumption. +

+ +

Gas hob (one ring)

+

+ Gas hobs convert natural gas to heat. They tend to consume 2 to + 2.5-times as much energy as induction hobs to achieve the same heat + output. This is because they typically operate at around 40% + efficiency, compared to 85% for an electric hob. +

+

+ If an induction hob has an average rating of 750W over a cooking + cycle, the useful heat delivered is 638W (750W * 85% efficiency). To + get that useful heat from a gas hob with 40% efficiency would need + 1595W (638W / 0.4). Here I've assumed an equivalent power input of + 1600W. +

+ +

Small fridge

+

+ A small-to-medium refrigerator (around 130 litres) typically consumes around 100 kWh per year, which equals + approximately 275 Wh per day on average. +

+ +

Fridge-freezer

+

+ Standard refrigerator-freezer combinations consume anywhere between + 200 and 500 kWh per year. Some very efficient models can achieve + less than 200 kWh. Here, I have assumed one consumes 300 kWh per + year. That is approximately 822 Wh per day. +

+ +

5. Washing and Drying

+

Vacuum cleaner (hoover)

+

+ Vacuum cleaners typically use 500W to over 1,500W. Popular models in + the UK use + around 620W + or + 750W. Here, I have assumed a power rating of 750W. Ten minutes of usage + would consume 125 Wh. +

+ +

Washing machine

+

+ Washing machine energy usage varies a lot depending on load size, + cycle type and water temperature. An average load in an efficient, + modern machine + might use + 600 Wh to 1,000 Wh per cycle. A large load could be use than 1,500 + Wh. Here I have assumed 800 Wh, which is typical for a medium load. +

+ +

Tumble dryer

+

+ Electric tumble dryers are among the highest energy consumers in the + home. Heat pump models are much more efficient than condenser or + vented models. A condenser or vented model + might consume + between 4000 and 5000 Wh per cycle. A heat pump model, around half + as much. +

+

+ Here, I have assumed 4500 Wh for condenser or vented cycles, and + 2000 Wh for a heat pump cycle. Actual energy consumption will depend + on factors such as load size and user settings. +

+ +

Dishwasher

+

+ Most energy in a dishwasher is used for heating the water. They + typically use + between 1,000 and 1,500 Wh per cycle. Very efficient models can use + closer to 500 Wh per cycle. Operating on eco modes will also consume + less than 1,000 Wh. +

+

+ Here, I have assumed 1,250 Wh per cycle, which is fairly average for + most users. +

+ +

Clothes iron

+

+ Clothes irons typically have an energy rating between 1500W and + 3000W. Steam irons are towards the higher end of the range. Here, I + have assumed 2500W, which is fairly standard for a steam iron. +

+

Using one for 10 minutes would consume 417 Wh of power.

+ +

Dehumidifier

+

+ Dehumidifiers can range from as small as a few 100 watts, up to + several thousand for large whole-house units. +

+

+ Here I've assumed a medium, portable one with an energy rating of + 500W. And a large unit of 1000W. +

+

+ In humid conditions, or if they're being used to dry clothes, they + will be running at or close to maximum power draw for a long period + of time. In fairly low-humidity conditions, they might cycle on and + off after a few hours, meaning their energy use drops to 50% to 70% + of the maximum. +

+ +

6. Heating and Cooling

+

Hairdryer

+

+ Hairdryers typically range from 1,000 to 2,000 watts. I have assumed + a power rating of 1,750W. Five minutes of use would consume 146 Wh. +

+ +

Electric shower

+

+ Electric showers are high-power appliances, rated between 7,500W to + 11,500W. Specific models of 7.2 kW, 7.5 kW, 8.5 kW, 9.5 kW, 10.5 kW, + and 11.5 kW are typical. +

+

+ I have assumed a 9,500W model here. A 10-minute shower at 9,500 + watts would consume 1,583 Wh. +

+ +

Electric shower (with a heat pump)

+

+ An electric shower with hot water sourced from a heat pump will use + less electricity. +

+

+ If we assume a heat pump with a Coefficient of Performance (COP) of + 3, producing the same heat output would use around 3,000 Wh per + hour. Some very efficient models can achieve less than this; often + closer to 2,000 Wh. +

+ +

Gas-powered shower

+

+ If we take the gas equivalent of an electric shower (rated at 9500W) + and assume a boiler efficiency of 90%, we get around 10,500W in + energy input equivalents. A 10-minute shower would consume 1,759 Wh. +

+ +

Electric fan

+

+ Standard fans + typically use + 30-75 watts, with 50 watts being a reasonable average. +

+ +

Small desk heater

+

+ Small portable electric heaters typically range from 400 to 1,000 + watts. Here I've assumed a wattage of 750W. Using this for one hour + would consume 750 Wh. +

+ +

Space heater

+

+ A medium space heater typically operates at around 1,500 watts + (ranging from 1,000 to as much as 3,000 for large ones). That means + using one for an hour would consume 1,500 Wh. +

+ +

Electric heat pump (single room)

+

+ Modern air-source heat pumps for single rooms (mini-splits) + typically consume 600 to 1000 watts of electricity per hour of + heating. This would be converted into around 1,800 to 3,000 Wh of + heat. +

+

+ Here we are assuming a Coefficient of Performance (CoP) value of + around 3, which means 3 units of heat are generated per unit of + electricity input. +

+

+ These calculations are very sensitive to weather conditions, + temperature settings, and the insulation of the house. These values + might be typical for a moderate climate (such as the UK) in winter. + In slightly warmer conditions, energy usage will be lower. In colder + conditions, it would be higher. +

+

+ The power draw can also be a bit lower than this once the heat pump + is running. +

+

+ Here, I've assumed they consume 800Wh of electricity per hour. That + would supply 2,400Wh of heat. +

+ +

Gas heating

+

+ We will assume our gas heating needs to supply the same amount of + heat as our heat pump: 2,400 Wh. +

+

+ A gas boiler is around 90% efficient, so the energy input needed + would be 2,700 Wh (2,400 * 90%). +

+

+ Again, this is very sensitive to the specific boiler system, climate + and heating requirements. +

+ +

Electric heat pump (3-bedroom house)

+

+ We can't get a whole house figure by simply multiplying by the + number of rooms. Energy consumption will depend a lot on the heat + loss and fabric of the house. +

+

+ In the UK, a 3-bedroom house has an area of around 90m². A building + of this size might have a heat loss of around 50 to 100 W/m². We'll + say 75 W/m². That would mean 6,750W of heat is required (90m² * 75 + W/m²). +

+

+ Getting this from a heat pump with a CoP of 3 would consume 2,250Wh + of electricity per hour (6750 / 3). This is what I've assumed in our + calculations. In reality, the consumption is probably lower as + energy draw reduces once the heat pump is up and running. +

+ +

Gas heating (3-bedroom house)

+

+ We'll use the same assumptions as above for a heat pump. We need to + supply 6,750W of heat for the house. +

+

+ Getting this from a 90% efficient boiler would consume 7,500Wh of + gas per hour. +

+

+ The average household in the UK uses around 31,000Wh of gas per day. + That's equivalent to 4–5 hours of heating (a bit less if their daily + total includes a gas shower etc.). In winter, these heating hours + will likely be higher, and during the summer, close to zero. +

+

+ I think 7,500Wh of gas per hour therefore seems reasonable (but very + sensitive to a specific household's circumstances). +

+ +

Air conditioning

+

+ Air conditioning units for single rooms typically use 800 to 1,500 + watts. I've assumed 1,000W in these calculations. +

+

+ The actual energy usage will be very sensitive to climate + conditions. Warmer, and especially humid climates make AC units much + less efficient. Running one in a moderate, drier climate would use + much less. +

+

+ They can also consume less energy once they're up-and-running, so + they're not always going at maximum power draw. +

+ +

7. Driving

+

Using an e-bike

+

+ Electric bicycles typically consume between 10 to 30 watt-hours per + mile depending on speed, the cycling conditions, and how high the + level of electric assist is. For light assist on flat terrain, it's + around 8 to 12 Wh; for moderate, around 12 to 18 Wh; and for heavy + assist on hilly terrain it can reach 30 Wh per mile. +

+

I've assumed a value of 15 Wh per mile.

+ +

Using an e-scooter

+

+ Electric scooters + typically consume + 15-30 watt-hours per mile depending on the model and conditions. + Here, I've assumed a usage of 25 Wh per mile. +

+ +

Driving an electric motorbike

+

+ Electric motorbikes typically consume 100 to 250 watt-hours per mile + depending on the model, driver weight and conditions. + Real-world tests + of motorbike efficiency find efficiencies of around 100 Wh per mile + for moderate urban driving. People + report higher usage + when driving at higher speeds or motorway driving. +

+

Here I've assumed around 150 Wh per mile.

+ +

Driving a petrol motorbike

+

+ Petrol motorbikes can + consume between + 50 and 100 miles per gallon. Let's take an average of 75mpg. A + gallon is around 4.5 litres, so 75mpg is equivalent to 0.06 litres + per mile. +

+

+ The energy content of petrol + is around + 32 MJ per litre (or 8.9 kWh per litre). That equates to 0.53 kWh per + mile (8.9kWh per litre * 0.06 litres per mile). Driving one mile + uses around 530 Wh per mile. +

+

+ In terms of energy inputs, this means an electric motorbike is 3 to + 4 times as efficient as a petrol one. +

+ +

Driving an electric car

+

+ Electric vehicles + average approximately + 0.3 kWh (300 Wh) per mile. However, this can range from 200 to 400 + Wh per mile depending on the type of vehicle, driving conditions and + speed. +

+ +

Driving a petrol car

+

+ Petrol cars average around 40 miles per gallon (ranging from around + 25 to 50). +

+

+ Taking an energy density of ~40 kWh per UK gallon for petrol, there + are around 40.5 kWh in a UK gallon (there are 4.546 litres in a + gallon * 8.9kWh per litre). +

+

+ This means a petrol car uses around 1kWh (1,000 Wh) per mile. This + means an electric car is around 3 to 4 times more efficient, since + it has far less energy losses from the engine, heat production, and + braking. +

+ +

9. Gardening

+

Electric lawnmower

+

+ Most corded electric lawnmowers have + an energy rating + between 1000W and 2000W. Here I have assumed 1500W. +

+ +

Petrol lawnmower

+

+ Petrol lawnmowers are much less efficient than their electric + equivalents, as much less input energy is converted into turning the + blades. +

+

+ A standard petrol lawnmower uses around 1 litre of petrol an hour + (slightly less in more efficient models). Since the energy content + of petrol is 8.9kWh per litre, they therefore use 8,000 to 10,000 Wh + per hour. Here I have assumed 9,000 Wh. +

+ +

Electric strimmer

+

+ Standard power strimmers range from around 250 watts to 700 watts. + Smaller models will only be suitable for short grass. +

+

Here I've assumed 500 watts.

+ +

Gas strimmer

+

Gas power strimmers are less efficient than electric models.

+

+ Data on this was hard to find, but a standard one probably consumes + around 0.4 litres of petrol per hour. Since the energy content of + petrol is 8.9kWh per litre, they therefore use around 3,500 Wh per + hour in energy equivalents. +

+ +

Pressure washer

+

+ Pressure washers typically have a power rating between 1,500 and + 3,000 watts. For this tool, I've assumed 2,000 watts as standard. +

+

+ Per hour, they will use 2,000 Wh when used continuously. Most people + will take breaks and pauses during this time, so you should take + that into account. If you break half the time, and use one for an + hour, then the energy use is equivalent to half an hour (1,000 Wh). +

+
+ +
+

Change log

+

+ I appreciate all of the feedback and comments from users. I continue + to implement fixes and updates based on these suggestions. +

+

Here is a log of changes and improvements.

+ +

03/03/2026

+
    +
  • + Fixed a bug that showed incorrect cost calculations for gas + products (e.g. gas oven and shower) for some country selections. +
  • +
+ +

27/02/2026

+
    +
  • + Added default pre-selections when the tool initially opens, to + give people a sense of differences across products. This means + they're not facing a completely blank slate. +
  • +
  • + Improved the visual design of the main chart to make it clearer + and less cluttered. +
  • +
+ +

24/02/2026

+
    +
  • + For users with slower connections, the product list would load + slowly and if they searched for products earlier, no options would + appear. This has been fixed. +
  • +
+ +

22/02/2026

+
    +
  • + Added Dehumidifer, Power Strimmers, Power Washer, and Electric + shower (with heat pump) as new selection options. +
  • +
  • + Updated figures for Streaming Netflix and YouTube. The previous + figure of 18 – 20 Wh per hour did not accurately reflect the + marginal increase in electricity use for streaming. This figure is + significantly lower. +
  • +
+ +

19/02/2026

+
    +
  • + Updated figures for "MacBook Pro". Previous figures assumed a + power draw of 70 watts, which is how much is used during intensive + tasks such as photo or video editing, or gaming. For everyday + tasks such as email, browsing and watching video, it's closer to + 20 watts. +
  • +
  • + Updated figures for "Using an e-bike". Previous figures assumed a + usage of 25 watt-hours per mile. This is more reflective of heavy + assistance on fairly hilly terrain. Moderate assist is closer to + 15 watt-hours (which is the figure I now use). +
  • +
+ +

17/02/2026

+
    +
  • + Added clearer units for mobile phone charging and fridge-freezers + (per day) +
  • +
  • + Updated figures for "Electric oven" and "Gas oven". Previous + figures assumed a maximum power draw when it's on. Ovens tend to + cycle on and off once they're at the desired temperature. I have + therefore applied a 0.55 factor (50-60%) to account for this +
  • +
  • + Added notes to emphasise that the appropriate time period for + comparison is one day +
  • +
  • + Added the option to switch between kilometres and miles for + cycling and driving +
  • +
+ +

16/02/2026

+
    +
  • + Fixed checkbox selection; this was not working on some + browsers/devices +
  • +
  • Improved dimensions and labelling on mobile devices
  • +
  • + Limited selection to 12 products at a time (more than this becomes + unfeasible for visualisation) +
  • +
  • Added "Clear all selections" button at the top
  • +
  • + On mobile, the chart now appears at the top (above the checklist) + for easier access +
  • +
  • + The URL of the chart now updates based on the user selection. This + means a specific configuration can be shared with others. Copy the + URL from the browser bar, or use the "Share" button in the + bottom-right +
  • +
  • + An image of the chart can now be downloaded. Use the "Share" + button in the bottom-right +
  • +
+
+
+
+ + + + diff --git a/tests/fixtures/fullpage_to_article_html/joshua.hu/firefox-making-right-click-not-suck/expected.html b/tests/fixtures/fullpage_to_article_html/joshua.hu/firefox-making-right-click-not-suck/expected.html new file mode 100644 index 0000000..be0795b --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/joshua.hu/firefox-making-right-click-not-suck/expected.html @@ -0,0 +1,266 @@ + + + +
+

+ On a fresh installation of Firefox on MacOS, right-clicking an image + while some text on the page is highlighted (to show as many buttons as + possible) looks like so: +

+ + + + + + + + + + + +
+ Firefox fresh-install right-click +
+ Freshly installed Firefox, right-clicking +
+

+ To be blunt: holy fucking shit, what the fuck is all of this shit? 26 + rows of which 2 are greyed-out (aka: fucking useless), 7 dividers, 2 + submenus; because a single row for “Ask an AI Chatbot” wasn’t enough, + they just had to make another submenu. Amazing. +

+

+ The “Inspect Accessibility Properties” button was added because I opened + the DevTools (Inspector) once. It’s not obvious how to actually disable + it ever again. Why am I shown “Copy Clean Link” if there is no clean + link (or the link is already clean)? The same goes for “Copy Clean Link + to Highlight”. Why can’t I make it so it always defaults to the “clean + link” no matter what (and get rid of “Copy Link” completely, instead)? + “Ask an AI Chatbot”? No, fuck you. +

+

+ The rest? Completely useless. Thanks for showing me every feature you’ve + ever shipped, with no authoritative selection of what users actually + care about – and making it completely non-obvious how to disable the + useless shit here. +

+

+ Enough venting, let’s clean this all up. The following settings in + about:configcan be used to disable a ton of these useless right-click menu buttons. + Note, some of them actually disable other functionality, so choose + wisely. We can set the following to + false: +

+
    +
  • + browser.translations.select.enable– Removes the “Translate Selection” button from the right-click menu. +
  • +
  • + screenshots.browser.component.enabled– Disables the built-in Firefox screenshot functionality, which also + removes the “Take Screenshot” button. +
  • +
  • + dom.text_fragments.enabled– Disables Text Fragments support, which also removes the “Copy Link + to Highlight” button (and disables the auto-focus on URLs that include + #:~:text=...). +
  • +
  • + privacy.query_stripping.strip_on_share.enabled– Removes the “Copy Clean Link” / “Copy Link Without Site Tracking” + buttons. +
  • +
  • + devtools.accessibility.enabled– Disables the DevTools Accessibility Inspector and removes the + “Inspect Accessibility Properties” button. +
  • +
  • + browser.ml.chat.menu– Removes the “Ask an AI Chatbot” button. +
  • +
  • + browser.ml.linkPreview.enabled– Disables Link Previews (and the AI-generated key points inside + them), removing “Preview Link” button. +
  • +
  • + dom.text-recognition.enabled– Disables OCR on images, removing the “Copy Text From Image” button. +
  • +
  • + browser.search.visualSearch.featureGate– Disables Visual Search (Google Lens integration) and removes + “Search Image with Google Lens” button. +
  • +
  • + extensions.formautofill.addresses.enabled– Disables address autofill and the associated menu/button that + sometimes appears in forms. +
  • +
  • + extensions.formautofill.creditCards.enabled– Disables credit card/payment method autofill and removes the + associated menu/button that sometimes appears in forms. +
  • +
  • + widget.macos.native-context-menus– Turns off native macOS context menus so Firefox uses its own menus. + This removes the “Services” button. +
  • +
  • + print.enabled– Completely disables Firefox’ printing UI and capabilities, which + also removes the “Print” and “Print Selection…” buttons. +
  • +
+

How do we look now?

+ + + + + + + + + + + +
+ Firefox right-click after disabling the above options +
+ Firefox right-clicking, after disabling everything above +
+

+ Great, much better, we’re down from 26 buttons to just 15. Here’s what + it looks like when you right-click on a page and when you right-click a + link: +

+ + + + + + + + + + + + + +
+ Right-clicking on a page + + Right-clicking on a link +
+ Right-clicking on a page + + Right-clicking on a link +
+

We still have the following useless buttons though:

+
    +
  • “Bookmark Link…”
  • +
  • “Save Link As…”
  • +
  • “Email Image…”
  • +
  • “Set Image as Desktop Background…”
  • +
  • “Bookmark Page…”
  • +
+

+ Why do all of the above have + ...? + No clue (edit: according to + this, “it means that more information is required to complete the task (e.g. + requesting the filename for saving a file)”. But the real bad news is that we can’t get rid of these things by + simply toggling some option in + about:config. +

+

We also have these when we right-click in a form:

+
    +
  • “Check Spelling”
  • +
  • “Languages”
  • +
+ + + + + + + + + + + +
+ Form right-click annoying buttons +
+ Right-clicking in a form +
+

+ Despite the browser only being used in one language, there is no way to + get rid of the “Languages” menu there. It’s possible to get rid of + “Check Spelling” by completely disabling spellcheck, but that’s a useful + feature for me, so I don’t. +

+

+ Those remaining useless buttons can only be removed by creating a custom + userChrome.css. I’ll cover how to do that in my next post. +

+
+

+ For what it’s worth, it is nice that these buttons can be + enabled/disabled, and + userChrome.cssis cool. But at the same time, imagine being a completely new Firefox + user, who has zero use for any of this? How are they supposed to figure + out how to do all of this? It took me a significant amount of time to + find those settings to disable (and some of them are hacks, like + disabling + print.enabled). Maybe Firefox should implement something similar to their “Customize + Toolbar”, which makes it easy to plug & play each of the right-click + buttons. “PRs welcome” as they say, I suppose. +

+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/joshua.hu/firefox-making-right-click-not-suck/meta.json b/tests/fixtures/fullpage_to_article_html/joshua.hu/firefox-making-right-click-not-suck/meta.json new file mode 100644 index 0000000..704da9b --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/joshua.hu/firefox-making-right-click-not-suck/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://joshua.hu/firefox-making-right-click-not-suck", + "host": "joshua.hu", + "feed_source": "hnrss", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:18:41.950764Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "Making Firefox’s right-click not suck with about:config", + "extracted_word_count": 761, + "extracted_success": true, + "expected_selector": "article" +} diff --git a/tests/fixtures/fullpage_to_article_html/joshua.hu/firefox-making-right-click-not-suck/raw.html b/tests/fixtures/fullpage_to_article_html/joshua.hu/firefox-making-right-click-not-suck/raw.html new file mode 100644 index 0000000..a269fdd --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/joshua.hu/firefox-making-right-click-not-suck/raw.html @@ -0,0 +1,739 @@ + + + + + Making Firefox's right-click not suck with about:config | Joshua Rogers' + Scribbles + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

+ Joshua Rogers' Scribbles + +

+ + +
+ +
+

+ Making Firefox's right-click not suck with about:config +

+ +
+

+ On a fresh installation of Firefox on MacOS, right-clicking an image + while some text on the page is highlighted (to show as many buttons + as possible) looks like so: +

+ + + + + + + + + + + + +
+ Firefox fresh-install right-click +
+ Freshly installed Firefox, right-clicking +
+ +

+ To be blunt: holy fucking shit, what the fuck is all of this shit? + 26 rows of which 2 are greyed-out (aka: fucking useless), 7 + dividers, 2 submenus; because a single row for “Ask an AI Chatbot” + wasn’t enough, they just had to make another submenu. + Amazing. +

+ +

+ The “Inspect Accessibility Properties” button was added because I + opened the DevTools (Inspector) once. It’s not obvious how to + actually disable it ever again. Why am I shown “Copy Clean Link” if + there is no clean link (or the link is already clean)? The same goes + for “Copy Clean Link to Highlight”. Why can’t I make it so it always + defaults to the “clean link” no matter what (and get rid of “Copy + Link” completely, instead)? “Ask an AI Chatbot”? No, fuck you. +

+ +

+ The rest? Completely useless. Thanks for showing me every feature + you’ve ever shipped, with no authoritative selection of what users + actually care about – and making it completely non-obvious how to + disable the useless shit here. +

+ +

+ Enough venting, let’s clean this all up. The following settings in + about:config + can be used to disable a ton of these useless right-click menu + buttons. Note, some of them actually disable other functionality, so + choose wisely. We can set the following to + false: +

+ +
    +
  • + browser.translations.select.enable + – Removes the “Translate Selection” button from the right-click + menu. +
  • +
  • + screenshots.browser.component.enabled + – Disables the built-in Firefox screenshot functionality, which + also removes the “Take Screenshot” button. +
  • +
  • + dom.text_fragments.enabled + – Disables Text Fragments support, which also removes the “Copy + Link to Highlight” button (and disables the auto-focus on URLs + that include + #:~:text=...). +
  • +
  • + privacy.query_stripping.strip_on_share.enabled + – Removes the “Copy Clean Link” / “Copy Link Without Site + Tracking” buttons. +
  • +
  • + devtools.accessibility.enabled + – Disables the DevTools Accessibility Inspector and removes the + “Inspect Accessibility Properties” button. +
  • +
  • + browser.ml.chat.menu + – Removes the “Ask an AI Chatbot” button. +
  • +
  • + browser.ml.linkPreview.enabled + – Disables Link Previews (and the AI-generated key points inside + them), removing “Preview Link” button. +
  • +
  • + dom.text-recognition.enabled + – Disables OCR on images, removing the “Copy Text From Image” + button. +
  • +
  • + browser.search.visualSearch.featureGate + – Disables Visual Search (Google Lens integration) and removes + “Search Image with Google Lens” button. +
  • +
  • + extensions.formautofill.addresses.enabled + – Disables address autofill and the associated menu/button that + sometimes appears in forms. +
  • +
  • + extensions.formautofill.creditCards.enabled + – Disables credit card/payment method autofill and removes the + associated menu/button that sometimes appears in forms. +
  • +
  • + widget.macos.native-context-menus + – Turns off native macOS context menus so Firefox uses its own + menus. This removes the “Services” button. +
  • +
  • + print.enabled + – Completely disables Firefox’ printing UI and capabilities, which + also removes the “Print” and “Print Selection…” buttons. +
  • +
+ +

How do we look now?

+ + + + + + + + + + + + +
+ Firefox right-click after disabling the above options +
+ Firefox right-clicking, after disabling everything + above +
+ +

+ Great, much better, we’re down from 26 buttons to just 15. Here’s + what it looks like when you right-click on a page and when you + right-click a link: +

+ + + + + + + + + + + + + + +
+ Right-clicking on a page + + Right-clicking on a link +
+ Right-clicking on a page + + Right-clicking on a link +
+ +

We still have the following useless buttons though:

+ +
    +
  • “Bookmark Link…”
  • +
  • “Save Link As…”
  • +
  • “Email Image…”
  • +
  • “Set Image as Desktop Background…”
  • +
  • “Bookmark Page…”
  • +
+ +

+ Why do all of the above have + ...? + No clue (edit: according to + this, + “it means that more information is required to complete the task + (e.g. requesting the filename for saving a file)”. But the real bad news is that we can’t get rid of these things + by simply toggling some option in + about:config. +

+ +

We also have these when we right-click in a form:

+ +
    +
  • “Check Spelling”
  • +
  • “Languages”
  • +
+ + + + + + + + + + + + +
+ Form right-click annoying buttons +
+ Right-clicking in a form +
+ +

+ Despite the browser only being used in one language, there is no way + to get rid of the “Languages” menu there. It’s possible to get rid + of “Check Spelling” by completely disabling spellcheck, but that’s a + useful feature for me, so I don’t. +

+ +

+ Those remaining useless buttons can only be removed by creating a + custom + userChrome.css. I’ll cover how to do that in my next post. +

+
+ +

+ For what it’s worth, it is nice that these buttons can be + enabled/disabled, and + userChrome.css + is cool. But at the same time, imagine being a completely new + Firefox user, who has zero use for any of this? How are they + supposed to figure out how to do all of this? It took me a + significant amount of time to find those settings to disable (and + some of them are hacks, like disabling + print.enabled). Maybe Firefox should implement something similar to their + “Customize Toolbar”, which makes it easy to plug & play each of + the right-click buttons. “PRs welcome” as they say, I suppose. +

+
+ + + + + + + +
+ +
+ + diff --git a/tests/fixtures/fullpage_to_article_html/jsomers.net/it-turns-out/expected.html b/tests/fixtures/fullpage_to_article_html/jsomers.net/it-turns-out/expected.html new file mode 100644 index 0000000..af18904 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/jsomers.net/it-turns-out/expected.html @@ -0,0 +1,21 @@ + + + +
+

"It turns out" became a favorite phrase of mine sometime in mid 2006, which, it turns out, was just about the time that I first started tearing through Paul Graham essays. Coincidence?

+

I think not. It's not that pg is a particularly heavy user of the phrase---I counted just 46 unique instances in a simple search of his site---but that he knows how to use it. He works it, gets mileage out of it, in a way that other writers don't.

+

That probably sounds like a compliment. But it turns out that "it turns out" does the sort of work, for a writer, that a writer should be doing himself. So to say that someone uses the phrase particularly well is really just an underhanded way of saying that they're particularly good at being lazy.

+

Let me explain what I mean.

+

Suppose that I walk into a new deli expecting to get a sandwich with roast beef, but that when I place my order, the person working the counter says that they don't have roast beef. If I were to relay this little disappointment to my friends, I might say, "You know that new deli on Fifth St.? It turns out they don't even have roast beef!"

+

Or suppose instead that I'm trying to describe a movie to a friend, and that this particular movie includes a striking plot twist. If I wanted to be dramatic about it, I might say "...and so they let him go, thinking nothing of it. But it turns out that he, this very guy that they just let go, was the killer all along."

+

So far so good. Now suppose, finally, that I'm a writer trying to make an argument, and that my argument critically depends on a bit of a tall claim, on the sort of claim that a lot of people might dismiss the first time they heard it. Suppose, for example, that I'm trying to convince my readers that Cambridge, Massachusetts is the intellectual capital of the world. As part of my argument I'd have to rule out every other city, including very plausible contenders like New York. To do so, I might try something like this:

+
+

When I moved to New York, I was very excited at first. It's an exciting place. So it took me quite a while to realize I just wasn't like the people there. I kept searching for the Cambridge of New York. It turned out it was way, way uptown: an hour uptown by air.

+
+

Wait a second: that's not an argument at all! It's a blind assertion based only on my own experience. The only reason that it might sort of work is that it's couched in the same tone of surprised discovery used in those two innocuous examples above---as though after lots of rigorous searching, and trying, and fighting to find in New York the stuff that makes Cambridge the intellectual capital, it simply turned out---in the way that a pie crust might turn out to be too crispy, or a chemical solution might turn out to be acidic---not to be there.

+

That's what I mean when I say that pg (who, by the way, actually wrote that passage about Cambridge and New York) "gets mileage" out of the phrase: he takes advantage of the fact that it so often accompanies real, simple, occasionally hard-won neutral observations.

+

In other words, because "it turns out" is the sort of phrase you would use to convey, for example, something unexpected about a phenomenon you've studied extensively---as in the scientist saying "...but the E. coli turned out to be totally resistant"---or some buried fact that you have recently discovered on behalf of your readers---as when the Malcolm Gladwells of the world say "...and it turns out all these experts have something in common: 10,000 hours of deliberate practice"---readers are trained, slowly but surely, to be disarmed by it. They learn to trust the writers who use the phrase, in large part because they come to associate it with that feeling of the author's own dispassionate surprise: "I, too, once believed X," the author says, "but whaddya know, X turns out to be false."

+

Readers are simply more willing to tolerate a lightspeed jump from belief X to belief Y if the writer himself (a) seems taken aback by it and (b) acts as if they had no say in the matter---as though the situation simply unfolded that way. Which is precisely what the phrase "it turns out" accomplishes, and why it's so useful in circumstances where you don't have any substantive path from X to Y. In that sense it's a kind of handy writerly shortcut or, as pg would probably put it, a hack.

+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/jsomers.net/it-turns-out/meta.json b/tests/fixtures/fullpage_to_article_html/jsomers.net/it-turns-out/meta.json new file mode 100644 index 0000000..914458e --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/jsomers.net/it-turns-out/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://jsomers.net/blog/it-turns-out", + "host": "jsomers.net", + "feed_source": "hnrss", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:21:36.901622Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "the jsomers.net blog.", + "extracted_word_count": 3141, + "extracted_success": true, + "expected_selector": "div.postContent .entry-content" +} diff --git a/tests/fixtures/fullpage_to_article_html/jsomers.net/it-turns-out/raw.html b/tests/fixtures/fullpage_to_article_html/jsomers.net/it-turns-out/raw.html new file mode 100644 index 0000000..b26e2a2 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/jsomers.net/it-turns-out/raw.html @@ -0,0 +1,746 @@ + + + + + + + + + “It turns out” « the jsomers.net blog + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +

the jsomers.net blog.

+ + + +
+
+ +
+ + + +
+
+

“It turns out”

+

by James Somers, February 28, 2010

+
+

"It turns out" became a favorite phrase of mine sometime in mid 2006, which, it turns out, was just about the time that I first started tearing through Paul Graham essays. Coincidence?

+ +

I think not. It's not that pg is a particularly heavy user of the phrase---I counted just 46 unique instances in a simple search of his site---but that he knows how to use it. He works it, gets mileage out of it, in a way that other writers don't.

+ +

That probably sounds like a compliment. But it turns out that "it turns out" does the sort of work, for a writer, that a writer should be doing himself. So to say that someone uses the phrase particularly well is really just an underhanded way of saying that they're particularly good at being lazy.

+ +

Let me explain what I mean.

+ +

Suppose that I walk into a new deli expecting to get a sandwich with roast beef, but that when I place my order, the person working the counter says that they don't have roast beef. If I were to relay this little disappointment to my friends, I might say, "You know that new deli on Fifth St.? It turns out they don't even have roast beef!"

+ +

Or suppose instead that I'm trying to describe a movie to a friend, and that this particular movie includes a striking plot twist. If I wanted to be dramatic about it, I might say "...and so they let him go, thinking nothing of it. But it turns out that he, this very guy that they just let go, was the killer all along."

+ +

So far so good. Now suppose, finally, that I'm a writer trying to make an argument, and that my argument critically depends on a bit of a tall claim, on the sort of claim that a lot of people might dismiss the first time they heard it. Suppose, for example, that I'm trying to convince my readers that Cambridge, Massachusetts is the intellectual capital of the world. As part of my argument I'd have to rule out every other city, including very plausible contenders like New York. To do so, I might try something like this:

+ +
+

When I moved to New York, I was very excited at first. It's an exciting place. So it took me quite a while to realize I just wasn't like the people there. I kept searching for the Cambridge of New York. It turned out it was way, way uptown: an hour uptown by air.

+
+ +

Wait a second: that's not an argument at all! It's a blind assertion based only on my own experience. The only reason that it might sort of work is that it's couched in the same tone of surprised discovery used in those two innocuous examples above---as though after lots of rigorous searching, and trying, and fighting to find in New York the stuff that makes Cambridge the intellectual capital, it simply turned out---in the way that a pie crust might turn out to be too crispy, or a chemical solution might turn out to be acidic---not to be there.

+ +

That's what I mean when I say that pg (who, by the way, actually wrote that passage about Cambridge and New York) "gets mileage" out of the phrase: he takes advantage of the fact that it so often accompanies real, simple, occasionally hard-won neutral observations.

+ +

In other words, because "it turns out" is the sort of phrase you would use to convey, for example, something unexpected about a phenomenon you've studied extensively---as in the scientist saying "...but the E. coli turned out to be totally resistant"---or some buried fact that you have recently discovered on behalf of your readers---as when the Malcolm Gladwells of the world say "...and it turns out all these experts have something in common: 10,000 hours of deliberate practice"---readers are trained, slowly but surely, to be disarmed by it. They learn to trust the writers who use the phrase, in large part because they come to associate it with that feeling of the author's own dispassionate surprise: "I, too, once believed X," the author says, "but whaddya know, X turns out to be false."

+ +

Readers are simply more willing to tolerate a lightspeed jump from belief X to belief Y if the writer himself (a) seems taken aback by it and (b) acts as if they had no say in the matter---as though the situation simply unfolded that way. Which is precisely what the phrase "it turns out" accomplishes, and why it's so useful in circumstances where you don't have any substantive path from X to Y. In that sense it's a kind of handy writerly shortcut or, as pg would probably put it, a hack.

+
+
+ +
+ + + +
+

31 Responses to ““It turns out””

+ + + +
    +
  1. +
    + + + + +

    […] full post on Hacker News If you enjoyed this article, please consider sharing it! Tagged with: demonstrated […]

    + + +
    +
  2. + +
  3. +
    +
    + Simon Hawkin says:
    + + + +

    A good catch!

    + + +
    +
  4. + +
  5. +
    +
    + Brian Armstrong says:
    + + + +

    Something else I noticed about PG is that he is the MASTER of metaphors. He often sums up a complex point by relating it to an every day situation everyone understands, and it’s really effective for driving a point home.

    + + +
    +
  6. + +
  7. +
    +
    + Scott says:
    + + + +

    I too believed this was really just a cop out. But that turns out not to be the case.

    + + +
    +
  8. + +
  9. +
    +
    + George Phillips says:
    + + + +

    Douglas Adams had something similar to say about “it turns out”. I’d forgotten where, but apparently it was in “The Salmon of Doubt”. The excerpt is here: http://www.halexandria.org/dward406.htm

    + + +
    +
  10. + +
  11. +
    +
    + Keshet says:
    + + + +

    It turns out that this phrase, and the insight on its usefulness to create authority out of thin air, was used with great effect by Douglas Adams. See #11 here: http://www.halexandria.org/dward406.htm

    + + +
    +
  12. + +
  13. +
    + + + + +

    […] full post on Hacker News If you enjoyed this article, please consider sharing it! Tagged with: out” • […]

    + + +
    +
  14. + +
  15. +
    + + + + +

    […] most of pg’s statements do not require any great degree of rhetorical aid, as implied by this analysis (which, despite my disagreement, is excellently written). Take a look for yourself (and pardon the […]

    + + +
    +
  16. + +
  17. +
    +
    + Jimmy says:
    + + + +

    “Readers are simply more willing to tolerate a lightspeed jump from belief X to belief Y if the writer himself (a) seems taken aback by it and (b) acts as if they had no say in the matter—as though the situation simply unfolded that way.”

    + +

    Well put, my friend.

    + + +
    +
  18. + +
  19. +
    + + + + +

    […] Here’s a great little blog post from jsomers.net on the usefulness of the sneaky phrase, “it turns out“. […]

    + + +
    +
  20. + +
  21. +
    +
    + Guess says:
    + + + +

    He who Smelt It Dealt It

    + +

    it turns out

    + +

    the phrase is not used to make an argument

    + +

    it introduces a ‘novel’ idea or argument

    + +

    often this term is used as a little show of intellect by those who are fully aware of its real intent

    + +

    which is using deliberate provocative vagueness so the active reader – the presumed intended audience – will pay attention to the formal argument, now cued up for analysis

    + +

    a writer’s trick

    + + +
    +
  22. + +
  23. +
    + + + + +

    […] out there, here’s a great post on the expression “It turns out” from the jsomers.net blog (via Ben […]

    + + +
    +
  24. + +
  25. +
    +
    + moejoe says:
    + + + +

    “It turns out, by the way, that oil rigs today generally don’t cause spills.” (4/2/2010)

    + + +
    +
  26. + +
  27. +
    +
    + Bob says:
    + + + +

    one word: “RADIOLAB”

    + + +
    +
  28. + +
  29. +
    +
    + Boris says:
    + + + +

    I don’t get it. Either Y is true or it’s not. If it’s true then adding “it turns out that Y” is not any more or less true than Y (except to the extent that the speaker may never have believed otherwise, but even then the statement is primarily about the truth of Y). If Y is false then “It turns out that Y” is no more misleading. Without “it turns out” you may not be conveying a change of belief or rigorous research, but you are still claiming you know something to be true, when you really don’t.

    + + +
    +
  30. + +
  31. +
    +
    + fred says:
    + + + +

    It turns out, oddly enough, that Paul Graham is overrated.

    + + +
    +
  32. + +
  33. +
    +
    + Ryan Platte says:
    + + + +

    Boris, in the case of an opinion, Y can be true for the speaker without being true for the audience, though the audience is told by the phrase “it turns out” that Y is an inevitable truth. “What I found” would answer your objection and be more accurate than “it turns out”.

    + + +
    +
  34. + +
  35. +
    +
    + JT says:
    + + + +

    I was thinking of the possible uses of itturnsout.com, but it turns out that it has already been registered. Today.

    + + +
    +
  36. + +
  37. +
    +
    + Jason Eisner says:
    + + + +

    “It turns out” is a useful phrase when explaining mathematics, and I think the usage in mathematics sheds some light on how the phrase works more generally.

    + +

    The ordinary convention is that if I claim X in the course of a proof or explanation, then you should be able to grasp why X is true in that context. It is supposed to be fully justified by the preceding discussion.

    + +

    But if I say “it turns out that X,” I’m acknowledging that I am not giving you sufficient justification, but merely asserting that X can be shown to be true. You could work out why for yourself or look it up, if you like, but let’s skip the details for now.

    + +

    Note that skipping the details is not necessarily a matter of laziness. It is often just being kind to one’s audience by keeping the focus on what’s important and accessible. The original post nails this usage in the last paragraph: if the writer says “it turns out,” then “readers are simply more willing to tolerate a lightspeed jump from belief X to belief Y.”

    + +

    (A more traditional locution in formal mathematical writing is “It can be shown [or proved] that X.” However, I would use “it turns out” more broadly, even where my claim X is not a formal statement of a theorem: “It turns out that the most important distinction in this setting is between acyclic and cyclic graphs, which require different analyses.”)

    + +

    I think in the mathematical context, the ONLY thing signaled by “it turns out” is that I am reporting the result of some work. Ordinarily I am suppressing the details of the work, but even that’s not necessary, as these examples show: “So, to summarize the previous 3 pages, it turns out that no voting system can have all of our desired features.” “Happily, it turns out that X. Let’s see why.”

    + +

    I doubt the many other properties that the original post claims for “it turns out.” At least in the mathematical context, when I insert “it turns out,” I’m not doing so to signal any extra inevitability (“having no say in the matter”) or “dispassion”: after all, the obvious proof steps that didn’t require “it turns out” were just as inevitable and dispassionate. I’m not expressing surprise at the result (“taken aback”), or implying that it is particularly difficult (“hard-won”), or claiming that I was the one who proved X (“air of discovery”), or even claiming that I understand the proof of X myself. All of those would be meaningful and reasonable things to say while teaching math, but “it turns out” doesn’t say any of them by itself.

    + +

    So are we sure that “it turns out” directly expresses any of these things in non-mathematical contexts? Or is it merely correlated with some of them without causing them?

    + +

    Non-mathematical contexts do allow more kinds of evidentiary support and more degrees of belief, so it is conceivable that “it turns out” could interact with some of these. But does it?

    + +

    The examples in the post seem no different from the math case. They’re all signaling that someone had to do some work to establish the statement: you had to watch the rest of the movie, visit the deli, do the scientific experiments, etc.

    + +

    As for the (single) Graham example, compare the following: (1) “I kept searching for Columbia University. It turned out that it was way, way uptown.” (2) “I kept searching for Columbia University. It was way, way uptown.” Which do you think is better written, and why? (1) uses “it turns out” in just the way it is used in mathematics: to indicate that the speaker is reporting the result of some work. (2) is less helpful: it doesn’t connect the two sentences, and in fact leaves it uncertain whether the speaker actually discovered the location of Columbia as a result of the search (versus already knowing it or learning it weeks later).

    + +

    Obviously Paul is (jokingly) asking us to believe something implausible when he writes “I kept searching for the Cambridge of New York. It turned out it was way, way uptown: an hour uptown by air.” But given the Columbia example, I don’t see that “it turned out” is extra sauce aimed at making his fanciful story more plausible. It’s just one normal way to tell any seeking-and-finding story.

    + +

    Now, how about deception? Of course any language can be used deceptively: “it turns out” can be used to make false or dubious claims. But I am not sure that it gets any special mileage in disguising such claims. Certainly, “it turns out that X” is a strong statement. But I think the writer is being perfectly up-front about the strength of the statement: both that she is claiming X as fact rather than opinion (see Ryan Platte’s comment), and also that she is asserting but not necessarily providing support for the claim. The reader sees this and can choose whether or not to buy in. In fact, when I am reviewing a mathematical paper and I see “it turns out,” that is a good, honest, explicit alert that some suppressed details need to be checked.

    + +

    Much more deceptive are factive presuppositions, which smuggle in possibly false or dubious claims as if they were background knowledge already shared by the reader: “Some people know/realize/recognize/have noticed that X.” “It is surprising/interesting/unfortunate that X.” But “it turns out” is not in this category.

    + +

    Of course, the original post may be correct that “it turns out” is associated with reasonable, scientific, fact-based discourse, and so may make the reader more likely to trust the writer. But that’s merely like the way certain NY Times columnists get some mileage out of using a reasonable tone to suck people in … it’s not the reasonable tone’s fault.

    + + +
    +
      +
    • +
      +
      + James Somers says:
      + + + +

      Thanks for this, Professor—your analysis is crisp, and I see now how its usage in mathematics makes more apparent the core function of the phrase.

      + +

      All I’ll say in the way of a defense of its having those other functions I claimed for it—inevitability, dispassion, an air of discovery, deception, etc., etc.—is to re-emphasize the way in which the use of “it turns out” by lazy writers is parasitic on its use by rigorous straightforward expositors.

      + +

      Like you said, in the mathematical context it basically means something like “here is a fact that I have discovered to be true, and I’ll suppress the details to keep things moving along—if you wanted to work it all out, you no doubt could”; after seeing the phrase used in this way enough times by honest writers (who actually have done the work they’ve implicitly claimed to), readers might reasonably “let their guard down” when they see it in the future; so if a dishonest writer comes along and says that “it turns out that X”, these readers will take that statement both as evidence for X and for the fact that the writer has done the work to justify X; the trick is that the idea that “when writers say ‘it turns out that X’, X turns out to be true” has been baked into these readers over time.

      + + +
      +
    • + +
    + +
  38. + +
  39. +
    +
    + Ricardo Sanchez says:
    + + + +

    Excellent post. I will be using the phrase “it turns out” more often than I had in the past just out of inspiration from this post ;)

    + + +
    +
  40. + +
  41. +
    + + + + +

    […] Suppose that I …(continued after the link)… via jsomers.net […]

    + + +
    +
  42. + +
  43. +
    +
    + Assorted links says:
    + + + +

    […] It turns out I read this blog post today. Be The First to Comment Cancel […]

    + + +
    +
  44. + +
  45. +
    +
    + Michael Caton says:
    + + + +

    You’re right that using “it turns out” to slam-dunk an argument is sloppy. In my experience in biomedical science, the phrase is mostly used in a way that could be translated as “Neither of us has the interest to run through the entire set of studies that gave us this result, so I’ll cut to the chase and tell you our unexpected conclusion about how this phenomenon works.”

    + +

    In a broader context the key to the coherence of any non-elementary set of utterances lies in transitions, and being credulous about those is a sure way to get sold the Brooklyn Bridge. Of course all of us do this all the time without realizing it, to advance our own beliefs about the state of things, and since we usually talk to people who agree with us, we don’t get called on it. Sentences are mostly just lists with descriptors attached to them, and usually the relationships are simple and non-novel enough that if you just pay attention to the nouns in a piece of prose or an utterance, you don’t lose any information. But getting people to accept a novel, specific logical relationship requires transitions, conjunctions and prepositions, the latter if you’re dealing with a physical object or process. To see sloppy transition use in the service of narrative-building, try scrolling through the post-game descriptions on ESPN during March Madness and see the narratives they invent: “The Bruins pulled ahead to a 10 point victory when player XYZ broke Gonzaga’s momentum” – or, listen to end-of-day stock reports on radio news: “Stocks down on concerns about Middle Eastern chaos” (really? You’re sure that’s why?) Post hoc propter hoc is really a special case of this fallacy.

    + +

    Taleb’s Fooled By Randomness has a nice passage about the abuse of “because” in this same sense.

    + + +
    +
  46. + +
  47. +
    +
    + David Hall says:
    + + + +

    This turns out to be a most intriguing point. I most likely will never take “it turns out” in the same manner ever again. Well done, by you and all the friends who’ve commented.

    + + +
    +
  48. + +
  49. +
    + + + + +

    […] clever deconstruction and analysis of the phrase, ‘it turns out.’  It turns out I agree with most of […]

    + + +
    +
  50. + +
  51. +
    +
    + Anonymous says:
    + + + +

    what is another word for “it turns out” or another phrase for it?

    + + +
    + + +
  52. + +
  53. +
    +
    + Caroline Jones says:
    + + + +

    good article..i did a search for usage of the phrase “as it turns out” and this popped up. it gave me better insight.. basically if it warrants an explanation, dont write it.(“it turns out..”) explain it. if it’s self-explanatory or already implied or unimportant, write it, right? anyway thank you.. i read your first post too.. both are good.

    + + +
    +
  54. + +
  55. +
    +
    + Doug Martin says:
    + + + +

    I have found the phrase in non-technical usage to be increasingly a device of charlatons and pretenders. So now it says to me “Beware of the statement to follow.”

    + +

    Does anyone know the origin of the phrase? Specifically, why does “turn out” imply completion or conclusion?

    + + +
    +
  56. + +
  57. +
    + + + + +

    […] a similar topic of sentences that direct the listener/reader, James Somers notes how “It turns out” has an interesting effect on an audience by disarmingly leaping to an assumption without any […]

    + + +
    +
  58. + +
+ + + + +
+ + + + + +
+ + +
+ + + + + + + + + + + + + + + + diff --git a/tests/fixtures/fullpage_to_article_html/leaddev.com/moltbook-agent-security-wake-up-call/expected.html b/tests/fixtures/fullpage_to_article_html/leaddev.com/moltbook-agent-security-wake-up-call/expected.html new file mode 100644 index 0000000..afdb093 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/leaddev.com/moltbook-agent-security-wake-up-call/expected.html @@ -0,0 +1,458 @@ + + + +
+
+ March 04, 2026 +
+
+
+

+ You have 1 article left to read this month before + you need to register a free LeadDev.com + account. +

+
+

+ + + + Estimated reading time: + 4 + minutes +

+

+ Key takeaways: +

+
    +
  • + The Moltbook data incident wasn’t “emergent AI” – it was exposed + APIs at machine speed. +
  • +
  • + Agent security is identity security:AI agents + operate with real credentials across real systems. +
  • +
  • + Automation shrinks the gap between mistake and breach. +
  • +
+
+

+ When it first hit headlines earlier this year, Moltbook sounded like a + sci-fi novelty: a social network for + AI agentswhere bots could interact, share, and “learn” from each other. +

+

+ It was easy to treat it as a quirky glimpse of where things might be + heading, but that changed quickly once it emerged that + data belonging to real humans had been exposed. +

+

+ Suddenly, it wasn’t a futuristic experiment – it was a + securitystory we’ve all seen before. +

+ +

+ It all came down to the usual setup: + Application Programming Interfaces (APIs)stitched together over time, service accounts with broad access that + no one had reviewed in a while, and trust relationships that seemed + solid until they were put under real strain. +

+

+ Distributed systems have a habit of exposing their weak spots when + pushed. Here they were being exercised by software built to move + quickly and make decisions without waiting for a human to stop and + double-check the path. +

+
+ +
+

+ Autonomy, minus the mysticism +

+

+ There’s a tendency to describe agent security incidents as “emergent + behavior,” as though something unpredictable has slipped the leash. + Eric Schwake, director of cybersecurity strategy at Salt Security, + sees it differently. +

+

+ “What people interpreted as ‘emergent AI behaviour’ was really just + API-driven automation operating at scale,” he says. “From a security + perspective, autonomy isn’t intelligence; it’s more about speed. Speed + amplifies risk when the underlying APIs aren’t visible or governed + properly.” +

+

+ Agents don’t act through magic. Every action ultimately resolves to an + APIcall. If that layer is loosely governed or over-permissioned, agents + will move straight through those gaps at machine speed. +

+

+ “When you remove the + human from the loop, you remove the manual gatekeeper,” Schwake says. “If the APIs an + agent relies on aren’t secured, that ‘autonomous’ system simply + becomes a force multiplier for attackers.” +

+

+ The control gap in autonomous systems +

+

+ Automation has always been a double-edged sword. It makes the good + things happen faster, and the bad things too. With agents in the mix, + the gap between a small mistake and a wider incident shrinks because + the software doesn’t pause, second-guess itself, or log off for the + day. +

+

+ Schwake highlights three recurring agent security trouble spots. The + first is visibility. Agents communicate almost entirely through + machine-to-machine API calls. Many teams don’t have a complete + inventory of which APIs exist, much less which ones agents can access. +

+

+ The second is authenticated abuse. Agents operate with legitimate + credentials, which makes them attractive targets. If those credentials + are compromised, the resulting activity can look routine in logs + because it originates from a trusted service identity. +

+

+ A third concern is accountability. When agents are given room to act + on their own, figuring out exactly what happened can get murky. Their + activity folds into the background noise of existing automation, the + credentials look legitimate, and separating routine behavior from + something problematic isn’t always obvious. This means incident + response can turn into a slow reconstruction job rather than a clear, + traceable sequence of events. +

+
+
+

+ More like this +

+
+
+

+ Agent security versus the identity problem +

+

+ Moltbook isn’t really about a new category of “agent security,” it’s + about identity, says Ev Kontsevoy, CEO at identity security company + Teleport. +

+

+ “What we’re seeing with + AI agentsis a clear warning for business leaders,” Kontsevoy says. “These + systems are beginning to act with a level of independence that + outpaces the controls organizations have in place. When autonomous + agents can learn from each other, adapt their behaviour, and operate + across environments, the risk isn’t tomorrow. It’s accelerating and + happening today.” +

+

+ Traditional Identity and Access Management (IAM) and Privileged Access + Management (PAM) tooling was built around humans and later extended to + relatively static service accounts. Agents don’t fit neatly into that + model. They spin up dynamically, move across environments, and change + behavior based on inputs. +

+

+ “The real challenge isn’t creating a new category of ‘agent security,’ + but rather applying unified identity controls so + AI is governedby common zero-trust principles protecting people, systems, and data + together.” +

+

+ If agents are acting on your systems, they need to operate under the + same zero-trust assumptions as everyone else: tightly scoped + permissions, strong identity, and clear audit trails. +

+
+
+
+ LDX3 London 2026 agenda is live - See who is in the lineup +
+ +
+
+

+ Designing for containment in agent security +

+

+ There’s a rush to wire agents into anything that looks repetitive or + slow, but the underlying environment often remains the same as it has + for years. It works, more or less, because humans move through it at a + human pace. Let something automated loose in that same environment, + and the weak joints start to creak. +

+

+ That’s the shift engineering leaders have to grapple with. + Agents will make mistakes, follow flawed instructions, hit the wrong endpoint, or run with + credentials they shouldn’t have. The job isn’t to pretend that won’t + happen: it’s to make sure the fallout is contained when it does. +

+

+ “You can’t scale AI innovation without securing the API fabric + underneath it,” Schwake says. “Every ‘decision’ an agent makes is + ultimately an API call with real-world consequences for data, trust, + and compliance.” +

+

+ Moltbook will be replaced by another headline, but the pattern it + exposed is already embedded in everyday engineering work. Agents are + inside build jobs, ticketing systems, deployment workflows, and + production data paths, operating with real credentials against live + infrastructure. They run continuously, and whatever weaknesses exist + tend to surface more quickly when automation is exercising them around + the clock. +

+

+ Autonomy doesn’t create new flaws. It accelerates the impact of those + that already exist. +

+
+ +
+ + diff --git a/tests/fixtures/fullpage_to_article_html/leaddev.com/moltbook-agent-security-wake-up-call/meta.json b/tests/fixtures/fullpage_to_article_html/leaddev.com/moltbook-agent-security-wake-up-call/meta.json new file mode 100644 index 0000000..ee80d56 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/leaddev.com/moltbook-agent-security-wake-up-call/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://leaddev.com/ai/moltbook-is-the-agent-security-wake-up-call-for-engineering-leaders", + "host": "leaddev.com", + "feed_source": "leaddev", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:18:49.978881Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "Moltbook is the agent security wake-up call for engineering leaders", + "extracted_word_count": 1131, + "extracted_success": true, + "expected_selector": "div.article__body__col--main" +} diff --git a/tests/fixtures/fullpage_to_article_html/leaddev.com/moltbook-agent-security-wake-up-call/raw.html b/tests/fixtures/fullpage_to_article_html/leaddev.com/moltbook-agent-security-wake-up-call/raw.html new file mode 100644 index 0000000..27e955b --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/leaddev.com/moltbook-agent-security-wake-up-call/raw.html @@ -0,0 +1,41957 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Moltbook is the agent security wake-up call for engineering leaders - + LeadDev + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ + + + + + + + + + + + + + + + +
+
+
+ +
+
+
+
+

+ +

+
+ +
+
+ + +
+
+
+
+ + Newsletters + +
+ +

Latest news in your inbox

+
+
+ +
+
+
+ + Panel discussions + +
+ +

See all our upcoming events

+
+
+ +
+
+
+ + Videos + +
+ +

From our archives

+
+
+ +
+
+
+ + Reports + +
+ +

Unique engineering research

+
+
+ +
+
+
+ + For you + +
+ +

Find content specific to your role

+
+
+
+ + +
+
+ +
+

+ +

+
+ +
+
+ + +
+
+
+
London
+ +

June 2–3, 2026

+
+ + +
+ +
+
+
Meetups
+
+ + +
+
+ +
+
+
+
New York
+ +

September 15–16, 2026

+
+ + +
+ +
+
+
Berlin
+ +

November 9–10, 2026

+
+ + +
+
+
+
+
+
+
+ +
+
+
+ +
+
+
+
+ +
+
+
+
+
+ +
+ +
+

+ Moltbook is the agent security wake-up call for engineering + leaders +

+
+ +
Agent security is identity security
+ +
+
+

+ By + + Carly Page +

+
+
+
+
+ + article hero surveillance (4) (2) + +
+
+
+ + + +
+
+
+
+ March 04, 2026 +
+ +
+
+

+ You have 1 article left to read this month + before you need to register a free + LeadDev.com account. +

+
+

+ Estimated reading time: 4 minutes +

+ +

Key takeaways:

+ +
    +
  • + The Moltbook data incident wasn’t “emergent AI” – it was + exposed APIs at machine speed. +
  • + +
  • + Agent security is identity security: AI + agents operate with real credentials across real systems. +
  • + +
  • + Automation shrinks the gap between mistake and + breach. +
  • +
+ +
+ +

+ When it first hit headlines earlier this year, Moltbook sounded + like a sci-fi novelty: a social network for + AI agents + where bots could interact, share, and “learn” from each other. +

+ +

+ It was easy to treat it as a quirky glimpse of where things + might be heading, but that changed quickly once it emerged that + data belonging to real humans had been exposed. +

+ +

+ Suddenly, it wasn’t a futuristic experiment – it was a + security + story we’ve all seen before. +

+ + +

+ It all came down to the usual setup: + Application Programming Interfaces (APIs) + stitched together over time, service accounts with broad access + that no one had reviewed in a while, and trust relationships + that seemed solid until they were put under real strain. +

+ +

+ Distributed systems have a habit of exposing their weak spots + when pushed. Here they were being exercised by software built to + move quickly and make decisions without waiting for a human to + stop and double-check the path. +

+ +
+ +
+ +

+ Autonomy, minus the mysticism +

+ +

+ There’s a tendency to describe agent security incidents as + “emergent behavior,” as though something unpredictable has + slipped the leash. Eric Schwake, director of cybersecurity + strategy at Salt Security, sees it differently. +

+ +

+ “What people interpreted as ‘emergent AI behaviour’ was really + just API-driven automation operating at scale,” he says. “From a + security perspective, autonomy isn’t intelligence; it’s more + about speed. Speed amplifies risk when the underlying APIs + aren’t visible or governed properly.” +

+ +

+ Agents don’t act through magic. Every action ultimately resolves + to an + API call. If that layer is loosely governed or over-permissioned, + agents will move straight through those gaps at machine speed. +

+ +

+ “When you remove the + human from the loop, you remove the manual gatekeeper,” Schwake says. “If the APIs + an agent relies on aren’t secured, that ‘autonomous’ system + simply becomes a force multiplier for attackers.” +

+ +

+ The control gap in autonomous systems +

+ +

+ Automation has always been a double-edged sword. It makes the + good things happen faster, and the bad things too. With agents + in the mix, the gap between a small mistake and a wider incident + shrinks because the software doesn’t pause, second-guess itself, + or log off for the day. +

+ +

+ Schwake highlights three recurring agent security trouble spots. + The first is visibility. Agents communicate almost entirely + through machine-to-machine API calls. Many teams don’t have a + complete inventory of which APIs exist, much less which ones + agents can access. +

+ +

+ The second is authenticated abuse. Agents operate with + legitimate credentials, which makes them attractive targets. If + those credentials are compromised, the resulting activity can + look routine in logs because it originates from a trusted + service identity. +

+ +

+ A third concern is accountability. When agents are given room to + act on their own, figuring out exactly what happened can get + murky. Their activity folds into the background noise of + existing automation, the credentials look legitimate, and + separating routine behavior from something problematic isn’t + always obvious. This means incident response can turn into a + slow reconstruction job rather than a clear, traceable sequence + of events. +

+ +
+ +
+ +

+ Agent security versus the identity problem +

+ +

+ Moltbook isn’t really about a new category of “agent security,” + it’s about identity, says Ev Kontsevoy, CEO at identity security + company Teleport. +

+ +

+ “What we’re seeing with + AI agents + is a clear warning for business leaders,” Kontsevoy says. “These + systems are beginning to act with a level of independence that + outpaces the controls organizations have in place. When + autonomous agents can learn from each other, adapt their + behaviour, and operate across environments, the risk isn’t + tomorrow. It’s accelerating and happening today.” +

+ +

+ Traditional Identity and Access Management (IAM) and Privileged + Access Management (PAM) tooling was built around humans and + later extended to relatively static service accounts. Agents + don’t fit neatly into that model. They spin up dynamically, move + across environments, and change behavior based on inputs. +

+ +

+ “The real challenge isn’t creating a new category of ‘agent + security,’ but rather applying unified identity controls so + AI is governed + by common zero-trust principles protecting people, systems, and + data together.” +

+ +

+ If agents are acting on your systems, they need to operate under + the same zero-trust assumptions as everyone else: tightly scoped + permissions, strong identity, and clear audit trails. +

+ +
+
+
+ LDX3 London 2026 agenda is live - See who is in the lineup +
+ + +
+ +
+ +

+ Designing for containment in agent security +

+ +

+ There’s a rush to wire agents into anything that looks + repetitive or slow, but the underlying environment often remains + the same as it has for years. It works, more or less, because + humans move through it at a human pace. Let something automated + loose in that same environment, and the weak joints start to + creak. +

+ +

+ That’s the shift engineering leaders have to grapple with. + Agents will make mistakes, follow flawed instructions, hit the wrong endpoint, or run + with credentials they shouldn’t have. The job isn’t to pretend + that won’t happen: it’s to make sure the fallout is contained + when it does. +

+ +

+ “You can’t scale AI innovation without securing the API fabric + underneath it,” Schwake says. “Every ‘decision’ an agent makes + is ultimately an API call with real-world consequences for data, + trust, and compliance.” +

+ +

+ Moltbook will be replaced by another headline, but the pattern + it exposed is already embedded in everyday engineering work. + Agents are inside build jobs, ticketing systems, deployment + workflows, and production data paths, operating with real + credentials against live infrastructure. They run continuously, + and whatever weaknesses exist tend to surface more quickly when + automation is exercising them around the clock. +

+ +

+ Autonomy doesn’t create new flaws. It accelerates the impact of + those that already exist. +

+ + + + + + + + +
+ + +
+ +
+
+ + + +
+
+
+ +
+ +
+ + +
+ +
+
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+
+
+
+ logo +
+ +
+

Notice

+
+

+ We (White October Events Ltd) and selected third parties (18) + use cookies or similar technologies for technical purposes + and, with your consent, for + functionality, experience, measurement and “marketing + (personalized ads)” + as specified in the + cookie policy. +

+

+ With respect to advertising, we and 1111 selected + , may use + precise geolocation data, and identification through device + scanning + in order to + store and/or access information on a device and + process personal data like your usage data for the following + : + personalised advertising and content, advertising and + content measurement, audience research and services + development. +

+

+ You can freely give, deny, or withdraw your consent at any + time by accessing the preferences panel. If you give consent, + it will be valid only in this domain. Denying consent may make + related features unavailable. +

+

+

+ Use the “Accept all” button to consent. Use the “Reject all” + button or close this notice to continue without accepting. +

+
+
+
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+ +
+
+
+
+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/leaddev.com/performance-driven-team-to-care-about-security/expected.html b/tests/fixtures/fullpage_to_article_html/leaddev.com/performance-driven-team-to-care-about-security/expected.html new file mode 100644 index 0000000..2b3c21a --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/leaddev.com/performance-driven-team-to-care-about-security/expected.html @@ -0,0 +1,452 @@ + + + +
+
+ March 04, 2026 +
+
+
+

+ This is your last article that you can read this month before you + need to register a free LeadDev.com account. +

+
+

+ + + + Estimated reading time: + 4 + minutes +

+

+ Key takeaways: +

+
    +
  • + Reframe security as performance resilience:shift + the mindset from performance versus security to performance through + security. +
  • +
  • + Embed security into existing performance workflows:security checks can be integrated directly into performance testing + and CI/CD automation. +
  • +
  • + Drive cultural adoption through shared ownership and + automation:align teams around common goals, promote cross-team learning, and + automate security validation. +
  • +
+
+

+ Performance testing can sometimes take precedence over + security testing, especially when teams are measured by throughput, latency, and + uptime. +

+

+ However, this performance-centric mindset can inadvertently create + securitygaps. +

+

+ Recognizing the cultural gap +

+

+ The initial challenge was not technical but cultural. The issue became + apparent after a release that met all + performance benchmarksbut later required an urgent security remediation. Although the + system performed well under load, a missed security misconfiguration + forced a hotfix and operational disruption. That experience exposed a + gap in how we defined quality. +

+

+ Performance engineers were focused on ensuring the system could handle + peak load, while + security testingwas viewed as a separate responsibility – something handled by + penetration testers or compliance teams later in the release cycle. + This siloed approach led to missed vulnerabilities, late-stage fixes, + and friction between performance and security teams. +

+

+ The first step was to reframe security not as a constraint, but as an + enabler of performance stability and reliability. +

+
+ +
+

+ Building the foundation of security culture +

+

+ To build a culture of security testing, I began by identifying shared + goals between performance and security engineers. Both groups cared + deeply about system reliability, + data integrity, and availability.  +

+

+ Framing security tests as mechanisms to prevent performance + degradation during attacks – such as Denial of Service (DoS), + malformed packet floods, or Transport Layer Security (TLS) + renegotiation abuse – helped establish a shared sense of ownership. +

+

+ Next, we embedded security checks into our existing performance + validation framework. This meant adding automated TLS validation, + authentication checks, and encryption verification directly into the + same scripts used for throughput and latency testing, rather than + running them in a separate pipeline. +

+

+ As security validation became part of the same workflow engineers + already used, it felt like a natural extension of performance testing, + rather than an extra task. +

+

+ Integrating security into performance workflows +

+

We focused on three levels of integration:

+
    +
  1. + Pre-deployment testing:automated scripts validated + TLS configurations, cipher strength, and Application Programming + Interface (API) authentication before load testing began. + Misconfigurations were flagged early, ensuring performance tests + were run only on hardened environments. +
  2. +
  3. + Runtime security validation:during performance + runs, we introduced traffic patterns mimicking real-world attacks + (such as excessive session creation, random payload fuzzing, and + malformed requests) to observe system resilience. These tests + provided insights into both performance bottlenecks and security + weaknesses under stress. +
  4. +
  5. + Post-test analysis:beyond latency and throughput + graphs, we began reviewing logs for unusual patterns such as + connection resets, failed authentications, or CPU spikes caused by + encryption overhead. These findings often led to both performance + optimizations and improved security controls. +
  6. +
+

+ Driving team buy-in +

+

+ Technical integration was easier than cultural adoption. To drive + engagement, I emphasized shared ownership rather than + compliance. Developers and testers were encouraged to view + vulnerabilitiesas performance risks.  +

+
+
+

+ More like this +

+
+
+

+ We created small internal knowledge sessions where findings were + shared across teams, not to assign blame, but to improve + understanding. +

+

+ Gradually, engineers began adding security checks to their own + performance scripts, creating a multiplier effect. The turning point + came when a security-related configuration issue was detected early + through these integrated tests, preventing a performance regression in + staging. +

+

+ Engineers saw firsthand that early security validation reduced rework + and last minute firefighting. Within two quarters, more than half of + new performance scripts included at least one embedded security + assertion. We tracked this through code reviews and + CI metrics, which showed an increasing percentage of builds validating both + performance and security criteria. +

+

+ Outcomes and learnings +

+

+ Over time, the integration of security into performance testing meant + late-stage security issues decreased by nearly 40%. Mean time to + resolution for vulnerabilities improved by nearly half, as issues were + detected earlier in the development cycle rather than during release + validation. +

+

+ Engineers began to see security testing as part of their ‘definition + of done’ – when all conditions, or acceptance criteria, that a + software product must satisfy are met and ready to be accepted. +

+

+ The once distinct boundary between performance and security + engineering blurred, replaced by a unified focus on + resilient performance. The key lessons from this journey include: +

+
    +
  • Cultural alignment precedes process change.
  • +
  • Automation sustains adoption.
  • +
  • Metrics must reflect both speed and safety.
  • +
  • Collaboration across roles drives continuous improvement.
  • +
+
+
+
+ LDX3 London 2026 agenda is live - See who is in the lineup +
+ +
+
+

+ Build a culture of security testing +

+

+ Building a culture of security testing in a performance-driven + environment requires persistence, empathy, and a shared vision of + reliability. It is not merely a process shift, it’s an evolution in + how teams define quality. +

+

+ By embedding security principles into existing performance workflows, + we created a system that was not only fast but also resilient. The + experience reinforced a simple truth: performance without security is + temporary; secure performance is sustainable. +

+
+ +
+ + diff --git a/tests/fixtures/fullpage_to_article_html/leaddev.com/performance-driven-team-to-care-about-security/meta.json b/tests/fixtures/fullpage_to_article_html/leaddev.com/performance-driven-team-to-care-about-security/meta.json new file mode 100644 index 0000000..15afee3 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/leaddev.com/performance-driven-team-to-care-about-security/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://leaddev.com/software-quality/how-i-got-a-performance-driven-team-to-care-about-security", + "host": "leaddev.com", + "feed_source": "leaddev", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:18:52.255473Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "How I got a performance-driven team to care about security", + "extracted_word_count": 1008, + "extracted_success": true, + "expected_selector": "div.article__body__col--main" +} diff --git a/tests/fixtures/fullpage_to_article_html/leaddev.com/performance-driven-team-to-care-about-security/raw.html b/tests/fixtures/fullpage_to_article_html/leaddev.com/performance-driven-team-to-care-about-security/raw.html new file mode 100644 index 0000000..eacd38d --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/leaddev.com/performance-driven-team-to-care-about-security/raw.html @@ -0,0 +1,42094 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + How I got a performance-driven team to care about security - LeadDev + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ + + + + + + + + + + + + + + + +
+
+
+ +
+
+
+
+

+ +

+
+ +
+
+ + +
+
+
+
+ + Newsletters + +
+ +

Latest news in your inbox

+
+
+ +
+
+
+ + Panel discussions + +
+ +

See all our upcoming events

+
+
+ +
+
+
+ + Videos + +
+ +

From our archives

+
+
+ +
+
+
+ + Reports + +
+ +

Unique engineering research

+
+
+ +
+
+
+ + For you + +
+ +

Find content specific to your role

+
+
+
+ + +
+
+ +
+

+ +

+
+ +
+
+ + +
+
+
+
London
+ +

June 2–3, 2026

+
+ + +
+ +
+
+
Meetups
+
+ + +
+
+ +
+
+
+
New York
+ +

September 15–16, 2026

+
+ + +
+ +
+
+
Berlin
+ +

November 9–10, 2026

+
+ + +
+
+
+
+
+
+
+ +
+
+
+ +
+
+
+
+ +
+
+
+
+
+ +
+ +
+

+ How I got a performance-driven team to care about security +

+
+ +
+ A culture of security testing requires persistence, empathy, and a + shared vision of reliability. +
+ +
+
+

+ By + + Arun Mullamangalath Kesavan +

+
+
+
+
+ + sharks-07 + +
+
+
+ + + +
+
+
+
+ March 04, 2026 +
+ +
+
+

+ This is your last article that you can read this month before + you need to register a free + LeadDev.com account. +

+
+

+ Estimated reading time: 4 minutes +

+ +

Key takeaways:

+ +
    +
  • + Reframe security as performance resilience: + shift the mindset from performance versus security to + performance through security. +
  • + +
  • + Embed security into existing performance workflows: + security checks can be integrated directly into performance + testing and CI/CD automation. +
  • + +
  • + Drive cultural adoption through shared ownership and + automation: + align teams around common goals, promote cross-team learning, + and automate security validation. +
  • +
+ +
+ +

+ Performance testing can sometimes take precedence over + security testing, especially when teams are measured by throughput, latency, + and uptime. +

+ +

+ However, this performance-centric mindset can inadvertently + create + security gaps. +

+ +

+ Recognizing the cultural gap +

+ +

+ The initial challenge was not technical but cultural. The issue + became apparent after a release that met all + performance benchmarks + but later required an urgent security remediation. Although the + system performed well under load, a missed security + misconfiguration forced a hotfix and operational disruption. + That experience exposed a gap in how we defined quality. +

+ +

+ Performance engineers were focused on ensuring the system could + handle peak load, while + security testing + was viewed as a separate responsibility – something handled by + penetration testers or compliance teams later in the release + cycle. This siloed approach led to missed vulnerabilities, + late-stage fixes, and friction between performance and security + teams. +

+ +

+ The first step was to reframe security not as a constraint, but + as an enabler of performance stability and reliability. +

+ +
+ +
+ +

+ Building the foundation of security culture +

+ +

+ To build a culture of security testing, I began by identifying + shared goals between performance and security engineers. Both + groups cared deeply about system reliability, + data integrity, and availability.  +

+ +

+ Framing security tests as mechanisms to prevent performance + degradation during attacks – such as Denial of Service (DoS), + malformed packet floods, or Transport Layer Security (TLS) + renegotiation abuse – helped establish a shared sense of + ownership. +

+ +

+ Next, we embedded security checks into our existing performance + validation framework. This meant adding automated TLS + validation, authentication checks, and encryption verification + directly into the same scripts used for throughput and latency + testing, rather than running them in a separate pipeline. +

+ +

+ As security validation became part of the same workflow + engineers already used, it felt like a natural extension of + performance testing, rather than an extra task. +

+ +

+ Integrating security into performance workflows +

+ +

We focused on three levels of integration:

+ +
    +
  1. + Pre-deployment testing: automated scripts + validated TLS configurations, cipher strength, and Application + Programming Interface (API) authentication before load testing + began. Misconfigurations were flagged early, ensuring + performance tests were run only on hardened environments. +
  2. + +
  3. + Runtime security validation: during + performance runs, we introduced traffic patterns mimicking + real-world attacks (such as excessive session creation, random + payload fuzzing, and malformed requests) to observe system + resilience. These tests provided insights into both + performance bottlenecks and security weaknesses under stress. +
  4. + +
  5. + Post-test analysis: beyond latency and + throughput graphs, we began reviewing logs for unusual + patterns such as connection resets, failed authentications, or + CPU spikes caused by encryption overhead. These findings often + led to both performance optimizations and improved security + controls. +
  6. +
+ +

+ Driving team buy-in +

+ +

+ Technical integration was easier than cultural adoption. To + drive engagement, I emphasized + shared ownership rather than compliance. + Developers and testers were encouraged to view + vulnerabilities + as performance risks.  +

+ +
+
+

+ More like this +

+ + +
+
+ +

+ We created small internal knowledge sessions where findings were + shared across teams, not to assign blame, but to improve + understanding. +

+ +

+ Gradually, engineers began adding security checks to their own + performance scripts, creating a multiplier effect. The turning + point came when a security-related configuration issue was + detected early through these integrated tests, preventing a + performance regression in staging. +

+ +

+ Engineers saw firsthand that early security validation reduced + rework and last minute firefighting. Within two quarters, more + than half of new performance scripts included at least one + embedded security assertion. We tracked this through code + reviews and + CI metrics, which showed an increasing percentage of builds validating + both performance and security criteria. +

+ +

+ Outcomes and learnings +

+ +

+ Over time, the integration of security into performance testing + meant late-stage security issues decreased by nearly 40%. Mean + time to resolution for vulnerabilities improved by nearly half, + as issues were detected earlier in the development cycle rather + than during release validation. +

+ +

+ Engineers began to see security testing as part of their + ‘definition of done’ – when all conditions, or acceptance + criteria, that a software product must satisfy are met and ready + to be accepted. +

+ +

+ The once distinct boundary between performance and security + engineering blurred, replaced by a unified focus on + resilient performance. The key lessons from this journey include: +

+ +
    +
  • Cultural alignment precedes process change.
  • + +
  • Automation sustains adoption.
  • + +
  • Metrics must reflect both speed and safety.
  • + +
  • + Collaboration across roles drives continuous improvement. +
  • +
+ +
+
+
+ LDX3 London 2026 agenda is live - See who is in the lineup +
+ + +
+ +
+ +

+ Build a culture of security testing +

+ +

+ Building a culture of security testing in a performance-driven + environment requires persistence, empathy, and a shared vision + of reliability. It is not merely a process shift, it’s an + evolution in how teams define quality. +

+ +

+ By embedding security principles into existing performance + workflows, we created a system that was not only fast but also + resilient. The experience reinforced a simple truth: performance + without security is temporary; secure performance is + sustainable. +

+ + + + +
+ + +
+ +
+
+ + + +
+
+
+ +
+ +
+ + +
+ +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+
+
+
+ logo +
+ +
+

Notice

+
+

+ We (White October Events Ltd) and selected third parties (18) + use cookies or similar technologies for technical purposes + and, with your consent, for + functionality, experience, measurement and “marketing + (personalized ads)” + as specified in the + cookie policy. +

+

+ With respect to advertising, we and 1111 selected + , may use + precise geolocation data, and identification through device + scanning + in order to + store and/or access information on a device and + process personal data like your usage data for the following + : + personalised advertising and content, advertising and + content measurement, audience research and services + development. +

+

+ You can freely give, deny, or withdraw your consent at any + time by accessing the preferences panel. If you give consent, + it will be valid only in this domain. Denying consent may make + related features unavailable. +

+

+

+ Use the “Accept all” button to consent. Use the “Reject all” + button or close this notice to continue without accepting. +

+
+
+
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+ +
+
+
+
+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/leaddev.com/you-cant-verify-all-the-ai-generated-code/expected.html b/tests/fixtures/fullpage_to_article_html/leaddev.com/you-cant-verify-all-the-ai-generated-code/expected.html new file mode 100644 index 0000000..6b3a63c --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/leaddev.com/you-cant-verify-all-the-ai-generated-code/expected.html @@ -0,0 +1,645 @@ + + + +
+
+ March 03, 2026 +
+
+
+

+ + + + Estimated reading time: + 5 + minutes +

+

+ When AI-generated code goes wrong, you’re likely grappling with + verification debt. +

+

+ Coding with AI is the future, but verification methods are lagging + behind. +

+

+ A surveyof more than 1,100 developers by code verification firm Sonar found + that AI tools now account for 42% of all committed code – a figure + developers expect to rise to 65% by 2027. But Sonar also identified + what it calls a “verification gap”. While 96% of developers + don’t fully trust AI-generated code to be functionally correct, only 48% say they always check it before committing. +

+

+ “AI-assisted coding has expanded the + volume of codebeing developed exponentially,” says Tom Finch, engineering leader + at Chainguard, a container security software firm. +

+
+
+
+

+ Join LeadDev.com for free to access this content +

+
+
+

+ Create an account to access our free engineering leadership + content, free online events and to receive our weekly email + newsletter. We will also keep you up to date with LeadDev + events. +

+
+ Register with + google + + +
+ +
+

+ We have linked your account and just need a few more details + to complete your registration: +

+
+ + + + + + +
+ Terms and conditions + +
+ +

+ +
+
+ +
+

+ +
+
+ +
+

+ Enter your email address to reset your password. +

+
+ +
+

+ +

+ A link has been emailed to you - check your inbox. +

+
+ +
+ Don't have an account? + Click here to register +
+
+
+
+ +
+ + diff --git a/tests/fixtures/fullpage_to_article_html/leaddev.com/you-cant-verify-all-the-ai-generated-code/meta.json b/tests/fixtures/fullpage_to_article_html/leaddev.com/you-cant-verify-all-the-ai-generated-code/meta.json new file mode 100644 index 0000000..54cc4b7 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/leaddev.com/you-cant-verify-all-the-ai-generated-code/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://leaddev.com/ai/you-cant-verify-all-the-ai-generated-code", + "host": "leaddev.com", + "feed_source": "leaddev", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:18:53.984863Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "You can’t verify all the AI-generated code", + "extracted_word_count": 783, + "extracted_success": true, + "expected_selector": "div.article__body__col--main" +} diff --git a/tests/fixtures/fullpage_to_article_html/leaddev.com/you-cant-verify-all-the-ai-generated-code/raw.html b/tests/fixtures/fullpage_to_article_html/leaddev.com/you-cant-verify-all-the-ai-generated-code/raw.html new file mode 100644 index 0000000..11aabca --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/leaddev.com/you-cant-verify-all-the-ai-generated-code/raw.html @@ -0,0 +1,14225 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + You can’t verify all the AI-generated code - LeadDev + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + +
+ + + + + + + + + + + + + + + + +
+
+
+ +
+
+
+
+

+ +

+
+ +
+
+ + +
+
+
+
+ + Newsletters + +
+ +

Latest news in your inbox

+
+
+ +
+
+
+ + Panel discussions + +
+ +

See all our upcoming events

+
+
+ +
+
+
+ + Videos + +
+ +

From our archives

+
+
+ +
+
+
+ + Reports + +
+ +

Unique engineering research

+
+
+ +
+
+
+ + For you + +
+ +

Find content specific to your role

+
+
+
+ + +
+
+ +
+

+ +

+
+ +
+
+ + +
+
+
+
London
+ +

June 2–3, 2026

+
+ + +
+ +
+
+
Meetups
+
+ + +
+
+ +
+
+
+
New York
+ +

September 15–16, 2026

+
+ + +
+ +
+
+
Berlin
+ +

November 9–10, 2026

+
+ + +
+
+
+
+
+
+
+ +
+
+
+ +
+
+
+
+ +
+
+
+
+
+ +
+ +
+

+ You can’t verify all the AI-generated code +

+
+ +
+ More code + the same number of engineers = problems. +
+ +
+
+

+ By + + Chris Stokel-Walker +

+
+
+
+
+ + article hero – quality assurance, testing, QA + +
+
+
+ + + +
+
+
+
+ March 03, 2026 +
+ +
+
+

+ + Estimated reading time: 5 minutes +

+ +

+ When AI-generated code goes wrong, you’re likely grappling + with verification debt. +

+ +

+ Coding with AI is the future, but verification methods are + lagging behind.  +

+ +

+ A survey + of more than 1,100 developers by code verification firm Sonar + found that AI tools now account for 42% of all committed code + – a figure developers expect to rise to 65% by 2027. But Sonar + also identified what it calls a “verification gap”. While 96% + of developers + don’t fully trust AI-generated code to be functionally + correct, only 48% say they always check it before committing. +

+ +

+ “AI-assisted coding has expanded the + volume of code + being developed exponentially,” says Tom Finch, engineering + leader at Chainguard, a container security software firm. +

+
+
+ + + + + +
+

+ Join LeadDev.com for free to access this content +

+
+
+

+ Create an account to access our free engineering + leadership content, free online events and to receive our + weekly email newsletter. We will also keep you up to date + with LeadDev events. +

+
+ Register + with + google + + +
+ +
+

+ We have linked your account and just need a few more + details to complete your registration: +

+
+ + + + + + +
+ + Terms and conditions + + +
+ +

 

+ +
+
+ +
+

 

+ +
+
+ +
+

+ Enter your email address to reset your password. +

+
+ +
+

 

+ +

+ A link has been emailed to you - check your inbox. +

+
+
+ Don't have an account? + Click here to register + +
+
+ + +
+ +
+ + +
+ +
+
+ + + +
+
+
+ +
+ +
+ + +
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+
+
+
+ logo +
+ +
+

Notice

+
+

+ We (White October Events Ltd) and selected third parties (18) + use cookies or similar technologies for technical purposes + and, with your consent, for + functionality, experience, measurement and “marketing + (personalized ads)” + as specified in the + cookie policy. +

+

+ With respect to advertising, we and 1111 selected + , may use + precise geolocation data, and identification through device + scanning + in order to + store and/or access information on a device and + process personal data like your usage data for the following + : + personalised advertising and content, advertising and + content measurement, audience research and services + development. +

+

+ You can freely give, deny, or withdraw your consent at any + time by accessing the preferences panel. If you give consent, + it will be valid only in this domain. Denying consent may make + related features unavailable. +

+

+

+ Use the “Accept all” button to consent. Use the “Reject all” + button or close this notice to continue without accepting. +

+
+
+
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+ +
+
+
+
+
+ + + diff --git a/tests/fixtures/fullpage_to_article_html/martinfowler.com/fragment-2026-02-25/expected.html b/tests/fixtures/fullpage_to_article_html/martinfowler.com/fragment-2026-02-25/expected.html new file mode 100644 index 0000000..235722a --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/martinfowler.com/fragment-2026-02-25/expected.html @@ -0,0 +1,231 @@ + + + +
+

Fragments: February 25

+
Martin Fowler: 25 Feb 2026
+
+

+ I don’t tend to post links to videos here, as + I can’t stand watching videos to learn about things. But some talks are worth a watch, and I do suggest this overview on + how organizations are currently using AIby Laura Tacho. There’s various nuggets of data from her work with + DX: +

+
    +
  • 92.6% of devs are using AI assistants
  • +
  • devs reckon it’s saving them 4 hours per week
  • +
  • + 27% of code is written by AI without significant human intervention +
  • +
  • AI cuts onboarding time by half
  • +
+

+ These are interesting numbers, but most of them are averages, and + those who know me know I teach people to + be suspicious of averages. Laura knows this too: +

+
+

+ average doesn’t mean typical.. there is no typical experience with + AI +

+
+

+ Different companies (and teams within companies) are having very + different experiences. Often AI is an amplifier to an organization’s + practices, for good or ill. +

+
+

+ Organizational performance is multidimensional, and these + organizations are just going off into different extremes based on + what they were doing before. AI is an accelerator, it’s a + multiplier, and it is moving organizations off in different + directions. (08:52) +

+
+

+ Some organizations are facing twice as many customer incidents, but + others are facing half. +

+

❄ ❄ ❄ ❄ ❄

+

+ Rachel Laycock (Thoughtworks CTO) + shares her reflectionson our recent Future of Software Engineering retreat in Utah. +

+
    +
  • We need to address cognitive load
  • +
  • The staff engineer role is changing
  • +
  • What happens to code reviews?
  • +
  • Agent Topologies
  • +
  • What exactly does AI mean for programming languages?
  • +
  • Self-healing systems
  • +
+

On the latter:

+
+

+ One of the most interesting and perhaps immediately applicable ideas + was the concept of an ‘agent subconscious’, in which agents are + informed by a comprehensive knowledge graph of post mortems and + incident data. This particularly excites me because I’ve seen many + production issues solved by the latent knowledge of those in + leadership positions. The constant challenge comes from what happens + when those people aren’t available or involved. +

+
+

❄ ❄ ❄ ❄ ❄

+

+ Simon Willison (one of my most reliable sources for information about + LLMs and programming) is starting a series of + Agentic Engineering Patterns: +

+
+

+ I think of vibe coding using its original definition of coding where + you pay no attention to the code at all, which today is often + associated with non-programmers using LLMs to write code. +

+

+ Agentic Engineering represents the other end of the scale: + professional software engineers using coding agents to improve and + accelerate their work by amplifying their existing expertise. +

+
+

+ He’s intending this to be closer to evergreen material, as opposed to + the day-to-day writing he does (extremely well) on his blog. +

+

+ One of the first patterns is + Red/Green TDD +

+
+

+ This turns out to be a fantastic fit for coding agents. A + significant risk with coding agents is that they might write code + that doesn’t work, or build code that is unnecessary and never gets + used, or both. +

+

+ Test-first development helps protect against both of these common + mistakes, and also ensures a robust automated test suite that + protects against future regressions. +

+
+

❄ ❄ ❄ ❄ ❄

+

+ Aaron Ericksonis one of those technologists with good judgment who I listen to a + lot +

+
+

+ As much fun as people are having with OpenClaw, I think the days of + “here is my agent with access to all my stuff” are numbered. +

+

+ Fine scoped agents who can read email and cleanse it before it + reaches the agentic OODA loop that acts on it, policy agents (a claw + with a job called “VP of NO” to money being spent) +

+

+ You structure your agents like you would a company. Insert friction + where you want decisions to be slow and the cost of being wrong is + high, reduce friction where you want decisions to be fast and the + cost of being wrong is trivial or zero. +

+
+

+ I’ve posted here a lot about security concerns with agents. Right now + I think this notion of fine-scoped agents is the most promising + direction. Last year Korny Sietsma + wrote about how to mitigate agentic AI security risks. His advice included to split the tasks, so that no agent has access + to all parts of the Lethal Trifecta: +

+
+

+ This approach is an application of a more general security habit: + follow the Principle of Least Privilege. Splitting the work, and + giving each sub-task a minimum of privilege, reduces the scope for a + rogue LLM to cause problems, just as we would do when working with + corruptible humans. +

+

+ This is not only more secure, it is also increasingly a way people + are encouraged to work. It’s too big a topic to cover here, but it’s + a good idea to split LLM work into small stages, as the LLM works + much better when its context isn’t too big. Dividing your tasks into + “Think, Research, Plan, Act” keeps context down, especially if “Act” + can be chunked into a number of small independent and testable + chunks. +

+
+

❄ ❄ ❄ ❄ ❄

+

+ Doonesbury outlines the + opportunity for aging writers like myself. (Currently I’m still writing my words the old fashioned way.) +

+

❄ ❄ ❄ ❄ ❄

+

+ An interesting story someone told me. They were at a swimming pool + with their child, she looked at a photo on a poster advertising an + event there and said “that’s AI”. Initially the parents didn’t think + it was, but looking carefully spotted a tell-tale six fingers. They + concluded that fresher biological neural networks are being trained to + quickly recognize AI. +

+

❄ ❄ ❄ ❄ ❄

+

+ I carefully curate my social media streams, following only feeds where + I can control whose posts are picked up. In times gone by, editors of + newspapers and magazines would do a similar job. But many users of + social media + are faced with a tsunamiof stuff, much of it ugly, and don’t have to tools to control it. +

+
+

+ A few days ago I saw an Instagram reel of a young woman talking + about how she had been raped six years ago, struggled with thoughts + of suicide afterwards, but managed to rebuild her life again. Among + the comments – the majority of which were from men – were things + like “Well at least you had some”, “No way, she’s unrapeable”, “Hope + you didn’t talk this much when it happened”, “Bro could have picked + a better option.” Reading those comments, which had thousands of + likes and many boys agreeing with them, made me feel sick. +

+
+

+ My tendencies are to free speech, and I try not to be a Free Speech + Poseur, but the deluge of ugly material on the internet isn’t getting + any better. The people running these platforms seem to be “tackling” + this problem by putting their heads in the sand and hoping it won’t + hurt them. It is hurting their users. +

+
+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/martinfowler.com/fragment-2026-02-25/meta.json b/tests/fixtures/fullpage_to_article_html/martinfowler.com/fragment-2026-02-25/meta.json new file mode 100644 index 0000000..49e3113 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/martinfowler.com/fragment-2026-02-25/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://martinfowler.com/fragments/2026-02-25.html", + "host": "martinfowler.com", + "feed_source": "martinfowler", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:18:57.559630Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "Fragments: February 25", + "extracted_word_count": 1124, + "extracted_success": true, + "expected_selector": "main" +} diff --git a/tests/fixtures/fullpage_to_article_html/martinfowler.com/fragment-2026-02-25/raw.html b/tests/fixtures/fullpage_to_article_html/martinfowler.com/fragment-2026-02-25/raw.html new file mode 100644 index 0000000..81de2bf --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/martinfowler.com/fragment-2026-02-25/raw.html @@ -0,0 +1,711 @@ + + + + + + Fragments: February 25 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

Fragments: February 25

+ +
Martin Fowler: 25 Feb 2026
+ +
+

+ I don’t tend to post links to videos here, as + I can’t stand watching videos to learn about things. But some talks are worth a watch, and I do suggest this overview on + how organizations are currently using AI + by Laura Tacho. There’s various nuggets of data from her work with DX: +

+ +
    +
  • 92.6% of devs are using AI assistants
  • +
  • devs reckon it’s saving them 4 hours per week
  • +
  • + 27% of code is written by AI without significant human intervention +
  • +
  • AI cuts onboarding time by half
  • +
+ +

+ These are interesting numbers, but most of them are averages, and + those who know me know I teach people to + be suspicious of averages. Laura knows this too: +

+ +
+

+ average doesn’t mean typical.. there is no typical experience with + AI +

+
+ +

+ Different companies (and teams within companies) are having very + different experiences. Often AI is an amplifier to an organization’s + practices, for good or ill. +

+ +
+

+ Organizational performance is multidimensional, and these + organizations are just going off into different extremes based on + what they were doing before. AI is an accelerator, it’s a + multiplier, and it is moving organizations off in different + directions. (08:52) +

+
+ +

+ Some organizations are facing twice as many customer incidents, but + others are facing half. +

+ +

+  ❄                ❄                ❄                ❄                ❄ +

+ +

+ Rachel Laycock (Thoughtworks CTO) + shares her reflections + on our recent Future of Software Engineering retreat in Utah. +

+ +
    +
  • We need to address cognitive load
  • +
  • The staff engineer role is changing
  • +
  • What happens to code reviews?
  • +
  • Agent Topologies
  • +
  • What exactly does AI mean for programming languages?
  • +
  • Self-healing systems
  • +
+ +

On the latter:

+ +
+

+ One of the most interesting and perhaps immediately applicable ideas + was the concept of an ‘agent subconscious’, in which agents are + informed by a comprehensive knowledge graph of post mortems and + incident data. This particularly excites me because I’ve seen many + production issues solved by the latent knowledge of those in + leadership positions. The constant challenge comes from what happens + when those people aren’t available or involved. +

+
+ +

+  ❄                ❄                ❄                ❄                ❄ +

+ +

+ Simon Willison (one of my most reliable sources for information about + LLMs and programming) is starting a series of + Agentic Engineering Patterns: +

+ +
+

+ I think of vibe coding using its original definition of coding where + you pay no attention to the code at all, which today is often + associated with non-programmers using LLMs to write code. +

+ +

+ Agentic Engineering represents the other end of the scale: + professional software engineers using coding agents to improve and + accelerate their work by amplifying their existing expertise. +

+
+ +

+ He’s intending this to be closer to evergreen material, as opposed to + the day-to-day writing he does (extremely well) on his blog. +

+ +

+ One of the first patterns is + Red/Green TDD +

+ +
+

+ This turns out to be a fantastic fit for coding agents. A + significant risk with coding agents is that they might write code + that doesn’t work, or build code that is unnecessary and never gets + used, or both. +

+ +

+ Test-first development helps protect against both of these common + mistakes, and also ensures a robust automated test suite that + protects against future regressions. +

+
+ +

+  ❄                ❄                ❄                ❄                ❄ +

+ +

+ Aaron Erickson + is one of those technologists with good judgment who I listen to a lot +

+ +
+

+ As much fun as people are having with OpenClaw, I think the days of + “here is my agent with access to all my stuff” are numbered. +

+ +

+ Fine scoped agents who can read email and cleanse it before it + reaches the agentic OODA loop that acts on it, policy agents (a claw + with a job called “VP of NO” to money being spent) +

+ +

+ You structure your agents like you would a company. Insert friction + where you want decisions to be slow and the cost of being wrong is + high, reduce friction where you want decisions to be fast and the + cost of being wrong is trivial or zero. +

+
+ +

+ I’ve posted here a lot about security concerns with agents. Right now + I think this notion of fine-scoped agents is the most promising + direction. Last year Korny Sietsma + wrote about how to mitigate agentic AI security risks. His advice included to split the tasks, so that no agent has access + to all parts of the Lethal Trifecta: +

+ +
+

+ This approach is an application of a more general security habit: + follow the Principle of Least Privilege. Splitting the work, and + giving each sub-task a minimum of privilege, reduces the scope for a + rogue LLM to cause problems, just as we would do when working with + corruptible humans. +

+ +

+ This is not only more secure, it is also increasingly a way people + are encouraged to work. It’s too big a topic to cover here, but it’s + a good idea to split LLM work into small stages, as the LLM works + much better when its context isn’t too big. Dividing your tasks into + “Think, Research, Plan, Act” keeps context down, especially if “Act” + can be chunked into a number of small independent and testable + chunks. +

+
+ +

+  ❄                ❄                ❄                ❄                ❄ +

+ +

+ Doonesbury outlines the + opportunity for aging writers like myself. (Currently I’m still writing my words the old fashioned way.) +

+ +

+  ❄                ❄                ❄                ❄                ❄ +

+ +

+ An interesting story someone told me. They were at a swimming pool + with their child, she looked at a photo on a poster advertising an + event there and said “that’s AI”. Initially the parents didn’t think + it was, but looking carefully spotted a tell-tale six fingers. They + concluded that fresher biological neural networks are being trained to + quickly recognize AI. +

+ +

+  ❄                ❄                ❄                ❄                ❄ +

+ +

+ I carefully curate my social media streams, following only feeds where + I can control whose posts are picked up. In times gone by, editors of + newspapers and magazines would do a similar job. But many users of + social media + are faced with a tsunami + of stuff, much of it ugly, and don’t have to tools to control it. +

+ +
+

+ A few days ago I saw an Instagram reel of a young woman talking + about how she had been raped six years ago, struggled with thoughts + of suicide afterwards, but managed to rebuild her life again. Among + the comments – the majority of which were from men – were things + like “Well at least you had some”, “No way, she’s unrapeable”, “Hope + you didn’t talk this much when it happened”, “Bro could have picked + a better option.” Reading those comments, which had thousands of + likes and many boys agreeing with them, made me feel sick. +

+
+ +

+ My tendencies are to free speech, and I try not to be a Free Speech + Poseur, but the deluge of ugly material on the internet isn’t getting + any better. The people running these platforms seem to be “tackling” + this problem by putting their heads in the sand and hoping it won’t + hurt them. It is hurting their users. +

+
+
+ + + + + + + + + diff --git a/tests/fixtures/fullpage_to_article_html/martinfowler.com/host-leadership/expected.html b/tests/fixtures/fullpage_to_article_html/martinfowler.com/host-leadership/expected.html new file mode 100644 index 0000000..7193e07 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/martinfowler.com/host-leadership/expected.html @@ -0,0 +1,86 @@ + + + +
+
+

+ Host Leadership +

+

19 February 2026

+
+
+

+ + + +

+

+ +

+
+
+ +
+
+
+

+ If you've hung around agile circles for long, you've probably heard + about the concept of servant leadership, that managers should + think of themselves as supporting the team, removing blocks, + protecting them from the vagaries of corporate life. That's never + sounded quite right to me, and a recent conversation with Kent Beck + nailed why - it's gaslighting. The manager claims to be a servant, + but everyone knows who really has the power. +

+

+ My colleague Giles Edwards-Alexander told me about an alternative + way of thinking about leadership, one that he came across working + with mental-health professionals. This casts the leader as a host: + preparing a suitable space, inviting the team in, providing ideas + and problems, and then stepping back to let them work. The host + looks after the team, rather as the ideal servant leader does, but + still has the power to intervene should things go awry. +

+
+

Further Reading

+
    +
  • + Dr Mark McKergow and Helen Bailey + wrote a bookin 2014. +
  • +
  • + The website + hostleadership.com has + ongoing information including a blog. +
  • +
  • + McKergow and Bailey have a short article in HR Review that + outlines the + six roles of engagementof a host leader. +
  • +
+
+
+
+
+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/martinfowler.com/host-leadership/meta.json b/tests/fixtures/fullpage_to_article_html/martinfowler.com/host-leadership/meta.json new file mode 100644 index 0000000..79de66e --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/martinfowler.com/host-leadership/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://martinfowler.com/bliki/HostLeadership.html", + "host": "martinfowler.com", + "feed_source": "martinfowler", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:18:59.008005Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "bliki: Host Leadership", + "extracted_word_count": 201, + "extracted_success": true, + "expected_selector": "main" +} diff --git a/tests/fixtures/fullpage_to_article_html/martinfowler.com/host-leadership/raw.html b/tests/fixtures/fullpage_to_article_html/martinfowler.com/host-leadership/raw.html new file mode 100644 index 0000000..b87b20f --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/martinfowler.com/host-leadership/raw.html @@ -0,0 +1,528 @@ + + + + + + Host Leadership + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

Host Leadership

+ +

19 February 2026

+ +
+
+

+ +

+ +

+
+ +
+ + +
+ +
+ +
+

+ If you've hung around agile circles for long, you've probably heard + about the concept of servant leadership, that managers should + think of themselves as supporting the team, removing blocks, + protecting them from the vagaries of corporate life. That's never + sounded quite right to me, and a recent conversation with Kent Beck + nailed why - it's gaslighting. The manager claims to be a servant, + but everyone knows who really has the power. +

+ +

+ My colleague Giles Edwards-Alexander told me about an alternative + way of thinking about leadership, one that he came across working + with mental-health professionals. This casts the leader as a host: + preparing a suitable space, inviting the team in, providing ideas + and problems, and then stepping back to let them work. The host + looks after the team, rather as the ideal servant leader does, but + still has the power to intervene should things go awry. +

+ +
+

Further Reading

+ +
    +
  • + Dr Mark McKergow and Helen Bailey + wrote a book + in 2014. +
  • + +
  • + The website + hostleadership.com has + ongoing information including a blog. +
  • + +
  • + McKergow and Bailey have a short article in HR Review that + outlines the + six roles of engagement + of a host leader. +
  • +
+
+
+
+ +
+
+ + + + + + + + + diff --git a/tests/fixtures/fullpage_to_article_html/martinfowler.com/humans-and-agents/expected.html b/tests/fixtures/fullpage_to_article_html/martinfowler.com/humans-and-agents/expected.html new file mode 100644 index 0000000..b2ac45e --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/martinfowler.com/humans-and-agents/expected.html @@ -0,0 +1,347 @@ + + + +
+
+
+

Humans and Agents in Software Engineering Loops

+
+
+
+
+ + Photo of Kief Morris + +
+
+ +
+
+

+ Kief Morris lives in London and works as a global cloud + technology specialist for Thoughtworks. +

+
+
+
+
+
+
+
+

+ + + +

+

+ This article is part of + “Exploring Gen AI”. A + series capturing Thoughtworks technologists' explorations of using + gen ai technology for software development. +

+
+

04 March 2026

+
+
+
+

+ Should humans stay out of the software development process and vibe + code, or do we need developers in the loop inspecting every line of + code? I believe the answer is to focus on the goal of turning ideas + into outcomes. The right place for us humans is to build and manage + the working loop rather than either leaving the agents to it or + micromanaging what they produce. Let’s call this “on the loop.” +

+

+ As software creators we build an outcome by turning our ideas into + working software and iterating as we learn and evolve our ideas. This + is the “why loop”. Until the AI uprising comes humans will run this + loop because we’re the ones who want what it produces. +

+

+ The process of building the software is the “how loop.” The how loop + involves creating, selecting, and using intermediate artefacts like + code, tests, tools, and infrastructure. It may also involve + documentation like technical designs and ADRs. We’re used to seeing + many of these as deliverables, but intermediate artefacts are really + just a means to an end. +

+

+ The software delivery feedback loops: An upper "why" loop connected to a lower “how” loop. The why loop iterates over an idea and working software. The how loop iterates over interim artefacts like specs, code, and tests. +

+

+ Figure 1: The why loop iterates over ideas and software, the how + loop iterates on building the software +

+

+ In reality the how loop contains multiple loops. The outermost how + loop specifies and delivers the working software for the why loop. The + innermost loop generates and tests code. Loops in between break down + higher levels of work into smaller tasks for the lower loops to + implement, then validate the results. +

+

+ Multiple levels of “how” loops supporting the “why” loop. An outer loop iterates on a feature. A middle loop iterates on stories. An inner loop iterates on code. +

+

+ Figure 2: The how loop has multiple levels of inner loops that work + on smaller increments of the full implementation +

+

+ These loops may follow practices like design reviews and test stages. + They might build systems by applying architectural approaches and + design patterns like microservices or CUPID. Like the intermediate + artefacts that pop out of these practices and patterns, they are all a + means of achieving the outcome we actually care about. +

+

+ But maybe we don’t care about the means that are used to achieve our + goals? Maybe we can just let the LLMs run the how loop however they + like? +

+

Humans outside the loop

+

+ Plenty of people have discovered the joy of letting humans stick to + the why loop, and leaving the how loop for the agents to deal with. + This is the common definition of “vibe coding”. Some interpretations + of Spec Driven Development (SDD) are much the same, with humans + investing effort in writing the outcome we want, but not dictating how + the LLM should achieve it. +

+

+ Humans outside the loop: An upper "why" loop with a human on top. The loop iterates over an idea and working software. This is connected to a lower "how" loop by a robot, which iterates over interim artefacts like code. +

+

+ Figure 3: Human runs the why loop, agent runs the how loop. +

+

+ The appeal of humans staying out of the how loop is that the why loop + is the one we really care about. Software development is a messy + domain that inevitably bogs down into over-engineered processes and + coping with technical debt. And every new LLM model so far has gotten + better at taking a user prompt and spitting out working software. If + you’re not satisfied with what it spits out, tell the LLM and it’ll + give you another iteration. +

+

+ If the LLMs can write and change code without us, do we care whether + the code is “clean”? It doesn’t matter whether a variable name clearly + expresses its purpose as long as an LLM can figure it out. Maybe we + don’t even need to care what language the software is written in? +

+

+ We care about external quality, not internal quality for its own sake. + External quality is what we experience as a user or other stakeholder + of the software. Functional quality is a must, the system needs to + work correctly. And for production software we also care about + non-functional, operational quality. Our system shouldn’t crash, it + should run quickly, and we don’t want it posting confidential data to + social media sites. We don’t want to run up massive cloud hosting + bills, and in many domains we need to pass compliance audits. +

+

+ We care about internal quality when it affects external outcomes. When + human coders were crawling through the codebase, adding features and + fixing bugs, they could do it more quickly and reliably in a clean + codebase. But LLMs don’t care about developer experience, do they? +

+

+ In theory our LLM agents can extrude a massively overcomplicated + spaghetti codebase, test and fix it by running ad-hoc shell commands, + and eventually produce a correct, compliant, high-performing system. + We just get our swarms Ralph Wiggumming on it, running in data centers + that draw energy from the boiling oceans they float on, and eventually + we’ll get there. +

+

+ In practice, a cleanly-designed, well-structured codebase has + externally important benefits over a messy codebase. When LLMs can + more quickly understand and modify the code they work faster and + spiral less. We do care about the time and cost of building the + systems we need. +

+

Humans in the loop

+

+ Some developers believe that the only way to maintain internal quality + is to stay closely involved in the lowest levels of the how loop. + Often, when an agent spirals over some broken bit of code a human + developer can understand and fix it in seconds. Human experience and + judgement still exceeds LLMs in many situations. +

+

+ Humans in the loop: A single “why+how” loop with a human at the top and a robot at the bottom. The loop iterates over idea, interim artefacts like code and tests, and the working software. +

+

+ Figure 4: Human runs the why loop and the how loop +

+

+ When people talk about “humans in the loop”, they often mean humans as + a gatekeeper within the innermost loop where code is generated, such + as manually inspecting each line of code created by an LLM. +

+

+ The challenge when we insist on being too closely involved in the + process is that we become a bottleneck. Agents can generate code + faster than humans can manually inspect it. Reports on developer + productivity with AI show mixed results, which may be at least partly + because of humans spending more time specifying and reviewing code + than they save by getting LLMs to generate it. +

+

+ We need to adopt classic “shift left” thinking. Once upon a time we + wrote all of our code, passed it to a QA team to test, and then tried + to fix enough bugs to ship a release. Then we discovered that when + developers write and run tests as we work we find and fix issues right + away, which makes the whole process faster and more reliable. +

+

+ What works for humans can work for agents as well. Agents produce + better code when they can gauge the quality of the code they produce + themselves rather than relying on us to check it for them. We need to + instruct them on what we’re looking for, and give them guidance on the + best ways to achieve it. +

+

Humans on the loop

+

+ Rather than personally inspecting what the agents produce, we can make + them better at producing it. The collection of specifications, quality + checks, and workflow guidance that control different levels of loops + inside the how loop is the agent’s harness. The emerging practice of + building and maintaining these harnesses, + Harness Engineering, is how humans work on the loop. +

+

+ Humans on the loop: An upper "why" loop connected to a lower “how” loop by a human. The why loop iterates over an idea and working software.A robot sits at the bottom of the lower “how” loop, which iterates over interim artefacts like specs and code. +

+

+ Figure 5: Human defines the how loop and the agent runs it +

+

+ Something like the on the loop concept has also been described as the + “middle loop,” including by participants of + The Future of Software Development Retreat. The middle loop refers to moving human attention to a higher-level + loop than the coding loop. +

+

+ The difference between in the loop and on the loop is most visible in + what we do when we’re not satisfied with what the agent produces, + including an intermediate artefact. The “in the loop” way is to fix + the artefact, whether by directly editing it, or by telling the agent + to make the correction we want. The “on the loop” way is to change the + harness that produced the artefact so it produces the results we want. +

+

+ We continuously improve the quality of the outcomes we get by + continuously improving the harness. And then we can take it to another + level. +

+

The agentic flywheel

+

+ The next level is humans directing agents to manage and improve the + harness rather than doing it by hand. +

+

+ Flywheel: An upper "why" loop connected to a lower “how” loop by a human and a robot. The why loop iterates over an idea and working software. The how loop iterates over interim artefacts like specs. +

+

+ Figure 6: Human directs agent to build and improve the how loop +

+

+ We build the flywheel by giving the agents the information they need + to evaluate the performance of the loop. A good starting point is the + tests and evaluations already included in the harness. The flywheel + becomes more powerful as we feed it richer signals. Add pipeline + stages that measure performance and validate failure scenarios. Feed + operational data from production, user journey logs, and commercial + results to broaden the scope and depth of what the agents can analyze. +

+

+ For each step of the workflow we have the agent review the results and + recommend improvements to the harness. The scope includes improvements + to any of the upstream parts of the workflow that could improve those + results. What we have now is an agent harness that generates + recommendations for improving itself. +

+

+ We start by considering the recommendations interactively, prompting + the agents to implement specific changes. We can also have the agents + add their recommendations to the product backlog, so we can prioritize + and schedule them for the agents to pick up, apply, and test as part + of the automated flow. +

+

+ As we gain confidence, the agents can assign scores to their + recommendations, including the risks, costs, and benefits. We might + then decide that recommendations with certain scores should be + automatically approved and applied. +

+

+ At some point this might look a lot like humans out of the loop, + old-school vibe coding. I suspect that will be true for standard types + of work that are done often as the improvement loops reach diminishing + returns. But by engineering the harness we won’t just get one-off, + “good enough” solutions, we’ll get robust, maybe even anti-fragile + systems that continuously improve themselves. +

+
+
+
+

latest article (Mar 04):

+

+ Humans and Agents in Software Engineering Loops +

+
+ +
+

+ + + +

+
+
+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/martinfowler.com/humans-and-agents/meta.json b/tests/fixtures/fullpage_to_article_html/martinfowler.com/humans-and-agents/meta.json new file mode 100644 index 0000000..1300972 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/martinfowler.com/humans-and-agents/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://martinfowler.com/articles/exploring-gen-ai/humans-and-agents.html", + "host": "martinfowler.com", + "feed_source": "martinfowler", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:18:55.867024Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "Humans and Agents in Software Engineering Loops", + "extracted_word_count": 1655, + "extracted_success": true, + "expected_selector": "main" +} diff --git a/tests/fixtures/fullpage_to_article_html/martinfowler.com/humans-and-agents/raw.html b/tests/fixtures/fullpage_to_article_html/martinfowler.com/humans-and-agents/raw.html new file mode 100644 index 0000000..1c1e1c2 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/martinfowler.com/humans-and-agents/raw.html @@ -0,0 +1,827 @@ + + + + + + Humans and Agents in Software Engineering Loops + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+

Humans and Agents in Software Engineering Loops

+ +
+
+
+
+ Photo of Kief Morris +
+ +
+ +
+ +
+

+ Kief Morris lives in London and works as a global cloud + technology specialist for Thoughtworks. +

+
+
+
+
+
+ +
+
+

+ +

+ +

+ This article is part of + “Exploring Gen AI”. A + series capturing Thoughtworks technologists' explorations of using + gen ai technology for software development. +

+
+ +

04 March 2026

+
+
+ +
+

+ Should humans stay out of the software development process and vibe + code, or do we need developers in the loop inspecting every line of + code? I believe the answer is to focus on the goal of turning ideas + into outcomes. The right place for us humans is to build and manage + the working loop rather than either leaving the agents to it or + micromanaging what they produce. Let’s call this “on the loop.” +

+ +

+ As software creators we build an outcome by turning our ideas into + working software and iterating as we learn and evolve our ideas. This + is the “why loop”. Until the AI uprising comes humans will run this + loop because we’re the ones who want what it produces. +

+ +

+ The process of building the software is the “how loop.” The how loop + involves creating, selecting, and using intermediate artefacts like + code, tests, tools, and infrastructure. It may also involve + documentation like technical designs and ADRs. We’re used to seeing + many of these as deliverables, but intermediate artefacts are really + just a means to an end. +

+ +

+ The software delivery feedback loops: An upper "why" loop connected to a lower “how” loop. The why loop iterates over an idea and working software. The how loop iterates over interim artefacts like specs, code, and tests. +

+ +

+ Figure 1: The why loop iterates over ideas and software, the how + loop iterates on building the software +

+ +

+ In reality the how loop contains multiple loops. The outermost how + loop specifies and delivers the working software for the why loop. The + innermost loop generates and tests code. Loops in between break down + higher levels of work into smaller tasks for the lower loops to + implement, then validate the results. +

+ +

+ Multiple levels of “how” loops supporting the “why” loop. An outer loop iterates on a feature. A middle loop iterates on stories. An inner loop iterates on code. +

+ +

+ Figure 2: The how loop has multiple levels of inner loops that work + on smaller increments of the full implementation +

+ +

+ These loops may follow practices like design reviews and test stages. + They might build systems by applying architectural approaches and + design patterns like microservices or CUPID. Like the intermediate + artefacts that pop out of these practices and patterns, they are all a + means of achieving the outcome we actually care about. +

+ +

+ But maybe we don’t care about the means that are used to achieve our + goals? Maybe we can just let the LLMs run the how loop however they + like? +

+ +

Humans outside the loop

+ +

+ Plenty of people have discovered the joy of letting humans stick to + the why loop, and leaving the how loop for the agents to deal with. + This is the common definition of “vibe coding”. Some interpretations + of Spec Driven Development (SDD) are much the same, with humans + investing effort in writing the outcome we want, but not dictating how + the LLM should achieve it. +

+ +

+ Humans outside the loop: An upper "why" loop with a human on top. The loop iterates over an idea and working software. This is connected to a lower "how" loop by a robot, which iterates over interim artefacts like code. +

+ +

+ Figure 3: Human runs the why loop, agent runs the how loop. +

+ +

+ The appeal of humans staying out of the how loop is that the why loop + is the one we really care about. Software development is a messy + domain that inevitably bogs down into over-engineered processes and + coping with technical debt. And every new LLM model so far has gotten + better at taking a user prompt and spitting out working software. If + you’re not satisfied with what it spits out, tell the LLM and it’ll + give you another iteration. +

+ +

+ If the LLMs can write and change code without us, do we care whether + the code is “clean”? It doesn’t matter whether a variable name clearly + expresses its purpose as long as an LLM can figure it out. Maybe we + don’t even need to care what language the software is written in? +

+ +

+ We care about external quality, not internal quality for its own sake. + External quality is what we experience as a user or other stakeholder + of the software. Functional quality is a must, the system needs to + work correctly. And for production software we also care about + non-functional, operational quality. Our system shouldn’t crash, it + should run quickly, and we don’t want it posting confidential data to + social media sites. We don’t want to run up massive cloud hosting + bills, and in many domains we need to pass compliance audits. +

+ +

+ We care about internal quality when it affects external outcomes. When + human coders were crawling through the codebase, adding features and + fixing bugs, they could do it more quickly and reliably in a clean + codebase. But LLMs don’t care about developer experience, do they? +

+ +

+ In theory our LLM agents can extrude a massively overcomplicated + spaghetti codebase, test and fix it by running ad-hoc shell commands, + and eventually produce a correct, compliant, high-performing system. + We just get our swarms Ralph Wiggumming on it, running in data centers + that draw energy from the boiling oceans they float on, and eventually + we’ll get there. +

+ +

+ In practice, a cleanly-designed, well-structured codebase has + externally important benefits over a messy codebase. When LLMs can + more quickly understand and modify the code they work faster and + spiral less. We do care about the time and cost of building the + systems we need. +

+ +

Humans in the loop

+ +

+ Some developers believe that the only way to maintain internal quality + is to stay closely involved in the lowest levels of the how loop. + Often, when an agent spirals over some broken bit of code a human + developer can understand and fix it in seconds. Human experience and + judgement still exceeds LLMs in many situations. +

+ +

+ Humans in the loop: A single “why+how” loop with a human at the top and a robot at the bottom. The loop iterates over idea, interim artefacts like code and tests, and the working software. +

+ +

Figure 4: Human runs the why loop and the how loop

+ +

+ When people talk about “humans in the loop”, they often mean humans as + a gatekeeper within the innermost loop where code is generated, such + as manually inspecting each line of code created by an LLM. +

+ +

+ The challenge when we insist on being too closely involved in the + process is that we become a bottleneck. Agents can generate code + faster than humans can manually inspect it. Reports on developer + productivity with AI show mixed results, which may be at least partly + because of humans spending more time specifying and reviewing code + than they save by getting LLMs to generate it. +

+ +

+ We need to adopt classic “shift left” thinking. Once upon a time we + wrote all of our code, passed it to a QA team to test, and then tried + to fix enough bugs to ship a release. Then we discovered that when + developers write and run tests as we work we find and fix issues right + away, which makes the whole process faster and more reliable. +

+ +

+ What works for humans can work for agents as well. Agents produce + better code when they can gauge the quality of the code they produce + themselves rather than relying on us to check it for them. We need to + instruct them on what we’re looking for, and give them guidance on the + best ways to achieve it. +

+ +

Humans on the loop

+ +

+ Rather than personally inspecting what the agents produce, we can make + them better at producing it. The collection of specifications, quality + checks, and workflow guidance that control different levels of loops + inside the how loop is the agent’s harness. The emerging practice of + building and maintaining these harnesses, + Harness Engineering, is how humans work on the loop. +

+ +

+ Humans on the loop: An upper "why" loop connected to a lower “how” loop by a human. The why loop iterates over an idea and working software.A robot sits at the bottom of the lower “how” loop, which iterates over interim artefacts like specs and code. +

+ +

+ Figure 5: Human defines the how loop and the agent runs it +

+ +

+ Something like the on the loop concept has also been described as the + “middle loop,” including by participants of + The Future of Software Development Retreat. The middle loop refers to moving human attention to a higher-level + loop than the coding loop. +

+ +

+ The difference between in the loop and on the loop is most visible in + what we do when we’re not satisfied with what the agent produces, + including an intermediate artefact. The “in the loop” way is to fix + the artefact, whether by directly editing it, or by telling the agent + to make the correction we want. The “on the loop” way is to change the + harness that produced the artefact so it produces the results we want. +

+ +

+ We continuously improve the quality of the outcomes we get by + continuously improving the harness. And then we can take it to another + level. +

+ +

The agentic flywheel

+ +

+ The next level is humans directing agents to manage and improve the + harness rather than doing it by hand. +

+ +

+ Flywheel: An upper "why" loop connected to a lower “how” loop by a human and a robot. The why loop iterates over an idea and working software. The how loop iterates over interim artefacts like specs. +

+ +

+ Figure 6: Human directs agent to build and improve the how loop +

+ +

+ We build the flywheel by giving the agents the information they need + to evaluate the performance of the loop. A good starting point is the + tests and evaluations already included in the harness. The flywheel + becomes more powerful as we feed it richer signals. Add pipeline + stages that measure performance and validate failure scenarios. Feed + operational data from production, user journey logs, and commercial + results to broaden the scope and depth of what the agents can analyze. +

+ +

+ For each step of the workflow we have the agent review the results and + recommend improvements to the harness. The scope includes improvements + to any of the upstream parts of the workflow that could improve those + results. What we have now is an agent harness that generates + recommendations for improving itself. +

+ +

+ We start by considering the recommendations interactively, prompting + the agents to implement specific changes. We can also have the agents + add their recommendations to the product backlog, so we can prioritize + and schedule them for the agents to pick up, apply, and test as part + of the automated flow. +

+ +

+ As we gain confidence, the agents can assign scores to their + recommendations, including the risks, costs, and benefits. We might + then decide that recommendations with certain scores should be + automatically approved and applied. +

+ +

+ At some point this might look a lot like humans out of the loop, + old-school vibe coding. I suspect that will be true for standard types + of work that are done often as the improvement loops reach diminishing + returns. But by engineering the harness we won’t just get one-off, + “good enough” solutions, we’ll get robust, maybe even anti-fragile + systems that continuously improve themselves. +

+
+ +
+
+

latest article (Mar 04):

+ +

+ Humans and Agents in Software Engineering Loops +

+
+ + + +
+

+ +

+
+
+
+ + + + + diff --git a/tests/fixtures/fullpage_to_article_html/simonwillison.net/agentic-engineering-patterns-anti-patterns/expected.html b/tests/fixtures/fullpage_to_article_html/simonwillison.net/agentic-engineering-patterns-anti-patterns/expected.html new file mode 100644 index 0000000..5eca509 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/simonwillison.net/agentic-engineering-patterns-anti-patterns/expected.html @@ -0,0 +1,84 @@ + + + +
+

+ There are some behaviors that are anti-patterns in our weird new world + of agentic engineering. +

+

+ Inflicting unreviewed code on collaborators # +

+

This anti-pattern is common and deeply frustrating.

+

+ Don't file pull requests with code you haven't reviewed + yourself. +

+

+ If you open a PR with hundreds (or thousands) of lines of code that an + agent produced for you, and you haven't done the work to ensure that + code is functional yourself, you are delegating the actual work to other + people. +

+

+ They could have prompted an agent themselves. What value are you even + providing? +

+

+ If you put code up for review you need to be confident that it's ready + for other people to spend their time on it. The initial review pass is + your responsibility, not something you should farm out to others. +

+

+ A good agentic engineering pull request has the following + characteristics: +

+
    +
  • + The code works, and you are confident that it works. + Your job is to deliver code that works. +
  • +
  • + The change is small enough to be reviewed efficiently without + inflicting too much additional cognitive load on the reviewer. Several + small PRs beats one big one, and splitting code into separate commits + is easy with a coding agent to do the Git finagling for you. +
  • +
  • + The PR includes additional context to help explain the change. What's + the higher level goal that the change serves? Linking to relevant + issues or specifications is useful here. +
  • +
  • + Agents write convincing looking pull request descriptions. You need to + review these too! It's rude to expect someone else to read text that + you haven't read and validated yourself. +
  • +
+

+ Given how easy it is to dump unreviewed code on other people, I + recommend including some form of evidence that you've put that extra + work in yourself. Notes on how you manually tested it, comments on + specific implementation choices or even screenshots and video of the + feature working go a long way to demonstrating that a + reviewer's time will not be wasted digging into the details. +

+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/simonwillison.net/agentic-engineering-patterns-anti-patterns/meta.json b/tests/fixtures/fullpage_to_article_html/simonwillison.net/agentic-engineering-patterns-anti-patterns/meta.json new file mode 100644 index 0000000..b71706b --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/simonwillison.net/agentic-engineering-patterns-anti-patterns/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://simonwillison.net/guides/agentic-engineering-patterns/anti-patterns/", + "host": "simonwillison.net", + "feed_source": "simonwillison", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:19:00.754094Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "Anti-patterns: things to avoid - Agentic Engineering Patterns", + "extracted_word_count": 349, + "extracted_success": true, + "expected_selector": "div.note" +} diff --git a/tests/fixtures/fullpage_to_article_html/simonwillison.net/agentic-engineering-patterns-anti-patterns/raw.html b/tests/fixtures/fullpage_to_article_html/simonwillison.net/agentic-engineering-patterns-anti-patterns/raw.html new file mode 100644 index 0000000..a0cda0f --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/simonwillison.net/agentic-engineering-patterns-anti-patterns/raw.html @@ -0,0 +1,623 @@ + + + + + + + + Anti-patterns: things to avoid - Agentic Engineering Patterns - Simon + Willison's Weblog + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

+ Guides > + Agentic Engineering Patterns +

+ +

Anti-patterns: things to avoid

+ +
+

+ There are some behaviors that are anti-patterns in our weird new + world of agentic engineering. +

+

+ Inflicting unreviewed code on collaborators # +

+

This anti-pattern is common and deeply frustrating.

+

+ Don't file pull requests with code you haven't reviewed + yourself. +

+

+ If you open a PR with hundreds (or thousands) of lines of code that + an agent produced for you, and you haven't done the work to ensure + that code is functional yourself, you are delegating the actual work + to other people. +

+

+ They could have prompted an agent themselves. What value are you + even providing? +

+

+ If you put code up for review you need to be confident that it's + ready for other people to spend their time on it. The initial review + pass is your responsibility, not something you should farm out to + others. +

+

+ A good agentic engineering pull request has the following + characteristics: +

+
    +
  • + The code works, and you are confident that it works. + Your job is to deliver code that works. +
  • +
  • + The change is small enough to be reviewed efficiently without + inflicting too much additional cognitive load on the reviewer. + Several small PRs beats one big one, and splitting code into + separate commits is easy with a coding agent to do the Git + finagling for you. +
  • +
  • + The PR includes additional context to help explain the change. + What's the higher level goal that the change serves? Linking to + relevant issues or specifications is useful here. +
  • +
  • + Agents write convincing looking pull request descriptions. You + need to review these too! It's rude to expect someone else to read + text that you haven't read and validated yourself. +
  • +
+

+ Given how easy it is to dump unreviewed code on other people, I + recommend including some form of evidence that you've put that extra + work in yourself. Notes on how you manually tested it, comments on + specific implementation choices or even screenshots and video of the + feature working go a long way to demonstrating that a + reviewer's time will not be wasted digging into the details. +

+
+ + + + +
+ + +
+
+

+ This is a chapter from the guide + Agentic Engineering Patterns. +

+
+

Chapters in this guide

+
    +
  1. + Principles +
      +
    1. + Writing code is cheap now +
    2. + +
    3. + Hoard things you know how to do +
    4. + +
    5. Anti-patterns: things to avoid
    6. +
    +
  2. + +
  3. + Testing and QA +
      +
    1. + Red/green TDD +
    2. + +
    3. + First run the tests +
    4. +
    +
  4. + +
  5. + Understanding code +
      +
    1. + Linear walkthroughs +
    2. + +
    3. + Interactive explanations +
    4. +
    +
  6. + +
  7. + Annotated prompts +
      +
    1. + GIF optimization tool using WebAssembly and Gifsicle +
    2. +
    +
  8. + +
  9. + Appendix +
      +
    1. + Prompts I use +
    2. +
    +
  10. +
+
+ + + + + + + + + + + + + + + + + +

+ Created: 4th March 2026
+ Last modified: 4th March 2026
+ 2 changes +

+ +

+ Previous: + Hoard things you know how to do +

+ +

+ Next: + Red/green TDD +

+
+ + +
+ +
+ + +
+ +
+ + + + + + + + + diff --git a/tests/fixtures/fullpage_to_article_html/simonwillison.net/donald-knuth/expected.html b/tests/fixtures/fullpage_to_article_html/simonwillison.net/donald-knuth/expected.html new file mode 100644 index 0000000..195c79a --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/simonwillison.net/donald-knuth/expected.html @@ -0,0 +1,31 @@ + + + +
+

3rd March 2026

+
+
+

+ Shock! Shock! I learned yesterday that an open problem I'd been + working on for several weeks had just been solved by Claude Opus 4.6 + - Anthropic's hybrid reasoning model that had been released three + weeks earlier! It seems that I'll have to revise my opinions about + "generative AI" one of these days. What a joy it is to learn not + only that my conjecture has a nice solution but also to celebrate + this dramatic advance in automatic deduction and creative problem + solving. +

+
+

+ — + Donald Knuth, Claude's Cycles +

+
+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/simonwillison.net/donald-knuth/meta.json b/tests/fixtures/fullpage_to_article_html/simonwillison.net/donald-knuth/meta.json new file mode 100644 index 0000000..e6c5f1a --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/simonwillison.net/donald-knuth/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://simonwillison.net/2026/Mar/3/donald-knuth/", + "host": "simonwillison.net", + "feed_source": "simonwillison", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:19:03.878682Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "A quote from Donald Knuth", + "extracted_word_count": 98, + "extracted_success": true, + "expected_selector": "div.entry.entryPage" +} diff --git a/tests/fixtures/fullpage_to_article_html/simonwillison.net/donald-knuth/raw.html b/tests/fixtures/fullpage_to_article_html/simonwillison.net/donald-knuth/raw.html new file mode 100644 index 0000000..172b92b --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/simonwillison.net/donald-knuth/raw.html @@ -0,0 +1,449 @@ + + + + + + + A quote from Donald Knuth + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+

3rd March 2026

+
+
+

+ Shock! Shock! I learned yesterday that an open problem I'd been + working on for several weeks had just been solved by Claude Opus + 4.6 - Anthropic's hybrid reasoning model that had been released + three weeks earlier! It seems that I'll have to revise my + opinions about "generative AI" one of these days. What a joy it + is to learn not only that my conjecture has a nice solution but + also to celebrate this dramatic advance in automatic deduction + and creative problem solving. +

+
+

+ — + Donald Knuth, Claude's Cycles +

+
+ + + +
+ +
+

Recent articles

+ +
+
+ + + + +
+ + +
+ +
+ + + + + + + + + diff --git a/tests/fixtures/fullpage_to_article_html/simonwillison.net/qwen/expected.html b/tests/fixtures/fullpage_to_article_html/simonwillison.net/qwen/expected.html new file mode 100644 index 0000000..0770f5a --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/simonwillison.net/qwen/expected.html @@ -0,0 +1,155 @@ + + + +
+

Something is afoot in the land of Qwen

+

4th March 2026

+

+ I’m behind on writing about Qwen 3.5, a truly remarkable family of open + weight models released by Alibaba’s Qwen team over the past few weeks. + I’m hoping that the 3.5 family doesn’t turn out to be Qwen’s swan song, + seeing as that team has had some very high profile departures in the + past 24 hours. +

+

+ It all started with + this tweetfrom Junyang Lin (@JustinLin610): +

+
+

me stepping down. bye my beloved qwen.

+
+

+ Junyang Lin was the lead researcher building Qwen, and was key to + releasing their open weight models from 2024 onwards. +

+

+ As far as I can tell a trigger for this resignation was a re-org within + Alibaba where a new researcher hired from Google’s Gemini team was put + in charge of Qwen, but I’ve not confirmed that detail. +

+

+ More information is available in + this article from 36kr.com. Here’s + Wikipedia on 36Krconfirming that it’s a credible media source established in 2010 with a + good track record reporting on the Chinese technology industry. +

+

+ The article is in Chinese—here are some quotes translated via Google + Translate: +

+
+

+ At approximately 1:00 PM Beijing time on March 4th, Tongyi Lab held an + emergency All Hands meeting, where Alibaba Group CEO Wu Yongming + frankly told Qianwen employees. +

+

+ Twelve hours ago (at 0:11 AM Beijing time on March 4th), Lin Junyang, + the technical lead for Alibaba’s Qwen Big Data Model, suddenly + announced his resignation on X. Lin Junyang was a key figure in + promoting Alibaba’s open-source AI models and one of Alibaba’s + youngest P10 employees. Amidst the industry uproar, many members of + Qwen were also unable to accept the sudden departure of their team’s + key figure. +

+

+ “Given far fewer resources than competitors, Junyang’s leadership is + one of the core factors in achieving today’s results,” multiple + Qianwen members told 36Kr. [...] +

+

+ Regarding Lin Junyang’s whereabouts, no new conclusions were reached + at the meeting. However, around 2 PM, Lin Junyang posted again on his + WeChat Moments, stating, “Brothers of Qwen, continue as originally + planned, no problem,” without explicitly confirming whether he would + return. [...] +

+
+

+ That piece also lists several other key members who have apparently + resigned: +

+
+

+ With Lin Junyang’s departure, several other Qwen members also + announced their departure, including core leaders responsible for + various sub-areas of Qwen models, such as: +

+

+ Binyuan Hui: Lead Qwen code development, principal of the Qwen-Coder + series models, responsible for the entire agent training process from + pre-training to post-training, and recently involved in robotics + research. +

+

+ Bowen Yu: Lead Qwen post-training research, graduated from the + University of Chinese Academy of Sciences, leading the development of + the Qwen-Instruct series models. +

+

+ Kaixin Li: Core contributor to Qwen 3.5/VL/Coder, PhD from the + National University of Singapore. +

+

+ Besides the aforementioned individuals, many young researchers also + resigned on the same day. +

+
+

+ Based on the above it looks to me like everything is still very much up + in the air. The presence of Alibaba’s CEO at the “emergency All Hands + meeting” suggests that the company understands the significance of these + resignations and may yet retain some of the departing talent. +

+

+ Qwen 3.5 is exceptional # +

+

+ This story hits particularly hard right now because the Qwen 3.5 models + appear to be exceptionally good. +

+

+ I’ve not spent enough time with them yet but the scale of the new model + family is impressive. They started with + Qwen3.5-397B-A17B on February 17th—an 807GB model—and then followed with + a flurry of smaller siblingsin 122B, 35B, 27B, 9B, 4B, 2B, 0.8B sizes. +

+

+ I’m hearing positive noises about the 27B and 35B models for coding + tasks that still fit on a 32GB/64GB Mac, and I’ve tried the 9B, 4B and + 2B models and found them to be notably effective considering their tiny + sizes. That 2B model is just 4.57GB—or as small as 1.27GB quantized—and + is a full reasoning and multi-modal (vision) model. +

+

+ It would be a real tragedy if the Qwen team were to disband now, given + their proven track record in continuing to find new ways to get high + quality results out of smaller and smaller models. +

+

+ If those core Qwen team members either start something new or join + another research lab I’m excited to see what they do next. +

+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/simonwillison.net/qwen/meta.json b/tests/fixtures/fullpage_to_article_html/simonwillison.net/qwen/meta.json new file mode 100644 index 0000000..3365be5 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/simonwillison.net/qwen/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://simonwillison.net/2026/Mar/4/qwen/", + "host": "simonwillison.net", + "feed_source": "simonwillison", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:19:02.204007Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "Something is afoot in the land of Qwen", + "extracted_word_count": 719, + "extracted_success": true, + "expected_selector": "div.entry.entryPage div[data-permalink-context]" +} diff --git a/tests/fixtures/fullpage_to_article_html/simonwillison.net/qwen/raw.html b/tests/fixtures/fullpage_to_article_html/simonwillison.net/qwen/raw.html new file mode 100644 index 0000000..17a91e7 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/simonwillison.net/qwen/raw.html @@ -0,0 +1,609 @@ + + + + + + + Something is afoot in the land of Qwen + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+

Something is afoot in the land of Qwen

+

4th March 2026

+ +

+ I’m behind on writing about Qwen 3.5, a truly remarkable family of + open weight models released by Alibaba’s Qwen team over the past + few weeks. I’m hoping that the 3.5 family doesn’t turn out to be + Qwen’s swan song, seeing as that team has had some very high + profile departures in the past 24 hours. +

+

+ It all started with + this tweet + from Junyang Lin (@JustinLin610): +

+
+

me stepping down. bye my beloved qwen.

+
+

+ Junyang Lin was the lead researcher building Qwen, and was key to + releasing their open weight models from 2024 onwards. +

+

+ As far as I can tell a trigger for this resignation was a re-org + within Alibaba where a new researcher hired from Google’s Gemini + team was put in charge of Qwen, but I’ve not confirmed that + detail. +

+

+ More information is available in + this article from 36kr.com. Here’s + Wikipedia on 36Kr + confirming that it’s a credible media source established in 2010 + with a good track record reporting on the Chinese technology + industry. +

+

+ The article is in Chinese—here are some quotes translated via + Google Translate: +

+
+

+ At approximately 1:00 PM Beijing time on March 4th, Tongyi Lab + held an emergency All Hands meeting, where Alibaba Group CEO Wu + Yongming frankly told Qianwen employees. +

+

+ Twelve hours ago (at 0:11 AM Beijing time on March 4th), Lin + Junyang, the technical lead for Alibaba’s Qwen Big Data Model, + suddenly announced his resignation on X. Lin Junyang was a key + figure in promoting Alibaba’s open-source AI models and one of + Alibaba’s youngest P10 employees. Amidst the industry uproar, + many members of Qwen were also unable to accept the sudden + departure of their team’s key figure. +

+

+ “Given far fewer resources than competitors, Junyang’s + leadership is one of the core factors in achieving today’s + results,” multiple Qianwen members told 36Kr. [...] +

+

+ Regarding Lin Junyang’s whereabouts, no new conclusions were + reached at the meeting. However, around 2 PM, Lin Junyang posted + again on his WeChat Moments, stating, “Brothers of Qwen, + continue as originally planned, no problem,” without explicitly + confirming whether he would return. [...] +

+
+

+ That piece also lists several other key members who have + apparently resigned: +

+
+

+ With Lin Junyang’s departure, several other Qwen members also + announced their departure, including core leaders responsible + for various sub-areas of Qwen models, such as: +

+

+ Binyuan Hui: Lead Qwen code development, principal of the + Qwen-Coder series models, responsible for the entire agent + training process from pre-training to post-training, and + recently involved in robotics research. +

+

+ Bowen Yu: Lead Qwen post-training research, graduated from the + University of Chinese Academy of Sciences, leading the + development of the Qwen-Instruct series models. +

+

+ Kaixin Li: Core contributor to Qwen 3.5/VL/Coder, PhD from the + National University of Singapore. +

+

+ Besides the aforementioned individuals, many young researchers + also resigned on the same day. +

+
+

+ Based on the above it looks to me like everything is still very + much up in the air. The presence of Alibaba’s CEO at the + “emergency All Hands meeting” suggests that the company + understands the significance of these resignations and may yet + retain some of the departing talent. +

+

+ Qwen 3.5 is exceptional # +

+

+ This story hits particularly hard right now because the Qwen 3.5 + models appear to be exceptionally good. +

+

+ I’ve not spent enough time with them yet but the scale of the new + model family is impressive. They started with + Qwen3.5-397B-A17B on February 17th—an 807GB model—and then followed with + a flurry of smaller siblings + in 122B, 35B, 27B, 9B, 4B, 2B, 0.8B sizes. +

+

+ I’m hearing positive noises about the 27B and 35B models for + coding tasks that still fit on a 32GB/64GB Mac, and I’ve tried the + 9B, 4B and 2B models and found them to be notably effective + considering their tiny sizes. That 2B model is just 4.57GB—or as + small as 1.27GB quantized—and is a full reasoning and multi-modal + (vision) model. +

+

+ It would be a real tragedy if the Qwen team were to disband now, + given their proven track record in continuing to find new ways to + get high quality results out of smaller and smaller models. +

+

+ If those core Qwen team members either start something new or join + another research lab I’m excited to see what they do next. +

+
+ + +
+ +
+

More recent articles

+ +
+
+ + +
+
+

+ This is Something is afoot in the land of Qwen by + Simon Willison, posted on 4th March 2026. +

+ + + + + + + + + + + +

+ Previous: + I vibe coded my dream macOS presentation app +

+ +
+

+ Monthly briefing +

+

+ Sponsor me for $10/month and get a curated email + digest of the month's most important LLM developments. +

+

+ Pay me to send you less! +

+ + Sponsor & subscribe + +
+
+
+ +
+ + +
+ +
+ + + + + + + + + diff --git a/tests/fixtures/fullpage_to_article_html/world.hey.com/clankers-with-claws/expected.html b/tests/fixtures/fullpage_to_article_html/world.hey.com/clankers-with-claws/expected.html new file mode 100644 index 0000000..708960b --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/world.hey.com/clankers-with-claws/expected.html @@ -0,0 +1,108 @@ + + + +
+
+
+ With OpenClaw you're giving AI its + own machine, long-term memory, reminders, and persistent execution. + The model is no longer confined to a prompt-response cycle, but able + to check its own email, Basecamp notifications, and whatever else you + give it access to on a running basis. It's a sneak peek at a future + where everyone has a personal agent assistant, and it's + fascinating.

+
+
+ I set up mine on a + Proxmox virtual machine to + be fully isolated from my personal data and logins. (But there are + people out there running wild and giving OpenClaw access to everything + on their own machine, despite the repeated warnings that this is more + than a little risky!). +
+
+
Then I tried to see just how little help it would need + navigating our human-centric digital world. I didn't install any + skills, any + MCPs, or give it access to any APIs. Zero machine accommodations. I just + started off with a simple prompt: "Sign up for Fizzy, so we have a + place to collaborate. Here's the invite link."

+
+
+ Kef, as I named my new agent, dutifully went to + Fizzy to sign up, but was immediately + stumped by needing an email address. It asked me what to do, and I + replied: "Just go to hey.com and + sign up for a new account." So it did. In a single try. No errors, no + steering, no accommodations. +
+
+
After it had procured its own email address, it continued on + with the task of signing up for Fizzy. And again, it completed the + mission without any complications. Now we had a shared space to + collaborate. +
+
+
So, as a test, I asked it to create a new board for business + ideas, and add five cards with short suggestions, including providing + a background image sourced from the web to describe the idea. And it + did. Again, zero corrections. Perfect execution.

+
+
+ I then invited it to Basecamp by + just adding it as I would any other user. That sent off an email to + Kef's new HEY account, which it quickly received, then followed the + instructions, got signed up, and greeted everyone in the chat room of + the AI Labs project it was invited to.

+
+ + image.png + +
+
+
+
I'm thoroughly impressed. All the agent accommodations, like + MCPs/CLIs/APIs, probably still have a place for a bit longer, as doing + all this work cold is both a bit slow and token-intensive. But I bet + this is just a temporary crutch.

+
+
+ And while I ran this initial experiment on Claude's Opus 4.5, I later + reran most of it on the Chinese open-weight model + Kimi K2.5, + and it too was able to get it all right (though it was a fair bit + slower when provisioned through OpenRouter).

+
+
+ Everything is changing so fast in the world of AI right now, but if I + was going to skate to where the puck is going to be, it'd be a world + where agents, like self-driving cars, don't need special equipment, + like + LIDARor MCPs, to interact with the environment. The human affordances will + be more than adequate. +
+

What a time to be alive.
+
+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/world.hey.com/clankers-with-claws/meta.json b/tests/fixtures/fullpage_to_article_html/world.hey.com/clankers-with-claws/meta.json new file mode 100644 index 0000000..ebddaeb --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/world.hey.com/clankers-with-claws/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://world.hey.com/dhh/clankers-with-claws-9f86fa71", + "host": "world.hey.com", + "feed_source": "worldhey_dhh", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:19:48.782069Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "Clankers with claws", + "extracted_word_count": 584, + "extracted_success": true, + "expected_selector": "section article" +} diff --git a/tests/fixtures/fullpage_to_article_html/world.hey.com/clankers-with-claws/raw.html b/tests/fixtures/fullpage_to_article_html/world.hey.com/clankers-with-claws/raw.html new file mode 100644 index 0000000..0991eaa --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/world.hey.com/clankers-with-claws/raw.html @@ -0,0 +1,301 @@ + + + + + + Clankers with claws + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + + +

+ David Heinemeier Hansson +

+
+
+ +

+ February 5, 2026 +

+ +

+ Clankers with claws +

+ +
+
+
+
+ With OpenClaw you're giving + AI its own machine, long-term memory, reminders, and persistent + execution. The model is no longer confined to a prompt-response + cycle, but able to check its own email, Basecamp notifications, + and whatever else you give it access to on a running basis. It's + a sneak peek at a future where everyone has a personal agent + assistant, and it's fascinating.

+
+
+ I set up mine on a + Proxmox virtual + machine to be fully isolated from my personal data and logins. + (But there are people out there running wild and giving OpenClaw + access to everything on their own machine, despite the repeated + warnings that this is more than a little risky!). +
+
+
Then I tried to see just how little help it would need + navigating our human-centric digital world. I didn't install any + skills, any + MCPs, or give it access to any APIs. Zero machine accommodations. I + just started off with a simple prompt: "Sign up for Fizzy, so we + have a place to collaborate. Here's the invite link."

+
+
+ Kef, as I named my new agent, dutifully went to + Fizzy to sign up, but was + immediately stumped by needing an email address. It asked me + what to do, and I replied: "Just go to + hey.com and sign up for a new + account." So it did. In a single try. No errors, no steering, no + accommodations. +
+
+
After it had procured its own email address, it continued + on with the task of signing up for Fizzy. And again, it + completed the mission without any complications. Now we had a + shared space to collaborate. +
+
+
So, as a test, I asked it to create a new board for + business ideas, and add five cards with short suggestions, + including providing a background image sourced from the web to + describe the idea. And it did. Again, zero corrections. Perfect + execution.

+
+
+ I then invited it to + Basecamp by just adding it + as I would any other user. That sent off an email to Kef's new + HEY account, which it quickly received, then followed the + instructions, got signed up, and greeted everyone in the chat + room of the AI Labs project it was invited to.

+
+ + image.png + +
+
+
+
I'm thoroughly impressed. All the agent accommodations, + like MCPs/CLIs/APIs, probably still have a place for a bit + longer, as doing all this work cold is both a bit slow and + token-intensive. But I bet this is just a temporary crutch.

+
+
+ And while I ran this initial experiment on Claude's Opus 4.5, I + later reran most of it on the Chinese open-weight model + Kimi K2.5, and it too was able to get it all right (though it was a fair + bit slower when provisioned through OpenRouter).

+
+
+ Everything is changing so fast in the world of AI right now, but + if I was going to skate to where the puck is going to be, it'd + be a world where agents, like self-driving cars, don't need + special equipment, like + LIDAR + or MCPs, to interact with the environment. The human affordances + will be more than adequate. +
+

What a time to be alive.
+
+
+
+ +
+

+ About David Heinemeier Hansson +

+
+
+
+ Made Basecamp and + HEY for the underdogs as + co-owner and CTO of + 37signals. Created + Ruby on Rails, + Hotwire, + Kamal, + Omarchy. Wrote + REWORK, + It Doesn't Have to Be Crazy at Work, and + REMOTE. Won at Le Mans as a + racing driver. Invested in + Danish startups. +
+
+
+ +
+
+ + + +
+
+
+ + +
+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/world.hey.com/cloud-gaming-is-kinda-amazing/expected.html b/tests/fixtures/fullpage_to_article_html/world.hey.com/cloud-gaming-is-kinda-amazing/expected.html new file mode 100644 index 0000000..b657f5b --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/world.hey.com/cloud-gaming-is-kinda-amazing/expected.html @@ -0,0 +1,145 @@ + + + +
+
+
+ I fully understand the nostalgia for real ownership of physical-media + games. I grew up on cassette tapes (C64 + Amstrad 464!), floppy disks + (C64 5-1/4" then Amiga 3-1/2"), cartridges, and CDs. I occasionally + envy the retro gamers on YouTube with an entire wall full of such + physical media. But do you know what I like more than collecting? + Playing! Anywhere. Anything. Anytime. +
+
+
We went through the same coping phases with movies and music. + Yes, vinyl had a resurgence, but it's still a tiny sliver of hours + listened. Same too with 4K Blue-rays. Almost everyone just listens to + Spotify or watches on Netflix these days. It's simply cheaper, faster, + and, thus, better. +
+
+
Not "better" in some abstract philosophical way (ownership vs + rent) or even in a concrete technical way (bit rates), but in a + practical way. Paying $20/month for unlimited music and the same again + for a broad selection of shows and movies is clearly a deal most + consumers are happy to make.

+
+
+ So why not video games? Well, because it just wasn't good enough! + Netflix tried for casual gaming, but I didn't hear much of that after + the announcement. + Google Stadiaappears to + have been just a few years ahead of reality (eerie how often that + happens for big G, like with both AI and + AR!) as they + shut down their service already. +
+
+
NVIDIA, though, kept working, and its + GeForce NOW serviceis actually, finally kinda amazing! I had tried it back in the late + 2010s, and just didn't see anything worth using back then. Maybe my + internet was too slow, maybe the service just wasn't good enough yet. + But then I tried it again a few days ago, just after + NVIDIA shipped the native GFN client for Linux, and holy smokes!!

+
+
+ You can legitimately play Fortnite in 2880x1800 at 120 fps through a + remote 4080, and it looks incredible. Yes, there's a little input lag, + but it's shockingly, surprisingly playable on a good internet + connection. And that's with the hardest possible genre: competitive + shooters! If you play racing games like Forza Horizon or story-mode + games like Warhammer 40K: Space Marine 2, you can barely tell!

+
+
+ This is obviously a great option for anyone with a modest computer + that can't run the latest triple-A titles, but also for Linux gamers + who don't have access to run the cheat-protection software required + for Fortnite and a few other games. 

+
+
+ And, like Spotify and Netflix, it's pretty competitively priced. It's + $20/month for access to that 4080-tier. You'd quickly spend $2,000+ on + a gaming rig with a 4080, so this isn't a half bad deal: it's a + payback of 100 months, and by then you'd probably want a 6080 anyway. + Funny how NVIDIA is better at offering the promise of cheap cloud + costs than the likes of AWS!

+
+
+ Anyway, I've been very impressed with NVIDIA GeForce NOW. We're going + to bake the Linux installer straight into the next version of Omarchy, + so you can just go to + Install > Gaming > NVIDIA GeForce NOW to get going + (just like we have such options for Steam and Minecraft). +
+
+
But of course seeing Fortnite running in full graphics on that + remote 4080 made me hungry for even more. I've been playing Fortnite + every week for the last five years or so with the kids, but the + majority of my gameplay has actually been on tablet. A high-end + tablet, like an iPad M5, can play the game with good-for-mobile + graphics at 120 Hz. It's smooth, it's easy, and the kids and I can + lounge on the couch and play together. + Good Family Fun!Not peak visual fidelity, though. +
+
+
So after the NVIDIA GeForce NOW experience, I found a way to use + the same amazing game streaming technology at home through a + local-server solution called + Apolloand a + client called Moonlight. + This allowed me to turn my racing-sim PC that's stuck downstairs into + a cloud-like remote gaming service that I can access anywhere on the + local network, so I can borrow its 4090 to play 120-fps, + ultra-settings Fortnite with zero perceivable input lag on any + computer in the house. +
+
+
The NVIDIA cloud streaming is very impressive, but the + local-server version of the same is mind-blowing. I'm mostly using the + Asus G14 laptop as a client, so Fortnite looks incredible with those + ultra, high-resolution settings on its OLED, but unlike when you use + that laptop's built-in graphics card, the machine stays perfectly cool + and silent pulling a meager 18 watts. And the graphics are of course a + lot nicer.

+
+
+ The Moonlight client is available for virtually every platform: Mac, + iOS, Android, and of course Linux. That means no need to dual boot to + enjoy the best games at the highest fidelity. No need for a honking + big PC on my primary desk. I did not know this was an option!!

+
+
+ Whether you give NVIDIA's cloud gaming setup a try or repurpose a + local gaming PC for the same, you're in for a real treat of what's + possible with streaming Fortnite on ultra settings at 120 fps on Linux + (or even Mac!). GG, NVIDIA!

+
+ + fortnite-apollo-4090.jpg + +
+

+
+
+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/world.hey.com/cloud-gaming-is-kinda-amazing/meta.json b/tests/fixtures/fullpage_to_article_html/world.hey.com/cloud-gaming-is-kinda-amazing/meta.json new file mode 100644 index 0000000..4ad5559 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/world.hey.com/cloud-gaming-is-kinda-amazing/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://world.hey.com/dhh/cloud-gaming-is-kinda-amazing-b8a19c57", + "host": "world.hey.com", + "feed_source": "worldhey_dhh", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:19:50.436828Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "Cloud gaming is kinda amazing", + "extracted_word_count": 926, + "extracted_success": true, + "expected_selector": "section article" +} diff --git a/tests/fixtures/fullpage_to_article_html/world.hey.com/cloud-gaming-is-kinda-amazing/raw.html b/tests/fixtures/fullpage_to_article_html/world.hey.com/cloud-gaming-is-kinda-amazing/raw.html new file mode 100644 index 0000000..d9840a0 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/world.hey.com/cloud-gaming-is-kinda-amazing/raw.html @@ -0,0 +1,340 @@ + + + + + + Cloud gaming is kinda amazing + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + + +

+ David Heinemeier Hansson +

+
+
+ +

+ February 3, 2026 +

+ +

+ Cloud gaming is kinda amazing +

+ +
+
+
+
+ I fully understand the nostalgia for real ownership of + physical-media games. I grew up on cassette tapes (C64 + Amstrad + 464!), floppy disks (C64 5-1/4" then Amiga 3-1/2"), cartridges, + and CDs. I occasionally envy the retro gamers on YouTube with an + entire wall full of such physical media. But do you know what I + like more than collecting? Playing! Anywhere. Anything. Anytime. +
+
+
We went through the same coping phases with movies and + music. Yes, vinyl had a resurgence, but it's still a tiny sliver + of hours listened. Same too with 4K Blue-rays. Almost everyone + just listens to Spotify or watches on Netflix these days. It's + simply cheaper, faster, and, thus, better. +
+
+
Not "better" in some abstract philosophical way (ownership + vs rent) or even in a concrete technical way (bit rates), but in + a practical way. Paying $20/month for unlimited music and the + same again for a broad selection of shows and movies is clearly + a deal most consumers are happy to make.

+
+
+ So why not video games? Well, because it just wasn't good + enough! Netflix tried for casual gaming, but I didn't hear much + of that after the announcement. + Google Stadia + appears to have been just a few years ahead of reality (eerie + how often that happens for big G, like with both AI and + AR!) as + they shut down their service already. +
+
+
NVIDIA, though, kept working, and its + GeForce NOW service + is actually, finally kinda amazing! I had tried it back in the + late 2010s, and just didn't see anything worth using back then. + Maybe my internet was too slow, maybe the service just wasn't + good enough yet. But then I tried it again a few days ago, just + after + NVIDIA shipped the native GFN client for Linux, and holy smokes!!

+
+
+ You can legitimately play Fortnite in 2880x1800 at 120 fps + through a remote 4080, and it looks incredible. Yes, there's a + little input lag, but it's shockingly, surprisingly playable on + a good internet connection. And that's with the hardest possible + genre: competitive shooters! If you play racing games like Forza + Horizon or story-mode games like Warhammer 40K: Space Marine 2, + you can barely tell!

+
+
+ This is obviously a great option for anyone with a modest + computer that can't run the latest triple-A titles, but also for + Linux gamers who don't have access to run the cheat-protection + software required for Fortnite and a few other games. 

+
+
+ And, like Spotify and Netflix, it's pretty competitively priced. + It's $20/month for access to that 4080-tier. You'd quickly spend + $2,000+ on a gaming rig with a 4080, so this isn't a half bad + deal: it's a payback of 100 months, and by then you'd probably + want a 6080 anyway. Funny how NVIDIA is better at offering the + promise of cheap cloud costs than the likes of AWS!

+
+
+ Anyway, I've been very impressed with NVIDIA GeForce NOW. We're + going to bake the Linux installer straight into the next version + of Omarchy, so you can just go to + Install > Gaming > NVIDIA GeForce NOW to get + going (just like we have such options for Steam and Minecraft). +
+
+
But of course seeing Fortnite running in full graphics on + that remote 4080 made me hungry for even more. I've been playing + Fortnite every week for the last five years or so with the kids, + but the majority of my gameplay has actually been on tablet. A + high-end tablet, like an iPad M5, can play the game with + good-for-mobile graphics at 120 Hz. It's smooth, it's easy, and + the kids and I can lounge on the couch and play together. + Good Family Fun! + Not peak visual fidelity, though. +
+
+
So after the NVIDIA GeForce NOW experience, I found a way + to use the same amazing game streaming technology at home + through a local-server solution called + Apollo + and a client called + Moonlight. This + allowed me to turn my racing-sim PC that's stuck downstairs into + a cloud-like remote gaming service that I can access anywhere on + the local network, so I can borrow its 4090 to play 120-fps, + ultra-settings Fortnite with zero perceivable input lag on any + computer in the house. +
+
+
The NVIDIA cloud streaming is very impressive, but the + local-server version of the same is mind-blowing. I'm mostly + using the Asus G14 laptop as a client, so Fortnite looks + incredible with those ultra, high-resolution settings on its + OLED, but unlike when you use that laptop's built-in graphics + card, the machine stays perfectly cool and silent pulling a + meager 18 watts. And the graphics are of course a lot nicer.

+
+
+ The Moonlight client is available for virtually every platform: + Mac, iOS, Android, and of course Linux. That means no need to + dual boot to enjoy the best games at the highest fidelity. No + need for a honking big PC on my primary desk. I did not know + this was an option!!

+
+
+ Whether you give NVIDIA's cloud gaming setup a try or repurpose + a local gaming PC for the same, you're in for a real treat of + what's possible with streaming Fortnite on ultra settings at 120 + fps on Linux (or even Mac!). GG, NVIDIA!

+
+ + fortnite-apollo-4090.jpg + +
+

+
+
+
+
+ +
+

+ About David Heinemeier Hansson +

+
+
+
+ Made Basecamp and + HEY for the underdogs as + co-owner and CTO of + 37signals. Created + Ruby on Rails, + Hotwire, + Kamal, + Omarchy. Wrote + REWORK, + It Doesn't Have to Be Crazy at Work, and + REMOTE. Won at Le Mans as a + racing driver. Invested in + Danish startups. +
+
+
+ +
+
+ + + +
+
+
+ + +
+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/world.hey.com/omacon-comes-to-new-york/expected.html b/tests/fixtures/fullpage_to_article_html/world.hey.com/omacon-comes-to-new-york/expected.html new file mode 100644 index 0000000..f7067b3 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/world.hey.com/omacon-comes-to-new-york/expected.html @@ -0,0 +1,80 @@ + + + +
+
+
+ The vibes around Linux are changing fast. Companies of all shapes and + sizes are paying fresh attention. The hardware game on + x86 is rapidly improving. And thanks to OpenCode and + Claude Code, terminal user interfaces (TUIs) are suddenly everywhere. + It's all this and Omarchy that we'll be celebrating in New York City + on April 10 at the Shopify SoHo Space for the first + OMACON!

+
+
+ We've got an incredible lineup of speakers coming. The creator of + Hyprland, Vaxry, will be there. Along with ThePrimeagen and TJ + DeVries. You'll see OpenCode creator Dax Raad. Omarchy power + contributors Ryan Hughes and Bjarne Øverli. As well as Chris Powers + (Typecraft) and myself as Linux superfans. All packed into a single + day of short sessions, plenty of mingle time, and some good food. +
+
+
Tickets go on sale tomorrow (February 19) at 10am EST. We only + have room for 130 attendees total, so I imagine the offered-at-cost + $299 tickets will go quickly. But if you can't manage to snatch a + ticket in time, we'll also be recording everything, so you won't be + left out entirely. +
+
+
But there is just something special about being together in + person about a shared passion. I've felt the intensity of that three + years in a row now with + Rails World. + There's an endless amount of information and instruction available + online, but a sense of community and connection is far more scarce. We + nerds need this. +
+
+
We also need people to JUST DO THINGS. Like kick off a fresh + Linux distribution together with over three hundred contributors so + far all leaning boldly into aesthetics, ergonomics, and that omakase + spirit.  +
+
+
Omarchy only came about last + summer, now we're seeing 50,000 ISO downloads a week, 30,000 people on + the Discord, and now our very first exclusive gathering in New York + City. This is open source at its best. People from all over, coming + together, making cool shit.

+
+
+ (Oh, and thanks to Shopify and Tobi for hosting. You gotta love when a + hundred-plus billion dollar company like this is run by an uber nerd + who can just sign off on doing something fun and cool for the + community without any direct plausible payback.)

+
+ + opengraph.png + +
+
+
+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/world.hey.com/omacon-comes-to-new-york/meta.json b/tests/fixtures/fullpage_to_article_html/world.hey.com/omacon-comes-to-new-york/meta.json new file mode 100644 index 0000000..f2721a0 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/world.hey.com/omacon-comes-to-new-york/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://world.hey.com/dhh/omacon-comes-to-new-york-e6ee93cb", + "host": "world.hey.com", + "feed_source": "worldhey_dhh", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:19:47.342903Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "Omacon comes to New York", + "extracted_word_count": 440, + "extracted_success": true, + "expected_selector": "section article" +} diff --git a/tests/fixtures/fullpage_to_article_html/world.hey.com/omacon-comes-to-new-york/raw.html b/tests/fixtures/fullpage_to_article_html/world.hey.com/omacon-comes-to-new-york/raw.html new file mode 100644 index 0000000..947f134 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/world.hey.com/omacon-comes-to-new-york/raw.html @@ -0,0 +1,270 @@ + + + + + + Omacon comes to New York + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + + +

+ David Heinemeier Hansson +

+
+
+ +

+ February 18, 2026 +

+ +

+ Omacon comes to New York +

+ +
+
+
+
+ The vibes around Linux are changing fast. Companies of all + shapes and sizes are paying fresh attention. The hardware game + on + x86 is rapidly improving. And thanks to OpenCode and + Claude Code, terminal user interfaces (TUIs) are suddenly + everywhere. It's all this and Omarchy that we'll be celebrating + in New York City on April 10 at the Shopify SoHo Space for the + first OMACON!

+
+
+ We've got an incredible lineup of speakers coming. The creator + of Hyprland, Vaxry, will be there. Along with ThePrimeagen and + TJ DeVries. You'll see OpenCode creator Dax Raad. Omarchy power + contributors Ryan Hughes and Bjarne Øverli. As well as Chris + Powers (Typecraft) and myself as Linux superfans. All packed + into a single day of short sessions, plenty of mingle time, and + some good food. +
+
+
Tickets go on sale tomorrow (February 19) at 10am EST. We + only have room for 130 attendees total, so I imagine the + offered-at-cost $299 tickets will go quickly. But if you can't + manage to snatch a ticket in time, we'll also be recording + everything, so you won't be left out entirely. +
+
+
But there is just something special about being together + in person about a shared passion. I've felt the intensity of + that three years in a row now with + Rails World. There's an endless amount of information and instruction + available online, but a sense of community and connection is far + more scarce. We nerds need this. +
+
+
We also need people to JUST DO THINGS. Like kick off a + fresh Linux distribution together with over three hundred + contributors so far all leaning boldly into aesthetics, + ergonomics, and that omakase spirit.  +
+
+
Omarchy only came about + last summer, now we're seeing 50,000 ISO downloads a week, + 30,000 people on the Discord, and now our very first exclusive + gathering in New York City. This is open source at its best. + People from all over, coming together, making cool shit.

+
+
+ (Oh, and thanks to Shopify and Tobi for hosting. You gotta love + when a hundred-plus billion dollar company like this is run by + an uber nerd who can just sign off on doing something fun and + cool for the community without any direct plausible payback.)

+
+ + opengraph.png + +
+
+
+
+
+ +
+

+ About David Heinemeier Hansson +

+
+
+
+ Made Basecamp and + HEY for the underdogs as + co-owner and CTO of + 37signals. Created + Ruby on Rails, + Hotwire, + Kamal, + Omarchy. Wrote + REWORK, + It Doesn't Have to Be Crazy at Work, and + REMOTE. Won at Le Mans as a + racing driver. Invested in + Danish startups. +
+
+
+ +
+
+ + + +
+
+
+ + +
+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/www.experimental-history.com/the-one-science-reform-we-can-all/expected.html b/tests/fixtures/fullpage_to_article_html/www.experimental-history.com/the-one-science-reform-we-can-all/expected.html new file mode 100644 index 0000000..986021f --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.experimental-history.com/the-one-science-reform-we-can-all/expected.html @@ -0,0 +1,1662 @@ + + + +
+ +

+ If you ever want a good laugh, ask an academic to explain what they get + paid to do, and who pays them to do it. +

+

+ In STEM fields, it works like this: the university pays you to teach, + but unless you’re at a liberal arts college, you don’t actually get + promoted or recognized for your teaching. Instead, you get promoted + and recognized for your research, which the university does + not + generally pay you for. You have to ask someone else to provide that + part of your salary, and in the US, that someone else is usually the + federal government. If you’re lucky—and these days, + very + lucky—you get a chunk of money to grow your bacteria or smash your + electrons together or whatever, you write up your results for + publication, and this is where the monkey business really + begins. +

+

+ In most disciplines, the next step is sending your paper to a + peer-reviewed journal, where it gets evaluated by an editor and (if + the editor sees some promise in it) a few reviewers. These people are + academics just like you, and they generally + do not + get paid for their time. Editors maybe get a small stipend and a bit + of professional cred, while reviewers get nothing but the warm fuzzies + of doing “service to the field”, or the cold thrill of tanking other + people’s papers. +

+

+ If you’re lucky again, your paper gets accepted by the journal, which + now owns the copyright to your work. They + do not + pay you for this! If anything, + you pay them + an “article processing charge” for the privilege of no longer owning + the rights to your paper. This is considered a great honor. +

+

+ The journals then paywall your work, sell the access back to you and + your colleagues, and pocket the profit. Universities cover these + subscriptions and fees by charging the government “indirect costs” on + every grant—money that doesn’t go to the research itself, but to all the + things that support the research, like keeping the lights on, cleaning + the toilets, and accessing the journals that the researchers need to + read. +

+

+ Nothing about this system makes sense, which is why I think + we should build a new one + . In the meantime, though, we should also fix the old one. But that’s + hard, for two reasons. First, many people are invested in things + working exactly the way they do now, so every stupid idea has a + constituency behind it. Second, our current administration seems to + believe in policy by bloodletting: if something isn’t working, just + slice it open at random. Thanks to these + haphazard cuts and cancellations + , we now have a system that is both dysfunctional + and + anemic. +

+

+ I see a way to solve both problems at once. We can satisfy both the + scientists and the scalpel-wielding politicians by ridding ourselves + of the one constituency that + should not exist + . Of all the crazy parts of our crazy system, the craziest part is + where taxpayers pay for the research, then pay private companies to + publish it, and then + pay again + so scientists can read it. We may not agree on much, but we can all + agree on this: it is time, finally and forever, to get rid of + for-profit scientific publishers. +

+

+ MOMMY, WHERE DO SCAMS COME FROM? +
+
+
+ +
+
+

+

+ The writer G.K. Chesterton once + said + that before you knock anything down, you ought to know how it got + there in the first place. So before we show for-profit publishers the + pointy end of a pitchfork, we ought to know where they came from and + why they persist. +

+

+ It used to be a huge pain to produce a physical journal—someone had + to operate the printing presses, lick the stamps, and mail the copies + all over the world. Unsurprisingly, academics didn’t care much about + doing those things. When government money started flowing into + universities post-World War II and the number of articles exploded, + private companies were + like + , “Hey, why don’t we take these journals off your hands—you keep + doing the scientific stuff and we’ll handle all the boring stuff.” And + the academics were like “Sounds good, we’re sure this won’t have any + unforeseen consequences.” +

+

+ Those companies knew they had a captive audience, so they bought up + as many journals as they could. Journal articles aren’t + interchangeable commodities like corn or soybeans—if your science + supplier starts gouging you, you can’t just switch to a new one. + Adding to this lock-in effect, publishing in “high-impact” journals + became the key to success in science, which meant if you wanted + to + move + up, your university had to + pay + up. So, even as the internet made it much cheaper to produce a + journal, publishers made it much more expensive to subscribe to + one. +

+
+
+ +
+ + + + + +
+
+
+ Robert Maxwell, one of the architects of the for-profit + scientific publishing scheme. When he later went into debt, + he + plundered hundreds of millions of pounds from his employees’ + pension funds + . You may be familiar with his daughter and lieutenant + Ghislaine Maxwell + , who went on to have a successful career in child trafficking. + ( + source + ) +
+
+
+

+ The people running this scam had no illusions about it, even if they + hoped that other people did. Here’s how one CEO + described it + : +

+
+

+ You have no idea how profitable these journals are once you stop doing + anything. When you’re building a journal, you spend time getting good + editorial boards, you treat them well, you give them dinners. [...] + [and then] we stop doing all that stuff and then the cash just pours + out and you wouldn’t believe how wonderful it is. +

+
+

+ So here’s the report we can make to Mr. Chesterton: for-profit + scientific publishers arose to solve the problem of producing physical + journals. The internet mostly solved that problem. Now the + publishers + are + the problem. These days, Springer Nature, Elsevier, Wiley, and the + like are basically giant operations that proofread, format, and store + PDFs. That’s not nothing, but it’s pretty close to nothing. +

+

+ No one knows how much publishers make in return for providing these + modest services, but we can guess. In 2017, the Association of + Research Libraries surveyed its 123 member institutions and found they + were paying a collective + $1 billion + in journal subscriptions every year. The ARL covers some of the + biggest universities, but not nearly all of them, so let’s guess that + number accounts for half of all university subscription spending. In + 2023, the federal government estimated it paid nearly + $380 million in article processing charges + alone, and those are separate from subscriptions. So it wouldn’t be + crazy if American universities were paying something like $2.5 billion + to publishers every year, with the majority of that ultimately coming + from taxpayers. +

+

+ (By the way, the estimated profit margins for commercial scientific + publishers are around + 40% + , which is + higher than Microsoft + .) +

+

+ To put those costs in perspective: if the federal government cut out + the publishers, it would probably save more money every year than it + has “saved” in its recent attempts to cut off scientific funding to + universities. It’s unclear how much money will ultimately be clawed + back, as grants continue to get frozen, unfrozen, litigated, and + negotiated. But right now, it seems like + ~$1.4 billion + in promised science funding is simply not going to be paid out. We + could save more than that + every year + if we just stopped writing checks to John Wiley & Sons. +

+

+ PUNK ROCK SCIENCE +
+
+
+ +
+
+

+

+ How can such a scam continue to exist? In large part, it’s because of a + computer hacker from Kazakhstan. +

+

+ The political scientist James C. Scott once wrote that many systems + only “work” because people disobey them. For instance, the Soviet + Union attempted to impose agricultural regulations so strict that + people would have starved if they followed the letter of the law. + Instead, citizens grew and traded food in secret. This made it + look + like the regulations were successful, when in fact they were a + sham. + 1 +

+

+ Something similar is happening right now in science, except Russia is + on the opposite side of the story this time. In the early 2010s, a + Kazakhstani computer programmer named + Alexandra Elbakyan + started downloading articles en masse and posting them publicly on a + website called SciHub. The publishers sued her, so she’s hiding out in + Russia, which protects her from extradition. As you can see in the map + below, millions of people now use SciHub to access scientific + articles, including lots of people who seem to work at + universities: +

+
+
+ +
+ + + + + +
+
+
+ This data is ten years old, so I would expect these numbers to be + higher today. ( + source + ) +
+
+
+

+ Why would researchers resort to piracy when they have legitimate + access themselves? Maybe because journals’ interfaces are so clunky + and annoying that it’s faster to go straight to SciHub. Or maybe it’s + because those researchers + don’t + actually have access. Universities are always trying to save money + by + canceling journal subscriptions + , so academics often have to rely on bootleg copies. Either way, + SciHub seems to be our modern-day version of those Soviet secret + gardens: for-profit publishing only “works” because people find ways + to circumvent it. +

+
+
+ +
+ + + + + +
+
+
+ Alexandra Elbakyan, “Pirate Queen of Science” ( + source + ) +
+
+
+

+ In a punk rock kind of way, it’s kinda cool that so many American + scientists can only do their work thanks to a database maintained by a + Russia-backed fugitive. But it ought to be a huge embarrassment to the + US government. + 2 +

+

+ Instead, for some reason, the government insists on siding with + publishers against citizens. Sixteen years ago, the US had its own + Elbakyan. His name was + Aaron Swartz + . He downloaded millions of paywalled journal articles using a + connection at MIT, possibly intending to share them publicly. + Government agents arrested him, charged him with wire fraud, and + intended to + fine him $1 million and imprison him for 35 years + . Instead, he killed himself. He was 26. +

+
+
+ +
+ + + Swartz with glasses, smiling with Jason Scott (cut off from the picture from the left) + + +
+
+
+ Swartz in 2011, two years before his death ( + source + ) +
+
+
+

+ THE FOREST FIRE IS OVERDUE +
+
+
+ +
+
+

+

+ Scientists have tried to take on the middlemen themselves. They’ve + founded open-access journals. They’ve published preprints. They’ve + tried + alternative ways of evaluating research + . A few high-profile professors have + publicly + and + dramatically + sworn off all “luxury” outlets, and less-famous folks have followed + suit: in 2012, over 10,000 researchers signed a + pledge + not to publish in any journals owned by Elsevier. +

+

+ None of this has worked. The biggest for-profit publishers continue + making more money + year after year + . “Diamond” open access journals—that is, publications that don’t + charge authors or readers—only account for + ~10% + of all articles. + 3 + Four years after that massive pledge, + 38% of signers had broken their promise and published in an Elsevier + journal + . + 4 +

+

+ These efforts have fizzled because this isn’t a problem that can be + solved by any individual, or even + many + individuals. Academia is so cutthroat that anyone who righteously + gives up an advantage will be outcompeted by someone who has fewer + scruples. What we have here is a collective action problem. +

+

+ Fortunately, we have an organization that exists for the express + purpose of solving collective action problems. It’s called + the government + . And as luck would have it, they’re also the one paying most of the + bills! +

+

+ So the solution here is straightforward: every government grant should + stipulate that the research it supports can’t be published in a + for-profit journal. That’s it! If the public paid for it, it shouldn’t + be paywalled. +

+

+ The Biden administration tried to do this, but they did it in a + stupid way. They + mandated + that NIH-funded research papers have to be “open access”, which + sounds like a solution, but it’s actually a psyop. By replacing + subscription fees with “article processing charges”, publishers can + simply make + authors + pay for + writing + instead of making + readers + pay for + reading + . The companies can keep skimming money off the system, and best of + all, they get to call the result “open access”. +

+

+ These fees can be wild. When my PhD advisor and I published one of + our papers together, the journal charged us an “open access” fee of + $12,000. This arrangement is a tiny bit better than the alternative, + because at least everybody can read our paper now, including people + who aren’t affiliated with a university. But those fees still have to + come from somewhere, and whether you charge writers or readers, you’re + ultimately charging the same account—namely, the US government. + 5 +

+

+ The Trump administration somehow found a way to make a stupid policy + even stupider. They sped up the timeline while also + firing a bunch of NIH staffers + —exactly the people who would make sure that government-sponsored + publications are, in fact, publicly accessible. And you need someone + to check on that, because researchers are notoriously bad about this + kind of stuff. They’re already required to upload the results of + clinical trials to a public database, but more than half the time they + just... + don’t + . +

+

+ To do this right, you cannot allow the rent-seekers to rebrand. You + have to cut them out entirely. I don’t think this will fix everything + that’s wrong with science; it will merely fix the + wrongest + thing. Nonprofit journals still charge fees, but at least the money + goes to organizations that ostensibly care about science, rather than + going to CEOs who make + $17 million a year + . And almost every journal, for-profit or not, uses the + same failed system of peer review + . The biggest benefit of shaking things up, then, would be allowing + different approaches to have a chance at life, the same way an + occasional forest fire clears away the dead wood, opens up the + pinecones, and gives seedlings a shot at the sunlight. +

+

+ Science philanthropies should adopt the same policy, and some of them + already have. The Navigation Fund, which oversees billions of dollars + in scientific funding, + no longer bankrolls journal publications at all + . + Seemay Chou + , its director, reports that the experiment has been a great + success: +

+
+

+ Our researchers began designing experiments differently from the + start. They became more creative and collaborative. The goal shifted + from telling polished stories to uncovering useful truths. All + results had value, such as failed attempts, abandoned inquiries, or + untested ideas, which we frequently release through Arcadia’s + Icebox + . The bar for utility went up, as proxies like impact factors + disappeared. +

+
+

Sounds good to me!

+

+ CATCH THE TIGER +
+
+
+ +
+
+

+

+ Fifteen years ago, the open science movement was all about abolishing + for-profit journals—that’s what open science + meant + . It seemed like every speech would end with “ELSEVIER + DELENDA EST + ”. +

+

+ Now people barely bring it up at all. + 6 + It’s like a lion has escaped the zoo and it’s gulping down + schoolchildren, but when people suggest zoo improvements, all the + agenda items are like, “We should add another Dippin’ Dots kiosk”. If + you bring up the loose tiger, everyone gets annoyed at you, like “Of + course, no one likes the + tiger + ”. +

+

+ I think two things happened. First, we got cynical about cyberspace. + In the 1990s and 2000s, we really thought the internet would solve + most of our problems. When those problems persisted despite all of us + getting broadband, we shifted to thinking that the internet was, in + fact, + causing + the problems. And so it became cringe to think the internet could + ever be a force for good. In 1995, for-profit publishers were going to + be “ + the internet’s first victim + ”; in 2015, they were “ + the business the internet could not kill + ”. +

+

+ Second, when the + replication crisis + hit in the early 2010s, the open science movement got a new + villain—namely, naughty researchers. The fakers, the fraudsters, the + over-claimers: those are the real bad boys of science. It’s no longer + cool to hate international publishing conglomerates. Now it’s cool to + hate your + colleagues + . +

+

+ Both of these shifts were a shame. The internet utopians were right + that the web would eliminate the need for journals, but they were + wrong to think that would be + enough + . The replication police were right to call out scientific + malfeasance, but they were wrong to forget our old foes. The + for-profit publishers are just as bad as they ever were, and while the + internet has made them more vulnerable then ever, now we know they + won’t go unless they’re pushed. +

+

+ If we want better science, we should catch the tiger. Not only + because it’s bad for the tiger to be loose, but because it’s bad for + us to look the other way. If you allow an outrageous scam to go + unchecked, if you participate in it, normalize it—then what + won’t + you do? Why not also goose your stats a bit? Why not publish some + junk research? Look around: no one cares! +

+

+ There are so many problems with our current way of doing things, and + most of those problems are complicated and difficult to solve. This one + isn’t. Let’s heave this succubus off our scientific system and end this + scam once and for all. After that, Dippin’ Dots all around. +

+ +
+
+ 1 +
+

+ Seeing Like a State + , 203-204, 310 +

+
+
+
+ 2 +
+

+ For anyone who is all-in on “America First”: may I also mention that + three of the largest publishers—Springer Nature, Elsevier, and + Taylor and Francis—are all British-owned. A curious choice of + companies to subsidize! +

+
+
+
+ 3 +
+

+ Don’t get me started on this “diamond open access” designation. If + it costs money to publish or to read, it’s not open access, period. + “Oh, you’d like your car to come with a steering wheel and brakes? + You’ll need our ‘diamond’ package.” +

+
+
+
+ 4 +
+

+ I assume this number is much higher now. At the time, Elsevier + controlled + 16% + of the market, so most people could continuing publish in their + usual journals without breaking their pledge. I started graduate + school in 2016, and I never heard anyone mention avoiding Elsevier + journals at all. +

+
+
+
+ 5 +
+

+ The NIH has + announced vague plans + to cap these charges, which is kind of like saying, “I’ll let you + scam me, but just don’t go crazy about it.” +

+
+
+
+ 6 +
+

+ For example, the current + strategic plan of the Center for Open Science + doesn’t mention for-profit journals at all. +

+
+
+
+ + diff --git a/tests/fixtures/fullpage_to_article_html/www.experimental-history.com/the-one-science-reform-we-can-all/meta.json b/tests/fixtures/fullpage_to_article_html/www.experimental-history.com/the-one-science-reform-we-can-all/meta.json new file mode 100644 index 0000000..98e4629 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.experimental-history.com/the-one-science-reform-we-can-all/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://www.experimental-history.com/p/the-one-science-reform-we-can-all", + "host": "www.experimental-history.com", + "feed_source": "hnrss", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:18:47.217836Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "The one science reform we can all agree on, but we're too cowardly to do", + "extracted_word_count": 3061, + "extracted_success": true, + "expected_selector": "div.body.markup" +} diff --git a/tests/fixtures/fullpage_to_article_html/www.experimental-history.com/the-one-science-reform-we-can-all/raw.html b/tests/fixtures/fullpage_to_article_html/www.experimental-history.com/the-one-science-reform-we-can-all/raw.html new file mode 100644 index 0000000..7542c8f --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.experimental-history.com/the-one-science-reform-we-can-all/raw.html @@ -0,0 +1,7154 @@ + + + + + + + + + + The one science reform we can all agree on, but we're too cowardly to do + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+
+ +
+
+
+
+ +
+
+
+
+
+
+
+

+ Discussion about this post +

+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ + User's avatar +
+
+
+ +
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+ + 1d +
+
+
+
+
+ +
+
+
+

+ Loophole alert! How do you define + "non-profit"? How about we set up Elsevier + Non-Profit Home for Orphans and Kittens, + which signs an exclusive license agreement + with ... publications that make + "philanthropic" donations. +

+
+ +
+
+
+
+ 3 replies +
+
+
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+

+ Just a lil' proofreading note: you started + with a loose lion in your zoo analogy, then + moved to a tiger for the rest of the piece. + Love your work, and your dad's photos! +

+
+ +
+
+
+
+
+
+ 49 more comments... +
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+ + +
+   Adam Mastroianni +
+
+
+ + +
+
+
+
+ + +
+
+
+
+
+
+
+ + +
+   Adam Mastroianni +
+
+
+ + +
+
+
+
+ + +
+
+
+
+
+
+
+ + +
+   Adam Mastroianni +
+
+
+ + +
+
+
+
+ + +
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+

+ Ready for more? +

+
+
+ +
+
+
+ +
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+

+ Cookie Policy +

+
+ We use cookies to improve your experience, for analytics, and + for marketing. You can accept, reject, or manage your + preferences. See our + privacy policy. +
+
+ +
+
+
+
+
+
+
+
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/fixtures/fullpage_to_article_html/www.infoworld.com/an-ode-to-craftsmanship-in-software-development/expected.html b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/an-ode-to-craftsmanship-in-software-development/expected.html new file mode 100644 index 0000000..057be7b --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/an-ode-to-craftsmanship-in-software-development/expected.html @@ -0,0 +1,563 @@ + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    + Neither an art nor a science, writing software is more + akin to carpentry or watchmaking. We’re going to miss + it. +

    +

    +
    +
    +
    +
    + shutterstock 77260183 rusty old woodworking tools on the wall of an old workshop +
    +
    + Credit: Mihai Simonia / Shutterstock +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    + I was talking about the + astonishing rise of AI-assisted codingwith my friend Josh recently, and he said he was + going to miss the craftsmanship aspect of actually + writing code.  +

    +

    + Now, I’m a big believer that software is a craft, not + an engineering endeavor. The term “engineering” + implies a certain amount of certainty and precision + that can never be replicated in software. I’ve never + been a big fan of the term “computer science” either, + because again “science” implies the scientific method + and a certain amount of repeatability. Part of what + makes software development so hard is that no two + projects are even remotely alike, and if you tried to + repeat a project, you’d get a completely different + result. +

    +

    + Some like to argue that writing software is like + painting, but I’ve never followed that route either. + Artists are usually free-flowing and unbound, + restrained only by convention—and many artists feel + utterly unrestrained even by convention. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Software always seems to be somewhere in between. The + consensus among many developers—Uncle Bob Martin among + them—is that writing software is a craft, more akin to + carpentry or watchmaking. All three practices are + somewhat limited by the physical properties of the + materials. They require precision to get good results, + and this precision requires care, commitment, and + expertise. +

    + +

    + So I get Josh’s feelings of loss about no longer being + able to wield the craft of software development. +

    +

    + The conversation got a touch more interesting, though, + when I said, “Well, think of it this way: You are now + a senior craftsman with a tireless, eager, and + constantly learning apprentice who is completely + willing to do all the work in the shop without a + single complaint.” +

    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + And that is quite a thought. Sure, we senior + craftspeople celebrate writing elegant code, + constructing beautiful class hierarchies, and + designing working software. But I will admit that a + lot of the work is tedious, and having an unflagging + coder grinding out the “dirty work” is a really nice + thing. +

    +

    + But it can become more than that. Your coding + apprentice can build, at your direction, pretty much + anything now. The task becomes more like conducting an + orchestra than playing in it. Not all members of the + orchestra want to conduct, but given that is where + things are headed, I think we all need to consider it + at least. The results are the same. You can dabble as + much in code as you want. You can check every line, + merely review the overall architecture, or, if you are + like me, you can be quite content with moving past the + grind of actually writing code to orchestrating the + process and ensuring the proper final result. +

    +

    + Nevertheless, I feel Josh’s angst. I will miss the + satisfaction of writing the lovely procedure that does + one thing cleanly and quickly, of creating the single + object that does everything you need it to do and + nothing more, of getting things working just right. + All of that is gone, as are the conductor’s days of + playing a spotlight solo. It’s hard, but it’s where we + are. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + It’s not unlike choosing to become a manager—you leave + behind your coding days for a different role. Sure, + you miss the good old days of programming every day, + but the new challenges are valuable and satisfying. +

    +

    + There are folks out there who write carefully crafted + assembly. And in a few years, there will be folks + doing the same thing with Java and C# and Pascal. + Coding will soon become a quirky pastime, written only + by eccentric old developers who relish the craft of + software development. +

    +

    + It’s only been a few months, but I already view Claude + Code as nothing more than an elaborate compiler, and + the code it produces (in whatever language) as + assembly code. +

    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + + + +
    +
    +
    + + diff --git a/tests/fixtures/fullpage_to_article_html/www.infoworld.com/an-ode-to-craftsmanship-in-software-development/meta.json b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/an-ode-to-craftsmanship-in-software-development/meta.json new file mode 100644 index 0000000..c2a5e93 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/an-ode-to-craftsmanship-in-software-development/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://www.infoworld.com/article/4140156/an-ode-to-craftsmanship-in-software-development.html", + "host": "www.infoworld.com", + "feed_source": "infoworld", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:19:11.455141Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "An ode to craftsmanship in software development", + "extracted_word_count": 647, + "extracted_success": true, + "expected_selector": "article[id^=\"post-\"]" +} diff --git a/tests/fixtures/fullpage_to_article_html/www.infoworld.com/an-ode-to-craftsmanship-in-software-development/raw.html b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/an-ode-to-craftsmanship-in-software-development/raw.html new file mode 100644 index 0000000..edb4f3f --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/an-ode-to-craftsmanship-in-software-development/raw.html @@ -0,0 +1,7700 @@ + + + + + + + + + + + + + + + + + An ode to craftsmanship in software development | InfoWorld + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +
    + +
    + + +
    +
    +
    +
    +
    +
    + +
    +
    +
    + +
    +
    +
    +
    + +
    +
    + +
    +
    +
      +
    • + +
    • +
    +
    +
    + +
    +
    +
    +
    + +
    +
    +
    + +
    + + +
    +
    + +
    +
    + + + +
    +
    +
    +
    + + + +
    +
    +
    +
    + + +
    + +
    +

    Our Network

    + +
    +
    +
    +
    +
    +
    + + +
    + +
    +

    More

    + +
    +
    +
    +
    +
    + + +
    +
    +
    + +
    +
    +
    +
    +
    + Nick Hodges +
    +
    +
    +
    + by + +
    +
    +
    +
    +
    + +
    +
    +
    +
    +

    + An ode to craftsmanship in software development +

    +
    +
    + opinion +
    +
    + Mar 4, 20264 mins +
    +
    + +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    + Neither an art nor a science, writing software is + more akin to carpentry or watchmaking. We’re going + to miss it. +

    +

    +
    +
    +
    +
    + shutterstock 77260183 rusty old woodworking tools on the wall of an old workshop +
    + +
    + Credit: Mihai Simonia / Shutterstock +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    + I was talking about the + astonishing rise of AI-assisted coding + with my friend Josh recently, and he said he was + going to miss the craftsmanship aspect of actually + writing code.  +

    + +

    + Now, I’m a big believer that software is a craft, + not an engineering endeavor. The term + “engineering” implies a certain amount of + certainty and precision that can never be + replicated in software. I’ve never been a big + fan of the term “computer science” either, because + again “science” implies the scientific method and + a certain amount of repeatability. Part of what + makes software development so hard is that no two + projects are even remotely alike, and if you tried + to repeat a project, you’d get a completely + different result.  +

    + +

    + Some like to argue that writing software is like + painting, but I’ve never followed that route + either. Artists are usually free-flowing and + unbound, restrained only by convention—and many + artists feel utterly unrestrained even by + convention. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Software always seems to be somewhere in + between. The consensus among many + developers—Uncle Bob Martin among them—is that + writing software is a craft, more akin to + carpentry or watchmaking. All three practices + are somewhat limited by the physical properties of + the materials. They require precision to get good + results, and this precision requires care, + commitment, and expertise. +

    + + + +

    + So I get Josh’s feelings of loss about no longer + being able to wield the craft of software + development. +

    + +

    + The conversation got a touch more interesting, + though, when I said, “Well, think of it this + way: You are now a senior craftsman with a + tireless, eager, and constantly learning + apprentice who is completely willing to do all the + work in the shop without a single + complaint.”  +

    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + And that is quite a thought. Sure, we senior + craftspeople celebrate writing elegant code, + constructing beautiful class hierarchies, and + designing working software. But I will admit + that a lot of the work is tedious, and having an + unflagging coder grinding out the “dirty work” is + a really nice thing. +

    + +

    + But it can become more than that. Your coding + apprentice can build, at your direction, pretty + much anything now. The task becomes more like + conducting an orchestra than playing in + it. Not all members of the orchestra want to + conduct, but given that is where things are + headed, I think we all need to consider it at + least. The results are the same. You can + dabble as much in code as you want. You can + check every line, merely review the overall + architecture, or, if you are like me, you can be + quite content with moving past the grind of + actually writing code to orchestrating the process + and ensuring the proper final result.  +

    + +

    + Nevertheless, I feel Josh’s angst. I will miss the + satisfaction of writing the lovely procedure that + does one thing cleanly and quickly, of creating + the single object that does everything you need it + to do and nothing more, of getting things working + just right. All of that is gone, as are the + conductor’s days of playing a spotlight + solo. It’s hard, but it’s where we are.  +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + It’s not unlike choosing to become a manager—you + leave behind your coding days for a different + role. Sure, you miss the good old days of + programming every day, but the new challenges are + valuable and satisfying. +

    + +

    + There are folks out there who write carefully + crafted assembly. And in a few years, there + will be folks doing the same thing with Java and + C# and Pascal. Coding will soon become a quirky + pastime, written only by eccentric old developers + who relish the craft of software development. +

    + +

    + It’s only been a few months, but I already view + Claude Code as nothing more than an elaborate + compiler, and the code it produces (in whatever + language) as assembly code. +

    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + Nick Hodges +
    +
    +
    + +
    +
    + + +
    +

    + Nick has a BA in classical languages from Carleton + College and an MS in information technology management + from the Naval Postgraduate School. In his career, he + has been a busboy, a cook, a caddie, a telemarketer (for + which he apologizes), an office manager, a high school + teacher, a naval intelligence officer, a software + developer, a product manager, and a software development + manager. In addition, he is a former Delphi Product + Manager and Delphi R&D Team Manager and the author + of + Coding in Delphi. He is a passionate Minnesota sports fan, especially + the Timberwolves, as he grew up and went to college in + the Land of 10,000 Lakes. He currently lives in West + Chester, PA, and can be found on the Internet at + https://nickhodges.com. +

    +
    + +
    + +
    +

    + More from this author +

    +
    +
    + + + +
    +
    + +
    +
    +
    +
    +
    +
    +
    + + +
    + +
    +
    + + + + + + +
    + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/fixtures/fullpage_to_article_html/www.infoworld.com/angular-releases-patches-for-ssr-security-issues/expected.html b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/angular-releases-patches-for-ssr-security-issues/expected.html new file mode 100644 index 0000000..a961992 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/angular-releases-patches-for-ssr-security-issues/expected.html @@ -0,0 +1,614 @@ + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    + Server-side rendering vulnerabilities could allow + attackers to steal authorization headers or perpetrate + phishing and SEO hacking. +

    +

    +
    +
    +
    +
    + shutterstock 2416896949 The Black Sheep In The Herd Of White Sheep insider threat security cybersecurity vulnerability +
    +
    + Credit: Bastian Herrmann / Shutterstock +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    + The Angular team from Google has announced the release + of two security updates to the Angular web framework, + both pertaining to SSR (server-side rendering) + vulnerabilities. Developers are advised to update SSR + applications as soon as possible. Patching can help + users avoid the theft of authorization headers as well + as phishing scams. +

    +

    + A bulletin on the issues was published + February 28. One of the vulnerabilities, labeled as critical, + pertains to SSRF (server-side request forgery) and + header injection. The patched version can be found + here. The second vulnerability, labeled as moderate, + pertains to an open redirect via the + X-Forwarded-Prefix header. That patch can + be found + here. +

    +

    + The SSRF vulnerability found in the Angular SSR + request handling pipeline exists because Angular’s + internal URL reconstruction logic directly trusts and + consumes user-controlled HTTP headers, specifically + the host and X-Forwarded-* family, to + determine the application’s base origin without + validation of the destination domain. This + vulnerability manifests through implicit relative URL + resolution, explicit manual construction, and + confidentiality breach, the Angular team said. When + exploited successfully, this SSRF vulnerability allows + for arbitrary internal request steering. This can lead + to the stealing sensitive + Authorizationheaders or session cookies + by redirecting them to an attacker’s server. Attackers + also can access and transmit data from internal + services, databases, or cloud metadata endpoints not + exposed to the public internet. Also, attackers could + access sensitive information processed within the + application’s server-side context. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + The open redirect vulnerability, meanwhile, exists in + the internal URL processing logic in Angular SSR. This + vulnerability allows attackers to conduct large-scale + phishing and SEO hijacking, the Angular team said. +

    + +

    + The team recommends updating SSR applications to the + latest patch version as soon as possible. If an app + does not deploy SSR to production, there is no + immediate need to update, they said. Developers on an + unsupported version of Angular or unable to update + quickly are advised to avoid using req.headers for URL + construction. Instead, they should use trusted + variables for base API paths. Another workaround is + implementing a middleware in the server.ts to enforce + numeric ports and validated hostnames.











    @font-face&amp;lt;br> + {font-family:"Cambria Math";&amp;lt;br> + panose-1:2 4 5 3 5 4 6 3 2 4;&amp;lt;br> + mso-font-charset:0;&amp;lt;br> + mso-generic-font-family:roman;&amp;lt;br> + mso-font-pitch:variable;&amp;lt;br> + mso-font-signature:-536870145 1107305727 0 0 415 + 0;}@font-face&amp;lt;br> + {font-family:Aptos;&amp;lt;br> panose-1:2 11 0 + 4 2 2 2 2 2 4;&amp;lt;br> + mso-font-charset:0;&amp;lt;br> + mso-generic-font-family:swiss;&amp;lt;br> + mso-font-pitch:variable;&amp;lt;br> + mso-font-signature:536871559 3 0 0 415 0;}p.MsoNormal, + li.MsoNormal, div.MsoNormal&amp;lt;br> + {mso-style-unhide:no;&amp;lt;br> + mso-style-qformat:yes;&amp;lt;br> + mso-style-parent:"";&amp;lt;br> + margin-top:0in;&amp;lt;br> + margin-right:0in;&amp;lt;br> + margin-bottom:8.0pt;&amp;lt;br> + margin-left:0in;&amp;lt;br> + line-height:115%;&amp;lt;br> + mso-pagination:widow-orphan;&amp;lt;br> + font-size:12.0pt;&amp;lt;br> + font-family:"Aptos",sans-serif;&amp;lt;br> + mso-ascii-font-family:Aptos;&amp;lt;br> + mso-ascii-theme-font:minor-latin;&amp;lt;br> + mso-fareast-font-family:Aptos;&amp;lt;br> + mso-fareast-theme-font:minor-latin;&amp;lt;br> + mso-hansi-font-family:Aptos;&amp;lt;br> + mso-hansi-theme-font:minor-latin;&amp;lt;br> + mso-bidi-font-family:"Times New + Roman";&amp;lt;br> + mso-bidi-theme-font:minor-bidi;&amp;lt;br> + mso-font-kerning:1.0pt;&amp;lt;br> + mso-ligatures:standardcontextual;}a:link, + span.MsoHyperlink&amp;lt;br> + {mso-style-priority:99;&amp;lt;br> + color:#467886;&amp;lt;br> + mso-themecolor:hyperlink;&amp;lt;br> + text-decoration:underline;&amp;lt;br> + text-underline:single;}a:visited, + span.MsoHyperlinkFollowed&amp;lt;br> + {mso-style-noshow:yes;&amp;lt;br> + mso-style-priority:99;&amp;lt;br> + color:#96607D;&amp;lt;br> + mso-themecolor:followedhyperlink;&amp;lt;br> + text-decoration:underline;&amp;lt;br> + text-underline:single;}p&amp;lt;br> + {mso-style-priority:99;&amp;lt;br> + mso-margin-top-alt:auto;&amp;lt;br> + margin-right:0in;&amp;lt;br> + mso-margin-bottom-alt:auto;&amp;lt;br> + margin-left:0in;&amp;lt;br> + mso-pagination:widow-orphan;&amp;lt;br> + font-size:12.0pt;&amp;lt;br> font-family:"Times + New Roman",serif;&amp;lt;br> + mso-fareast-font-family:"Times New + Roman";}code&amp;lt;br> + {mso-style-noshow:yes;&amp;lt;br> + mso-style-priority:99;&amp;lt;br> + font-family:"Courier New";&amp;lt;br> + mso-ascii-font-family:"Courier New";&amp;lt;br> + mso-fareast-font-family:"Times New + Roman";&amp;lt;br> + mso-hansi-font-family:"Courier New";&amp;lt;br> + mso-bidi-font-family:"Courier + New";}.MsoChpDefault&amp;lt;br> + {mso-style-type:export-only;&amp;lt;br> + mso-default-props:yes;&amp;lt;br> + font-family:"Aptos",sans-serif;&amp;lt;br> + mso-ascii-font-family:Aptos;&amp;lt;br> + mso-ascii-theme-font:minor-latin;&amp;lt;br> + mso-fareast-font-family:Aptos;&amp;lt;br> + mso-fareast-theme-font:minor-latin;&amp;lt;br> + mso-hansi-font-family:Aptos;&amp;lt;br> + mso-hansi-theme-font:minor-latin;&amp;lt;br> + mso-bidi-font-family:"Times New + Roman";&amp;lt;br> + mso-bidi-theme-font:minor-bidi;}.MsoPapDefault&amp;lt;br> + {mso-style-type:export-only;&amp;lt;br> + margin-bottom:8.0pt;&amp;lt;br> + line-height:115%;}div.WordSection1&amp;lt;br> + {page:WordSection1;}







    +

    +

    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + +
    +
    +
    + + diff --git a/tests/fixtures/fullpage_to_article_html/www.infoworld.com/angular-releases-patches-for-ssr-security-issues/meta.json b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/angular-releases-patches-for-ssr-security-issues/meta.json new file mode 100644 index 0000000..75e9383 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/angular-releases-patches-for-ssr-security-issues/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://www.infoworld.com/article/4140166/angular-releases-patches-for-ssr-security-issues.html", + "host": "www.infoworld.com", + "feed_source": "infoworld", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:19:15.547888Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "Angular releases patches for SSR security issues", + "extracted_word_count": 376, + "extracted_success": true, + "expected_selector": "article[id^=\"post-\"]" +} diff --git a/tests/fixtures/fullpage_to_article_html/www.infoworld.com/angular-releases-patches-for-ssr-security-issues/raw.html b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/angular-releases-patches-for-ssr-security-issues/raw.html new file mode 100644 index 0000000..8dbe64e --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/angular-releases-patches-for-ssr-security-issues/raw.html @@ -0,0 +1,7660 @@ + + + + + + + + + + + + + + + + + Angular releases patches for SSR security issues | InfoWorld + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +
    + +
    + + +
    +
    +
    +
    +
    +
    + +
    +
    +
    + +
    +
    +
    +
    + +
    +
    + +
    +
    +
      +
    • + +
    • +
    +
    +
    + +
    +
    +
    +
    + +
    +
    +
    + +
    + + +
    +
    + +
    +
    + + + +
    +
    +
    +
    + + + +
    +
    +
    +
    + + +
    + +
    +

    Our Network

    + +
    +
    +
    +
    +
    +
    + + +
    + +
    +

    More

    + +
    +
    +
    +
    +
    + + +
    +
    +
    + +
    +
    +
    +
    +
    + Paul Krill +
    +
    +
    +
    + by + +
    + +
    + Editor at Large +
    +
    +
    +
    +
    + +
    +
    +
    +
    +

    + Angular releases patches for SSR security issues +

    +
    +
    + news +
    +
    + Mar 3, 20263 mins +
    +
    + +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    + Server-side rendering vulnerabilities could allow + attackers to steal authorization headers or + perpetrate phishing and SEO hacking. +

    +

    +
    +
    +
    +
    + shutterstock 2416896949 The Black Sheep In The Herd Of White Sheep insider threat security cybersecurity vulnerability +
    + +
    + Credit: Bastian Herrmann / Shutterstock +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +

    + The Angular team from Google has announced the + release of two security updates to + the Angular web framework, both pertaining to + SSR (server-side rendering) vulnerabilities. + Developers are advised to update SSR applications + as soon as possible. Patching can help users avoid + the theft of authorization headers as well as + phishing scams. +

    + +

    + A bulletin on the issues was published + February 28. One of the vulnerabilities, labeled as + critical, pertains to SSRF (server-side request + forgery) and header injection. The patched version + can be found + here. The second vulnerability, labeled as moderate, + pertains to an open redirect via the + X-Forwarded-Prefix header. That patch + can be found + here. +

    + +

    + The SSRF vulnerability found in the Angular SSR + request handling pipeline exists because Angular’s + internal URL reconstruction logic directly trusts + and consumes user-controlled HTTP headers, + specifically the host and + X-Forwarded-* family, to determine + the application’s base origin without validation + of the destination domain. This vulnerability + manifests through implicit relative URL + resolution, explicit manual construction, and + confidentiality breach, the Angular team said. + When exploited successfully, this SSRF + vulnerability allows for arbitrary internal + request steering. This can lead to the stealing + sensitive Authorizationheaders or + session cookies by redirecting them to an + attacker’s server. Attackers also can access and + transmit data from internal services, databases, + or cloud metadata endpoints not exposed to the + public internet. Also, attackers could access + sensitive information processed within the + application’s server-side context. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + The open redirect vulnerability, meanwhile, exists + in the internal URL processing logic in Angular + SSR. This vulnerability allows attackers to + conduct large-scale phishing and SEO hijacking, + the Angular team said. +

    + + + +

    + The team recommends updating SSR applications to + the latest patch version as soon as possible. If + an app does not deploy SSR to production, there is + no immediate need to update, they said. Developers + on an unsupported version of Angular or unable to + update quickly are advised to avoid using + req.headers for URL construction. Instead, they + should use trusted variables for base API paths. + Another workaround is implementing a middleware in + the server.ts to enforce numeric ports and + validated hostnames.



















    +

    + +

    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + Paul Krill +
    +
    +
    + +
    +
    +
    +
    +
    +
    + + +
    + Editor at Large +
    +
    +
    +
    + + +
    + +
    +

    + Paul Krill is editor at large at InfoWorld. Paul has + been covering computer technology as a news and feature + reporter for more than 35 years, including 30 years at + InfoWorld. He has specialized in coverage of software + development tools and technologies since the 1990s, and + he continues to lead InfoWorld’s news coverage of + software development platforms including Java and .NET + and programming languages including JavaScript, + TypeScript, PHP, Python, Ruby, Rust, and Go. Long + trusted as a reporter who prioritizes accuracy, + integrity, and the best interests of readers, Paul is + sought out by technology companies and industry + organizations who want to reach InfoWorld’s audience of + software developers and other information technology + professionals. Paul has won a “Best Technology News + Coverage” award from IDG. +

    +
    + +
    + +
    +

    + More from this author +

    +
    +
    + + + +
    +
    + +
    +
    +
    +
    +
    +
    +
    + + +
    + +
    +
    + + + + + + +
    + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/fixtures/fullpage_to_article_html/www.infoworld.com/the-right-way-to-architect-modern-web-applications/expected.html b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/the-right-way-to-architect-modern-web-applications/expected.html new file mode 100644 index 0000000..7da68a6 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/the-right-way-to-architect-modern-web-applications/expected.html @@ -0,0 +1,1044 @@ + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    + Should you double down on server-side rendering, or + embrace a client-side approach? Yes and more. +

    +

    +
    +
    +
    +
    + Illustration of man going own direction, trailblazing, exploring, thinking outside the box. Experimentation choice freedom concept. +
    +
    + Credit: fran_kie / Shutterstock +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    + For decades, web architecture has followed a familiar + and frankly exhausting pattern. A dominant approach + emerges, gains near-universal adoption, reveals its + cracks under real-world scale, and is eventually + replaced by a new “best practice” that promises to fix + everything the last one broke. +

    +

    + We saw it in the early 2000s, when server-rendered, + monolithic applications were the default. We saw it + again in the late 2000s and early 2010s, when the + industry pushed aggressively toward rich client-side + applications. And we saw it most clearly during the + rise of single-page applications, which promised + desktop-like interactivity in the browser but often + delivered something else entirely: multi-megabyte + JavaScript bundles, blank loading screens, and years + of SEO workarounds just to make pages discoverable. +

    +

    + Today, server-side rendering is once again in vogue. + Are teams turning back to the server because + client-side architectures have hit a wall? Not + exactly. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Both server-side rendering and client-side approaches + are as compelling today as they ever were. What’s + different now is not the tools, or their viability, + but the systems we’re building and the expectations we + place on them. +

    + +

    + The upshot? There is no single “right” model for + building web applications anymore. Let me explain why. +

    +

    + From websites to distributed systems +

    +

    + Modern web applications are no longer just “sites.” + They are long-lived, highly interactive systems that + span multiple runtimes, global content delivery + networks, edge caches, background workers, and + increasingly complex data pipelines. They are expected + to load instantly, remain responsive under poor + network conditions, and degrade gracefully when + something goes wrong. +

    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + In that environment, architectural dogmatism quickly + becomes a liability. Absolutes like “everything should + be server-rendered” or “all state belongs in the + browser” sound decisive, but they rarely survive + contact with production systems. +

    +

    + The reality is messier. And that’s not a failure—it’s + a reflection of how much the web has grown up. +

    +

    + The problem with architectural absolutes +

    +

    + Strong opinions are appealing, especially at scale. + They reduce decision fatigue. They make onboarding + easier. Declaring “we only build + SPAs” or “we are an + SSR-first organization” feels like a strategy because it + removes ambiguity. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + The problem is that real applications don’t cooperate. +

    +

    + A single modern SaaS platform often contains wildly + different workloads. Public-facing landing pages and + documentation demand fast + first contentful paint, predictable SEO behavior, and aggressive caching. + Authenticated dashboards, on the other hand, may + involve real-time data, complex client-side + interactions, and long-lived state where a server + round trip for every UI change would be unacceptable. +

    +

    + Trying to force a single rendering strategy across all + of that introduces what many teams eventually + recognize as architectural friction. Exceptions creep + in. “Just this once” logic appears. Over time, the + architecture becomes harder to understand than if + those trade-offs had been acknowledged explicitly from + the start. +

    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Not a return to the past, but an expansion +

    +

    + It’s tempting to describe the current interest in + server-side rendering as a return to fundamentals. In + practice, that comparison breaks down quickly. +

    +

    + Classic server-rendered applications operated on short + request life cycles. The server generated HTML, sent + it to the browser, and largely forgot about the user + until the next request arrived. Interactivity meant + full page reloads, and state lived almost entirely on + the server. +

    +

    + Modern server-rendered applications behave very + differently. The initial HTML is often just a starting + point. It is “hydrated,” enhanced, and kept alive by + client-side logic that takes over after the first + render. The server no longer owns the full interaction + loop, but it hasn’t disappeared either. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Even ecosystems that never abandoned server rendering, + PHP being the most obvious example, continued to + thrive because they solved certain problems well; they + provided predictable execution models, straightforward + deployments, and proximity to data. What changed was + not their relevance, but the expectation that they now + coexist with richer client-side behavior rather than + compete with it. +

    +

    + This isn’t a rollback. It’s an expansion of the + architectural map. +

    +

    + Constraint-driven architecture +

    +

    + Once teams step away from ideology, the conversation + becomes more productive. The question shifts from + “What is the best model?” to “What are we optimizing + for right now?” +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Data volatility matters. Content that changes once a + week behaves very differently from real-time, + personalized data streams. Performance budgets matter + too. In an e-commerce flow, a 100-millisecond delay + can translate directly into lost revenue. In an + internal admin tool, the same delay may be irrelevant. +

    +

    + Operational reality plays a role as well. Some teams + can comfortably run and observe a fleet of SSR + servers. Others are better served by static-first or + serverless approaches simply because that’s what their + headcount and expertise can support. +

    +

    + These pressures rarely apply uniformly across an + application. Systems with strict uptime requirements + may even choose to duplicate logic across layers to + reduce coupling and failure impact, for example, + enforcing critical validation rules both at the API + boundary and again in the client, so that a single + back-end failure doesn’t completely block user + workflows. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Hybrid architectures stop being a compromise in this + context. They become a way to make trade-offs explicit + rather than accidental. +

    +
    +

    + When the server takes on more UI responsibility +

    +

    + One of the more subtle shifts in recent years is how + much responsibility the server takes on before the + browser becomes interactive. +

    +

    + This goes well beyond SEO or faster + first paint. Servers live in predictable environments. They have + stable CPU resources and direct access to databases + and internal services. Browsers, by contrast, run on + everything from high-end desktops to underpowered + mobile devices on unreliable networks. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Increasingly, teams are using the server to do the + heavy lifting. Instead of sending fragmented data to + the client and asking the browser to assemble it, the + server prepares UI-ready view models. It aggregates + data, resolves permissions, and shapes state in a way + that would be expensive or duplicative to do + repeatedly on the client. +

    +

    + By the time the payload reaches the browser, the + client’s job is narrower: activate and enhance. This + reduces the + time to interactiveand shrinks the amount of transformation logic + shipped to users. +

    +

    + This naturally leads to incremental and selective + hydration. Hydration is no longer an all-or-nothing step. + Critical, above-the-fold elements become interactive + first. Less frequently used components may not hydrate + until the user engages with them. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Performance optimization, in this model, becomes + localized rather than global. Teams improve specific + views or workflows without restructuring the entire + application. Rendering becomes a staged process, not a + binary choice. +

    +

    + Debuggability changes the architecture + conversation +

    +

    + As applications grow more distributed, performance is + no longer the only concern that shapes architecture. + Debuggability increasingly matters just as much. +

    +

    + In simpler systems, failures were easier to trace. + Rendering happened in one place. Logs told a clear + story. In modern applications, rendering can be split + across build pipelines, edge runtimes, and long-lived + client sessions. Data can be fetched, cached, + transformed, and rehydrated at different moments in + time. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + When something breaks, the hardest part is often + figuring out where it broke. +

    +

    + This is where staged architectures show a real + advantage. When rendering responsibilities are + explicit, failures tend to be more localized. A + malformed initial render points to the server layer. A + UI that looks fine but fails on interaction suggests a + hydration or client-side state issue. At an + architectural level, this mirrors the + single responsibility principleapplied beyond individual classes: Each stage has a + clear reason to change and a clear place to + investigate when something goes wrong. +

    +
    +

    + Architectures that try to hide this complexity behind + “automatic” abstractions often make debugging harder, + not easier. Engineers end up reverse engineering + framework behavior instead of reasoning about system + design. It’s no surprise that many senior teams now + prefer systems that are explicit, even boring, over + ones that feel magical but opaque. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Frameworks as enablers, not answers +

    +

    + This shift is visible across the ecosystem. + Angularis a good example. Once seen as the archetype of + heavy client-side development, it has steadily + embraced server-side rendering, fine-grained + hydration, and + signals. Importantly, it doesn’t prescribe a single way to + use them. +

    +

    + That pattern repeats elsewhere. Modern frameworks are + no longer trying to win an ideological war. They are + providing knobs and dials, ways to control when work + happens, where state lives, and how rendering unfolds + over time. +

    +

    + The competition is no longer about purity. It’s about + flexibility under real-world constraints. Pure + architectures tend to look great in greenfield + projects. They age less gracefully. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + As requirements evolve, and they always do, strict + models accumulate exceptions. What began as a clean + set of rules turns into a collection of caveats. + Architectures that acknowledge complexity early tend + to be more resilient. Clear boundaries make it + possible to evolve one part of the system without + destabilizing everything else. +

    +
    +

    + Rigor in 2026 is not about enforcing sameness. It’s + about enforcing clarity: knowing where code runs, why + it runs there, and how failures propagate. +

    +

    + Embracing the spectrum +

    +

    + The idea of a single “right” way to build for the web + is finally losing its grip. And that’s a good thing. +

    +

    + Server-side rendering and client-side applications + were never enemies. They were tools that solved + different problems at different moments in time. The + web has matured enough to admit that most + architectural questions don’t have universal answers. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + The most successful teams today aren’t chasing trends. + They understand their constraints, respect their + performance budgets, and treat rendering as a spectrum + rather than a switch. The web didn’t grow up by + picking a side. It grew up by embracing nuance, and + the architectures that will last are the ones that do + the same. +

    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + + + +
    +
    +
    + + diff --git a/tests/fixtures/fullpage_to_article_html/www.infoworld.com/the-right-way-to-architect-modern-web-applications/meta.json b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/the-right-way-to-architect-modern-web-applications/meta.json new file mode 100644 index 0000000..cb51889 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/the-right-way-to-architect-modern-web-applications/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://www.infoworld.com/article/4138765/the-right-way-to-architect-modern-web-applications.html", + "host": "www.infoworld.com", + "feed_source": "infoworld", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:19:08.396167Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "The right way to architect modern web applications", + "extracted_word_count": 1587, + "extracted_success": true, + "expected_selector": "article[id^=\"post-\"]" +} diff --git a/tests/fixtures/fullpage_to_article_html/www.infoworld.com/the-right-way-to-architect-modern-web-applications/raw.html b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/the-right-way-to-architect-modern-web-applications/raw.html new file mode 100644 index 0000000..551a71e --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.infoworld.com/the-right-way-to-architect-modern-web-applications/raw.html @@ -0,0 +1,8302 @@ + + + + + + + + + + + + + + + + + + + + The right way to architect modern web applications | InfoWorld + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    + + +
    +
    +
    +
    +
    + +
    + + +
    +
    +
    +
    +
    +
    + +
    +
    +
    + +
    +
    +
    +
    + +
    +
    + +
    +
    +
      +
    • + +
    • +
    +
    +
    + +
    +
    +
    +
    + +
    +
    +
    + +
    + + +
    +
    + +
    +
    + + + +
    +
    +
    +
    + + + +
    +
    +
    +
    + + +
    + +
    +

    Our Network

    + +
    +
    +
    +
    +
    +
    + + +
    + +
    +

    More

    + +
    +
    +
    +
    +
    + + +
    +
    +
    + +
    +
    +
    +
    +
    + Sonu Kapoor +
    +
    +
    +
    + by + +
    + +
    + Contributor +
    +
    +
    +
    +
    + +
    +
    +
    +
    +

    + The right way to architect modern web applications +

    +
    +
    + feature +
    +
    + Mar 4, 20269 mins +
    +
    + +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    + Should you double down on server-side rendering, + or embrace a client-side approach? Yes and more. +

    +

    +
    +
    +
    +
    + Illustration of man going own direction, trailblazing, exploring, thinking outside the box. Experimentation choice freedom concept. +
    + +
    + Credit: fran_kie / Shutterstock +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    + +

    + For decades, web architecture has followed a + familiar and frankly exhausting pattern. A + dominant approach emerges, gains near-universal + adoption, reveals its cracks under real-world + scale, and is eventually replaced by a new “best + practice” that promises to fix everything the last + one broke. +

    + +

    + We saw it in the early 2000s, when + server-rendered, monolithic applications were the + default. We saw it again in the late 2000s and + early 2010s, when the industry pushed aggressively + toward rich client-side applications. And we saw + it most clearly during the rise of single-page + applications, which promised desktop-like + interactivity in the browser but often delivered + something else entirely: multi-megabyte JavaScript + bundles, blank loading screens, and years of SEO + workarounds just to make pages discoverable. +

    + +

    + Today, server-side rendering is once again in + vogue. Are teams turning back to the server + because client-side architectures have hit a wall? + Not exactly. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Both server-side rendering and client-side + approaches are as compelling today as they ever + were. What’s different now is not the tools, or + their viability, but the systems we’re building + and the expectations we place on them. +

    + + + +

    + The upshot? There is no single “right” model for + building web applications anymore. Let me explain + why. +

    + +

    + From websites to distributed systems +

    + +

    + Modern web applications are no longer just + “sites.” They are long-lived, highly interactive + systems that span multiple runtimes, global + content delivery networks, edge caches, background + workers, and increasingly complex data pipelines. + They are expected to load instantly, remain + responsive under poor network conditions, and + degrade gracefully when something goes wrong. +

    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + In that environment, architectural dogmatism + quickly becomes a liability. Absolutes like + “everything should be server-rendered” or “all + state belongs in the browser” sound decisive, but + they rarely survive contact with production + systems. +

    + +

    + The reality is messier. And that’s not a + failure—it’s a reflection of how much the web has + grown up. +

    + +

    + The problem with architectural absolutes +

    + +

    + Strong opinions are appealing, especially at + scale. They reduce decision fatigue. They make + onboarding easier. Declaring “we only build + SPAs” or “we are an + SSR-first organization” feels like a strategy + because it removes ambiguity. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + The problem is that real applications don’t + cooperate. +

    + +

    + A single modern SaaS platform often contains + wildly different workloads. Public-facing landing + pages and documentation demand fast + first contentful paint, predictable SEO behavior, and aggressive + caching. Authenticated dashboards, on the other + hand, may involve real-time data, complex + client-side interactions, and long-lived state + where a server round trip for every UI change + would be unacceptable. +

    + +

    + Trying to force a single rendering strategy across + all of that introduces what many teams eventually + recognize as architectural friction. Exceptions + creep in. “Just this once” logic appears. Over + time, the architecture becomes harder to + understand than if those trade-offs had been + acknowledged explicitly from the start. +

    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Not a return to the past, but an expansion +

    + +

    + It’s tempting to describe the current interest in + server-side rendering as a return to fundamentals. + In practice, that comparison breaks down quickly. +

    + +

    + Classic server-rendered applications operated on + short request life cycles. The server generated + HTML, sent it to the browser, and largely forgot + about the user until the next request arrived. + Interactivity meant full page reloads, and state + lived almost entirely on the server. +

    + +

    + Modern server-rendered applications behave very + differently. The initial HTML is often just a + starting point. It is “hydrated,” enhanced, and + kept alive by client-side logic that takes over + after the first render. The server no longer owns + the full interaction loop, but it hasn’t + disappeared either. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Even ecosystems that never abandoned server + rendering, PHP being the most obvious example, + continued to thrive because they solved certain + problems well; they provided predictable execution + models, straightforward deployments, and proximity + to data. What changed was not their relevance, but + the expectation that they now coexist with richer + client-side behavior rather than compete with it. +

    + +

    + This isn’t a rollback. It’s an expansion of the + architectural map. +

    + +

    + Constraint-driven architecture +

    + +

    + Once teams step away from ideology, the + conversation becomes more productive. The question + shifts from “What is the best model?” to “What are + we optimizing for right now?” +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Data volatility matters. Content that changes once + a week behaves very differently from real-time, + personalized data streams. Performance budgets + matter too. In an e-commerce flow, a + 100-millisecond delay can translate directly into + lost revenue. In an internal admin tool, the same + delay may be irrelevant. +

    + +

    + Operational reality plays a role as well. Some + teams can comfortably run and observe a fleet of + SSR servers. Others are better served by + static-first or serverless approaches simply + because that’s what their headcount and expertise + can support. +

    + +

    + These pressures rarely apply uniformly across an + application. Systems with strict uptime + requirements may even choose to duplicate logic + across layers to reduce coupling and failure + impact, for example, enforcing critical validation + rules both at the API boundary and again in the + client, so that a single back-end failure doesn’t + completely block user workflows. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Hybrid architectures stop being a compromise in + this context. They become a way to make trade-offs + explicit rather than accidental. +

    +
    + +

    + When the server takes on more UI + responsibility +

    + +

    + One of the more subtle shifts in recent years is + how much responsibility the server takes on before + the browser becomes interactive. +

    + +

    + This goes well beyond SEO or faster + first paint. Servers live in predictable environments. They + have stable CPU resources and direct access to + databases and internal services. Browsers, by + contrast, run on everything from high-end desktops + to underpowered mobile devices on unreliable + networks. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Increasingly, teams are using the server to do the + heavy lifting. Instead of sending fragmented data + to the client and asking the browser to assemble + it, the server prepares UI-ready view models. It + aggregates data, resolves permissions, and shapes + state in a way that would be expensive or + duplicative to do repeatedly on the client. +

    + +

    + By the time the payload reaches the browser, the + client’s job is narrower: activate and enhance. + This reduces the + time to interactive + and shrinks the amount of transformation logic + shipped to users. +

    + +

    + This naturally leads to incremental and selective + hydration. Hydration is no longer an all-or-nothing step. + Critical, above-the-fold elements become + interactive first. Less frequently used components + may not hydrate until the user engages with them. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Performance optimization, in this model, becomes + localized rather than global. Teams improve + specific views or workflows without restructuring + the entire application. Rendering becomes a staged + process, not a binary choice. +

    + +

    + Debuggability changes the architecture + conversation +

    + +

    + As applications grow more distributed, performance + is no longer the only concern that shapes + architecture. Debuggability increasingly matters + just as much. +

    + +

    + In simpler systems, failures were easier to trace. + Rendering happened in one place. Logs told a clear + story. In modern applications, rendering can be + split across build pipelines, edge runtimes, and + long-lived client sessions. Data can be fetched, + cached, transformed, and rehydrated at different + moments in time. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + When something breaks, the hardest part is often + figuring out where it broke. +

    + +

    + This is where staged architectures show a real + advantage. When rendering responsibilities are + explicit, failures tend to be more localized. A + malformed initial render points to the server + layer. A UI that looks fine but fails on + interaction suggests a hydration or client-side + state issue. At an architectural level, this + mirrors the + single responsibility principle + applied beyond individual classes: Each stage has + a clear reason to change and a clear place to + investigate when something goes wrong. +

    +
    + +

    + Architectures that try to hide this complexity + behind “automatic” abstractions often make + debugging harder, not easier. Engineers end up + reverse engineering framework behavior instead of + reasoning about system design. It’s no surprise + that many senior teams now prefer systems that are + explicit, even boring, over ones that feel magical + but opaque. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + Frameworks as enablers, not answers +

    + +

    + This shift is visible across the ecosystem. + Angular + is a good example. Once seen as the archetype of + heavy client-side development, it has steadily + embraced server-side rendering, fine-grained + hydration, and + signals. Importantly, it doesn’t prescribe a single way + to use them. +

    + +

    + That pattern repeats elsewhere. Modern frameworks + are no longer trying to win an ideological war. + They are providing knobs and dials, ways to + control when work happens, where state lives, and + how rendering unfolds over time. +

    + +

    + The competition is no longer about purity. It’s + about flexibility under real-world constraints. + Pure architectures tend to look great in + greenfield projects. They age less gracefully. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + As requirements evolve, and they always do, strict + models accumulate exceptions. What began as a + clean set of rules turns into a collection of + caveats. Architectures that acknowledge complexity + early tend to be more resilient. Clear boundaries + make it possible to evolve one part of the system + without destabilizing everything else. +

    +
    + +

    + Rigor in 2026 is not about enforcing sameness. + It’s about enforcing clarity: knowing where code + runs, why it runs there, and how failures + propagate. +

    + +

    + Embracing the spectrum +

    + +

    + The idea of a single “right” way to build for the + web is finally losing its grip. And that’s a good + thing. +

    + +

    + Server-side rendering and client-side applications + were never enemies. They were tools that solved + different problems at different moments in time. + The web has matured enough to admit that most + architectural questions don’t have universal + answers. +

    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +

    + The most successful teams today aren’t chasing + trends. They understand their constraints, respect + their performance budgets, and treat rendering as + a spectrum rather than a switch. The web didn’t + grow up by picking a side. It grew up by embracing + nuance, and the architectures that will last are + the ones that do the same. +

    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + Sonu Kapoor +
    +
    +
    + +
    +
    +
    +
    +
    +
    + + +
    + Contributor +
    +
    +
    +
    + + +
    + +
    +

    + Sonu Kapoor is a Google Developer Expert (GDE) in + Angular and a multi-year Microsoft MVP in Developer + Technologies. He has over 20 years of experience + building large-scale systems across finance, retail, and + enterprise software. Sonu is the co-author of Angular + Typed Forms and a frequent contributor to the Angular + ecosystem. He writes and speaks internationally on + modern web architecture, Angular Signals, and + AI-augmented software development. His work has been + featured in publications including CODE Magazine and + LeadDev. He is currently focused on advancing + signal-based architectures and helping teams design more + maintainable, performance-driven front ends. +

    +
    + +
    + +
    +

    + More from this author +

    +
    +
    + + + +
    +
    + +
    +
    +
    +
    +
    +
    +
    + + +
    + +
    +
    + + + + + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/bridging-the-operational-ai-gap/expected.html b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/bridging-the-operational-ai-gap/expected.html new file mode 100644 index 0000000..d35af59 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/bridging-the-operational-ai-gap/expected.html @@ -0,0 +1,234 @@ + + + +
    +
    +
    +

    + In partnership with + Celigo +

    + +
    +
    +
    +
    +

    + The transformational potential of AI is already well + established. Enterprise use cases are building momentum and + organizations are transitioning from pilot projects to AI in + production. Companies are no longer just talking about AI; they + are redirecting budgets and resources to make it happen. Many + are already experimenting with agentic AI, which promises new + levels of automation. Yet, the road to full operational success + is still uncertain for many. And, while AI experimentation is + everywhere, enterprise-wide adoption remains elusive. +

    +

    + Without integrated data and systems, stable automated workflows, + and governance models, AI initiatives can get stuck in pilots + and struggle to move into production. The rise of agentic AI and + increasing model autonomy make a holistic approach to + integrating data, applications, and systems more important than + ever. Without it, enterprise AI initiatives may fail. Gartner + predicts over 40% of agentic AI projects will be cancelled by + 2027 due to cost, inaccuracy, and governance challenges. The + real issue is not the AI itself, but the missing operational + foundation. +

    +
    +
    +
    +
    +
    + + + +
    + +

    + To understand how organizations are structuring their AI + operations and how they are deploying successful AI projects, + MIT Technology Review Insights surveyed 500 senior IT leaders at + mid- to large-size companies in the US, all of which are + pursuing AI in some way. +

    +

    + The results of the survey, along with a series of expert + interviews, all conducted in December 2025, show that a strong + integration foundation aligns with more advanced AI + implementations, conducive to enterprise-wide initiatives. As AI + technologies and applications evolve and proliferate, an + integration platform can help organizations avoid duplication + and silos, and have clear oversight as they navigate the growing + autonomy of workflows. +

    +
    +
    + + + +
    +

    Key findings from the report include the following:

    +

    + Some organizations are making progress with AI.In recent years, study after study has exposed a lack of + tangible AI success. Yet, our research finds three in four (76%) + surveyed companies have at least one department with an AI + workflow fully in production. +

    +
    +

    + AI succeeds most frequently with well-defined, established + processes.Nearly half (43%) of organizations are finding success with AI + implementations applied to well-defined and automated processes. + A quarter are succeeding with new processes. And one-third (32%) + are applying AI to various processes. +

    +

    + Two-thirds of organizations lack dedicated AI teams.Only one in three (34%) organizations have a team specifically + for maintaining AI workflows. One in five (21%) say central IT + is responsible for ongoing AI maintenance, and 25% say the + responsibility lies with departmental operations. For 19% of + organizations, the responsibility is spread out. +

    +

    + Enterprise-wide integration platforms lead to more robust + implementation of AI.Companies with enterprise-wide integration platforms are five + times more likely to use more diverse data sources in AI + workflows. Six in 10 (59%) employ five or more data sources, + compared to only 11% of organizations using integration for + specific workflows, or 0% of those not using an integration + platform. Organizations using integration platforms also have + more multi-departmental implementation of AI, more autonomy in + AI workflows, and more confidence in assigning autonomy in the + future. +

    +

    + Download the report. +

    +

    + This content was produced by Insights, the custom content arm + of MIT Technology Review. It was not written by MIT Technology + Review’s editorial staff. It was researched, designed, and + written by human writers, editors, analysts, and illustrators. + This includes the writing of surveys and collection of data + for surveys. AI tools that may have been used were limited to + secondary production processes that passed thorough human + review. +

    +

    + +

    +
    +
    +
    +
    +
    + + diff --git a/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/bridging-the-operational-ai-gap/meta.json b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/bridging-the-operational-ai-gap/meta.json new file mode 100644 index 0000000..0949fa6 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/bridging-the-operational-ai-gap/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://www.technologyreview.com/2026/03/04/1133642/bridging-the-operational-ai-gap/", + "host": "www.technologyreview.com", + "feed_source": "technologyreview", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:19:19.907622Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "Bridging the operational AI gap", + "extracted_word_count": 809, + "extracted_success": true, + "expected_selector": "div#content--body" +} diff --git a/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/bridging-the-operational-ai-gap/raw.html b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/bridging-the-operational-ai-gap/raw.html new file mode 100644 index 0000000..f055494 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/bridging-the-operational-ai-gap/raw.html @@ -0,0 +1,41487 @@ + + + + + + Bridging the operational AI gap | MIT Technology Review + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Skip to Content +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    +

    + Sponsored +

    +
    +
    +
    +
    + Artificial intelligence +
    +

    + Bridging the operational AI gap +

    +
    +

    + Enterprise-wide integration is being leveraged to extend today’s process automations into tomorrow’s agentic workflows. +

    +
    +
    + +
    + March 4, 2026 +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    + In partnership withCeligo +

    + +
    +
    +
    +
    +

    + The transformational potential of AI is already + well established. Enterprise use cases are + building momentum and organizations are + transitioning from pilot projects to AI in + production. Companies are no longer just talking + about AI; they are redirecting budgets and + resources to make it happen. Many are already + experimenting with agentic AI, which promises + new levels of automation. Yet, the road to full + operational success is still uncertain for many. + And, while AI experimentation is everywhere, + enterprise-wide adoption remains elusive. +

    +

    + Without integrated data and systems, stable + automated workflows, and governance models, AI + initiatives can get stuck in pilots and struggle + to move into production. The rise of agentic AI + and increasing model autonomy make a holistic + approach to integrating data, applications, and + systems more important than ever. Without it, + enterprise AI initiatives may fail. Gartner + predicts over 40% of agentic AI projects will be + cancelled by 2027 due to cost, inaccuracy, and + governance challenges. The real issue is not the + AI itself, but the missing operational + foundation. +

    +
    +
    +
    +
    +
    + +
    + +

    + To understand how organizations are structuring + their AI operations and how they are deploying + successful AI projects, MIT Technology Review + Insights surveyed 500 senior IT leaders at mid- + to large-size companies in the US, all of which + are pursuing AI in some way. +

    +

    + The results of the survey, along with a series + of expert interviews, all conducted in December + 2025, show that a strong integration foundation + aligns with more advanced AI implementations, + conducive to enterprise-wide initiatives. As AI + technologies and applications evolve and + proliferate, an integration platform can help + organizations avoid duplication and silos, and + have clear oversight as they navigate the + growing autonomy of workflows. +

    +
    +
    + +
    +

    + Key findings from the report include the + following: +

    +

    + Some organizations are making progress with + AI. In recent years, study after study has exposed + a lack of tangible AI success. Yet, our research + finds three in four (76%) surveyed companies + have at least one department with an AI workflow + fully in production. +

    +
    +

    + AI succeeds most frequently with + well-defined, established processes. Nearly half (43%) of organizations are finding + success with AI implementations applied to + well-defined and automated processes. A quarter + are succeeding with new processes. And one-third + (32%) are applying AI to various processes. +

    +

    + Two-thirds of organizations lack dedicated AI + teams. Only one in three (34%) organizations have a + team specifically for maintaining AI workflows. + One in five (21%) say central IT is responsible + for ongoing AI maintenance, and 25% say the + responsibility lies with departmental + operations. For 19% of organizations, the + responsibility is spread out. +

    +

    + Enterprise-wide integration platforms lead to + more robust implementation of AI. Companies with enterprise-wide integration + platforms are five times more likely to use more + diverse data sources in AI workflows. Six in 10 + (59%) employ five or more data sources, compared + to only 11% of organizations using integration + for specific workflows, or 0% of those not using + an integration platform. Organizations using + integration platforms also have more + multi-departmental implementation of AI, more + autonomy in AI workflows, and more confidence in + assigning autonomy in the future. +

    +

    + Download the report. +

    +

    + This content was produced by Insights, the + custom content arm of MIT Technology Review. + It was not written by MIT Technology Review’s + editorial staff. It was researched, designed, + and written by human writers, editors, + analysts, and illustrators. This includes the + writing of surveys and collection of data for + surveys. AI tools that may have been used were + limited to secondary production processes that + passed thorough human review. +

    +

    + +

    +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    +

    + Deep Dive +

    +
    +
    +
    +

    + Artificial intelligence +

    +
    +
    +
    +
    +
    +
    + +
    +
    +

    + A “QuitGPT” campaign is urging people to cancel their + ChatGPT subscriptions +

    +
    +

    + Backlash against ICE is fueling a broader movement + against AI companies’ ties to President Trump. +

    +
    + +
    +
    +
    +
    +
    +
    + +
    +
    +

    + Moltbook was peak AI theater +

    +
    +

    + The viral social network for bots reveals more about our + own current mania for AI as it does about the future of + agents. +

    +
    + +
    +
    +
    +
    +
    +
    + +
    +
    +

    + Meet the new biologists treating LLMs like aliens +

    +
    +

    + By studying large language models as if they were living + things instead of computer programs, scientists are + discovering some of their secrets for the first time. +

    +
    + +
    +
    +
    +
    +
    +
    + +
    +
    +

    + What’s next for AI in 2026 +

    +
    +

    + Our AI writers make their big bets for the coming + year—here are five hot trends to watch. +

    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +

    + Stay connected +

    +
    + +
    + Illustration by Rose Wong +
    +

    + Get the latest updates from
    MIT Technology Review +

    +

    + Discover special offers, top stories, upcoming events, and + more. +

    +
    + +
    +
    +

    + Thank you for submitting your email! +

    +
    + Explore more newsletters +
    +
    +
    +

    + It looks like something went wrong. +

    +
    +

    + We’re having trouble saving your preferences. Try + refreshing this page and updating them one more time. If + you continue to get this message, reach out to us at + customer-service@technologyreview.com + with a list of newsletters you’d like to receive. +

    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + +
    +
    +
    + +
    +
    + + diff --git a/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/openais-compromise-with-the-pentagon/expected.html b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/openais-compromise-with-the-pentagon/expected.html new file mode 100644 index 0000000..0ab2010 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/openais-compromise-with-the-pentagon/expected.html @@ -0,0 +1,469 @@ + + + +
    +
    +
    +
    +
    +

    + On February 28, OpenAI announced it had reached a deal that will + allow the US military to use its technologies in classified + settings. CEO Sam Altman said the negotiations, which the + company began pursuing only + afterthe Pentagon’s public reprimand of Anthropic, were “definitely rushed.” +

    +

    + In its announcements, OpenAI took great pains to say that it had + not caved to allow the Pentagon to do whatever it wanted with + its technology. The company published a + blog postexplaining that its agreement protected against use for + autonomous weapons and mass domestic surveillance, and Altman + saidthe company did not simply accept the same terms that Anthropic + refused.  +

    +
    +
    +
    +
    +

    + You could read this to say that OpenAI won both the contract and + the moral high ground, but reading between the lines and the + legalese makes something else clear: Anthropic pursued a moral + approach that won it many supporters but failed, while OpenAI + pursued a pragmatic and legal approach that is ultimately softer + on the Pentagon. +

    +

    + It’s not yet clear if OpenAI can build in the safety precautions + it promises as the military rushes out a politicized AI strategy + during strikes on Iran, or if the deal will be seen as good + enough by employees who wanted the company to take a harder + line. Walking that tightrope will be tricky. (OpenAI did not + immediately respond to requests for additional information about + its agreement.) +

    +
    +
    + +
    +
    +
    +
    + +
    +
    +

    + But the devil is also in the details. The reason OpenAI was able + to make a deal when Anthropic could not was less about + boundaries, Altman said, but about approach. “Anthropic seemed + more focused on specific prohibitions in the contract, rather + than citing applicable laws, which we felt comfortable with,” he + wrote.  +

    +

    + OpenAI says one + basisfor its willingness to work with the Pentagon is simply an + assumption that the government won’t break the law. The company, + which has shared a + limited excerptof its contract, cites a number of laws and policies related to + autonomous weapons and surveillance. They are as specific as a + 2023 + directivefrom the Pentagon on autonomous weapons (which does not + prohibit them but issues guidelines for their design and + testing) and as broad as the Fourth Amendment, which has + supported protections for Americans against mass surveillance.  +

    +
    +

    + However, the published excerpt “does not give OpenAI an + Anthropic-style, free-standing right to prohibit + otherwise-lawful government use,” + wroteJessica Tillipman, associate dean for government procurement + law studies at George Washington University’s law school. It + simply states that the Pentagon can’t use OpenAI’s tech to break + any of those laws and policies as they’re stated today. +

    +

    + The whole reason Anthropic earned so many supporters in its + fight—including some of OpenAI’s + own employees—is that they + don’t believe these rules are good enough to prevent the + creation of AI-enabled autonomous weapons or mass surveillance. + And an assumption that federal agencies won’t break the law is + little assurance to anyone who remembers that the surveillance + practices exposed by Edward Snowden had been deemed legal by + internal agencies and were ruled unlawful only after drawn-out + battles (not to mention the many surveillance tactics + allowed under current lawthat AI could expand). On this front, we’ve essentially ended + up back where we started: allowing the Pentagon to use its AI + for any lawful use.  +

    +

    + OpenAI could say, as its head of national security partnerships + wroteyesterday, that if you believe the government won’t follow the + law, then you should also not be confident it would honor the + red lines that Anthropic was proposing. But that’s not an + argument against setting them. Imperfect enforcement doesn’t + make constraints meaningless, and contract terms still shape + behavior, oversight, and political consequences. +

    +

    + OpenAI claims a second line of defense. The company says it + maintains control over the safety rules governing its models and + will not give the military a version of its AI stripped of those + safety controls. “We can embed our red lines—no mass + surveillance and no directing weapons systems without human + involvement—directly into model behavior,” + wroteBoaz Barak, an OpenAI employee Altman deputized to speak on the + issue about X.  +

    +
    +
    +
    +
    +

    + But the company doesn’t specify how its safety rules for the + military differ from its rules for normal users. Enforcement is + also never perfect, and it is especially unlikely to be when + OpenAI is rolling out these protections in a classified setting + for the first time and is expected to do so in just six months. +

    +
    +
    + +
    +
    +

    + There’s another question beneath all this: Should it be down to + tech companies to prohibit things that are legal but that they + find morally objectionable? The government certainly viewed + Anthropic’s willingness to play this role as + unacceptable. On Friday evening, eight hours before the US launched strikes + in Tehran, Defense Secretary Pete Hegseth issued harsh remarks + on X. “Anthropic delivered a master class in arrogance and + betrayal,” he wrote, and echoed President Trump’s order for the + government to cease working with the AI company after Anthropic + sought to keep its model Claude from being used for autonomous + weapons or mass domestic surveillance. “The Department of War + must have full, unrestricted access to Anthropic’s models for + every LAWFUL purpose,” Hegseth wrote. +

    +

    + But unless OpenAI’s full contract will reveal more, it’s hard + not to see the company as sitting on an ideological seesaw, + promising that it does have leverage it will proudly + use to do what it sees as the right thing while deferring to the + law as the main backstop for what the Pentagon can do with its + tech. +

    +

    + There are three things to be watching here. One is whether this + position will be good enough for OpenAI’s most critical + employees. With AI companies spending so heavily on talent, it’s + possible that some at OpenAI see in Altman’s justification an + unforgivable compromise. +

    +
    +
    +
    + +
    +
    +

    + Second, there is the scorched-earth campaign that Hegseth has + promised to wage against Anthropic. Going far beyond simply + canceling the government’s contract with the company, he + announced that it would be classified as a supply chain risk, + and that “no contractor, supplier, or partner that does business + with the United States military may conduct any commercial + activity with Anthropic.” There is significant debate about + whether this death blow is + legally possible, and Anthropic has + saidit will sue if the threat is pursued. OpenAI has also come out + againstthe move. +

    +

    + Lastly, how will the Pentagon swap out Claude—the only AI model + it actively uses in classified operations, + includingsome in Venezuela—while it + escalatesstrikes against Iran? Hegseth granted the agency six months to + do so, during which the military will phase in OpenAI’s models + as well as those from Elon Musk’s + xAI. +

    +

    + But Claude was reportedly + usedin the strikes on Iran hours after the ban was issued, + suggesting that a phase-out will be anything but simple. Even if + the months-long feud between Anthropic and the Pentagon is over + (which I doubt it is), we are now seeing the Pentagon’s AI + acceleration planput pressure on companies to relinquish lines in the sand they + had + once drawn, with new tensions in the Middle East as the primary testing + ground. +

    +

    + If you have information to share about how this is unfolding, + reach out to me via Signal (username: + jamesodonnell.22). +

    +

    + +

    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + diff --git a/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/openais-compromise-with-the-pentagon/meta.json b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/openais-compromise-with-the-pentagon/meta.json new file mode 100644 index 0000000..985aa9f --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/openais-compromise-with-the-pentagon/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://www.technologyreview.com/2026/03/02/1133850/openais-compromise-with-the-pentagon-is-what-anthropic-feared/", + "host": "www.technologyreview.com", + "feed_source": "technologyreview", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:19:45.937926Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "OpenAI’s “compromise” with the Pentagon is what Anthropic feared", + "extracted_word_count": 1432, + "extracted_success": true, + "expected_selector": "div#content--body" +} diff --git a/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/openais-compromise-with-the-pentagon/raw.html b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/openais-compromise-with-the-pentagon/raw.html new file mode 100644 index 0000000..5ef7724 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/openais-compromise-with-the-pentagon/raw.html @@ -0,0 +1,42177 @@ + + + + + + + OpenAI’s ‘compromise’ with the Pentagon is what Anthropic feared | MIT + Technology Review + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Skip to Content +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    + Artificial intelligence +
    +

    + OpenAI’s “compromise” with the Pentagon is what Anthropic + feared +

    +
    +

    + Anthropic pushed for moral boundaries. OpenAI settled + for softer legal ones, and now it stands to benefit as + the Pentagon rushes out a politicized AI strategy during + strikes on Iran. +

    +
    +
    + +
    + March 2, 2026 +
    +
    +
    +
    + collage elements with Pete Hegseth, Daren Amodei and Sam Altman +
    + Stephanie Arnett/MIT Technology Review | Getty + Images +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    + On February 28, OpenAI announced it had reached + a deal that will allow the US military to use + its technologies in classified settings. CEO Sam + Altman said the negotiations, which the company + began pursuing only + after + the Pentagon’s public reprimand of Anthropic, + were “definitely rushed.” +

    +

    + In its announcements, OpenAI took great pains to + say that it had not caved to allow the Pentagon + to do whatever it wanted with its technology. + The company published a + blog post + explaining that its agreement protected against + use for autonomous weapons and mass domestic + surveillance, and Altman + said + the company did not simply accept the same terms + that Anthropic refused.  +

    +
    +
    +
    +
    +

    + You could read this to say that OpenAI won both + the contract and the moral high ground, but + reading between the lines and the legalese makes + something else clear: Anthropic pursued a moral + approach that won it many supporters but failed, + while OpenAI pursued a pragmatic and legal + approach that is ultimately softer on the + Pentagon.  +

    +

    + It’s not yet clear if OpenAI can build in the + safety precautions it promises as the military + rushes out a politicized AI strategy during + strikes on Iran, or if the deal will be seen as + good enough by employees who wanted the company + to take a harder line. Walking that tightrope + will be tricky. (OpenAI did not immediately + respond to requests for additional information + about its agreement.) +

    +
    +
    + +
    +
    +
    +
    + +
    +
    +

    + But the devil is also in the details. The reason + OpenAI was able to make a deal when Anthropic + could not was less about boundaries, Altman + said, but about approach. “Anthropic seemed more + focused on specific prohibitions in the + contract, rather than citing applicable laws, + which we felt comfortable with,” he + wrote.  +

    +

    + OpenAI says one + basis + for its willingness to work with the Pentagon is + simply an assumption that the government won’t + break the law. The company, which has shared a + limited excerpt + of its contract, cites a number of laws and + policies related to autonomous weapons and + surveillance. They are as specific as a 2023 + directive + from the Pentagon on autonomous weapons (which + does not prohibit them but issues guidelines for + their design and testing) and as broad as the + Fourth Amendment, which has supported + protections for Americans against mass + surveillance.  +

    +
    +

    + However, the published excerpt “does not give + OpenAI an Anthropic-style, free-standing right + to prohibit otherwise-lawful government use,” + wrote + Jessica Tillipman, associate dean for government + procurement law studies at George Washington + University’s law school. It simply states that + the Pentagon can’t use OpenAI’s tech to break + any of those laws and policies as they’re stated + today. +

    +

    + The whole reason Anthropic earned so many + supporters in its fight—including some of + OpenAI’s + own employees—is that they don’t believe these rules are + good enough to prevent the creation of + AI-enabled autonomous weapons or mass + surveillance. And an assumption that federal + agencies won’t break the law is little assurance + to anyone who remembers that the surveillance + practices exposed by Edward Snowden had been + deemed legal by internal agencies and were ruled + unlawful only after drawn-out battles (not to + mention the many surveillance tactics + allowed under current law + that AI could expand). On this front, we’ve + essentially ended up back where we started: + allowing the Pentagon to use its AI for any + lawful use.  +

    +

    + OpenAI could say, as its head of national + security partnerships + wrote + yesterday, that if you believe the government + won’t follow the law, then you should also not + be confident it would honor the red lines that + Anthropic was proposing. But that’s not an + argument against setting them. Imperfect + enforcement doesn’t make constraints + meaningless, and contract terms still shape + behavior, oversight, and political consequences. +

    +

    + OpenAI claims a second line of defense. The + company says it maintains control over the + safety rules governing its models and will not + give the military a version of its AI stripped + of those safety controls. “We can embed our red + lines—no mass surveillance and no directing + weapons systems without human + involvement—directly into model behavior,” + wrote + Boaz Barak, an OpenAI employee Altman deputized + to speak on the issue about X.  +

    +
    +
    +
    +
    +

    + But the company doesn’t specify how its safety + rules for the military differ from its rules for + normal users. Enforcement is also never perfect, + and it is especially unlikely to be when OpenAI + is rolling out these protections in a classified + setting for the first time and is expected to do + so in just six months. +

    +
    +
    + +
    +
    +

    + There’s another question beneath all this: + Should it be down to tech companies to prohibit + things that are legal but that they find morally + objectionable? The government certainly viewed + Anthropic’s willingness to play this role as + unacceptable. On Friday evening, eight hours before the US + launched strikes in Tehran, Defense Secretary + Pete Hegseth issued harsh remarks on X. + “Anthropic delivered a master class in arrogance + and betrayal,” he wrote, and echoed President + Trump’s order for the government to cease + working with the AI company after Anthropic + sought to keep its model Claude from being used + for autonomous weapons or mass domestic + surveillance. “The Department of War must have + full, unrestricted access to Anthropic’s models + for every LAWFUL purpose,” Hegseth wrote. +

    +

    + But unless OpenAI’s full contract will reveal + more, it’s hard not to see the company as + sitting on an ideological seesaw, promising that + it does have leverage it will proudly + use to do what it sees as the right thing while + deferring to the law as the main backstop for + what the Pentagon can do with its tech. +

    +

    + There are three things to be watching here. One + is whether this position will be good enough for + OpenAI’s most critical employees. With AI + companies spending so heavily on talent, it’s + possible that some at OpenAI see in Altman’s + justification an unforgivable compromise. +

    +
    +
    +
    + +
    +
    +

    + Second, there is the scorched-earth campaign + that Hegseth has promised to wage against + Anthropic. Going far beyond simply canceling the + government’s contract with the company, he + announced that it would be classified as a + supply chain risk, and that “no contractor, + supplier, or partner that does business with the + United States military may conduct any + commercial activity with Anthropic.” There is + significant debate about whether this death blow + is + legally possible, and Anthropic has + said + it will sue if the threat is pursued. OpenAI has + also come out + against + the move. +

    +

    + Lastly, how will the Pentagon swap out + Claude—the only AI model it actively uses in + classified operations, + including + some in Venezuela—while it + escalates + strikes against Iran? Hegseth granted the agency + six months to do so, during which the military + will phase in OpenAI’s models as well as those + from Elon Musk’s + xAI. +

    +

    + But Claude was reportedly + used + in the strikes on Iran hours after the ban was + issued, suggesting that a phase-out will be + anything but simple. Even if the months-long + feud between Anthropic and the Pentagon is over + (which I doubt it is), we are now seeing the + Pentagon’s AI + acceleration plan + put pressure on companies to relinquish lines in + the sand they had + once drawn, with new tensions in the Middle East as the + primary testing ground. +

    +

    + If you have information to share about how + this is unfolding, reach out to me via Signal + (username: jamesodonnell.22). +

    +

    + +

    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    +

    + Deep Dive +

    +
    +
    +
    +

    + Artificial intelligence +

    +
    +
    +
    +
    +
    +
    + +
    +
    +

    + A “QuitGPT” campaign is urging people to cancel their + ChatGPT subscriptions +

    +
    +

    + Backlash against ICE is fueling a broader movement + against AI companies’ ties to President Trump. +

    +
    + +
    +
    +
    +
    +
    +
    + +
    +
    +

    + Moltbook was peak AI theater +

    +
    +

    + The viral social network for bots reveals more about our + own current mania for AI as it does about the future of + agents. +

    +
    + +
    +
    +
    +
    +
    +
    + +
    +
    +

    + Meet the new biologists treating LLMs like aliens +

    +
    +

    + By studying large language models as if they were living + things instead of computer programs, scientists are + discovering some of their secrets for the first time. +

    +
    + +
    +
    +
    +
    +
    +
    + +
    +
    +

    + What’s next for AI in 2026 +

    +
    +

    + Our AI writers make their big bets for the coming + year—here are five hot trends to watch. +

    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +

    + Stay connected +

    +
    + +
    + Illustration by Rose Wong +
    +

    + Get the latest updates from
    MIT Technology Review +

    +

    + Discover special offers, top stories, upcoming events, and + more. +

    +
    + +
    +
    +

    + Thank you for submitting your email! +

    +
    + Explore more newsletters +
    +
    +
    +

    + It looks like something went wrong. +

    +
    +

    + We’re having trouble saving your preferences. Try + refreshing this page and updating them one more time. If + you continue to get this message, reach out to us at + customer-service@technologyreview.com + with a list of newsletters you’d like to receive. +

    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    + +
    +
    + +
    + +
    + +
    + +
    +
    + + diff --git a/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/the-download-lightning-openai-pentagon/expected.html b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/the-download-lightning-openai-pentagon/expected.html new file mode 100644 index 0000000..b58748e --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/the-download-lightning-openai-pentagon/expected.html @@ -0,0 +1,545 @@ + + + +
    +
    +
    +
    +
    +

    + This is today's edition ofThe Download,our weekday newsletter that provides a daily dose of what's + going on in the world of technology. +

    +

    + This startup claims it can stop lightning and prevent + catastrophic wildfires +

    +
    +
    +
    +
    +

    + Startup Skyward Wildfire says it can prevent catastrophic fires by stopping the + lightning strikes that ignite them. So far, it hasn’t publicly + revealed how it does so, but online documents suggest the + company is relying on an approach the US government began + evaluating in the early 1960s: seeding clouds with metallic + chaff, or narrow fiberglass strands coated with aluminum.  +

    +

    + It just raised millions of dollars to accelerate its product + development and expand its operations. But researchers and + environmental observers say uncertainties remain, including how + well the seeding may work under varying conditions, how much + material would need to be released, how frequently it would have + to be done, and what sorts of secondary environmental impacts + might result. Read the full story. +

    +
    +

    + —James Temple +

    +

    + OpenAI’s “compromise” with the Pentagon is what Anthropic + feared + +

    +
    +

    + OpenAI has reached a deal that will allow the US military to use + its technologies in classified settings. CEO Sam Altman said the + negotiations, which the company began pursuing only after the Pentagon’s public reprimand of Anthropic, were “definitely rushed.” +

    +

    + OpenAI has taken great pains to say that it has not caved to + allow the Pentagon to do whatever it wants with its technology. + The company published a blog post explaining that its agreement protected against use for + autonomous weapons and mass domestic surveillance, and Altman said the company did not simply accept the same terms that + Anthropic refused.  +

    +

    + But it’s not yet clear if OpenAI can build in the safety + precautions it promises as the military rushes out a politicized + AI strategy during strikes on Iran, or if the deal will be seen + as good enough by employees who wanted the company to take a + harder line. Walking that tightrope will be tricky. Read the full story. +

    +

    + —James O’Donnell +

    +
    +
    +
    +
    +

    + The story is from The Algorithm,our weekly newsletter on AI. To get stories like this in your + inbox first,sign up here. +

    +

    + The must-reads +

    +

    + I’ve combed the internet to find you today’s most + fun/important/scary/fascinating stories about technology. +

    +

    + 1 Gulf states are racing against time to intercept Iran’s + drone attacks +

    +
    +
    +

    + They could run out of interceptors very soon. (WSJ $) +

    +
      +
    • + Amazon says it lost three data centers in the strikes. (Business Insider $) +
    • +
    • + There has been a spike in GPS attacks too, affecting nearby + shipping. (Wired) +
    • +
    • + Crypto stocks are tumbling in response.(Bloomberg) +
    • +
    +

    + 2 Apple is considering using Google’s Gemini AI to power + Siri +

    +

    + It’s also set to deepen its reliance on Google’s cloud + infrastructure. (The Information $) +

    +

    + 3 A database shows which topics fall foul of the Trump + administration +

    +
    +
    +
    +
    +

    + National parks are being forced to erase any exhibits that + display “partisan ideology”. (WP $) +

    +
      +
    • + The transatlantic battle over free speech is coming.(FT $) +
    • +
    • + What it’s like to be banned in the US for fighting online + hate.(MIT Technology Review) +
    • +
    +

    + 4 Can AI actually enhance jobs, not just destroy + them? +

    +

    + Three economists take the optimistic view (New Yorker) +

    + +

    + 5 Are “bossware” apps tracking you? +

    +
    +

    + Tools to watch what workers are doing are getting more and more + sophisticated. (NYT) +

    + +

    + 6 RFK Jr says he is about to unleash 14 banned + peptides +

    +
    +

    + By reversing a Biden-era FDA ban on their production. (Gizmodo) +

    + +

    + 7 Meta is testing an AI shopping research tool +

    +
    +
    +
    +
    +

    + It hopes to rival Gemini and ChatGPT. (Bloomberg) +

    +

    + 8 Maybe data centers in space aren’t as crazy as they + sound? +

    +

    + They could be cheaper, with the right tech. (Economist)  +

    + +

    + 9 Why climate change is making turbulence worse +

    +
    +

    + Buckle up, people. (New Yorker) +

    +

    + 10 6G is on its way! +

    +

    + And the hype cycle is doing its thing again. (The Verge $) +

    +

    + Quote of the day +

    +
    +
    +
    +
    +
    +

    + “We don’t list markets directly tied to death. When there are + markets where potential outcomes involve death, we design the + rules to prevent people from profiting from death.” +

    +

    + —Tarek Mansour, CEO and founder of prediction market company + Kalshi, tries to justify the $54 million bet on “Ali Khamenei + out as Supreme Leader?” on his platform, 404 Media reports. +

    +

    + One More Thing +

    +
    +
    + surveillance and control concept +
    EDEL RODRIGUEZ
    +
    +
    +

    + +

    +

    + South Africa’s private surveillance machine is fueling a + digital apartheid +

    +

    + Johannesburg is birthing a uniquely South African surveillance + model. Over the past decade, the city has become host to a + centralized, coordinated, entirely privatized mass surveillance + operation. These tools have been enthusiastically adopted by the + local security industry, grappling with the pressures of a + high-crime environment. +

    +

    + Civil rights activists worry the new surveillance is fueling a + digital apartheid and unraveling people’s democratic liberties, + but a growing chorus of experts say the stakes are even higher. +

    +

    + They argue that the impact of artificial intelligence is + repeating the patterns of colonial history, and here in South + Africa, where colonial legacies abound, the unfettered + deployment of AI surveillance offers just one case study in how + a technology that promised to bring societies into the future is + threatening to send them back to the past. Read the full story. +

    +
    +
    +
    +
    +

    + —Karen Hao and Heidi Swart +

    +

    + We can still have nice things +

    +

    + A place for comfort, fun and distraction to brighten up your + day. (Got any ideas? + Drop me a line + or + skeet 'em at me + .) +

    +

    + + These influencers are on a mission to save the UK’s pubs.  +

    +

    + + Here’s what a map of America solely made up of its rivers would look like. +

    +

    + + The winner of the Underwater Photographer of the Year awards is + incredibly cute.
    + Pokémon may have turned 30 years old, but the franchise is more popular than ever. +

    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + diff --git a/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/the-download-lightning-openai-pentagon/meta.json b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/the-download-lightning-openai-pentagon/meta.json new file mode 100644 index 0000000..7d9474f --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/the-download-lightning-openai-pentagon/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://www.technologyreview.com/2026/03/03/1133900/the-download-the-startup-that-says-it-can-stop-lightning-and-inside-openais-pentagon-deal/", + "host": "www.technologyreview.com", + "feed_source": "technologyreview", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:19:23.587528Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "The Download: The startup that says it can stop lightning, and inside OpenAI’s Pentagon deal", + "extracted_word_count": 1208, + "extracted_success": true, + "expected_selector": "div#content--body" +} diff --git a/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/the-download-lightning-openai-pentagon/raw.html b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/the-download-lightning-openai-pentagon/raw.html new file mode 100644 index 0000000..e357882 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.technologyreview.com/the-download-lightning-openai-pentagon/raw.html @@ -0,0 +1,41983 @@ + + + + + + + The Download: The startup that says it can stop lightning, and inside + OpenAI's Pentagon deal | MIT Technology Review + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + Skip to Content +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    + The Download +
    +

    + The Download: The startup that says it can stop lightning, + and inside OpenAI’s Pentagon deal +

    +
    +

    + This is today's edition of The Download, our weekday + newsletter that provides a daily dose of what's going on + in the world of technology. +

    +
    +
    + +
    + March 3, 2026 +
    +
    +
    +
    +
    + lightning strike in wooded area of Canada +
    + Lightning strikes a wooded area in British Columbia's + Nahatlatch Valley.Getty Images +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +

    + This is today's edition of The Download, + our weekday newsletter that provides a daily + dose of what's going on in the world of + technology. +

    +

    + This startup claims it can stop lightning and + prevent catastrophic wildfires +

    +
    +
    +
    +
    +

    + Startup Skyward Wildfire says it can prevent catastrophic fires by + stopping the lightning strikes that ignite them. + So far, it hasn’t publicly revealed how it does + so, but online documents suggest the company is + relying on an approach the US government began + evaluating in the early 1960s: seeding clouds + with metallic chaff, or narrow fiberglass + strands coated with aluminum.  +

    +

    + It just raised millions of dollars to accelerate + its product development and expand its + operations. But researchers and environmental + observers say uncertainties remain, including + how well the seeding may work under varying + conditions, how much material would need to be + released, how frequently it would have to be + done, and what sorts of secondary environmental + impacts might result. Read the full story.  +

    +
    +

    —James Temple

    +

    + OpenAI’s “compromise” with the Pentagon is + what Anthropic feared +

    +
    +

    + OpenAI has reached a deal that will allow the US + military to use its technologies in classified + settings. CEO Sam Altman said the negotiations, + which the company began pursuing only after the Pentagon’s public reprimand of + Anthropic, were “definitely rushed.” +

    +

    + OpenAI has taken great pains to say that it has + not caved to allow the Pentagon to do whatever + it wants with its technology. The company + published a blog post explaining that its agreement protected + against use for autonomous weapons and mass + domestic surveillance, and Altman said the company did not simply accept the + same terms that Anthropic refused.  +

    +

    + But it’s not yet clear if OpenAI can build in + the safety precautions it promises as the + military rushes out a politicized AI strategy + during strikes on Iran, or if the deal will be + seen as good enough by employees who wanted the + company to take a harder line. Walking that + tightrope will be tricky. Read the full story. +

    +

    —James O’Donnell

    +
    +
    +
    +
    +

    + The story is from The + Algorithm, our weekly newsletter on AI. To get stories + like this in your inbox first, sign up here. +

    +

    The must-reads

    +

    + I’ve combed the internet to find you today’s + most fun/important/scary/fascinating stories + about technology. +

    +

    + 1 Gulf states are racing against time to + intercept Iran’s drone attacks +

    +
    +
    +

    + They could run out of interceptors very soon. + (WSJ $) +

    +
      +
    • + Amazon says it lost three data centers in + the strikes. (Business Insider $) +
    • +
    • + There has been a spike in GPS attacks too, + affecting nearby shipping. (Wired) +
    • +
    • + Crypto stocks are tumbling in + response. (Bloomberg) +
    • +
    +

    + 2 Apple is considering using Google’s Gemini + AI to power Siri +

    +

    + It’s also set to deepen its reliance on Google’s + cloud infrastructure. (The Information $) +

    +

    + 3 A database shows which topics fall foul of + the Trump administration +

    +
    +
    +
    +
    +

    + National parks are being forced to erase any + exhibits that display “partisan ideology”. (WP $) +

    +
      +
    • + The transatlantic battle over free speech + is coming. (FT $) +
    • +
    • + What it’s like to be banned in the US for + fighting online hate. (MIT Technology Review) +
    • +
    +

    + 4 Can AI actually enhance jobs, not just + destroy them? +

    +

    + Three economists take the optimistic view (New Yorker) +

    + +

    + 5 Are “bossware” apps tracking + you?  +

    +
    +

    + Tools to watch what workers are doing are + getting more and more sophisticated. (NYT) +

    + +

    + 6 RFK Jr says he is about to unleash 14 + banned peptides +

    +
    +

    + By reversing a Biden-era FDA ban on their + production. (Gizmodo) +

    + +

    + 7 Meta is testing an AI shopping research + tool +

    +
    +
    +
    +
    +

    + It hopes to rival Gemini and ChatGPT. (Bloomberg) +

    +

    + 8 Maybe data centers in space aren’t as crazy + as they sound?  +

    +

    + They could be cheaper, with the right tech. (Economist)  +

    + +

    + 9 Why climate change is making turbulence + worse +

    +
    +

    + Buckle up, people. (New Yorker) +

    +

    10 6G is on its way!

    +

    + And the hype cycle is doing its thing again. (The Verge $) +

    +

    Quote of the day

    +
    +
    +
    +
    +
    +

    + “We don’t list markets directly tied to death. + When there are markets where potential outcomes + involve death, we design the rules to prevent + people from profiting from death.” +

    +

    + —Tarek Mansour, CEO and founder of prediction + market company Kalshi, tries to justify the $54 + million bet on “Ali Khamenei out as Supreme + Leader?” on his platform, 404 Media reports. +

    +

    One More Thing

    +
    +
    + surveillance and control concept +
    EDEL RODRIGUEZ
    +
    +
    +

    + +

    +

    + South Africa’s private surveillance machine + is fueling a digital apartheid +

    +

    + Johannesburg is birthing a uniquely South + African surveillance model. Over the past + decade, the city has become host to a + centralized, coordinated, entirely privatized + mass surveillance operation. These tools have + been enthusiastically adopted by the local + security industry, grappling with the pressures + of a high-crime environment. +

    +

    + Civil rights activists worry the new + surveillance is fueling a digital apartheid and + unraveling people’s democratic liberties, but a + growing chorus of experts say the stakes are + even higher.  +

    +

    + They argue that the impact of artificial + intelligence is repeating the patterns of + colonial history, and here in South Africa, + where colonial legacies abound, the unfettered + deployment of AI surveillance offers just one + case study in how a technology that promised to + bring societies into the future is threatening + to send them back to the past. Read the full story. +

    +
    +
    +
    +
    +

    —Karen Hao and Heidi Swart

    +

    + We can still have nice things +

    +

    + A place for comfort, fun and distraction to + brighten up your day. (Got any + ideas? Drop me a line or skeet 'em at me.) +

    +

    + + These influencers are on a mission to save the UK’s pubs.  +

    +

    + + Here’s what a map of America solely made up of + its rivers would look like. +

    +

    + + The winner of the Underwater Photographer of the + Year awards is incredibly cute.
    + Pokémon + may have turned 30 years old, but the franchise is more popular than + ever. +

    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    + +
    +
    +
    +
    +

    + Deep Dive +

    +
    +
    +
    +

    + The Download +

    +
    +
    +
    +
    +
    +
    + +
    +
    +

    + The Download: AI-enhanced cybercrime, and secure AI + assistants +

    +
    +

    + Plus: Instagram's CEO Adam Mosseri has denied claims + that social media is “clinically addictive” +

    +
    + +
    +
    +
    +
    +
    +
    + +
    +
    +

    + The Download: sodium-ion batteries and China’s bright + tech future +

    +
    +

    + Plus: This company is developing gene therapies for + muscle growth, erectile dysfunction, and “radical + longevity” +

    +
    + +
    +
    +
    +
    +
    +
    + +
    +
    +

    + The Download: the future of nuclear power plants, and + social media-fueled AI hype +

    +
    +

    + Plus: more European countries are considering banning + social media for under-16s +

    +
    + +
    +
    +
    +
    +
    +
    + +
    +
    +

    + The Download: cut through AI coding hype, and biotech + trends to watch +

    +
    +

    + Plus: read our predictions for the five hottest AI + trends to watch +

    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +

    + Stay connected +

    +
    + +
    + Illustration by Rose Wong +
    +

    + Get the latest updates from
    MIT Technology Review +

    +

    + Discover special offers, top stories, upcoming events, and + more. +

    +
    + +
    +
    +

    + Thank you for submitting your email! +

    +
    + Explore more newsletters +
    +
    +
    +

    + It looks like something went wrong. +

    +
    +

    + We’re having trouble saving your preferences. Try + refreshing this page and updating them one more time. If + you continue to get this message, reach out to us at + customer-service@technologyreview.com + with a list of newsletters you’d like to receive. +

    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    + +
    +
    +
    + +
    +
    + + diff --git a/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/danish-pm-candidate-rejects-far-right-call-for-fewer-muslims/expected.html b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/danish-pm-candidate-rejects-far-right-call-for-fewer-muslims/expected.html new file mode 100644 index 0000000..129af6a --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/danish-pm-candidate-rejects-far-right-call-for-fewer-muslims/expected.html @@ -0,0 +1,131 @@ + + + + + + diff --git a/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/danish-pm-candidate-rejects-far-right-call-for-fewer-muslims/meta.json b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/danish-pm-candidate-rejects-far-right-call-for-fewer-muslims/meta.json new file mode 100644 index 0000000..abafe36 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/danish-pm-candidate-rejects-far-right-call-for-fewer-muslims/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://www.thelocal.dk/20260304/danish-pm-candidate-rejects-far-right-call-for-fewer-muslims", + "host": "www.thelocal.dk", + "feed_source": "thelocal", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:16:35.558683Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "Danish PM candidate rejects far-right call for fewer Muslims", + "extracted_word_count": 507, + "extracted_success": true, + "expected_selector": "div#articleBody" +} diff --git a/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/danish-pm-candidate-rejects-far-right-call-for-fewer-muslims/raw.html b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/danish-pm-candidate-rejects-far-right-call-for-fewer-muslims/raw.html new file mode 100644 index 0000000..9e53818 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/danish-pm-candidate-rejects-far-right-call-for-fewer-muslims/raw.html @@ -0,0 +1,7030 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Danish PM candidate rejects far-right call for fewer Muslims + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + +
    + +
    +

    Advertisement

    + +
    + + + +
    +
    + +
    +
    +
    +
    +
    + + +

    + Danish PM candidate rejects far-right call for fewer Muslims +

    +
    +
    + Michael Barrett +
    + + Michael Barrett + - michael@thelocal.dk + + + +
    +
    + +
    +
    + Danish PM candidate rejects far-right call for fewer Muslims +
    + Liberal party leader Troels Lund Poulsen has rejected + calls from the far right to reduce the number of Muslims + in Denmark. File photo: Mads Claus Rasmussen/Ritzau + Scanpix +
    +
    +

    + Liberal (Venstre) party leader Troels Lund Poulsen, who + wants to lead a conservative government after this month’s + election, has rejected calls from the far right to reduce + the number of Muslims in Denmark. +

    +
    +
    +

    Advertisement

    + +
    +
    + +

    + Please sign up or log in to continue reading +

    + +
    + +
    +

    More

    + +
    + +
    +
    +

    Comments

    + +
    + +
    +
    +

    + Join the conversation in our comments section + below. + Share your own views and experience and if you have a + question or suggestion for our journalists then email + us at + news@thelocal.dk.
    + Please keep comments civil, constructive and on topic + – and make sure to read our + terms of use + before getting involved. +

    +
    +
    +

    + Please log in + + to leave a comment. +

    +
    +
    +
    +
    + +
    +

    See Also

    +
    +
    +
    +
    +
    + +
    +
    +
    + +
    + + +
    + + + +
    + + + + + + + + + + + + + + + + + + + diff --git a/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/food-delivery-service-just-eat-confirms-denmark-exit/expected.html b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/food-delivery-service-just-eat-confirms-denmark-exit/expected.html new file mode 100644 index 0000000..e9c37b7 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/food-delivery-service-just-eat-confirms-denmark-exit/expected.html @@ -0,0 +1,101 @@ + + + +
    +

    + Orange-clad delivery drivers from Just Eat will no longer deliver + takeaway food in Denmark after its Dutch owner on Wednesday confirmed + the closure of its Danish business. +

    +

    + Some 280 Just Eat employees in Denmark are set to lose their jobs as + a result of the decision, Just Eat said in a short statement reported + by news wire Ritzau. +

    +

    + The decision is due to “new strategic priorities and challenging + market conditions” and the company will now prioritise its presence + elsewhere, it said. +

    +
    +

    + The company currently has 160 delivery drivers and 120 other staff in + the Nordic country. +

    +

    + The exact date of the closure is yet to be specified but it is + expected to happen soon. +

    +

    + Employees will be supported “as best as possible during this + process,” Ann-Sophie Adamsen, Country Manager at Just Eat Denmark, + said in the statement without giving further detail. +

    +

    + “I am incredibly proud of what we have achieved over the past three + years, and especially of my colleagues, who have made an enormous + effort and helped the business take some significant steps forward,” + Adamsen said. +

    +

    + “It is a difficult strategic decision for the global leadership, + which has had to make priorities,” she added. +

    +

    + The Danish country manager declined an interview request, Ritzau + writes. +

    +

    + Just Eat was founded in Denmark in 2001 before moving to the UK in + 2006 and later extending its global expansion. It most recently + changed ownership in August 2025 when it was acquired by Dutch + investment company Prosus. +

    +

    + The dominant position in Denmark's food delivery industry currently + belongs to Finnish-founded Wolt, which has been owned by US company + DoorDash since 2022. +

    +

    + Uber’s food delivery service Uber Eats + last month announced its expansion + to seven new European countries including Denmark. +

    +
    + + diff --git a/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/food-delivery-service-just-eat-confirms-denmark-exit/meta.json b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/food-delivery-service-just-eat-confirms-denmark-exit/meta.json new file mode 100644 index 0000000..f3a3fd1 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/food-delivery-service-just-eat-confirms-denmark-exit/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://www.thelocal.dk/20260304/food-delivery-service-just-eat-confirms-denmark-exit", + "host": "www.thelocal.dk", + "feed_source": "thelocal", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:16:40.380736Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "Food delivery service Just Eat confirms Denmark exit", + "extracted_word_count": 363, + "extracted_success": true, + "expected_selector": "div#articleBody" +} diff --git a/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/food-delivery-service-just-eat-confirms-denmark-exit/raw.html b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/food-delivery-service-just-eat-confirms-denmark-exit/raw.html new file mode 100644 index 0000000..b412395 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/food-delivery-service-just-eat-confirms-denmark-exit/raw.html @@ -0,0 +1,6995 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Food delivery service Just Eat confirms Denmark exit + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + +
    + +
    +

    Advertisement

    + +
    + + + +
    +
    + +
    +
    +
    +
    +
    +
    + Business +
    + +

    Food delivery service Just Eat confirms Denmark exit

    +
    +
    + Michael Barrett +
    + + Michael Barrett + - michael@thelocal.dk + + + +
    +
    + +
    +
    + Food delivery service Just Eat confirms Denmark exit +
    + The food delivery app Just Eat is set to vanish from the + Danish market after the company announced it will + discontinue operations in the country. File photo: Phil + Noble/Reuters/Ritzau Scanpix +
    +
    +

    + The food delivery app Just Eat is set to vanish from the + Danish market after the company announced it will + discontinue operations in the country. +

    +
    +
    +

    Advertisement

    + +
    +
    +
    +

    + Orange-clad delivery drivers from Just Eat will no + longer deliver takeaway food in Denmark after its Dutch + owner on Wednesday confirmed the closure of its Danish + business. +

    +

    + Some 280 Just Eat employees in Denmark are set to lose + their jobs as a result of the decision, Just Eat said in + a short statement reported by news wire Ritzau. +

    +

    + The decision is due to “new strategic priorities and + challenging market conditions” and the company will now + prioritise its presence elsewhere, it said. +

    +
    +

    + The company currently has 160 delivery drivers and 120 + other staff in the Nordic country. +

    +

    + The exact date of the closure is yet to be specified + but it is expected to happen soon. +

    +

    + Employees will be supported “as best as possible during + this process,” Ann-Sophie Adamsen, Country Manager at + Just Eat Denmark, said in the statement without giving + further detail. +

    + +
    +

    Advertisement

    + +
    +

    + “I am incredibly proud of what we have achieved over + the past three years, and especially of my colleagues, + who have made an enormous effort and helped the business + take some significant steps forward,” Adamsen + said. +

    +

    + “It is a difficult strategic decision for the global + leadership, which has had to make priorities,” she + added. +

    +

    + The Danish country manager declined an interview + request, Ritzau writes. +

    +

    + Just Eat was founded in Denmark in 2001 before moving + to the UK in 2006 and later extending its global + expansion. It most recently changed ownership in August + 2025 when it was acquired by Dutch investment company + Prosus. +

    +

    + The dominant position in Denmark's food delivery + industry currently belongs to Finnish-founded Wolt, + which has been owned by US company DoorDash since + 2022. +

    +

    + Uber’s food delivery service Uber Eats + last month announced its expansion + to seven new European countries including Denmark. +

    +
    +

    + Please sign up or log in to continue reading +

    + +
    + +
    +

    More

    +
    + #Business +
    +
    + +
    +
    +

    Comments

    + +
    + +
    +
    +

    + Join the conversation in our comments section + below. + Share your own views and experience and if you have a + question or suggestion for our journalists then email + us at + news@thelocal.dk.
    + Please keep comments civil, constructive and on topic + – and make sure to read our + terms of use + before getting involved. +

    +
    +
    +

    + Please log in + + to leave a comment. +

    +
    +
    +
    +
    + +
    +

    See Also

    +
    +
    +
    +
    +
    + +
    +
    +
    + +
    + + +
    + + + +
    + + + + + + + + + + + + + + + + + + diff --git a/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/the-history-of-the-danish-letter-o/expected.html b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/the-history-of-the-danish-letter-o/expected.html new file mode 100644 index 0000000..e2ed18a --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/the-history-of-the-danish-letter-o/expected.html @@ -0,0 +1,148 @@ + + + + + + diff --git a/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/the-history-of-the-danish-letter-o/meta.json b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/the-history-of-the-danish-letter-o/meta.json new file mode 100644 index 0000000..e2ce470 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/the-history-of-the-danish-letter-o/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://www.thelocal.dk/20260304/the-history-of-the-danish-letter-o", + "host": "www.thelocal.dk", + "feed_source": "thelocal", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:16:44.116716Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "The history of the Danish letter Ø", + "extracted_word_count": 694, + "extracted_success": true, + "expected_selector": "div#articleBody" +} diff --git a/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/the-history-of-the-danish-letter-o/raw.html b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/the-history-of-the-danish-letter-o/raw.html new file mode 100644 index 0000000..5fc7c97 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.thelocal.dk/the-history-of-the-danish-letter-o/raw.html @@ -0,0 +1,7045 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The history of the Danish letter Ø + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + +
    + +
    +

    Advertisement

    + +
    + + + +
    +
    + +
    +
    +
    +
    +
    +
    + danish language + + For Members +
    + +

    The history of the Danish letter Ø

    +
    +
    + Michael Barrett +
    + + Michael Barrett + - michael@thelocal.dk + + + +
    +
    + +
    +
    + The history of the Danish letter Ø +
    + Denmark's Eastern High Court or Østre Landsret. File + photo: Philip Davali/Ritzau Scanpix +
    +
    +

    + It might look quintessentially Danish, but the letter ø is + far from unique to the national language and has a longer + history than you might imagine. +

    +
    +
    +

    Advertisement

    + +
    +
    + +

    + Please sign up or log in to continue reading +

    + +
    + +
    +

    More

    + +
    + +
    +
    +

    Comments

    + +
    + +
    +
    +

    + Join the conversation in our comments section + below. + Share your own views and experience and if you have a + question or suggestion for our journalists then email + us at + news@thelocal.dk.
    + Please keep comments civil, constructive and on topic + – and make sure to read our + terms of use + before getting involved. +

    +
    +
    +

    + Please log in + + to leave a comment. +

    +
    +
    +
    +
    + +
    +

    See Also

    +
    +
    +
    +
    +
    + +
    +
    +
    + +
    + + +
    + + + +
    + + + + + + + + + + + + + + + + + + + diff --git a/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/annie-dillard-weasel/expected.html b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/annie-dillard-weasel/expected.html new file mode 100644 index 0000000..f03d889 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/annie-dillard-weasel/expected.html @@ -0,0 +1,331 @@ + + + +
    +

    + + What a Weasel Knows That We Forget: Annie Dillard on How to Live + +

    +

    + Suppose we answer + the most important question of existencein the affirmative. There is then only one question remaining: + How shall we live this life? +

    +

    + Despite all the technologies of thought and feeling we have invented to + divine an answer — philosophy and poetry, scripture and self-help — life + stares mutely back at us, immense and indifferent, having abled us with + opposable thumbs and handicapped us with a consciousness capable of + self-reference that renders us dissatisfied with the banality of mere + survival. Beneath the overstory of one hundred trillion synapses, the + overthinking animal keeps losing its way in the wilderness of want. +

    +

    + Not so the other animals. “They do not sweat and whine about their + condition,” Walt Whitman wrote in Leaves of Grass (which is + philosophy and poetry and scripture and self-help in one), “they do not + lie awake in the dark and weep for their sins, they do not make me sick + discussing their duty to God, not one is dissatisfied, not one is + demented with the mania of owning things.” +

    +

    + A century and a half after Whitman, + Annie Dillardlooks to another animal for a model of how to live these human lives. + Having let a muskrat be her + teacher in unselfconsciousness, she recounts her lens-clearing encounter with a weasel in an essay + originally published in her 1982 packet of revelations + Teaching a Stone to Talk, later included in + The Abundance: Narrative Essays Old and New(public library) — one of my all-time favorite books. +

    +
    + + + +
    Annie Dillard
    +
    +

    She writes:

    +
    +

    + I startled a weasel who startled me, and we exchanged a long glance. +

    +

    + Twenty minutes from my house, through the woods by the quarry and + across the highway, is Hollins Pond, a remarkable piece of + shallowness, where I like to go at sunset and sit on a tree trunk. + Hollins Pond is also called Murray’s Pond; it covers two acres of + bottomland near Tinker Creek with six inches of water and six thousand + lily pads. In winter, brown-and-white steers stand in the middle of + it, merely dampening their hooves; from the distant shore they look + like miracle itself, complete with miracle’s nonchalance. Now, in + summer, the steers are gone. The water lilies have blossomed and + spread to a green horizontal plane that is terra firma to plodding + blackbirds, and tremulous ceiling to black leeches, crayfish, and + carp. +

    +

    + This is, mind you, suburbia. It is a five-minute walk in three + directions to rows of houses, though none is visible here. There’s a + 55-mph highway at one end of the pond, and a nesting pair of wood + ducks at the other. Under every bush is a muskrat hole or a beer can. + The far end is an alternating series of fields and woods, fields and + woods, threaded everywhere with motorcycle tracks — in whose bare clay + wild turtles lay eggs. +

    +

    + So, I had crossed the highway, stepped over two low barbed-wire + fences, and traced the motorcycle path in all gratitude through the + wild rose and poison ivy of the pond’s shoreline up into high grassy + fields. Then I cut down through the woods to the mossy fallen tree + where I sit. This tree is excellent. It makes a dry, upholstered bench + at the upper, marshy end of the pond, a plush jetty raised from the + thorny shore between a shallow blue body of water and a deep blue body + of sky. +

    +

    + The sun had just set. I was relaxed on the tree trunk, ensconced in + the lap of lichen, watching the lily pads at my feet tremble and part + dreamily over the thrusting path of a carp. A yellow bird appeared to + my right and flew behind me. It caught my eye; I swiveled around — and + the next instant, inexplicably, I was looking down at a weasel, who + was looking up at me. +

    +

    + Weasel! I’d never seen one wild before. He was ten inches long, thin + as a curve, a muscled ribbon, brown as fruitwood, soft-furred, alert. + His face was fierce, small and pointed as a lizard’s; he would have + made a good arrowhead. There was just a dot of chin, maybe two brown + hairs’ worth, and then the pure white fur began that spread down his + underside. He had two black eyes I didn’t see, any more than you see a + window. +

    +
    +
    + +
    + Weasel from from + Natural History and Illustrations of Mammals by Heinrich + Rudolf Schinz, 1824. +
    +
    +

    + Encounters are events, they touch things in us, change things in us, + bend probability in the shape of the possible, tie time and chance into + a knot of meaning between two creatures. Dillard recounts: +

    +
    +

    + The weasel was stunned into stillness as he was emerging from beneath + an enormous shaggy wild rose bush four feet away. I was stunned into + stillness twisted backward on the tree trunk. Our eyes locked, and + someone threw away the key. +

    +

    + Our look was as if two lovers, or deadly enemies, met unexpectedly on + an overgrown path when each had been thinking of something else: a + clearing blow to the gut. It was also a bright blow to the brain, or a + sudden beating of brains, with all the charge and intimate grate of + rubbed balloons. It emptied our lungs. It felled the forest, moved the + fields, and drained the pond; the world dismantled and tumbled into + that black hole of eyes. If you and I looked at each other that way, + our skulls would split and drop to our shoulders. But we don’t. We + keep our skulls. So. +

    +
    +

    + Every meaningful encounter is a kind of enchantment — it comes unbidden + and breaks without warning, leaving us transformed. As the weasel + vanishes under the wild rose, Dillard finds herself wondering what life + is like for a creature whose “journal is tracks in clay, a spray of + feathers, mouse blood and bone: uncollected, unconnected, loose leaf, + and blown,” and what clues that life might give her about how to live + her own. Reflecting on the memory of the encounter, on the revelation of + it, she writes: +

    +
    +

    + I would like to learn, or remember, how to live. I come to Hollins + Pond not so much to learn how to live as, frankly, to forget about it. + That is, I don’t think I can learn from a wild animal how to live in + particular — shall I suck warm blood, hold my tail high, walk with my + footprints precisely over the prints of my hands? — but I might learn + something of mindlessness, something of the purity of living in the + physical sense and the dignity of living without bias or motive. The + weasel lives in necessity and we live in choice, hating necessity and + dying at the last ignobly in its talons. I would like to live as I + should, as the weasel lives as he should. And I suspect that for me + the way is like the weasel’s: open to time and death painlessly, + noticing everything, remembering nothing, choosing the given with a + fierce and pointed will. +

    +
    +
    + +
    + Art by + Jackie Morrisfrom + The Wild Cards +
    +
    +

    + Because we are creatures made of time, to change our way of being is to + change our experience of time. She considers the chronometry of + wildness: +

    +
    +

    + Time and events are merely poured, unremarked, and ingested directly, + like blood pulsed into my gut through a jugular vein. +

    +
    +

    + It is hard enough for a human being to attain such purity of being, + harder still to share it with another. In a passage that to me is the + purest, most exalted measure of love — love of another, love of life — + she writes: +

    +
    +

    + Could two live that way? Could two live under the wild rose, and + explore by the pond, so that the smooth mind of each is as everywhere + present to the other, and as received and as unchallenged, as falling + snow? +

    +

    + We could, you know. We can live any way we want. People take vows of + poverty, chastity, and obedience — even of silence — by choice. The + thing is to stalk your calling in a certain skilled and supple way, to + locate the most tender and live spot and plug into that pulse. This is + yielding, not fighting. A weasel doesn’t “attack” anything; a weasel + lives as he’s meant to, yielding at every moment to the perfect + freedom of single necessity. +

    +

    + I think it would be well, and proper, and obedient, and pure, to grasp + your one necessity and not let it go, to dangle from it limp wherever + it takes you. Then even death, where you’re going no matter how you + live, cannot you part. Seize it and let it seize you up aloft even, + till your eyes burn out and drop; let your musky flesh fall off in + shreds, and let your very bones unhinge and scatter, loosened over + fields, over fields and woods, lightly, thoughtless, from any height + at all, from as high as eagles. +

    +
    +

    + For more lessons on how to be human drawn from the lives of other + animals, learn about + time and tenderness from a donkey, about + love and loss from an orca, and about + living with a plasticity of being from a caracara. +

    +
    + + diff --git a/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/annie-dillard-weasel/meta.json b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/annie-dillard-weasel/meta.json new file mode 100644 index 0000000..dd1e345 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/annie-dillard-weasel/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://www.themarginalian.org/2026/03/04/annie-dillard-weasel/", + "host": "www.themarginalian.org", + "feed_source": "themarginalian", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:16:27.813303Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "What a Weasel Knows That We Forget: Annie Dillard on How to Live", + "extracted_word_count": 1522, + "extracted_success": true, + "expected_selector": "div.entry_content" +} diff --git a/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/annie-dillard-weasel/raw.html b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/annie-dillard-weasel/raw.html new file mode 100644 index 0000000..0486929 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/annie-dillard-weasel/raw.html @@ -0,0 +1,4125 @@ + + + + + + + + + + + + + + + + + + + + + + + + What a Weasel Knows That We Forget: Annie Dillard on How to Live – The + Marginalian + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    + The Marginalian +
    + + + + + +
    +
    + + + +
    + The Marginalian +
    +
    + +
    +
    + + + +
    +
    +
    +

    + What a Weasel Knows That We Forget: Annie Dillard on How to + Live +

    +

    + + + +
    +

    + What a Weasel Knows That We Forget: Annie Dillard on How to Live +

    +

    + Suppose we answer + the most important question of existence + in the affirmative. There is then only one question remaining: + How shall we live this life? +

    +

    + Despite all the technologies of thought and feeling we have + invented to divine an answer — philosophy and poetry, + scripture and self-help — life stares mutely back at us, + immense and indifferent, having abled us with opposable thumbs + and handicapped us with a consciousness capable of + self-reference that renders us dissatisfied with the banality + of mere survival. Beneath the overstory of one hundred + trillion synapses, the overthinking animal keeps losing its + way in the wilderness of want. +

    +

    + Not so the other animals. “They do not sweat and whine about + their condition,” Walt Whitman wrote in + Leaves of Grass (which is philosophy and poetry and + scripture and self-help in one), “they do not lie awake in the + dark and weep for their sins, they do not make me sick + discussing their duty to God, not one is dissatisfied, not one + is demented with the mania of owning things.” +

    +

    + A century and a half after Whitman, + Annie Dillard + looks to another animal for a model of how to live these human + lives. Having let a muskrat be her + teacher in unselfconsciousness, she recounts her lens-clearing encounter with a weasel in + an essay originally published in her 1982 packet of + revelations Teaching a Stone to Talk, later included + in + The Abundance: Narrative Essays Old and New + (public library) — one of my all-time favorite books. +

    +
    + +
    Annie Dillard
    +
    +

    She writes:

    +
    +

    + I startled a weasel who startled me, and we exchanged a long + glance. +

    +

    + Twenty minutes from my house, through the woods by the + quarry and across the highway, is Hollins Pond, a remarkable + piece of shallowness, where I like to go at sunset and sit + on a tree trunk. Hollins Pond is also called Murray’s Pond; + it covers two acres of bottomland near Tinker Creek with six + inches of water and six thousand lily pads. In winter, + brown-and-white steers stand in the middle of it, merely + dampening their hooves; from the distant shore they look + like miracle itself, complete with miracle’s nonchalance. + Now, in summer, the steers are gone. The water lilies have + blossomed and spread to a green horizontal plane that is + terra firma to plodding blackbirds, and tremulous ceiling to + black leeches, crayfish, and carp. +

    +

    + This is, mind you, suburbia. It is a five-minute walk in + three directions to rows of houses, though none is visible + here. There’s a 55-mph highway at one end of the pond, and a + nesting pair of wood ducks at the other. Under every bush is + a muskrat hole or a beer can. The far end is an alternating + series of fields and woods, fields and woods, threaded + everywhere with motorcycle tracks — in whose bare clay wild + turtles lay eggs. +

    +

    + So, I had crossed the highway, stepped over two low + barbed-wire fences, and traced the motorcycle path in all + gratitude through the wild rose and poison ivy of the pond’s + shoreline up into high grassy fields. Then I cut down + through the woods to the mossy fallen tree where I sit. This + tree is excellent. It makes a dry, upholstered bench at the + upper, marshy end of the pond, a plush jetty raised from the + thorny shore between a shallow blue body of water and a deep + blue body of sky. +

    +

    + The sun had just set. I was relaxed on the tree trunk, + ensconced in the lap of lichen, watching the lily pads at my + feet tremble and part dreamily over the thrusting path of a + carp. A yellow bird appeared to my right and flew behind me. + It caught my eye; I swiveled around — and the next instant, + inexplicably, I was looking down at a weasel, who was + looking up at me. +

    +

    + Weasel! I’d never seen one wild before. He was ten inches + long, thin as a curve, a muscled ribbon, brown as fruitwood, + soft-furred, alert. His face was fierce, small and pointed + as a lizard’s; he would have made a good arrowhead. There + was just a dot of chin, maybe two brown hairs’ worth, and + then the pure white fur began that spread down his + underside. He had two black eyes I didn’t see, any more than + you see a window. +

    +
    +
    + +
    + Weasel from from + Natural History and Illustrations of Mammals by + Heinrich Rudolf Schinz, 1824. +
    +
    +

    + Encounters are events, they touch things in us, change things + in us, bend probability in the shape of the possible, tie time + and chance into a knot of meaning between two creatures. + Dillard recounts: +

    +
    +

    + The weasel was stunned into stillness as he was emerging + from beneath an enormous shaggy wild rose bush four feet + away. I was stunned into stillness twisted backward on the + tree trunk. Our eyes locked, and someone threw away the key. +

    +

    + Our look was as if two lovers, or deadly enemies, met + unexpectedly on an overgrown path when each had been + thinking of something else: a clearing blow to the gut. It + was also a bright blow to the brain, or a sudden beating of + brains, with all the charge and intimate grate of rubbed + balloons. It emptied our lungs. It felled the forest, moved + the fields, and drained the pond; the world dismantled and + tumbled into that black hole of eyes. If you and I looked at + each other that way, our skulls would split and drop to our + shoulders. But we don’t. We keep our skulls. So. +

    +
    +

    + Every meaningful encounter is a kind of enchantment — it comes + unbidden and breaks without warning, leaving us transformed. + As the weasel vanishes under the wild rose, Dillard finds + herself wondering what life is like for a creature whose + “journal is tracks in clay, a spray of feathers, mouse blood + and bone: uncollected, unconnected, loose leaf, and blown,” + and what clues that life might give her about how to live her + own. Reflecting on the memory of the encounter, on the + revelation of it, she writes: +

    +
    +

    + I would like to learn, or remember, how to live. I come to + Hollins Pond not so much to learn how to live as, frankly, + to forget about it. That is, I don’t think I can learn from + a wild animal how to live in particular — shall I suck warm + blood, hold my tail high, walk with my footprints precisely + over the prints of my hands? — but I might learn something + of mindlessness, something of the purity of living in the + physical sense and the dignity of living without bias or + motive. The weasel lives in necessity and we live in choice, + hating necessity and dying at the last ignobly in its + talons. I would like to live as I should, as the weasel + lives as he should. And I suspect that for me the way is + like the weasel’s: open to time and death painlessly, + noticing everything, remembering nothing, choosing the given + with a fierce and pointed will. +

    +
    +
    + +
    + Art by + Jackie Morris + from + The Wild Cards +
    +
    +

    + Because we are creatures made of time, to change our way of + being is to change our experience of time. She considers the + chronometry of wildness: +

    +
    +

    + Time and events are merely poured, unremarked, and ingested + directly, like blood pulsed into my gut through a jugular + vein. +

    +
    +

    + It is hard enough for a human being to attain such purity of + being, harder still to share it with another. In a passage + that to me is the purest, most exalted measure of love — love + of another, love of life — she writes: +

    +
    +

    + Could two live that way? Could two live under the wild rose, + and explore by the pond, so that the smooth mind of each is + as everywhere present to the other, and as received and as + unchallenged, as falling snow? +

    +

    + We could, you know. We can live any way we want. People take + vows of poverty, chastity, and obedience — even of silence — + by choice. The thing is to stalk your calling in a certain + skilled and supple way, to locate the most tender and live + spot and plug into that pulse. This is yielding, not + fighting. A weasel doesn’t “attack” anything; a weasel lives + as he’s meant to, yielding at every moment to the perfect + freedom of single necessity. +

    +

    + I think it would be well, and proper, and obedient, and + pure, to grasp your one necessity and not let it go, to + dangle from it limp wherever it takes you. Then even death, + where you’re going no matter how you live, cannot you part. + Seize it and let it seize you up aloft even, till your eyes + burn out and drop; let your musky flesh fall off in shreds, + and let your very bones unhinge and scatter, loosened over + fields, over fields and woods, lightly, thoughtless, from + any height at all, from as high as eagles. +

    +
    +

    + For more lessons on how to be human drawn from the lives of + other animals, learn about + time and tenderness from a donkey, about + love and loss from an orca, and about + living with a plasticity of being from a caracara. +

    +
    +
    + + +
    + + + + + +
    +

    + —
    + Published March 4, 2026
    + —
    + https://www.themarginalian.org/2026/03/04/annie-dillard-weasel/
    + — +

    + BP +

    www.themarginalian.org

    +
    + +
    + BP + +

    + PRINT ARTICLE +

    + +

    + +

    + +
    + + +
    +

    Filed Under

    + +

    + +

    +
    +
    + +
    +

    + View Full Site +

    +
    +
    + + +
    +
    + + +
    +

    + The Marginalian participates in the Bookshop.org and + Amazon.com affiliate programs, designed to provide a means for sites + to earn commissions by linking to books. In more human terms, this + means that whenever you buy a book from a link here, I receive a small + percentage of its price, which goes straight back into my own colossal + biblioexpenses. + Privacy policy. (TLDR: You're safe — there are no nefarious "third parties" lurking + on my watch or shedding crumbs of the "cookies" the rest of the + internet uses.) +

    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/carl-jung-neurosis-creativity/expected.html b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/carl-jung-neurosis-creativity/expected.html new file mode 100644 index 0000000..60e67df --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/carl-jung-neurosis-creativity/expected.html @@ -0,0 +1,242 @@ + + + +
    +

    + + The Pain in You and the God in You: Carl Jung on the Relationship Between Psychological Suffering and Creativity + +

    +

    + When AI first began colonizing language — which is still our best + instrument for bridging the abyss between us, a container for thought + and feeling that shapes the contents — I asked chatGPT to compose a poem + about a solar eclipse in the style of Walt Whitman. It returned a ledger + of cliches in rhymed couplets. Getting the form wrong — Whitman did not + rhyme — seemed like an easy correction by a line of code. Getting poetry + itself wrong was the interesting question, the question that gets at the + heart of why we make poems (or paintings or novels or songs) — a + question fundamentally about what it means to be human. +

    +

    + I asked an elder poet friend why she thought chatGPT rang hollow where + Whitman could compact infinities of feeling in a single image, could + unseat the soul in a word. +

    +

    She paused, then said: “Because AI hasn’t suffered.”

    +

    + On the one hand, this echoes + a dangerous myth: the archetype of the tortured genius handed down to us by the + Romantics, who, cornered in their time and place, in a century of bloody + revolutions, deadly epidemics, and punitive Puritanical norms, must have + needed to believe that their suffering — those lives of poverty and + privation, those ill-fated exercises in projection mistaken for love, + all those premature deaths — was a fair price to pay for such creative + volcanicity. +

    +

    + On the other hand, this is reality: Art is the music we make from the + bewildered cry of being alive — sometimes a cry of exultant + astonishment, but often a cry of devastation at the collision between + our wishes and the will of the world. Every artist’s art is their coping + mechanism for what they are living through — the longings, the + heartbreaks, the triumphs, the wars within and without. It is these + painful convolutions of the psyche — which used to be termed + neurosis at the dawn of modern psychotherapy, and which we may + simply call suffering — that reveal us to ourselves, and it is out of + these revelations that we create anything capable of touching other + lives, that contact we call art. +

    +

    + Our power and our freedom lie in learning to neither negate our + suffering nor romanticize it but to harness its catalytic power as a + current passing through us to jolt us alive, then passing on and down + into the ground of being. +

    +
    + + + +
    Carl Jung
    +
    +

    + No one has refuted the myth of the tortured genius without negating the + fact and fertility of suffering more pointedly than + Carl Jung(July 26, 1875–June 6, 1961), who + thought deeply about the nature of creativity. +

    +

    + In 1943, a scholar of Kierkegaard asked Jung’s opinion of the + relationship between “psychological problems” and creative genius. With + an eye to Kierkegaard’s gift for + letting his anxiety fuel rather than hinder his creativity, Jung declares him a “whole” person and not “a jangling hither and + dither of displeasing fragmentary souls,” and writes: +

    +
    +

    + True creative genius does not let itself be spoilt by analysis, but is + freed from the impediments and distortions of a neurosis. Neurosis + does not produce art. It is uncreative and inimical to life. It is + failure and bungling. But the moderns mistake morbidity for creative + birth — part of the general lunacy of our time. +

    +

    + It is, of course, an unanswerable question what an artist would have + created if he had not been neurotic. Nietzsche’s syphilitic infection + undoubtedly exerted a strongly neuroticizing influence on his life. + But one could imagine a sound Nietzsche possessed of creative + power without hypertension — something like Goethe. He would have + written much the same as he did, but less strident, less shrill — + i.e., less German — more restrained, more responsible, more reasonable + and reverent. +

    +
    +
    + + + +
    + Art from + An Almanac of Birds: 100 Divinations for Uncertain Days, also available as a + stand-alone printand as + stationery cards. +
    +
    +

    + A century before Alain de Botton offered his + assuring perspective on the importance of breakdowns, Jung weighs what makes suffering generative or degenerative: +

    +
    +

    + Neurosis is a justified doubt in oneself and continually poses the + ultimate question of trust in + manand in God. + Doubt is creative if it is answered by deeds, and so is neurosis if it + exonerates itself as having been a phase — a crisis which is + pathological only when chronic. Neurosis is a protracted crisis + degenerated into a habit, the daily catastrophe ready for use. +

    +
    +

    + Jung considers the advice he would have given Kierkegaard about how to + orient to his suffering, which was the raw material of his philosophical + writings: +

    +
    +

    + It doesn’t matter what you say, but what it says in + you. To it you must address your answers. God is straightaway + with you and is the voice within you. You have to have it out with + that voice. +

    +
    +

    + Couple with a forgotten young poet’s + extraordinary letter to Emily Dickinson about how to bear your + suffering, then revisit Kierkegaard himself on + the value of despair. +

    +
    + + + +
    + Art from + An Almanac of Birds: 100 Divinations for Uncertain Days, also available as a + stand-alone printand as + stationery cards. +
    +
    +
    + + diff --git a/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/carl-jung-neurosis-creativity/meta.json b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/carl-jung-neurosis-creativity/meta.json new file mode 100644 index 0000000..1665197 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/carl-jung-neurosis-creativity/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://www.themarginalian.org/2026/03/04/carl-jung-neurosis-creativity/", + "host": "www.themarginalian.org", + "feed_source": "themarginalian", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:16:30.210688Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "The Pain in You and the God in You: Carl Jung on the Relationship Between Psychological Suffering and Creativity", + "extracted_word_count": 873, + "extracted_success": true, + "expected_selector": "div.entry_content" +} diff --git a/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/carl-jung-neurosis-creativity/raw.html b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/carl-jung-neurosis-creativity/raw.html new file mode 100644 index 0000000..5c00a64 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/carl-jung-neurosis-creativity/raw.html @@ -0,0 +1,4046 @@ + + + + + + + + + + + + + + + + + + + + + + + + The Pain in You and the God in You: Carl Jung on the Relationship Between + Psychological Suffering and Creativity – The Marginalian + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    + The Marginalian +
    + + + + + +
    +
    + + + +
    + The Marginalian +
    +
    + +
    +
    + + + +
    +
    +
    +

    + The Pain in You and the God in You: Carl Jung on the + Relationship Between Psychological Suffering and + Creativity +

    +

    + + + +
    +

    + The Pain in You and the God in You: Carl Jung on the Relationship Between Psychological Suffering and Creativity +

    +

    + When AI first began colonizing language — which is still our + best instrument for bridging the abyss between us, a container + for thought and feeling that shapes the contents — I asked + chatGPT to compose a poem about a solar eclipse in the style + of Walt Whitman. It returned a ledger of cliches in rhymed + couplets. Getting the form wrong — Whitman did not rhyme — + seemed like an easy correction by a line of code. Getting + poetry itself wrong was the interesting question, the question + that gets at the heart of why we make poems (or paintings or + novels or songs) — a question fundamentally about what it + means to be human. +

    +

    + I asked an elder poet friend why she thought chatGPT rang + hollow where Whitman could compact infinities of feeling in a + single image, could unseat the soul in a word. +

    +

    She paused, then said: “Because AI hasn’t suffered.”

    +

    + On the one hand, this echoes + a dangerous myth: the archetype of the tortured genius handed down to us by + the Romantics, who, cornered in their time and place, in a + century of bloody revolutions, deadly epidemics, and punitive + Puritanical norms, must have needed to believe that their + suffering — those lives of poverty and privation, those + ill-fated exercises in projection mistaken for love, all those + premature deaths — was a fair price to pay for such creative + volcanicity. +

    +

    + On the other hand, this is reality: Art is the music we make + from the bewildered cry of being alive — sometimes a cry of + exultant astonishment, but often a cry of devastation at the + collision between our wishes and the will of the world. Every + artist’s art is their coping mechanism for what they are + living through — the longings, the heartbreaks, the triumphs, + the wars within and without. It is these painful convolutions + of the psyche — which used to be termed neurosis at + the dawn of modern psychotherapy, and which we may simply call + suffering — that reveal us to ourselves, and it is out of + these revelations that we create anything capable of touching + other lives, that contact we call art. +

    +

    + Our power and our freedom lie in learning to neither negate + our suffering nor romanticize it but to harness its catalytic + power as a current passing through us to jolt us alive, then + passing on and down into the ground of being. +

    +
    + +
    Carl Jung
    +
    +

    + No one has refuted the myth of the tortured genius without + negating the fact and fertility of suffering more pointedly + than + Carl Jung + (July 26, 1875–June 6, 1961), who + thought deeply about the nature of creativity. +

    +

    + In 1943, a scholar of Kierkegaard asked Jung’s opinion of the + relationship between “psychological problems” and creative + genius. With an eye to Kierkegaard’s gift for + letting his anxiety fuel rather than hinder his + creativity, Jung declares him a “whole” person and not “a jangling + hither and dither of displeasing fragmentary souls,” and + writes: +

    +
    +

    + True creative genius does not let itself be spoilt by + analysis, but is freed from the impediments and distortions + of a neurosis. Neurosis does not produce art. It is + uncreative and inimical to life. It is failure and bungling. + But the moderns mistake morbidity for creative birth — part + of the general lunacy of our time. +

    +

    + It is, of course, an unanswerable question what an artist + would have created if he had not been neurotic. Nietzsche’s + syphilitic infection undoubtedly exerted a strongly + neuroticizing influence on his life. But one could imagine a + sound Nietzsche possessed of creative power without + hypertension — something like Goethe. He would have written + much the same as he did, but less strident, less shrill — + i.e., less German — more restrained, more responsible, more + reasonable and reverent. +

    +
    +
    + +
    + Art from + An Almanac of Birds: 100 Divinations for Uncertain + Days, also available as a + stand-alone print + and as + stationery cards. +
    +
    +

    + A century before Alain de Botton offered his + assuring perspective on the importance of breakdowns, Jung weighs what makes suffering generative or + degenerative: +

    +
    +

    + Neurosis is a justified doubt in oneself and continually + poses the ultimate question of trust in + man + and in God. Doubt is creative if it is answered by deeds, + and so is neurosis if it exonerates itself as having been a + phase — a crisis which is pathological only when chronic. + Neurosis is a protracted crisis degenerated into a habit, + the daily catastrophe ready for use. +

    +
    +

    + Jung considers the advice he would have given Kierkegaard + about how to orient to his suffering, which was the raw + material of his philosophical writings: +

    +
    +

    + It doesn’t matter what you say, but what + it says in you. To it you must address + your answers. God is straightaway with you and is the voice + within you. You have to have it out with that voice. +

    +
    +

    + Couple with a forgotten young poet’s + extraordinary letter to Emily Dickinson about how to bear + your suffering, then revisit Kierkegaard himself on + the value of despair. +

    +
    + +
    + Art from + An Almanac of Birds: 100 Divinations for Uncertain + Days, also available as a + stand-alone print + and as + stationery cards. +
    +
    +
    +
    + + +
    + + + + + +
    +

    + —
    + Published March 4, 2026
    + —
    + https://www.themarginalian.org/2026/03/04/carl-jung-neurosis-creativity/
    + — +

    + BP +

    www.themarginalian.org

    +
    + +
    + BP + +

    + PRINT ARTICLE +

    + +

    + +

    + +
    + + +
    +

    Filed Under

    + +

    + +

    +
    +
    + +
    +

    + View Full Site +

    +
    +
    + + +
    +
    + + +
    +

    + The Marginalian participates in the Bookshop.org and + Amazon.com affiliate programs, designed to provide a means for sites + to earn commissions by linking to books. In more human terms, this + means that whenever you buy a book from a link here, I receive a small + percentage of its price, which goes straight back into my own colossal + biblioexpenses. + Privacy policy. (TLDR: You're safe — there are no nefarious "third parties" lurking + on my watch or shedding crumbs of the "cookies" the rest of the + internet uses.) +

    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/neruda-time/expected.html b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/neruda-time/expected.html new file mode 100644 index 0000000..f3612dc --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/neruda-time/expected.html @@ -0,0 +1,226 @@ + + + +
    +

    + + Pablo Neruda on How to Hold Time + +

    +

    + “Time is a river that sweeps me along, but I am a river,” Borges + wrote. “Time is a fire that consumes me, but I am the fire.” +

    +

    + Most of us are not Borges. Most of us are drowning in bewilderment at + where the time goes, burning with the urgency of being alive while + waiting to start living, wandering the labyrinth of life with wayward presence, wishing that + time ran differentlyas the cult of productivity turns each minute into a blade pressed + against the vein of our transience. +

    +

    + And all the while, our time is nested within our times — the epoch we + are living through together, born into it with no more choice in the + matter than the body and brain and family we have been born into. In his + magnificent essay on Shakespeare, James Baldwin countered the commonplace lament of every epoch: “It is + said that his time was easier than ours, but I doubt it — no time can be + easy if one is living through it.” A century before him — a century of + unrest and transformation — Emerson issued the ultimate antilamentation: + “This time, like all times, is a very good one, if we but know what to + do with it.” +

    +
    + + + +
    + Discus chronologicus — a German depiction of time from the + early 1720s. (Available as + a printand as + a wall clock.) +
    +
    +

    + Not knowing what to do with the time we have been given, not knowing how + to hold time in our personal and political lives, is at bottom an act of + forgetting how time hold us. + Pablo Neruda(July 12, 1904–September 23, 1973) casts a spell against forgetting in + the fourth canto of his long poem “Morning,”: +

    +
    +

    + You will remember that whimsical ravine
    where the vibrant aromas + rose,
    and from time to time a bird dressed
    in water and + languor: winter’s garment. +

    +

    + You will remember those gifts from the earth:
    piquing fragrance, + gold clay,
    thickets of herbs, wild roots,
    bewitching thorns + like swords. +

    +

    + You will remember the bouquet you brought,
    a bouquet of shadow + and silent water,
    a bouquet like foam-covered stone. +

    +

    + And that time was like never and always:
    We go where nothing is + expected
    and find everything waiting there. +

    +
    +
    + + + +
    Pablo Neruda
    +
    +

    + If time is the fundamental problem of human life and poetry is our most + precise technology for parsing the aching astonishment of being alive, + then time is the prime subject of poetry. Neruda knew this — time is the + subterranean current coursing beneath his vast and varied body of work, + the substrate upon which all of his + stunning love poemsand his + meditations on the inner lifegrow. He reverenced the stones for how they have “touched time,” + reverenced the minute for how it is “bound to join the river of time + that bears us,” reverenced “the inexhaustible springs of time,” longed + for “a time complete as an ocean,” then made that ocean with his poetry. +

    +

    In his poem “The Enigmas,” composed during WWII, he writes:

    +
    +

    + You’ve asked me what the crustacean spins
    between its gold + claws
    and I reply: the sea knows. +

    +

    + You wonder what the sea squirt waits for in its transparent bell?
         What + does it wait for? +

    +

    I’ll tell you: it’s waiting for time like you.

    +
    +

    + A decade later, in one of his “Elemental Odes,” Neruda laid out his most + explicit instruction for how to hold time: +

    +
    +

    + Listen and learn.
    Time
    is divided
    into two rivers:
    one
    flows + backward, devouring
    life already lived;
    the other
    moves + forward with you
    exposing
    your life.
    For a single + second
    they may be joined.
    Now.
    This is that moment,
    the + drop of an instant
    that washes away the past.
    It is the + present.
    It is in your hands.
    Racing, slipping,
    tumbling + like a waterfall.
    But it is yours.
    Help it grow
    with + love, with firmness,
    with stone and flight,
    with + resounding
    rectitude,
    with purest grains,
    the most + brilliant metal
    from your heart,
    walking
    in the full + light of day
    without fear
    of truth, goodness, justice,
    companions + of song,
    time that flows
    will have the shape
    and + sound
    of a guitar,
    and when you want
    to bow to the + past,
    the singing spring of
    transparent time
    will + reveal your wholeness.
    Time is joy. +

    +
    +

    + Couple with + three poems for trusting time, then revisit Kahlil Gibran on + how to befriend time. +

    +
    + + diff --git a/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/neruda-time/meta.json b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/neruda-time/meta.json new file mode 100644 index 0000000..7e03d50 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/neruda-time/meta.json @@ -0,0 +1,13 @@ +{ + "url": "https://www.themarginalian.org/2026/03/03/neruda-time/", + "host": "www.themarginalian.org", + "feed_source": "themarginalian", + "status_code": 200, + "proxy": "http://10.20.30.1:18085", + "fetched_at_utc": "2026-03-04T23:16:32.943141Z", + "expected_strategy": "manually curated selector + node cleanup", + "extracted_title": "Pablo Neruda on How to Hold Time", + "extracted_word_count": 733, + "extracted_success": true, + "expected_selector": "div.entry_content" +} diff --git a/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/neruda-time/raw.html b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/neruda-time/raw.html new file mode 100644 index 0000000..3de1882 --- /dev/null +++ b/tests/fixtures/fullpage_to_article_html/www.themarginalian.org/neruda-time/raw.html @@ -0,0 +1,4044 @@ + + + + + + + + + + + + + + + + + + + + + + + Pablo Neruda on How to Hold Time – The Marginalian + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + + +
    +
    + The Marginalian +
    + + + + + +
    +
    + + + +
    + The Marginalian +
    +
    + +
    +
    + + + +
    +
    +
    +

    + Pablo Neruda on How to Hold Time +

    +

    + + + +
    +

    + Pablo Neruda on How to Hold Time +

    +

    + “Time is a river that sweeps me along, but I am a river,” + Borges + wrote. “Time is a fire that consumes me, but I am the fire.” +

    +

    + Most of us are not Borges. Most of us are drowning in + bewilderment at + where the time goes, burning with the urgency of being alive while + waiting to start living, wandering the labyrinth of life with wayward presence, + wishing that + time ran differently + as the cult of productivity turns each minute into a blade + pressed against the vein of our transience. +

    +

    + And all the while, our time is nested within our times — the + epoch we are living through together, born into it with no + more choice in the matter than the body and brain and family + we have been born into. In his + magnificent essay on Shakespeare, James Baldwin countered the commonplace lament of every + epoch: “It is said that his time was easier than ours, but I + doubt it — no time can be easy if one is living through it.” A + century before him — a century of unrest and transformation — + Emerson issued the ultimate antilamentation: “This time, like + all times, is a very good one, if we but know what to do with + it.” +

    +
    + +
    + Discus chronologicus — a German depiction of time + from the early 1720s. (Available as + a print + and as + a wall clock.) +
    +
    +

    + Not knowing what to do with the time we have been given, not + knowing how to hold time in our personal and political lives, + is at bottom an act of forgetting how time hold us. + Pablo Neruda + (July 12, 1904–September 23, 1973) casts a spell against + forgetting in the fourth canto of his long poem “Morning,”: +

    +
    +

    + You will remember that whimsical ravine
    + where the vibrant aromas rose,
    + and from time to time a bird dressed
    + in water and languor: winter’s garment. +

    +

    + You will remember those gifts from the earth:
    + piquing fragrance, gold clay,
    + thickets of herbs, wild roots,
    + bewitching thorns like swords. +

    +

    + You will remember the bouquet you brought,
    + a bouquet of shadow and silent water,
    + a bouquet like foam-covered stone. +

    +

    + And that time was like never and always:
    + We go where nothing is expected
    + and find everything waiting there. +

    +
    +
    + +
    Pablo Neruda
    +
    +

    + If time is the fundamental problem of human life and poetry is + our most precise technology for parsing the aching + astonishment of being alive, then time is the prime subject of + poetry. Neruda knew this — time is the subterranean current + coursing beneath his vast and varied body of work, the + substrate upon which all of his + stunning love poems + and his + meditations on the inner life + grow. He reverenced the stones for how they have “touched + time,” reverenced the minute for how it is “bound to join the + river of time that bears us,” reverenced “the inexhaustible + springs of time,” longed for “a time complete as an ocean,” + then made that ocean with his poetry. +

    +

    + In his poem “The Enigmas,” composed during WWII, he writes: +

    +
    +

    + You’ve asked me what the crustacean spins
    + between its gold claws
    + and I reply: the sea knows. +

    +

    + You wonder what the sea squirt waits for in its transparent + bell?
    +      What does it wait for? +

    +

    I’ll tell you: it’s waiting for time like you.

    +
    +

    + A decade later, in one of his “Elemental Odes,” Neruda laid + out his most explicit instruction for how to hold time: +

    +
    +

    + Listen and learn.
    + Time
    + is divided
    + into two rivers:
    + one
    + flows backward, devouring
    + life already lived;
    + the other
    + moves forward with you
    + exposing
    + your life.
    + For a single second
    + they may be joined.
    + Now.
    + This is that moment,
    + the drop of an instant
    + that washes away the past.
    + It is the present.
    + It is in your hands.
    + Racing, slipping,
    + tumbling like a waterfall.
    + But it is yours.
    + Help it grow
    + with love, with firmness,
    + with stone and flight,
    + with resounding
    + rectitude,
    + with purest grains,
    + the most brilliant metal
    + from your heart,
    + walking
    + in the full light of day
    + without fear
    + of truth, goodness, justice,
    + companions of song,
    + time that flows
    + will have the shape
    + and sound
    + of a guitar,
    + and when you want
    + to bow to the past,
    + the singing spring of
    + transparent time
    + will reveal your wholeness.
    + Time is joy. +

    +
    +

    + Couple with + three poems for trusting time, then revisit Kahlil Gibran on + how to befriend time. +

    +
    +
    + + +
    + + + + + +
    +

    + —
    + Published March 3, 2026
    + —
    + https://www.themarginalian.org/2026/03/03/neruda-time/
    + — +

    + BP +

    www.themarginalian.org

    +
    + +
    + BP + +

    + PRINT ARTICLE +

    + +

    + +

    + +
    + + +
    +

    Filed Under

    + +

    + +

    +
    +
    + +
    +

    + View Full Site +

    +
    +
    + + +
    +
    + + +
    +

    + The Marginalian participates in the Bookshop.org and + Amazon.com affiliate programs, designed to provide a means for sites + to earn commissions by linking to books. In more human terms, this + means that whenever you buy a book from a link here, I receive a small + percentage of its price, which goes straight back into my own colossal + biblioexpenses. + Privacy policy. (TLDR: You're safe — there are no nefarious "third parties" lurking + on my watch or shedding crumbs of the "cookies" the rest of the + internet uses.) +

    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_fixture_corpus.py b/tests/test_fixture_corpus.py new file mode 100644 index 0000000..ae4604f --- /dev/null +++ b/tests/test_fixture_corpus.py @@ -0,0 +1,97 @@ +"""Fixture-driven extraction tests for full page HTML samples.""" + +from __future__ import annotations + +import html +import json +import re +from dataclasses import dataclass +from pathlib import Path + +import pytest +from justhtml import JustHTML + +from article_extractor import extract_article + +FIXTURE_ROOT = Path(__file__).resolve().parent / "fixtures" / "fullpage_to_article_html" + + +@dataclass(frozen=True) +class FixtureCase: + host: str + case: str + url: str + raw_path: Path + expected_path: Path + + +def _normalized_inner_text(fragment: str) -> str: + wrapped = f"
    {fragment}
    " + doc = JustHTML(wrapped, safe=False) + containers = doc.query("div") + text = containers[0].to_text(separator=" ", strip=True) if containers else fragment + text = html.unescape(text) + text = re.sub(r"\s+", " ", text).strip() + return re.sub(r"\s+([,.;:!?])", r"\1", text) + + +def _load_fixture_cases() -> list[FixtureCase]: + cases: list[FixtureCase] = [] + for meta_path in sorted(FIXTURE_ROOT.glob("*/*/meta.json")): + case_dir = meta_path.parent + raw_path = case_dir / "raw.html" + expected_path = case_dir / "expected.html" + if not raw_path.exists() or not expected_path.exists(): + continue + + meta = json.loads(meta_path.read_text(encoding="utf-8")) + url = str(meta.get("url", "")).strip() + if not url: + continue + + cases.append( + FixtureCase( + host=case_dir.parent.name, + case=case_dir.name, + url=url, + raw_path=raw_path, + expected_path=expected_path, + ) + ) + return cases + + +FIXTURE_CASES = _load_fixture_cases() + + +@pytest.mark.integration +def test_fixture_corpus_has_expected_minimum_size(): + """Fixture corpus should include broad host coverage for regression safety.""" + assert FIXTURE_ROOT.exists() + assert len(FIXTURE_CASES) >= 27 + + +@pytest.mark.integration +@pytest.mark.parametrize( + "fixture_case", + FIXTURE_CASES, + ids=lambda case: f"{case.host}/{case.case}", +) +def test_fullpage_to_article_fixture_corpus(fixture_case: FixtureCase): + raw_html = fixture_case.raw_path.read_text(encoding="utf-8") + expected_html = fixture_case.expected_path.read_text(encoding="utf-8") + + result = extract_article(raw_html, url=fixture_case.url) + + assert result.success, ( + f"Extraction failed for fixture {fixture_case.host}/{fixture_case.case} " + f"({fixture_case.url})" + ) + + actual_text = _normalized_inner_text(result.content) + expected_text = _normalized_inner_text(expected_html) + + assert actual_text == expected_text, ( + "Extracted text mismatch for fixture " + f"{fixture_case.host}/{fixture_case.case} ({fixture_case.url})" + ) diff --git a/uv.lock b/uv.lock index d68f68b..5b73e80 100644 --- a/uv.lock +++ b/uv.lock @@ -35,7 +35,7 @@ wheels = [ [[package]] name = "article-extractor" -version = "0.5.5" +version = "0.5.6" source = { editable = "." } dependencies = [ { name = "defusedxml" },