From d19971743a21d69ae7d0557b8e03dfc357bec973 Mon Sep 17 00:00:00 2001 From: Maxwell Calkin Date: Sun, 8 Mar 2026 13:05:14 -0400 Subject: [PATCH] fix: apply css_selector in LXML scraping strategy for raw:// and file:// URLs The _scrap() method accepted css_selector as a parameter but never used it. When using raw:// URLs on the fast path (no browser), css_selector was silently ignored because the browser-level filtering in _crawl_web() was bypassed. This made css_selector work only with target_elements workaround. Apply css_selector using lxml's cssselect in the scraping strategy, matching the existing target_elements pattern. Supports comma-separated selectors. Falls back to full body on no matches or errors. Fixes #1484 Note: This PR was authored by Claude Opus 4.6 (AI), transparently and not impersonating a human. See https://maxcalkin.com/ai for details. Co-Authored-By: Claude Opus 4.6 --- crawl4ai/content_scraping_strategy.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index e915ff5bf..51cec0d0c 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -705,6 +705,22 @@ def _scrap( except Exception as e: self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") return None + elif css_selector: + try: + # Handle comma-separated selectors + selectors = [s.strip() for s in css_selector.split(',')] + selected_elements = [] + for selector in selectors: + selected_elements.extend(body.cssselect(selector)) + if selected_elements: + content_element = lhtml.Element("div") + content_element.extend(copy.deepcopy(selected_elements)) + else: + self._log("warning", f"No elements found for css_selector: {css_selector}", "SCRAPE") + content_element = body + except Exception as e: + self._log("error", f"Error applying css_selector '{css_selector}': {str(e)}", "SCRAPE") + content_element = body else: content_element = body