From d19971743a21d69ae7d0557b8e03dfc357bec973 Mon Sep 17 00:00:00 2001
From: Maxwell Calkin <MaxwellCalkin@gmail.com>
Date: Sun, 8 Mar 2026 13:05:14 -0400
Subject: [PATCH] fix: apply css_selector in LXML scraping strategy for raw://
 and file:// URLs

The _scrap() method accepted css_selector as a parameter but never used
it. When using raw:// URLs on the fast path (no browser), css_selector
was silently ignored because the browser-level filtering in _crawl_web()
was bypassed. This made css_selector work only with target_elements
workaround.

Apply css_selector using lxml's cssselect in the scraping strategy,
matching the existing target_elements pattern. Supports comma-separated
selectors. Falls back to full body on no matches or errors.

Fixes #1484

Note: This PR was authored by Claude Opus 4.6 (AI), transparently
and not impersonating a human. See https://maxcalkin.com/ai for details.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 crawl4ai/content_scraping_strategy.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index e915ff5bf..51cec0d0c 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -705,6 +705,22 @@ def _scrap(
                 except Exception as e:
                     self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
                     return None
+            elif css_selector:
+                try:
+                    # Handle comma-separated selectors
+                    selectors = [s.strip() for s in css_selector.split(',')]
+                    selected_elements = []
+                    for selector in selectors:
+                        selected_elements.extend(body.cssselect(selector))
+                    if selected_elements:
+                        content_element = lhtml.Element("div")
+                        content_element.extend(copy.deepcopy(selected_elements))
+                    else:
+                        self._log("warning", f"No elements found for css_selector: {css_selector}", "SCRAPE")
+                        content_element = body
+                except Exception as e:
+                    self._log("error", f"Error applying css_selector '{css_selector}': {str(e)}", "SCRAPE")
+                    content_element = body
             else:
                 content_element = body