From d716e4bcb224d161ac8c52f0b45b3fbbc7958c3b Mon Sep 17 00:00:00 2001 From: karesansui Date: Tue, 17 Mar 2026 01:18:23 +0900 Subject: [PATCH] fix: use str.split() for accurate word count in PruningContentFilter text.count(" ") + 1 overcounts words when consecutive spaces are present, which is common in HTML-extracted text from get_text(strip=True). This causes min_word_threshold checks to be too lenient, allowing short/noisy content to pass through the filter. The same file already uses len(text.split()) for the same purpose at lines 268 and 302. --- crawl4ai/content_filter_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py index 0909be33d..5e954c31b 100644 --- a/crawl4ai/content_filter_strategy.py +++ b/crawl4ai/content_filter_strategy.py @@ -739,7 +739,7 @@ def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len): if self.min_word_threshold: # Get raw text from metrics node - avoid extra processing text = metrics["node"].get_text(strip=True) - word_count = text.count(" ") + 1 + word_count = len(text.split()) if word_count < self.min_word_threshold: return -1.0 # Guaranteed removal score = 0.0