From d716e4bcb224d161ac8c52f0b45b3fbbc7958c3b Mon Sep 17 00:00:00 2001
From: karesansui <karesansui@ugentropy.com>
Date: Tue, 17 Mar 2026 01:18:23 +0900
Subject: [PATCH] fix: use str.split() for accurate word count in
 PruningContentFilter

text.count(" ") + 1 overcounts words when consecutive spaces are
present, which is common in HTML-extracted text from get_text(strip=True).
This causes min_word_threshold checks to be too lenient, allowing
short/noisy content to pass through the filter.

The same file already uses len(text.split()) for the same purpose
at lines 268 and 302.
---
 crawl4ai/content_filter_strategy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py
index 0909be33d..5e954c31b 100644
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -739,7 +739,7 @@ def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len):
         if self.min_word_threshold:
             # Get raw text from metrics node - avoid extra processing
             text = metrics["node"].get_text(strip=True)
-            word_count = text.count(" ") + 1
+            word_count = len(text.split())
             if word_count < self.min_word_threshold:
                 return -1.0  # Guaranteed removal
         score = 0.0