Skip to content

Commit 9e4ea8c

Browse files
authored
feat(fetch): add content quality signals (word_count, redirect_chain, is_paywall) (#82)
## What Add content quality signals to help agents decide whether fetched content is worth processing. ## Why Agents waste tokens processing low-quality or paywalled content. These signals let agents make informed decisions before committing to full processing. ## How - **`word_count`**: Counted from final text content (after conversion) - **`redirect_chain`**: Tracks all intermediate URLs during redirect following. Empty if no redirects occurred. - **`is_paywall`**: Heuristic detection against 13 common paywall indicators (e.g., "subscribe to read", "paywall", "premium content", "unlock this article"). Only set to `true` when detected; omitted otherwise. - Paywall detection runs on raw HTML before conversion (catches class names, hidden text) - Redirect chain integrated into `send_request_following_redirects` ## Risk - Low — additive fields, backward-compatible - `is_paywall` is a soft signal (false positives possible for pages that discuss paywalls) ### Checklist - [x] Unit tests (word count, paywall detection, redirect chain tracking, direct response) - [x] Clippy clean - [x] Docs build clean Closes #76
1 parent 17a78dc commit 9e4ea8c

File tree

2 files changed

+214
-4
lines changed

2 files changed

+214
-4
lines changed

crates/fetchkit/src/fetchers/default.rs

Lines changed: 202 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ impl Fetcher for DefaultFetcher {
220220
};
221221

222222
// THREAT[TM-SSRF-010]: Follow redirects manually so every hop is re-validated.
223-
let response =
223+
let (response, redirect_chain) =
224224
send_request_following_redirects(parsed_url, reqwest_method, headers, options).await?;
225225

226226
let status_code = response.status().as_u16();
@@ -250,6 +250,7 @@ impl Fetcher for DefaultFetcher {
250250
etag: meta.etag,
251251
filename: meta.filename,
252252
method: Some("HEAD".to_string()),
253+
redirect_chain,
253254
..Default::default()
254255
});
255256
}
@@ -265,6 +266,7 @@ impl Fetcher for DefaultFetcher {
265266
last_modified: meta.last_modified,
266267
etag: meta.etag,
267268
filename: meta.filename,
269+
redirect_chain,
268270
error: Some(
269271
"Binary content is not supported. Only textual content (HTML, text, JSON, etc.) can be fetched."
270272
.to_string(),
@@ -282,6 +284,9 @@ impl Fetcher for DefaultFetcher {
282284
// Convert to string
283285
let content = String::from_utf8_lossy(&body).to_string();
284286

287+
// Detect paywall before content is moved by conversion
288+
let is_paywall = detect_paywall(&content);
289+
285290
// Determine format and convert if needed
286291
// THREAT[TM-DOS-006]: Conversion input is bounded by max_body_size
287292
let is_html_content = is_html(&meta.content_type, &content);
@@ -335,6 +340,9 @@ impl Fetcher for DefaultFetcher {
335340
final_content.push_str(TRUNCATION_MESSAGE);
336341
}
337342

343+
// Compute quality signals
344+
let word_count = count_words(&final_content);
345+
338346
Ok(FetchResponse {
339347
url: final_url,
340348
status_code,
@@ -347,6 +355,9 @@ impl Fetcher for DefaultFetcher {
347355
content: Some(final_content),
348356
truncated: if truncated { Some(true) } else { None },
349357
metadata: page_metadata,
358+
word_count: Some(word_count),
359+
redirect_chain,
360+
is_paywall: if is_paywall { Some(true) } else { None },
350361
..Default::default()
351362
})
352363
}
@@ -383,7 +394,7 @@ impl Fetcher for DefaultFetcher {
383394
};
384395

385396
// THREAT[TM-SSRF-010]: Follow redirects manually with IP validation at each hop
386-
let response =
397+
let (response, redirect_chain) =
387398
send_request_following_redirects(parsed_url, reqwest_method, headers, options).await?;
388399

389400
let status_code = response.status().as_u16();
@@ -401,6 +412,7 @@ impl Fetcher for DefaultFetcher {
401412
etag: meta.etag,
402413
filename: meta.filename,
403414
method: Some("HEAD".to_string()),
415+
redirect_chain,
404416
..Default::default()
405417
});
406418
}
@@ -426,19 +438,22 @@ impl Fetcher for DefaultFetcher {
426438
truncated: if truncated { Some(true) } else { None },
427439
saved_path: Some(save_result.path),
428440
bytes_written: Some(save_result.bytes_written),
441+
redirect_chain,
429442
// No inline content when saving to file
430443
..Default::default()
431444
})
432445
}
433446
}
434447

448+
/// Returns `(response, redirect_chain)` where redirect_chain lists intermediate URLs.
435449
async fn send_request_following_redirects(
436450
initial_url: Url,
437451
method: reqwest::Method,
438452
headers: HeaderMap,
439453
options: &FetchOptions,
440-
) -> Result<reqwest::Response, FetchError> {
454+
) -> Result<(reqwest::Response, Vec<String>), FetchError> {
441455
let mut current_url = initial_url;
456+
let mut redirect_chain = Vec::new();
442457

443458
for redirect_count in 0..=MAX_REDIRECTS {
444459
let client = build_client_for_url(&current_url, headers.clone(), options)?;
@@ -449,7 +464,7 @@ async fn send_request_following_redirects(
449464
.map_err(FetchError::from_reqwest)?;
450465

451466
let Some(next_url) = redirect_target(&current_url, &response, options)? else {
452-
return Ok(response);
467+
return Ok((response, redirect_chain));
453468
};
454469

455470
if redirect_count == MAX_REDIRECTS {
@@ -463,6 +478,7 @@ async fn send_request_following_redirects(
463478
"Following redirect with IP validation"
464479
);
465480

481+
redirect_chain.push(current_url.to_string());
466482
current_url = next_url;
467483
}
468484

@@ -650,6 +666,36 @@ async fn read_body_with_timeout(
650666
}
651667
}
652668

669+
/// Count words in text content.
670+
fn count_words(text: &str) -> u64 {
671+
text.split_whitespace().count() as u64
672+
}
673+
674+
/// Common paywall indicators in raw HTML content.
675+
const PAYWALL_INDICATORS: &[&str] = &[
676+
"paywall",
677+
"subscribe to read",
678+
"subscribe to continue",
679+
"subscription required",
680+
"premium content",
681+
"members only",
682+
"sign in to read",
683+
"log in to read",
684+
"create a free account",
685+
"already a subscriber",
686+
"unlock this article",
687+
"get unlimited access",
688+
"start your free trial",
689+
];
690+
691+
/// Heuristic paywall detection from raw HTML.
692+
fn detect_paywall(html: &str) -> bool {
693+
let lower = html.to_lowercase();
694+
PAYWALL_INDICATORS
695+
.iter()
696+
.any(|indicator| lower.contains(indicator))
697+
}
698+
653699
#[cfg(test)]
654700
mod tests {
655701
use super::*;
@@ -1048,4 +1094,156 @@ mod tests {
10481094
assert_eq!(response.status_code, 304);
10491095
assert!(response.content.is_none());
10501096
}
1097+
1098+
#[test]
1099+
fn test_count_words() {
1100+
assert_eq!(count_words("hello world"), 2);
1101+
assert_eq!(count_words(""), 0);
1102+
assert_eq!(count_words(" one two three "), 3);
1103+
assert_eq!(count_words("word"), 1);
1104+
}
1105+
1106+
#[test]
1107+
fn test_detect_paywall() {
1108+
assert!(detect_paywall("<div class=\"paywall\">Subscribe</div>"));
1109+
assert!(detect_paywall("<p>Subscribe to read the full article</p>"));
1110+
assert!(detect_paywall("<span>Already a subscriber? Log in</span>"));
1111+
assert!(detect_paywall("<div>Unlock this article</div>"));
1112+
assert!(!detect_paywall("<p>This is a normal article</p>"));
1113+
assert!(!detect_paywall("<h1>Hello World</h1><p>Free content</p>"));
1114+
}
1115+
1116+
#[tokio::test]
1117+
async fn test_word_count_in_response() {
1118+
let server = MockServer::start().await;
1119+
Mock::given(method("GET"))
1120+
.and(path("/article"))
1121+
.respond_with(
1122+
ResponseTemplate::new(200)
1123+
.set_body_string("Hello world this is a test")
1124+
.insert_header("content-type", "text/plain"),
1125+
)
1126+
.mount(&server)
1127+
.await;
1128+
1129+
let fetcher = DefaultFetcher::new();
1130+
let options = FetchOptions {
1131+
dns_policy: DnsPolicy::allow_all(),
1132+
..Default::default()
1133+
};
1134+
let request = FetchRequest::new(format!("{}/article", server.uri()));
1135+
let response = fetcher.fetch(&request, &options).await.unwrap();
1136+
1137+
assert_eq!(response.word_count, Some(6));
1138+
}
1139+
1140+
#[tokio::test]
1141+
async fn test_redirect_chain_tracked() {
1142+
let destination = MockServer::start().await;
1143+
Mock::given(method("GET"))
1144+
.and(path("/final"))
1145+
.respond_with(
1146+
ResponseTemplate::new(200)
1147+
.set_body_string("arrived")
1148+
.insert_header("content-type", "text/plain"),
1149+
)
1150+
.mount(&destination)
1151+
.await;
1152+
1153+
let origin = MockServer::start().await;
1154+
Mock::given(method("GET"))
1155+
.and(path("/start"))
1156+
.respond_with(
1157+
ResponseTemplate::new(302)
1158+
.insert_header("location", format!("{}/final", destination.uri())),
1159+
)
1160+
.mount(&origin)
1161+
.await;
1162+
1163+
let fetcher = DefaultFetcher::new();
1164+
let options = FetchOptions {
1165+
dns_policy: DnsPolicy::allow_all(),
1166+
..Default::default()
1167+
};
1168+
let request = FetchRequest::new(format!("{}/start", origin.uri()));
1169+
let response = fetcher.fetch(&request, &options).await.unwrap();
1170+
1171+
assert_eq!(response.status_code, 200);
1172+
assert_eq!(response.redirect_chain.len(), 1);
1173+
assert!(response.redirect_chain[0].contains("/start"));
1174+
}
1175+
1176+
#[tokio::test]
1177+
async fn test_no_redirect_chain_for_direct_response() {
1178+
let server = MockServer::start().await;
1179+
Mock::given(method("GET"))
1180+
.and(path("/direct"))
1181+
.respond_with(
1182+
ResponseTemplate::new(200)
1183+
.set_body_string("direct")
1184+
.insert_header("content-type", "text/plain"),
1185+
)
1186+
.mount(&server)
1187+
.await;
1188+
1189+
let fetcher = DefaultFetcher::new();
1190+
let options = FetchOptions {
1191+
dns_policy: DnsPolicy::allow_all(),
1192+
..Default::default()
1193+
};
1194+
let request = FetchRequest::new(format!("{}/direct", server.uri()));
1195+
let response = fetcher.fetch(&request, &options).await.unwrap();
1196+
1197+
assert!(response.redirect_chain.is_empty());
1198+
}
1199+
1200+
#[tokio::test]
1201+
async fn test_paywall_detection() {
1202+
let server = MockServer::start().await;
1203+
Mock::given(method("GET"))
1204+
.and(path("/paywalled"))
1205+
.respond_with(
1206+
ResponseTemplate::new(200)
1207+
.set_body_string("<html><body><div class='paywall'>Subscribe to read the full article</div><p>Preview...</p></body></html>")
1208+
.insert_header("content-type", "text/html"),
1209+
)
1210+
.mount(&server)
1211+
.await;
1212+
1213+
let fetcher = DefaultFetcher::new();
1214+
let options = FetchOptions {
1215+
enable_markdown: true,
1216+
dns_policy: DnsPolicy::allow_all(),
1217+
..Default::default()
1218+
};
1219+
let request = FetchRequest::new(format!("{}/paywalled", server.uri())).as_markdown();
1220+
let response = fetcher.fetch(&request, &options).await.unwrap();
1221+
1222+
assert_eq!(response.is_paywall, Some(true));
1223+
}
1224+
1225+
#[tokio::test]
1226+
async fn test_no_paywall_for_normal_content() {
1227+
let server = MockServer::start().await;
1228+
Mock::given(method("GET"))
1229+
.and(path("/free"))
1230+
.respond_with(
1231+
ResponseTemplate::new(200)
1232+
.set_body_string("<html><body><p>This is free content</p></body></html>")
1233+
.insert_header("content-type", "text/html"),
1234+
)
1235+
.mount(&server)
1236+
.await;
1237+
1238+
let fetcher = DefaultFetcher::new();
1239+
let options = FetchOptions {
1240+
enable_markdown: true,
1241+
dns_policy: DnsPolicy::allow_all(),
1242+
..Default::default()
1243+
};
1244+
let request = FetchRequest::new(format!("{}/free", server.uri())).as_markdown();
1245+
let response = fetcher.fetch(&request, &options).await.unwrap();
1246+
1247+
assert!(response.is_paywall.is_none());
1248+
}
10511249
}

crates/fetchkit/src/types.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,18 @@ pub struct FetchResponse {
340340
/// Structured page metadata extracted from HTML
341341
#[serde(skip_serializing_if = "Option::is_none")]
342342
pub metadata: Option<PageMetadata>,
343+
344+
/// Word count of the final content
345+
#[serde(skip_serializing_if = "Option::is_none")]
346+
pub word_count: Option<u64>,
347+
348+
/// Chain of URLs followed during redirects (empty if no redirects)
349+
#[serde(skip_serializing_if = "Vec::is_empty", default)]
350+
pub redirect_chain: Vec<String>,
351+
352+
/// Heuristic paywall detection (soft signal, not guaranteed)
353+
#[serde(skip_serializing_if = "Option::is_none")]
354+
pub is_paywall: Option<bool>,
343355
}
344356

345357
#[cfg(test)]

0 commit comments

Comments
 (0)