Skip to content

Commit 4c90252

Browse files
committed
feat(fetch): add content quality signals (word_count, redirect_chain, is_paywall)
Add word_count, redirect_chain, and is_paywall fields to FetchResponse. Word count computed from final content. Redirect chain tracks all intermediate URLs during redirect following. Paywall detection uses heuristic matching against common paywall indicators in raw HTML. Closes #76
1 parent 17a78dc commit 4c90252

File tree

2 files changed

+214
-4
lines changed

2 files changed

+214
-4
lines changed

crates/fetchkit/src/fetchers/default.rs

Lines changed: 202 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ impl Fetcher for DefaultFetcher {
220220
};
221221

222222
// THREAT[TM-SSRF-010]: Follow redirects manually so every hop is re-validated.
223-
let response =
223+
let (response, redirect_chain) =
224224
send_request_following_redirects(parsed_url, reqwest_method, headers, options).await?;
225225

226226
let status_code = response.status().as_u16();
@@ -250,6 +250,7 @@ impl Fetcher for DefaultFetcher {
250250
etag: meta.etag,
251251
filename: meta.filename,
252252
method: Some("HEAD".to_string()),
253+
redirect_chain,
253254
..Default::default()
254255
});
255256
}
@@ -265,6 +266,7 @@ impl Fetcher for DefaultFetcher {
265266
last_modified: meta.last_modified,
266267
etag: meta.etag,
267268
filename: meta.filename,
269+
redirect_chain,
268270
error: Some(
269271
"Binary content is not supported. Only textual content (HTML, text, JSON, etc.) can be fetched."
270272
.to_string(),
@@ -282,6 +284,9 @@ impl Fetcher for DefaultFetcher {
282284
// Convert to string
283285
let content = String::from_utf8_lossy(&body).to_string();
284286

287+
// Detect paywall before content is moved by conversion
288+
let is_paywall = detect_paywall(&content);
289+
285290
// Determine format and convert if needed
286291
// THREAT[TM-DOS-006]: Conversion input is bounded by max_body_size
287292
let is_html_content = is_html(&meta.content_type, &content);
@@ -335,6 +340,9 @@ impl Fetcher for DefaultFetcher {
335340
final_content.push_str(TRUNCATION_MESSAGE);
336341
}
337342

343+
// Compute quality signals
344+
let word_count = count_words(&final_content);
345+
338346
Ok(FetchResponse {
339347
url: final_url,
340348
status_code,
@@ -347,6 +355,9 @@ impl Fetcher for DefaultFetcher {
347355
content: Some(final_content),
348356
truncated: if truncated { Some(true) } else { None },
349357
metadata: page_metadata,
358+
word_count: Some(word_count),
359+
redirect_chain,
360+
is_paywall: if is_paywall { Some(true) } else { None },
350361
..Default::default()
351362
})
352363
}
@@ -383,7 +394,7 @@ impl Fetcher for DefaultFetcher {
383394
};
384395

385396
// THREAT[TM-SSRF-010]: Follow redirects manually with IP validation at each hop
386-
let response =
397+
let (response, redirect_chain) =
387398
send_request_following_redirects(parsed_url, reqwest_method, headers, options).await?;
388399

389400
let status_code = response.status().as_u16();
@@ -401,6 +412,7 @@ impl Fetcher for DefaultFetcher {
401412
etag: meta.etag,
402413
filename: meta.filename,
403414
method: Some("HEAD".to_string()),
415+
redirect_chain,
404416
..Default::default()
405417
});
406418
}
@@ -426,19 +438,22 @@ impl Fetcher for DefaultFetcher {
426438
truncated: if truncated { Some(true) } else { None },
427439
saved_path: Some(save_result.path),
428440
bytes_written: Some(save_result.bytes_written),
441+
redirect_chain,
429442
// No inline content when saving to file
430443
..Default::default()
431444
})
432445
}
433446
}
434447

448+
/// Returns `(response, redirect_chain)` where redirect_chain lists intermediate URLs.
435449
async fn send_request_following_redirects(
436450
initial_url: Url,
437451
method: reqwest::Method,
438452
headers: HeaderMap,
439453
options: &FetchOptions,
440-
) -> Result<reqwest::Response, FetchError> {
454+
) -> Result<(reqwest::Response, Vec<String>), FetchError> {
441455
let mut current_url = initial_url;
456+
let mut redirect_chain = Vec::new();
442457

443458
for redirect_count in 0..=MAX_REDIRECTS {
444459
let client = build_client_for_url(&current_url, headers.clone(), options)?;
@@ -449,7 +464,7 @@ async fn send_request_following_redirects(
449464
.map_err(FetchError::from_reqwest)?;
450465

451466
let Some(next_url) = redirect_target(&current_url, &response, options)? else {
452-
return Ok(response);
467+
return Ok((response, redirect_chain));
453468
};
454469

455470
if redirect_count == MAX_REDIRECTS {
@@ -463,6 +478,7 @@ async fn send_request_following_redirects(
463478
"Following redirect with IP validation"
464479
);
465480

481+
redirect_chain.push(current_url.to_string());
466482
current_url = next_url;
467483
}
468484

@@ -650,6 +666,36 @@ async fn read_body_with_timeout(
650666
}
651667
}
652668

669+
/// Count words in text content.
670+
fn count_words(text: &str) -> u64 {
671+
text.split_whitespace().count() as u64
672+
}
673+
674+
/// Common paywall indicators in raw HTML content.
675+
const PAYWALL_INDICATORS: &[&str] = &[
676+
"paywall",
677+
"subscribe to read",
678+
"subscribe to continue",
679+
"subscription required",
680+
"premium content",
681+
"members only",
682+
"sign in to read",
683+
"log in to read",
684+
"create a free account",
685+
"already a subscriber",
686+
"unlock this article",
687+
"get unlimited access",
688+
"start your free trial",
689+
];
690+
691+
/// Heuristic paywall detection from raw HTML.
692+
fn detect_paywall(html: &str) -> bool {
693+
let lower = html.to_lowercase();
694+
PAYWALL_INDICATORS
695+
.iter()
696+
.any(|indicator| lower.contains(indicator))
697+
}
698+
653699
#[cfg(test)]
654700
mod tests {
655701
use super::*;
@@ -1048,4 +1094,156 @@ mod tests {
10481094
assert_eq!(response.status_code, 304);
10491095
assert!(response.content.is_none());
10501096
}
1097+
1098+
#[test]
1099+
fn test_count_words() {
1100+
assert_eq!(count_words("hello world"), 2);
1101+
assert_eq!(count_words(""), 0);
1102+
assert_eq!(count_words(" one two three "), 3);
1103+
assert_eq!(count_words("word"), 1);
1104+
}
1105+
1106+
#[test]
1107+
fn test_detect_paywall() {
1108+
assert!(detect_paywall("<div class=\"paywall\">Subscribe</div>"));
1109+
assert!(detect_paywall("<p>Subscribe to read the full article</p>"));
1110+
assert!(detect_paywall("<span>Already a subscriber? Log in</span>"));
1111+
assert!(detect_paywall("<div>Unlock this article</div>"));
1112+
assert!(!detect_paywall("<p>This is a normal article</p>"));
1113+
assert!(!detect_paywall("<h1>Hello World</h1><p>Free content</p>"));
1114+
}
1115+
1116+
#[tokio::test]
1117+
async fn test_word_count_in_response() {
1118+
let server = MockServer::start().await;
1119+
Mock::given(method("GET"))
1120+
.and(path("/article"))
1121+
.respond_with(
1122+
ResponseTemplate::new(200)
1123+
.set_body_string("Hello world this is a test")
1124+
.insert_header("content-type", "text/plain"),
1125+
)
1126+
.mount(&server)
1127+
.await;
1128+
1129+
let fetcher = DefaultFetcher::new();
1130+
let options = FetchOptions {
1131+
dns_policy: DnsPolicy::allow_all(),
1132+
..Default::default()
1133+
};
1134+
let request = FetchRequest::new(format!("{}/article", server.uri()));
1135+
let response = fetcher.fetch(&request, &options).await.unwrap();
1136+
1137+
assert_eq!(response.word_count, Some(6));
1138+
}
1139+
1140+
#[tokio::test]
1141+
async fn test_redirect_chain_tracked() {
1142+
let destination = MockServer::start().await;
1143+
Mock::given(method("GET"))
1144+
.and(path("/final"))
1145+
.respond_with(
1146+
ResponseTemplate::new(200)
1147+
.set_body_string("arrived")
1148+
.insert_header("content-type", "text/plain"),
1149+
)
1150+
.mount(&destination)
1151+
.await;
1152+
1153+
let origin = MockServer::start().await;
1154+
Mock::given(method("GET"))
1155+
.and(path("/start"))
1156+
.respond_with(
1157+
ResponseTemplate::new(302)
1158+
.insert_header("location", format!("{}/final", destination.uri())),
1159+
)
1160+
.mount(&origin)
1161+
.await;
1162+
1163+
let fetcher = DefaultFetcher::new();
1164+
let options = FetchOptions {
1165+
dns_policy: DnsPolicy::allow_all(),
1166+
..Default::default()
1167+
};
1168+
let request = FetchRequest::new(format!("{}/start", origin.uri()));
1169+
let response = fetcher.fetch(&request, &options).await.unwrap();
1170+
1171+
assert_eq!(response.status_code, 200);
1172+
assert_eq!(response.redirect_chain.len(), 1);
1173+
assert!(response.redirect_chain[0].contains("/start"));
1174+
}
1175+
1176+
#[tokio::test]
1177+
async fn test_no_redirect_chain_for_direct_response() {
1178+
let server = MockServer::start().await;
1179+
Mock::given(method("GET"))
1180+
.and(path("/direct"))
1181+
.respond_with(
1182+
ResponseTemplate::new(200)
1183+
.set_body_string("direct")
1184+
.insert_header("content-type", "text/plain"),
1185+
)
1186+
.mount(&server)
1187+
.await;
1188+
1189+
let fetcher = DefaultFetcher::new();
1190+
let options = FetchOptions {
1191+
dns_policy: DnsPolicy::allow_all(),
1192+
..Default::default()
1193+
};
1194+
let request = FetchRequest::new(format!("{}/direct", server.uri()));
1195+
let response = fetcher.fetch(&request, &options).await.unwrap();
1196+
1197+
assert!(response.redirect_chain.is_empty());
1198+
}
1199+
1200+
#[tokio::test]
1201+
async fn test_paywall_detection() {
1202+
let server = MockServer::start().await;
1203+
Mock::given(method("GET"))
1204+
.and(path("/paywalled"))
1205+
.respond_with(
1206+
ResponseTemplate::new(200)
1207+
.set_body_string("<html><body><div class='paywall'>Subscribe to read the full article</div><p>Preview...</p></body></html>")
1208+
.insert_header("content-type", "text/html"),
1209+
)
1210+
.mount(&server)
1211+
.await;
1212+
1213+
let fetcher = DefaultFetcher::new();
1214+
let options = FetchOptions {
1215+
enable_markdown: true,
1216+
dns_policy: DnsPolicy::allow_all(),
1217+
..Default::default()
1218+
};
1219+
let request = FetchRequest::new(format!("{}/paywalled", server.uri())).as_markdown();
1220+
let response = fetcher.fetch(&request, &options).await.unwrap();
1221+
1222+
assert_eq!(response.is_paywall, Some(true));
1223+
}
1224+
1225+
#[tokio::test]
1226+
async fn test_no_paywall_for_normal_content() {
1227+
let server = MockServer::start().await;
1228+
Mock::given(method("GET"))
1229+
.and(path("/free"))
1230+
.respond_with(
1231+
ResponseTemplate::new(200)
1232+
.set_body_string("<html><body><p>This is free content</p></body></html>")
1233+
.insert_header("content-type", "text/html"),
1234+
)
1235+
.mount(&server)
1236+
.await;
1237+
1238+
let fetcher = DefaultFetcher::new();
1239+
let options = FetchOptions {
1240+
enable_markdown: true,
1241+
dns_policy: DnsPolicy::allow_all(),
1242+
..Default::default()
1243+
};
1244+
let request = FetchRequest::new(format!("{}/free", server.uri())).as_markdown();
1245+
let response = fetcher.fetch(&request, &options).await.unwrap();
1246+
1247+
assert!(response.is_paywall.is_none());
1248+
}
10511249
}

crates/fetchkit/src/types.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,18 @@ pub struct FetchResponse {
340340
/// Structured page metadata extracted from HTML
341341
#[serde(skip_serializing_if = "Option::is_none")]
342342
pub metadata: Option<PageMetadata>,
343+
344+
/// Word count of the final content
345+
#[serde(skip_serializing_if = "Option::is_none")]
346+
pub word_count: Option<u64>,
347+
348+
/// Chain of URLs followed during redirects (empty if no redirects)
349+
#[serde(skip_serializing_if = "Vec::is_empty", default)]
350+
pub redirect_chain: Vec<String>,
351+
352+
/// Heuristic paywall detection (soft signal, not guaranteed)
353+
#[serde(skip_serializing_if = "Option::is_none")]
354+
pub is_paywall: Option<bool>,
343355
}
344356

345357
#[cfg(test)]

0 commit comments

Comments
 (0)