Skip to content

Commit 0d13405

Browse files
committed
feat(fetch): add content quality signals (word_count, redirect_chain, is_paywall)
Add word_count, redirect_chain, and is_paywall fields to FetchResponse. Word count computed from final content. Redirect chain tracks all intermediate URLs during redirect following. Paywall detection uses heuristic matching against common paywall indicators in raw HTML. Closes #76
1 parent b9d7c42 commit 0d13405

File tree

2 files changed

+214
-4
lines changed

2 files changed

+214
-4
lines changed

crates/fetchkit/src/fetchers/default.rs

Lines changed: 202 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ impl Fetcher for DefaultFetcher {
202202
};
203203

204204
// THREAT[TM-SSRF-010]: Follow redirects manually so every hop is re-validated.
205-
let response =
205+
let (response, redirect_chain) =
206206
send_request_following_redirects(parsed_url, reqwest_method, headers, options).await?;
207207

208208
let status_code = response.status().as_u16();
@@ -219,6 +219,7 @@ impl Fetcher for DefaultFetcher {
219219
last_modified: meta.last_modified,
220220
filename: meta.filename,
221221
method: Some("HEAD".to_string()),
222+
redirect_chain,
222223
..Default::default()
223224
});
224225
}
@@ -233,6 +234,7 @@ impl Fetcher for DefaultFetcher {
233234
size: meta.content_length,
234235
last_modified: meta.last_modified,
235236
filename: meta.filename,
237+
redirect_chain,
236238
error: Some(
237239
"Binary content is not supported. Only textual content (HTML, text, JSON, etc.) can be fetched."
238240
.to_string(),
@@ -250,6 +252,9 @@ impl Fetcher for DefaultFetcher {
250252
// Convert to string
251253
let content = String::from_utf8_lossy(&body).to_string();
252254

255+
// Detect paywall before content is moved by conversion
256+
let is_paywall = detect_paywall(&content);
257+
253258
// Determine format and convert if needed
254259
// THREAT[TM-DOS-006]: Conversion input is bounded by max_body_size
255260
let (format, final_content) =
@@ -281,6 +286,9 @@ impl Fetcher for DefaultFetcher {
281286
final_content.push_str(TRUNCATION_MESSAGE);
282287
}
283288

289+
// Compute quality signals
290+
let word_count = count_words(&final_content);
291+
284292
Ok(FetchResponse {
285293
url: final_url,
286294
status_code,
@@ -291,6 +299,9 @@ impl Fetcher for DefaultFetcher {
291299
format: Some(format),
292300
content: Some(final_content),
293301
truncated: if truncated { Some(true) } else { None },
302+
word_count: Some(word_count),
303+
redirect_chain,
304+
is_paywall: if is_paywall { Some(true) } else { None },
294305
..Default::default()
295306
})
296307
}
@@ -327,7 +338,7 @@ impl Fetcher for DefaultFetcher {
327338
};
328339

329340
// THREAT[TM-SSRF-010]: Follow redirects manually with IP validation at each hop
330-
let response =
341+
let (response, redirect_chain) =
331342
send_request_following_redirects(parsed_url, reqwest_method, headers, options).await?;
332343

333344
let status_code = response.status().as_u16();
@@ -344,6 +355,7 @@ impl Fetcher for DefaultFetcher {
344355
last_modified: meta.last_modified,
345356
filename: meta.filename,
346357
method: Some("HEAD".to_string()),
358+
redirect_chain,
347359
..Default::default()
348360
});
349361
}
@@ -368,19 +380,22 @@ impl Fetcher for DefaultFetcher {
368380
truncated: if truncated { Some(true) } else { None },
369381
saved_path: Some(save_result.path),
370382
bytes_written: Some(save_result.bytes_written),
383+
redirect_chain,
371384
// No inline content when saving to file
372385
..Default::default()
373386
})
374387
}
375388
}
376389

390+
/// Returns `(response, redirect_chain)` where redirect_chain lists intermediate URLs.
377391
async fn send_request_following_redirects(
378392
initial_url: Url,
379393
method: reqwest::Method,
380394
headers: HeaderMap,
381395
options: &FetchOptions,
382-
) -> Result<reqwest::Response, FetchError> {
396+
) -> Result<(reqwest::Response, Vec<String>), FetchError> {
383397
let mut current_url = initial_url;
398+
let mut redirect_chain = Vec::new();
384399

385400
for redirect_count in 0..=MAX_REDIRECTS {
386401
let client = build_client_for_url(&current_url, headers.clone(), options)?;
@@ -391,7 +406,7 @@ async fn send_request_following_redirects(
391406
.map_err(FetchError::from_reqwest)?;
392407

393408
let Some(next_url) = redirect_target(&current_url, &response, options)? else {
394-
return Ok(response);
409+
return Ok((response, redirect_chain));
395410
};
396411

397412
if redirect_count == MAX_REDIRECTS {
@@ -405,6 +420,7 @@ async fn send_request_following_redirects(
405420
"Following redirect with IP validation"
406421
);
407422

423+
redirect_chain.push(current_url.to_string());
408424
current_url = next_url;
409425
}
410426

@@ -591,6 +607,36 @@ async fn read_body_with_timeout(
591607
}
592608
}
593609

610+
/// Count words in text content.
611+
fn count_words(text: &str) -> u64 {
612+
text.split_whitespace().count() as u64
613+
}
614+
615+
/// Common paywall indicators in raw HTML content.
616+
const PAYWALL_INDICATORS: &[&str] = &[
617+
"paywall",
618+
"subscribe to read",
619+
"subscribe to continue",
620+
"subscription required",
621+
"premium content",
622+
"members only",
623+
"sign in to read",
624+
"log in to read",
625+
"create a free account",
626+
"already a subscriber",
627+
"unlock this article",
628+
"get unlimited access",
629+
"start your free trial",
630+
];
631+
632+
/// Heuristic paywall detection from raw HTML.
633+
fn detect_paywall(html: &str) -> bool {
634+
let lower = html.to_lowercase();
635+
PAYWALL_INDICATORS
636+
.iter()
637+
.any(|indicator| lower.contains(indicator))
638+
}
639+
594640
#[cfg(test)]
595641
mod tests {
596642
use super::*;
@@ -911,4 +957,156 @@ mod tests {
911957

912958
assert_eq!(response.status_code, 200);
913959
}
960+
961+
#[test]
962+
fn test_count_words() {
963+
assert_eq!(count_words("hello world"), 2);
964+
assert_eq!(count_words(""), 0);
965+
assert_eq!(count_words(" one two three "), 3);
966+
assert_eq!(count_words("word"), 1);
967+
}
968+
969+
#[test]
970+
fn test_detect_paywall() {
971+
assert!(detect_paywall("<div class=\"paywall\">Subscribe</div>"));
972+
assert!(detect_paywall("<p>Subscribe to read the full article</p>"));
973+
assert!(detect_paywall("<span>Already a subscriber? Log in</span>"));
974+
assert!(detect_paywall("<div>Unlock this article</div>"));
975+
assert!(!detect_paywall("<p>This is a normal article</p>"));
976+
assert!(!detect_paywall("<h1>Hello World</h1><p>Free content</p>"));
977+
}
978+
979+
#[tokio::test]
980+
async fn test_word_count_in_response() {
981+
let server = MockServer::start().await;
982+
Mock::given(method("GET"))
983+
.and(path("/article"))
984+
.respond_with(
985+
ResponseTemplate::new(200)
986+
.set_body_string("Hello world this is a test")
987+
.insert_header("content-type", "text/plain"),
988+
)
989+
.mount(&server)
990+
.await;
991+
992+
let fetcher = DefaultFetcher::new();
993+
let options = FetchOptions {
994+
dns_policy: DnsPolicy::allow_all(),
995+
..Default::default()
996+
};
997+
let request = FetchRequest::new(format!("{}/article", server.uri()));
998+
let response = fetcher.fetch(&request, &options).await.unwrap();
999+
1000+
assert_eq!(response.word_count, Some(6));
1001+
}
1002+
1003+
#[tokio::test]
1004+
async fn test_redirect_chain_tracked() {
1005+
let destination = MockServer::start().await;
1006+
Mock::given(method("GET"))
1007+
.and(path("/final"))
1008+
.respond_with(
1009+
ResponseTemplate::new(200)
1010+
.set_body_string("arrived")
1011+
.insert_header("content-type", "text/plain"),
1012+
)
1013+
.mount(&destination)
1014+
.await;
1015+
1016+
let origin = MockServer::start().await;
1017+
Mock::given(method("GET"))
1018+
.and(path("/start"))
1019+
.respond_with(
1020+
ResponseTemplate::new(302)
1021+
.insert_header("location", format!("{}/final", destination.uri())),
1022+
)
1023+
.mount(&origin)
1024+
.await;
1025+
1026+
let fetcher = DefaultFetcher::new();
1027+
let options = FetchOptions {
1028+
dns_policy: DnsPolicy::allow_all(),
1029+
..Default::default()
1030+
};
1031+
let request = FetchRequest::new(format!("{}/start", origin.uri()));
1032+
let response = fetcher.fetch(&request, &options).await.unwrap();
1033+
1034+
assert_eq!(response.status_code, 200);
1035+
assert_eq!(response.redirect_chain.len(), 1);
1036+
assert!(response.redirect_chain[0].contains("/start"));
1037+
}
1038+
1039+
#[tokio::test]
1040+
async fn test_no_redirect_chain_for_direct_response() {
1041+
let server = MockServer::start().await;
1042+
Mock::given(method("GET"))
1043+
.and(path("/direct"))
1044+
.respond_with(
1045+
ResponseTemplate::new(200)
1046+
.set_body_string("direct")
1047+
.insert_header("content-type", "text/plain"),
1048+
)
1049+
.mount(&server)
1050+
.await;
1051+
1052+
let fetcher = DefaultFetcher::new();
1053+
let options = FetchOptions {
1054+
dns_policy: DnsPolicy::allow_all(),
1055+
..Default::default()
1056+
};
1057+
let request = FetchRequest::new(format!("{}/direct", server.uri()));
1058+
let response = fetcher.fetch(&request, &options).await.unwrap();
1059+
1060+
assert!(response.redirect_chain.is_empty());
1061+
}
1062+
1063+
#[tokio::test]
1064+
async fn test_paywall_detection() {
1065+
let server = MockServer::start().await;
1066+
Mock::given(method("GET"))
1067+
.and(path("/paywalled"))
1068+
.respond_with(
1069+
ResponseTemplate::new(200)
1070+
.set_body_string("<html><body><div class='paywall'>Subscribe to read the full article</div><p>Preview...</p></body></html>")
1071+
.insert_header("content-type", "text/html"),
1072+
)
1073+
.mount(&server)
1074+
.await;
1075+
1076+
let fetcher = DefaultFetcher::new();
1077+
let options = FetchOptions {
1078+
enable_markdown: true,
1079+
dns_policy: DnsPolicy::allow_all(),
1080+
..Default::default()
1081+
};
1082+
let request = FetchRequest::new(format!("{}/paywalled", server.uri())).as_markdown();
1083+
let response = fetcher.fetch(&request, &options).await.unwrap();
1084+
1085+
assert_eq!(response.is_paywall, Some(true));
1086+
}
1087+
1088+
#[tokio::test]
1089+
async fn test_no_paywall_for_normal_content() {
1090+
let server = MockServer::start().await;
1091+
Mock::given(method("GET"))
1092+
.and(path("/free"))
1093+
.respond_with(
1094+
ResponseTemplate::new(200)
1095+
.set_body_string("<html><body><p>This is free content</p></body></html>")
1096+
.insert_header("content-type", "text/html"),
1097+
)
1098+
.mount(&server)
1099+
.await;
1100+
1101+
let fetcher = DefaultFetcher::new();
1102+
let options = FetchOptions {
1103+
enable_markdown: true,
1104+
dns_policy: DnsPolicy::allow_all(),
1105+
..Default::default()
1106+
};
1107+
let request = FetchRequest::new(format!("{}/free", server.uri())).as_markdown();
1108+
let response = fetcher.fetch(&request, &options).await.unwrap();
1109+
1110+
assert!(response.is_paywall.is_none());
1111+
}
9141112
}

crates/fetchkit/src/types.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,18 @@ pub struct FetchResponse {
213213
/// Bytes written to file
214214
#[serde(skip_serializing_if = "Option::is_none")]
215215
pub bytes_written: Option<u64>,
216+
217+
/// Word count of the final content
218+
#[serde(skip_serializing_if = "Option::is_none")]
219+
pub word_count: Option<u64>,
220+
221+
/// Chain of URLs followed during redirects (empty if no redirects)
222+
#[serde(skip_serializing_if = "Vec::is_empty", default)]
223+
pub redirect_chain: Vec<String>,
224+
225+
/// Heuristic paywall detection (soft signal, not guaranteed)
226+
#[serde(skip_serializing_if = "Option::is_none")]
227+
pub is_paywall: Option<bool>,
216228
}
217229

218230
#[cfg(test)]

0 commit comments

Comments
 (0)