@@ -220,7 +220,7 @@ impl Fetcher for DefaultFetcher {
220220 } ;
221221
222222 // THREAT[TM-SSRF-010]: Follow redirects manually so every hop is re-validated.
223- let response =
223+ let ( response, redirect_chain ) =
224224 send_request_following_redirects ( parsed_url, reqwest_method, headers, options) . await ?;
225225
226226 let status_code = response. status ( ) . as_u16 ( ) ;
@@ -250,6 +250,7 @@ impl Fetcher for DefaultFetcher {
250250 etag : meta. etag ,
251251 filename : meta. filename ,
252252 method : Some ( "HEAD" . to_string ( ) ) ,
253+ redirect_chain,
253254 ..Default :: default ( )
254255 } ) ;
255256 }
@@ -265,6 +266,7 @@ impl Fetcher for DefaultFetcher {
265266 last_modified : meta. last_modified ,
266267 etag : meta. etag ,
267268 filename : meta. filename ,
269+ redirect_chain,
268270 error : Some (
269271 "Binary content is not supported. Only textual content (HTML, text, JSON, etc.) can be fetched."
270272 . to_string ( ) ,
@@ -282,6 +284,9 @@ impl Fetcher for DefaultFetcher {
282284 // Convert to string
283285 let content = String :: from_utf8_lossy ( & body) . to_string ( ) ;
284286
287+ // Detect paywall before content is moved by conversion
288+ let is_paywall = detect_paywall ( & content) ;
289+
285290 // Determine format and convert if needed
286291 // THREAT[TM-DOS-006]: Conversion input is bounded by max_body_size
287292 let is_html_content = is_html ( & meta. content_type , & content) ;
@@ -335,6 +340,9 @@ impl Fetcher for DefaultFetcher {
335340 final_content. push_str ( TRUNCATION_MESSAGE ) ;
336341 }
337342
343+ // Compute quality signals
344+ let word_count = count_words ( & final_content) ;
345+
338346 Ok ( FetchResponse {
339347 url : final_url,
340348 status_code,
@@ -347,6 +355,9 @@ impl Fetcher for DefaultFetcher {
347355 content : Some ( final_content) ,
348356 truncated : if truncated { Some ( true ) } else { None } ,
349357 metadata : page_metadata,
358+ word_count : Some ( word_count) ,
359+ redirect_chain,
360+ is_paywall : if is_paywall { Some ( true ) } else { None } ,
350361 ..Default :: default ( )
351362 } )
352363 }
@@ -383,7 +394,7 @@ impl Fetcher for DefaultFetcher {
383394 } ;
384395
385396 // THREAT[TM-SSRF-010]: Follow redirects manually with IP validation at each hop
386- let response =
397+ let ( response, redirect_chain ) =
387398 send_request_following_redirects ( parsed_url, reqwest_method, headers, options) . await ?;
388399
389400 let status_code = response. status ( ) . as_u16 ( ) ;
@@ -401,6 +412,7 @@ impl Fetcher for DefaultFetcher {
401412 etag : meta. etag ,
402413 filename : meta. filename ,
403414 method : Some ( "HEAD" . to_string ( ) ) ,
415+ redirect_chain,
404416 ..Default :: default ( )
405417 } ) ;
406418 }
@@ -426,19 +438,22 @@ impl Fetcher for DefaultFetcher {
426438 truncated : if truncated { Some ( true ) } else { None } ,
427439 saved_path : Some ( save_result. path ) ,
428440 bytes_written : Some ( save_result. bytes_written ) ,
441+ redirect_chain,
429442 // No inline content when saving to file
430443 ..Default :: default ( )
431444 } )
432445 }
433446}
434447
448+ /// Returns `(response, redirect_chain)` where redirect_chain lists intermediate URLs.
435449async fn send_request_following_redirects (
436450 initial_url : Url ,
437451 method : reqwest:: Method ,
438452 headers : HeaderMap ,
439453 options : & FetchOptions ,
440- ) -> Result < reqwest:: Response , FetchError > {
454+ ) -> Result < ( reqwest:: Response , Vec < String > ) , FetchError > {
441455 let mut current_url = initial_url;
456+ let mut redirect_chain = Vec :: new ( ) ;
442457
443458 for redirect_count in 0 ..=MAX_REDIRECTS {
444459 let client = build_client_for_url ( & current_url, headers. clone ( ) , options) ?;
@@ -449,7 +464,7 @@ async fn send_request_following_redirects(
449464 . map_err ( FetchError :: from_reqwest) ?;
450465
451466 let Some ( next_url) = redirect_target ( & current_url, & response, options) ? else {
452- return Ok ( response) ;
467+ return Ok ( ( response, redirect_chain ) ) ;
453468 } ;
454469
455470 if redirect_count == MAX_REDIRECTS {
@@ -463,6 +478,7 @@ async fn send_request_following_redirects(
463478 "Following redirect with IP validation"
464479 ) ;
465480
481+ redirect_chain. push ( current_url. to_string ( ) ) ;
466482 current_url = next_url;
467483 }
468484
@@ -650,6 +666,36 @@ async fn read_body_with_timeout(
650666 }
651667}
652668
669+ /// Count words in text content.
670+ fn count_words ( text : & str ) -> u64 {
671+ text. split_whitespace ( ) . count ( ) as u64
672+ }
673+
674+ /// Common paywall indicators in raw HTML content.
675+ const PAYWALL_INDICATORS : & [ & str ] = & [
676+ "paywall" ,
677+ "subscribe to read" ,
678+ "subscribe to continue" ,
679+ "subscription required" ,
680+ "premium content" ,
681+ "members only" ,
682+ "sign in to read" ,
683+ "log in to read" ,
684+ "create a free account" ,
685+ "already a subscriber" ,
686+ "unlock this article" ,
687+ "get unlimited access" ,
688+ "start your free trial" ,
689+ ] ;
690+
691+ /// Heuristic paywall detection from raw HTML.
692+ fn detect_paywall ( html : & str ) -> bool {
693+ let lower = html. to_lowercase ( ) ;
694+ PAYWALL_INDICATORS
695+ . iter ( )
696+ . any ( |indicator| lower. contains ( indicator) )
697+ }
698+
653699#[ cfg( test) ]
654700mod tests {
655701 use super :: * ;
@@ -1048,4 +1094,156 @@ mod tests {
10481094 assert_eq ! ( response. status_code, 304 ) ;
10491095 assert ! ( response. content. is_none( ) ) ;
10501096 }
1097+
1098+ #[ test]
1099+ fn test_count_words ( ) {
1100+ assert_eq ! ( count_words( "hello world" ) , 2 ) ;
1101+ assert_eq ! ( count_words( "" ) , 0 ) ;
1102+ assert_eq ! ( count_words( " one two three " ) , 3 ) ;
1103+ assert_eq ! ( count_words( "word" ) , 1 ) ;
1104+ }
1105+
1106+ #[ test]
1107+ fn test_detect_paywall ( ) {
1108+ assert ! ( detect_paywall( "<div class=\" paywall\" >Subscribe</div>" ) ) ;
1109+ assert ! ( detect_paywall( "<p>Subscribe to read the full article</p>" ) ) ;
1110+ assert ! ( detect_paywall( "<span>Already a subscriber? Log in</span>" ) ) ;
1111+ assert ! ( detect_paywall( "<div>Unlock this article</div>" ) ) ;
1112+ assert ! ( !detect_paywall( "<p>This is a normal article</p>" ) ) ;
1113+ assert ! ( !detect_paywall( "<h1>Hello World</h1><p>Free content</p>" ) ) ;
1114+ }
1115+
1116+ #[ tokio:: test]
1117+ async fn test_word_count_in_response ( ) {
1118+ let server = MockServer :: start ( ) . await ;
1119+ Mock :: given ( method ( "GET" ) )
1120+ . and ( path ( "/article" ) )
1121+ . respond_with (
1122+ ResponseTemplate :: new ( 200 )
1123+ . set_body_string ( "Hello world this is a test" )
1124+ . insert_header ( "content-type" , "text/plain" ) ,
1125+ )
1126+ . mount ( & server)
1127+ . await ;
1128+
1129+ let fetcher = DefaultFetcher :: new ( ) ;
1130+ let options = FetchOptions {
1131+ dns_policy : DnsPolicy :: allow_all ( ) ,
1132+ ..Default :: default ( )
1133+ } ;
1134+ let request = FetchRequest :: new ( format ! ( "{}/article" , server. uri( ) ) ) ;
1135+ let response = fetcher. fetch ( & request, & options) . await . unwrap ( ) ;
1136+
1137+ assert_eq ! ( response. word_count, Some ( 6 ) ) ;
1138+ }
1139+
1140+ #[ tokio:: test]
1141+ async fn test_redirect_chain_tracked ( ) {
1142+ let destination = MockServer :: start ( ) . await ;
1143+ Mock :: given ( method ( "GET" ) )
1144+ . and ( path ( "/final" ) )
1145+ . respond_with (
1146+ ResponseTemplate :: new ( 200 )
1147+ . set_body_string ( "arrived" )
1148+ . insert_header ( "content-type" , "text/plain" ) ,
1149+ )
1150+ . mount ( & destination)
1151+ . await ;
1152+
1153+ let origin = MockServer :: start ( ) . await ;
1154+ Mock :: given ( method ( "GET" ) )
1155+ . and ( path ( "/start" ) )
1156+ . respond_with (
1157+ ResponseTemplate :: new ( 302 )
1158+ . insert_header ( "location" , format ! ( "{}/final" , destination. uri( ) ) ) ,
1159+ )
1160+ . mount ( & origin)
1161+ . await ;
1162+
1163+ let fetcher = DefaultFetcher :: new ( ) ;
1164+ let options = FetchOptions {
1165+ dns_policy : DnsPolicy :: allow_all ( ) ,
1166+ ..Default :: default ( )
1167+ } ;
1168+ let request = FetchRequest :: new ( format ! ( "{}/start" , origin. uri( ) ) ) ;
1169+ let response = fetcher. fetch ( & request, & options) . await . unwrap ( ) ;
1170+
1171+ assert_eq ! ( response. status_code, 200 ) ;
1172+ assert_eq ! ( response. redirect_chain. len( ) , 1 ) ;
1173+ assert ! ( response. redirect_chain[ 0 ] . contains( "/start" ) ) ;
1174+ }
1175+
1176+ #[ tokio:: test]
1177+ async fn test_no_redirect_chain_for_direct_response ( ) {
1178+ let server = MockServer :: start ( ) . await ;
1179+ Mock :: given ( method ( "GET" ) )
1180+ . and ( path ( "/direct" ) )
1181+ . respond_with (
1182+ ResponseTemplate :: new ( 200 )
1183+ . set_body_string ( "direct" )
1184+ . insert_header ( "content-type" , "text/plain" ) ,
1185+ )
1186+ . mount ( & server)
1187+ . await ;
1188+
1189+ let fetcher = DefaultFetcher :: new ( ) ;
1190+ let options = FetchOptions {
1191+ dns_policy : DnsPolicy :: allow_all ( ) ,
1192+ ..Default :: default ( )
1193+ } ;
1194+ let request = FetchRequest :: new ( format ! ( "{}/direct" , server. uri( ) ) ) ;
1195+ let response = fetcher. fetch ( & request, & options) . await . unwrap ( ) ;
1196+
1197+ assert ! ( response. redirect_chain. is_empty( ) ) ;
1198+ }
1199+
1200+ #[ tokio:: test]
1201+ async fn test_paywall_detection ( ) {
1202+ let server = MockServer :: start ( ) . await ;
1203+ Mock :: given ( method ( "GET" ) )
1204+ . and ( path ( "/paywalled" ) )
1205+ . respond_with (
1206+ ResponseTemplate :: new ( 200 )
1207+ . set_body_string ( "<html><body><div class='paywall'>Subscribe to read the full article</div><p>Preview...</p></body></html>" )
1208+ . insert_header ( "content-type" , "text/html" ) ,
1209+ )
1210+ . mount ( & server)
1211+ . await ;
1212+
1213+ let fetcher = DefaultFetcher :: new ( ) ;
1214+ let options = FetchOptions {
1215+ enable_markdown : true ,
1216+ dns_policy : DnsPolicy :: allow_all ( ) ,
1217+ ..Default :: default ( )
1218+ } ;
1219+ let request = FetchRequest :: new ( format ! ( "{}/paywalled" , server. uri( ) ) ) . as_markdown ( ) ;
1220+ let response = fetcher. fetch ( & request, & options) . await . unwrap ( ) ;
1221+
1222+ assert_eq ! ( response. is_paywall, Some ( true ) ) ;
1223+ }
1224+
1225+ #[ tokio:: test]
1226+ async fn test_no_paywall_for_normal_content ( ) {
1227+ let server = MockServer :: start ( ) . await ;
1228+ Mock :: given ( method ( "GET" ) )
1229+ . and ( path ( "/free" ) )
1230+ . respond_with (
1231+ ResponseTemplate :: new ( 200 )
1232+ . set_body_string ( "<html><body><p>This is free content</p></body></html>" )
1233+ . insert_header ( "content-type" , "text/html" ) ,
1234+ )
1235+ . mount ( & server)
1236+ . await ;
1237+
1238+ let fetcher = DefaultFetcher :: new ( ) ;
1239+ let options = FetchOptions {
1240+ enable_markdown : true ,
1241+ dns_policy : DnsPolicy :: allow_all ( ) ,
1242+ ..Default :: default ( )
1243+ } ;
1244+ let request = FetchRequest :: new ( format ! ( "{}/free" , server. uri( ) ) ) . as_markdown ( ) ;
1245+ let response = fetcher. fetch ( & request, & options) . await . unwrap ( ) ;
1246+
1247+ assert ! ( response. is_paywall. is_none( ) ) ;
1248+ }
10511249}
0 commit comments