@@ -202,7 +202,7 @@ impl Fetcher for DefaultFetcher {
202202 } ;
203203
204204 // THREAT[TM-SSRF-010]: Follow redirects manually so every hop is re-validated.
205- let response =
205+ let ( response, redirect_chain ) =
206206 send_request_following_redirects ( parsed_url, reqwest_method, headers, options) . await ?;
207207
208208 let status_code = response. status ( ) . as_u16 ( ) ;
@@ -219,6 +219,7 @@ impl Fetcher for DefaultFetcher {
219219 last_modified : meta. last_modified ,
220220 filename : meta. filename ,
221221 method : Some ( "HEAD" . to_string ( ) ) ,
222+ redirect_chain,
222223 ..Default :: default ( )
223224 } ) ;
224225 }
@@ -233,6 +234,7 @@ impl Fetcher for DefaultFetcher {
233234 size : meta. content_length ,
234235 last_modified : meta. last_modified ,
235236 filename : meta. filename ,
237+ redirect_chain,
236238 error : Some (
237239 "Binary content is not supported. Only textual content (HTML, text, JSON, etc.) can be fetched."
238240 . to_string ( ) ,
@@ -250,6 +252,9 @@ impl Fetcher for DefaultFetcher {
250252 // Convert to string
251253 let content = String :: from_utf8_lossy ( & body) . to_string ( ) ;
252254
255+ // Detect paywall before content is moved by conversion
256+ let is_paywall = detect_paywall ( & content) ;
257+
253258 // Determine format and convert if needed
254259 // THREAT[TM-DOS-006]: Conversion input is bounded by max_body_size
255260 let ( format, final_content) =
@@ -281,6 +286,9 @@ impl Fetcher for DefaultFetcher {
281286 final_content. push_str ( TRUNCATION_MESSAGE ) ;
282287 }
283288
289+ // Compute quality signals
290+ let word_count = count_words ( & final_content) ;
291+
284292 Ok ( FetchResponse {
285293 url : final_url,
286294 status_code,
@@ -291,6 +299,9 @@ impl Fetcher for DefaultFetcher {
291299 format : Some ( format) ,
292300 content : Some ( final_content) ,
293301 truncated : if truncated { Some ( true ) } else { None } ,
302+ word_count : Some ( word_count) ,
303+ redirect_chain,
304+ is_paywall : if is_paywall { Some ( true ) } else { None } ,
294305 ..Default :: default ( )
295306 } )
296307 }
@@ -327,7 +338,7 @@ impl Fetcher for DefaultFetcher {
327338 } ;
328339
329340 // THREAT[TM-SSRF-010]: Follow redirects manually with IP validation at each hop
330- let response =
341+ let ( response, redirect_chain ) =
331342 send_request_following_redirects ( parsed_url, reqwest_method, headers, options) . await ?;
332343
333344 let status_code = response. status ( ) . as_u16 ( ) ;
@@ -344,6 +355,7 @@ impl Fetcher for DefaultFetcher {
344355 last_modified : meta. last_modified ,
345356 filename : meta. filename ,
346357 method : Some ( "HEAD" . to_string ( ) ) ,
358+ redirect_chain,
347359 ..Default :: default ( )
348360 } ) ;
349361 }
@@ -368,19 +380,22 @@ impl Fetcher for DefaultFetcher {
368380 truncated : if truncated { Some ( true ) } else { None } ,
369381 saved_path : Some ( save_result. path ) ,
370382 bytes_written : Some ( save_result. bytes_written ) ,
383+ redirect_chain,
371384 // No inline content when saving to file
372385 ..Default :: default ( )
373386 } )
374387 }
375388}
376389
390+ /// Returns `(response, redirect_chain)` where redirect_chain lists intermediate URLs.
377391async fn send_request_following_redirects (
378392 initial_url : Url ,
379393 method : reqwest:: Method ,
380394 headers : HeaderMap ,
381395 options : & FetchOptions ,
382- ) -> Result < reqwest:: Response , FetchError > {
396+ ) -> Result < ( reqwest:: Response , Vec < String > ) , FetchError > {
383397 let mut current_url = initial_url;
398+ let mut redirect_chain = Vec :: new ( ) ;
384399
385400 for redirect_count in 0 ..=MAX_REDIRECTS {
386401 let client = build_client_for_url ( & current_url, headers. clone ( ) , options) ?;
@@ -391,7 +406,7 @@ async fn send_request_following_redirects(
391406 . map_err ( FetchError :: from_reqwest) ?;
392407
393408 let Some ( next_url) = redirect_target ( & current_url, & response, options) ? else {
394- return Ok ( response) ;
409+ return Ok ( ( response, redirect_chain ) ) ;
395410 } ;
396411
397412 if redirect_count == MAX_REDIRECTS {
@@ -405,6 +420,7 @@ async fn send_request_following_redirects(
405420 "Following redirect with IP validation"
406421 ) ;
407422
423+ redirect_chain. push ( current_url. to_string ( ) ) ;
408424 current_url = next_url;
409425 }
410426
@@ -591,6 +607,36 @@ async fn read_body_with_timeout(
591607 }
592608}
593609
610+ /// Count words in text content.
611+ fn count_words ( text : & str ) -> u64 {
612+ text. split_whitespace ( ) . count ( ) as u64
613+ }
614+
615+ /// Common paywall indicators in raw HTML content.
616+ const PAYWALL_INDICATORS : & [ & str ] = & [
617+ "paywall" ,
618+ "subscribe to read" ,
619+ "subscribe to continue" ,
620+ "subscription required" ,
621+ "premium content" ,
622+ "members only" ,
623+ "sign in to read" ,
624+ "log in to read" ,
625+ "create a free account" ,
626+ "already a subscriber" ,
627+ "unlock this article" ,
628+ "get unlimited access" ,
629+ "start your free trial" ,
630+ ] ;
631+
632+ /// Heuristic paywall detection from raw HTML.
633+ fn detect_paywall ( html : & str ) -> bool {
634+ let lower = html. to_lowercase ( ) ;
635+ PAYWALL_INDICATORS
636+ . iter ( )
637+ . any ( |indicator| lower. contains ( indicator) )
638+ }
639+
594640#[ cfg( test) ]
595641mod tests {
596642 use super :: * ;
@@ -911,4 +957,156 @@ mod tests {
911957
912958 assert_eq ! ( response. status_code, 200 ) ;
913959 }
960+
961+ #[ test]
962+ fn test_count_words ( ) {
963+ assert_eq ! ( count_words( "hello world" ) , 2 ) ;
964+ assert_eq ! ( count_words( "" ) , 0 ) ;
965+ assert_eq ! ( count_words( " one two three " ) , 3 ) ;
966+ assert_eq ! ( count_words( "word" ) , 1 ) ;
967+ }
968+
969+ #[ test]
970+ fn test_detect_paywall ( ) {
971+ assert ! ( detect_paywall( "<div class=\" paywall\" >Subscribe</div>" ) ) ;
972+ assert ! ( detect_paywall( "<p>Subscribe to read the full article</p>" ) ) ;
973+ assert ! ( detect_paywall( "<span>Already a subscriber? Log in</span>" ) ) ;
974+ assert ! ( detect_paywall( "<div>Unlock this article</div>" ) ) ;
975+ assert ! ( !detect_paywall( "<p>This is a normal article</p>" ) ) ;
976+ assert ! ( !detect_paywall( "<h1>Hello World</h1><p>Free content</p>" ) ) ;
977+ }
978+
979+ #[ tokio:: test]
980+ async fn test_word_count_in_response ( ) {
981+ let server = MockServer :: start ( ) . await ;
982+ Mock :: given ( method ( "GET" ) )
983+ . and ( path ( "/article" ) )
984+ . respond_with (
985+ ResponseTemplate :: new ( 200 )
986+ . set_body_string ( "Hello world this is a test" )
987+ . insert_header ( "content-type" , "text/plain" ) ,
988+ )
989+ . mount ( & server)
990+ . await ;
991+
992+ let fetcher = DefaultFetcher :: new ( ) ;
993+ let options = FetchOptions {
994+ dns_policy : DnsPolicy :: allow_all ( ) ,
995+ ..Default :: default ( )
996+ } ;
997+ let request = FetchRequest :: new ( format ! ( "{}/article" , server. uri( ) ) ) ;
998+ let response = fetcher. fetch ( & request, & options) . await . unwrap ( ) ;
999+
1000+ assert_eq ! ( response. word_count, Some ( 6 ) ) ;
1001+ }
1002+
1003+ #[ tokio:: test]
1004+ async fn test_redirect_chain_tracked ( ) {
1005+ let destination = MockServer :: start ( ) . await ;
1006+ Mock :: given ( method ( "GET" ) )
1007+ . and ( path ( "/final" ) )
1008+ . respond_with (
1009+ ResponseTemplate :: new ( 200 )
1010+ . set_body_string ( "arrived" )
1011+ . insert_header ( "content-type" , "text/plain" ) ,
1012+ )
1013+ . mount ( & destination)
1014+ . await ;
1015+
1016+ let origin = MockServer :: start ( ) . await ;
1017+ Mock :: given ( method ( "GET" ) )
1018+ . and ( path ( "/start" ) )
1019+ . respond_with (
1020+ ResponseTemplate :: new ( 302 )
1021+ . insert_header ( "location" , format ! ( "{}/final" , destination. uri( ) ) ) ,
1022+ )
1023+ . mount ( & origin)
1024+ . await ;
1025+
1026+ let fetcher = DefaultFetcher :: new ( ) ;
1027+ let options = FetchOptions {
1028+ dns_policy : DnsPolicy :: allow_all ( ) ,
1029+ ..Default :: default ( )
1030+ } ;
1031+ let request = FetchRequest :: new ( format ! ( "{}/start" , origin. uri( ) ) ) ;
1032+ let response = fetcher. fetch ( & request, & options) . await . unwrap ( ) ;
1033+
1034+ assert_eq ! ( response. status_code, 200 ) ;
1035+ assert_eq ! ( response. redirect_chain. len( ) , 1 ) ;
1036+ assert ! ( response. redirect_chain[ 0 ] . contains( "/start" ) ) ;
1037+ }
1038+
1039+ #[ tokio:: test]
1040+ async fn test_no_redirect_chain_for_direct_response ( ) {
1041+ let server = MockServer :: start ( ) . await ;
1042+ Mock :: given ( method ( "GET" ) )
1043+ . and ( path ( "/direct" ) )
1044+ . respond_with (
1045+ ResponseTemplate :: new ( 200 )
1046+ . set_body_string ( "direct" )
1047+ . insert_header ( "content-type" , "text/plain" ) ,
1048+ )
1049+ . mount ( & server)
1050+ . await ;
1051+
1052+ let fetcher = DefaultFetcher :: new ( ) ;
1053+ let options = FetchOptions {
1054+ dns_policy : DnsPolicy :: allow_all ( ) ,
1055+ ..Default :: default ( )
1056+ } ;
1057+ let request = FetchRequest :: new ( format ! ( "{}/direct" , server. uri( ) ) ) ;
1058+ let response = fetcher. fetch ( & request, & options) . await . unwrap ( ) ;
1059+
1060+ assert ! ( response. redirect_chain. is_empty( ) ) ;
1061+ }
1062+
1063+ #[ tokio:: test]
1064+ async fn test_paywall_detection ( ) {
1065+ let server = MockServer :: start ( ) . await ;
1066+ Mock :: given ( method ( "GET" ) )
1067+ . and ( path ( "/paywalled" ) )
1068+ . respond_with (
1069+ ResponseTemplate :: new ( 200 )
1070+ . set_body_string ( "<html><body><div class='paywall'>Subscribe to read the full article</div><p>Preview...</p></body></html>" )
1071+ . insert_header ( "content-type" , "text/html" ) ,
1072+ )
1073+ . mount ( & server)
1074+ . await ;
1075+
1076+ let fetcher = DefaultFetcher :: new ( ) ;
1077+ let options = FetchOptions {
1078+ enable_markdown : true ,
1079+ dns_policy : DnsPolicy :: allow_all ( ) ,
1080+ ..Default :: default ( )
1081+ } ;
1082+ let request = FetchRequest :: new ( format ! ( "{}/paywalled" , server. uri( ) ) ) . as_markdown ( ) ;
1083+ let response = fetcher. fetch ( & request, & options) . await . unwrap ( ) ;
1084+
1085+ assert_eq ! ( response. is_paywall, Some ( true ) ) ;
1086+ }
1087+
1088+ #[ tokio:: test]
1089+ async fn test_no_paywall_for_normal_content ( ) {
1090+ let server = MockServer :: start ( ) . await ;
1091+ Mock :: given ( method ( "GET" ) )
1092+ . and ( path ( "/free" ) )
1093+ . respond_with (
1094+ ResponseTemplate :: new ( 200 )
1095+ . set_body_string ( "<html><body><p>This is free content</p></body></html>" )
1096+ . insert_header ( "content-type" , "text/html" ) ,
1097+ )
1098+ . mount ( & server)
1099+ . await ;
1100+
1101+ let fetcher = DefaultFetcher :: new ( ) ;
1102+ let options = FetchOptions {
1103+ enable_markdown : true ,
1104+ dns_policy : DnsPolicy :: allow_all ( ) ,
1105+ ..Default :: default ( )
1106+ } ;
1107+ let request = FetchRequest :: new ( format ! ( "{}/free" , server. uri( ) ) ) . as_markdown ( ) ;
1108+ let response = fetcher. fetch ( & request, & options) . await . unwrap ( ) ;
1109+
1110+ assert ! ( response. is_paywall. is_none( ) ) ;
1111+ }
9141112}
0 commit comments