Skip to content

Commit 44346e1

Browse files
authored
feat(fetch): add conditional fetching with ETag and If-Modified-Since (#80)
## What Support conditional HTTP fetching so agents can efficiently poll pages without re-downloading unchanged content. ## Why Agents frequently re-fetch the same URLs in loops. Conditional fetching avoids re-downloading and re-processing unchanged pages, saving bandwidth and time. ## How - New `FetchRequest` fields: `if_none_match` (ETag), `if_modified_since` - New `FetchResponse` field: `etag` (returned on all responses) - 304 Not Modified handling: return response with status 304, no content, no conversion - Fix: 304 is no longer treated as a redirect in the redirect-following logic ## Risk - Low — additive fields, backward-compatible - 304 handling is a new code path but well-tested ### Checklist - [x] Unit tests passed (3 new conditional fetch tests) - [x] Clippy clean - [x] Docs build clean Closes #74
1 parent e74c328 commit 44346e1

2 files changed

Lines changed: 144 additions & 4 deletions

File tree

crates/fetchkit/src/fetchers/default.rs

Lines changed: 118 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ impl Default for DefaultFetcher {
8181
}
8282

8383
/// Build headers for HTTP requests
84-
fn build_headers(options: &FetchOptions, accept: &str) -> HeaderMap {
84+
fn build_headers(options: &FetchOptions, accept: &str, request: &FetchRequest) -> HeaderMap {
8585
let mut headers = HeaderMap::new();
8686
let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT);
8787
headers.insert(
@@ -93,6 +93,19 @@ fn build_headers(options: &FetchOptions, accept: &str) -> HeaderMap {
9393
ACCEPT,
9494
HeaderValue::from_str(accept).unwrap_or_else(|_| HeaderValue::from_static("*/*")),
9595
);
96+
97+
// Conditional request headers
98+
if let Some(ref etag) = request.if_none_match {
99+
if let Ok(v) = HeaderValue::from_str(etag) {
100+
headers.insert(reqwest::header::IF_NONE_MATCH, v);
101+
}
102+
}
103+
if let Some(ref date) = request.if_modified_since {
104+
if let Ok(v) = HeaderValue::from_str(date) {
105+
headers.insert(reqwest::header::IF_MODIFIED_SINCE, v);
106+
}
107+
}
108+
96109
headers
97110
}
98111

@@ -137,6 +150,7 @@ fn apply_bot_auth_if_enabled(headers: HeaderMap, _options: &FetchOptions, _url:
137150
struct ResponseMeta {
138151
content_type: Option<String>,
139152
last_modified: Option<String>,
153+
etag: Option<String>,
140154
content_length: Option<u64>,
141155
filename: Option<String>,
142156
}
@@ -151,6 +165,10 @@ fn extract_response_meta(headers: &HeaderMap, url: &str) -> ResponseMeta {
151165
.get("last-modified")
152166
.and_then(|v| v.to_str().ok())
153167
.map(|s| s.to_string()),
168+
etag: headers
169+
.get("etag")
170+
.and_then(|v| v.to_str().ok())
171+
.map(|s| s.to_string()),
154172
content_length: headers
155173
.get("content-length")
156174
.and_then(|v| v.to_str().ok())
@@ -192,7 +210,7 @@ impl Fetcher for DefaultFetcher {
192210
"*/*"
193211
};
194212

195-
let headers = build_headers(options, accept);
213+
let headers = build_headers(options, accept, request);
196214
let parsed_url = url::Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?;
197215
let headers = apply_bot_auth_if_enabled(headers, options, &parsed_url);
198216

@@ -209,6 +227,18 @@ impl Fetcher for DefaultFetcher {
209227
let final_url = response.url().to_string();
210228
let meta = extract_response_meta(response.headers(), &final_url);
211229

230+
// Handle 304 Not Modified (conditional request response)
231+
if status_code == 304 {
232+
return Ok(FetchResponse {
233+
url: final_url,
234+
status_code,
235+
content_type: meta.content_type,
236+
last_modified: meta.last_modified,
237+
etag: meta.etag,
238+
..Default::default()
239+
});
240+
}
241+
212242
// Handle HEAD request
213243
if method == HttpMethod::Head {
214244
return Ok(FetchResponse {
@@ -217,6 +247,7 @@ impl Fetcher for DefaultFetcher {
217247
content_type: meta.content_type,
218248
size: meta.content_length,
219249
last_modified: meta.last_modified,
250+
etag: meta.etag,
220251
filename: meta.filename,
221252
method: Some("HEAD".to_string()),
222253
..Default::default()
@@ -232,6 +263,7 @@ impl Fetcher for DefaultFetcher {
232263
content_type: meta.content_type,
233264
size: meta.content_length,
234265
last_modified: meta.last_modified,
266+
etag: meta.etag,
235267
filename: meta.filename,
236268
error: Some(
237269
"Binary content is not supported. Only textual content (HTML, text, JSON, etc.) can be fetched."
@@ -309,6 +341,7 @@ impl Fetcher for DefaultFetcher {
309341
content_type: meta.content_type,
310342
size: Some(size),
311343
last_modified: meta.last_modified,
344+
etag: meta.etag,
312345
filename: meta.filename,
313346
format: Some(format),
314347
content: Some(final_content),
@@ -340,7 +373,7 @@ impl Fetcher for DefaultFetcher {
340373
let method = request.effective_method();
341374
let max_body_size = options.max_body_size.unwrap_or(DEFAULT_MAX_BODY_SIZE);
342375

343-
let headers = build_headers(options, "*/*");
376+
let headers = build_headers(options, "*/*", request);
344377
let parsed_url = url::Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?;
345378
let headers = apply_bot_auth_if_enabled(headers, options, &parsed_url);
346379

@@ -365,6 +398,7 @@ impl Fetcher for DefaultFetcher {
365398
content_type: meta.content_type,
366399
size: meta.content_length,
367400
last_modified: meta.last_modified,
401+
etag: meta.etag,
368402
filename: meta.filename,
369403
method: Some("HEAD".to_string()),
370404
..Default::default()
@@ -387,6 +421,7 @@ impl Fetcher for DefaultFetcher {
387421
content_type: meta.content_type,
388422
size: Some(size),
389423
last_modified: meta.last_modified,
424+
etag: meta.etag,
390425
filename: meta.filename,
391426
truncated: if truncated { Some(true) } else { None },
392427
saved_path: Some(save_result.path),
@@ -472,7 +507,8 @@ fn redirect_target(
472507
response: &reqwest::Response,
473508
options: &FetchOptions,
474509
) -> Result<Option<Url>, FetchError> {
475-
if !response.status().is_redirection() {
510+
// 304 Not Modified is in the 3xx range but is not a redirect
511+
if !response.status().is_redirection() || response.status().as_u16() == 304 {
476512
return Ok(None);
477513
}
478514

@@ -934,4 +970,82 @@ mod tests {
934970

935971
assert_eq!(response.status_code, 200);
936972
}
973+
974+
#[tokio::test]
975+
async fn test_etag_returned_in_response() {
976+
let server = MockServer::start().await;
977+
Mock::given(method("GET"))
978+
.and(path("/page"))
979+
.respond_with(
980+
ResponseTemplate::new(200)
981+
.set_body_string("content")
982+
.insert_header("content-type", "text/plain")
983+
.insert_header("etag", "\"abc123\""),
984+
)
985+
.mount(&server)
986+
.await;
987+
988+
let fetcher = DefaultFetcher::new();
989+
let options = FetchOptions {
990+
dns_policy: DnsPolicy::allow_all(),
991+
..Default::default()
992+
};
993+
let request = FetchRequest::new(format!("{}/page", server.uri()));
994+
let response = fetcher.fetch(&request, &options).await.unwrap();
995+
996+
assert_eq!(response.status_code, 200);
997+
assert_eq!(response.etag.as_deref(), Some("\"abc123\""));
998+
}
999+
1000+
#[tokio::test]
1001+
async fn test_conditional_fetch_304_not_modified() {
1002+
use wiremock::matchers::header;
1003+
1004+
let server = MockServer::start().await;
1005+
Mock::given(method("GET"))
1006+
.and(path("/page"))
1007+
.and(header("if-none-match", "\"abc123\""))
1008+
.respond_with(ResponseTemplate::new(304).insert_header("etag", "\"abc123\""))
1009+
.mount(&server)
1010+
.await;
1011+
1012+
let fetcher = DefaultFetcher::new();
1013+
let options = FetchOptions {
1014+
dns_policy: DnsPolicy::allow_all(),
1015+
..Default::default()
1016+
};
1017+
let request =
1018+
FetchRequest::new(format!("{}/page", server.uri())).if_none_match("\"abc123\"");
1019+
let response = fetcher.fetch(&request, &options).await.unwrap();
1020+
1021+
assert_eq!(response.status_code, 304);
1022+
assert_eq!(response.etag.as_deref(), Some("\"abc123\""));
1023+
assert!(response.content.is_none());
1024+
assert!(response.format.is_none());
1025+
}
1026+
1027+
#[tokio::test]
1028+
async fn test_conditional_fetch_if_modified_since() {
1029+
use wiremock::matchers::header_exists;
1030+
1031+
let server = MockServer::start().await;
1032+
Mock::given(method("GET"))
1033+
.and(path("/page"))
1034+
.and(header_exists("if-modified-since"))
1035+
.respond_with(ResponseTemplate::new(304))
1036+
.mount(&server)
1037+
.await;
1038+
1039+
let fetcher = DefaultFetcher::new();
1040+
let options = FetchOptions {
1041+
dns_policy: DnsPolicy::allow_all(),
1042+
..Default::default()
1043+
};
1044+
let request = FetchRequest::new(format!("{}/page", server.uri()))
1045+
.if_modified_since("Wed, 21 Oct 2015 07:28:00 GMT");
1046+
let response = fetcher.fetch(&request, &options).await.unwrap();
1047+
1048+
assert_eq!(response.status_code, 304);
1049+
assert!(response.content.is_none());
1050+
}
9371051
}

crates/fetchkit/src/types.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,16 @@ pub struct FetchRequest {
9494
/// "full" (default) returns everything.
9595
#[serde(default, skip_serializing_if = "Option::is_none")]
9696
pub content_focus: Option<String>,
97+
98+
/// ETag value for conditional requests (If-None-Match header).
99+
/// When set, the server may return 304 Not Modified if content unchanged.
100+
#[serde(default, skip_serializing_if = "Option::is_none")]
101+
pub if_none_match: Option<String>,
102+
103+
/// Last-Modified value for conditional requests (If-Modified-Since header).
104+
/// When set, the server may return 304 Not Modified if content unchanged.
105+
#[serde(default, skip_serializing_if = "Option::is_none")]
106+
pub if_modified_since: Option<String>,
97107
}
98108

99109
impl FetchRequest {
@@ -135,6 +145,18 @@ impl FetchRequest {
135145
self
136146
}
137147

148+
/// Set ETag for conditional request
149+
pub fn if_none_match(mut self, etag: impl Into<String>) -> Self {
150+
self.if_none_match = Some(etag.into());
151+
self
152+
}
153+
154+
/// Set If-Modified-Since for conditional request
155+
pub fn if_modified_since(mut self, date: impl Into<String>) -> Self {
156+
self.if_modified_since = Some(date.into());
157+
self
158+
}
159+
138160
/// Get the effective method (default to GET)
139161
pub fn effective_method(&self) -> HttpMethod {
140162
self.method.unwrap_or_default()
@@ -279,6 +301,10 @@ pub struct FetchResponse {
279301
#[serde(skip_serializing_if = "Option::is_none")]
280302
pub last_modified: Option<String>,
281303

304+
/// ETag header value (for conditional requests)
305+
#[serde(skip_serializing_if = "Option::is_none")]
306+
pub etag: Option<String>,
307+
282308
/// Extracted filename
283309
#[serde(skip_serializing_if = "Option::is_none")]
284310
pub filename: Option<String>,

0 commit comments

Comments
 (0)