Skip to content

Commit 6036b78

Browse files
committed
feat: replace per-file HF downloads with bulk git clone snapshot
- Add snapshot_download_tasks() that clones entire HF dataset once (~6MB) - Cache persists across evaluations (0 HTTP requests after first run) - Fallback to parallel per-task download (5 concurrent) if snapshot fails - Add git-lfs to Docker image - Report download_errors in /evaluate response
1 parent 62088a9 commit 6036b78

File tree

3 files changed

+176
-21
lines changed

3 files changed

+176
-21
lines changed

Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ FROM debian:bookworm-slim
1313

1414
# Pre-install all common runtimes and tools at build time (as root)
1515
RUN apt-get update && apt-get install -y --no-install-recommends \
16-
ca-certificates git curl wget unzip libssl3 libssl-dev pkg-config sudo openssh-client \
16+
ca-certificates git git-lfs curl wget unzip libssl3 libssl-dev pkg-config sudo openssh-client \
1717
python3 python3-pip python3-venv python3-dev \
1818
build-essential gcc g++ make cmake autoconf automake libtool \
1919
default-jdk maven gradle \
@@ -24,6 +24,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
2424
imagemagick libmagickwand-dev \
2525
jq \
2626
&& ln -sf /usr/bin/python3 /usr/bin/python \
27+
&& git lfs install \
2728
&& rm -rf /var/lib/apt/lists/*
2829

2930
# Install Go 1.23 (Debian bookworm ships 1.19 which is too old for most projects)

src/handlers.rs

Lines changed: 88 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1352,7 +1352,7 @@ async fn evaluate_with_stored_agent(
13521352
})?
13531353
};
13541354

1355-
// Download task files from HF repo (workspace.yaml, tests/*.sh, etc.)
1355+
// Download all task files from HF repo via snapshot (bulk git clone)
13561356
let hf_client = crate::swe_forge::client::HuggingFaceClient::new().map_err(|e| {
13571357
(
13581358
StatusCode::INTERNAL_SERVER_ERROR,
@@ -1361,31 +1361,98 @@ async fn evaluate_with_stored_agent(
13611361
})?;
13621362

13631363
let dataset_id = "CortexLM/swe-forge";
1364-
let tasks_base = state.config.workspace_base.join("_hf_tasks");
1365-
let _ = tokio::fs::remove_dir_all(&tasks_base).await;
1364+
let snapshot_cache = state.config.workspace_base.join("_hf_dataset_cache");
13661365

1366+
// Bulk download the entire tasks/ folder (cached after first call)
1367+
hf_client
1368+
.snapshot_download_tasks(dataset_id, &snapshot_cache)
1369+
.await
1370+
.map_err(|e| {
1371+
tracing::warn!("Snapshot download failed: {}, falling back to per-task download", e);
1372+
(
1373+
StatusCode::BAD_GATEWAY,
1374+
Json(serde_json::json!({"error": format!("Failed to download HF dataset: {}", e)})),
1375+
)
1376+
})?;
1377+
1378+
// Resolve tasks from the cached snapshot directory.
1379+
// HF stores tasks as tasks/{org}/{repo-number}/ (nested dirs matching the task_id).
1380+
let tasks_dir = snapshot_cache.join("tasks");
13671381
let mut hf_tasks: Vec<crate::task::SweForgeTask> = Vec::new();
13681382
let mut errors: Vec<String> = Vec::new();
13691383

13701384
for task_id in &task_ids {
1371-
let task_dir = tasks_base.join(task_id.replace('/', "__"));
1372-
match hf_client
1373-
.download_task_files(dataset_id, task_id, &task_dir)
1374-
.await
1375-
{
1376-
Ok(()) => match crate::task::parse_task(&task_dir) {
1377-
Ok(mut task) => {
1378-
task.id = task_id.clone();
1379-
hf_tasks.push(task);
1380-
}
1381-
Err(e) => {
1382-
tracing::warn!("Failed to parse task {}: {}", task_id, e);
1383-
errors.push(format!("{}: parse error: {}", task_id, e));
1384-
}
1385-
},
1385+
// task_id format: "org/repo-number" → HF path: tasks/org/repo-number/
1386+
let task_path = tasks_dir.join(task_id);
1387+
1388+
if !task_path.exists() || !task_path.is_dir() {
1389+
errors.push(format!("{}: not found in cached dataset", task_id));
1390+
continue;
1391+
}
1392+
1393+
match crate::task::parse_task(&task_path) {
1394+
Ok(mut task) => {
1395+
task.id = task_id.clone();
1396+
hf_tasks.push(task);
1397+
}
13861398
Err(e) => {
1387-
tracing::warn!("Failed to download task {}: {}", task_id, e);
1388-
errors.push(format!("{}: download error: {}", task_id, e));
1399+
tracing::warn!("Failed to parse task {}: {}", task_id, e);
1400+
errors.push(format!("{}: parse error: {}", task_id, e));
1401+
}
1402+
}
1403+
}
1404+
1405+
// Fallback: if snapshot produced no matches, try per-task download for missing ones
1406+
if hf_tasks.is_empty() && !task_ids.is_empty() {
1407+
tracing::info!("No tasks found in snapshot cache, falling back to per-task download");
1408+
let hf_client = std::sync::Arc::new(
1409+
crate::swe_forge::client::HuggingFaceClient::new().map_err(|e| {
1410+
(
1411+
StatusCode::INTERNAL_SERVER_ERROR,
1412+
Json(serde_json::json!({"error": format!("HF client error: {}", e)})),
1413+
)
1414+
})?,
1415+
);
1416+
let tasks_base = state.config.workspace_base.join("_hf_tasks");
1417+
1418+
use futures::stream::{self, StreamExt};
1419+
1420+
let download_results: Vec<_> = stream::iter(task_ids.clone().into_iter())
1421+
.map(|task_id| {
1422+
let hf = std::sync::Arc::clone(&hf_client);
1423+
let base = tasks_base.clone();
1424+
async move {
1425+
let task_dir = base.join(task_id.replace('/', "__"));
1426+
match hf
1427+
.download_task_files(dataset_id, &task_id, &task_dir)
1428+
.await
1429+
{
1430+
Ok(()) => match crate::task::parse_task(&task_dir) {
1431+
Ok(mut task) => {
1432+
task.id = task_id.clone();
1433+
Ok(task)
1434+
}
1435+
Err(e) => {
1436+
tracing::warn!("Failed to parse task {}: {}", task_id, e);
1437+
Err(format!("{}: parse error: {}", task_id, e))
1438+
}
1439+
},
1440+
Err(e) => {
1441+
tracing::warn!("Failed to download task {}: {}", task_id, e);
1442+
Err(format!("{}: download error: {}", task_id, e))
1443+
}
1444+
}
1445+
}
1446+
})
1447+
.buffer_unordered(5)
1448+
.collect()
1449+
.await;
1450+
1451+
errors.clear();
1452+
for result in download_results {
1453+
match result {
1454+
Ok(task) => hf_tasks.push(task),
1455+
Err(e) => errors.push(e),
13891456
}
13901457
}
13911458
}
@@ -1444,6 +1511,7 @@ async fn evaluate_with_stored_agent(
14441511
"batch_id": batch_id,
14451512
"total_tasks": total_tasks,
14461513
"matched_task_ids": task_ids,
1514+
"download_errors": errors,
14471515
})),
14481516
))
14491517
}

src/swe_forge/client.rs

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ const HF_DATASET_VIEWER_BASE: &str = "https://datasets-server.huggingface.co/row
88
const HF_REPO_BASE: &str = "https://huggingface.co";
99
const DEFAULT_TIMEOUT_SECS: u64 = 30;
1010
const MAX_PAGE_SIZE: usize = 100;
11+
/// Minimum number of task directories to consider the snapshot cache valid.
12+
const SNAPSHOT_MIN_DIRS: usize = 5;
1113

1214
pub struct HuggingFaceClient {
1315
client: reqwest::Client,
@@ -136,6 +138,12 @@ impl HuggingFaceClient {
136138
tokio::fs::create_dir_all(parent).await?;
137139
}
138140

141+
// Skip files that already exist with non-zero size (cached from previous run)
142+
if local_path.exists() && local_path.metadata().map(|m| m.len() > 0).unwrap_or(false) {
143+
debug!("Skipping cached file: {}", local_path.display());
144+
continue;
145+
}
146+
139147
let download_url = format!(
140148
"{}/datasets/{}/resolve/main/{}",
141149
HF_REPO_BASE, dataset_id, file_path
@@ -167,6 +175,84 @@ impl HuggingFaceClient {
167175
Ok(())
168176
}
169177

178+
/// Download the entire `tasks/` folder from the HF dataset repo using a
179+
/// shallow `git clone --depth 1`. This replaces hundreds of individual
180+
/// HTTP API requests with a single bulk git operation.
181+
///
182+
/// After this call, task files live at `<cache_dir>/tasks/{org}/{repo-number}/`.
183+
pub async fn snapshot_download_tasks(
184+
&self,
185+
dataset_id: &str,
186+
cache_dir: &std::path::Path,
187+
) -> Result<()> {
188+
let tasks_dir = cache_dir.join("tasks");
189+
190+
// Check if already cached (has enough task directories)
191+
if tasks_dir.exists() {
192+
let mut entries = tokio::fs::read_dir(&tasks_dir).await?;
193+
let mut count = 0usize;
194+
while entries.next_entry().await?.is_some() {
195+
count += 1;
196+
if count >= SNAPSHOT_MIN_DIRS {
197+
info!(
198+
"HF tasks already cached at {:?} ({} top-level dirs)",
199+
tasks_dir, count
200+
);
201+
return Ok(());
202+
}
203+
}
204+
}
205+
206+
info!(
207+
"Downloading HF dataset via git clone --depth 1 to {:?}",
208+
cache_dir
209+
);
210+
211+
let repo_url = format!("https://huggingface.co/datasets/{}", dataset_id);
212+
213+
// Clean and recreate cache dir for a fresh clone
214+
if cache_dir.exists() {
215+
tokio::fs::remove_dir_all(cache_dir).await.ok();
216+
}
217+
218+
// Shallow clone — the swe-forge dataset is small (~6 MB) so a full
219+
// depth-1 clone is fast and avoids HF git server limitations with
220+
// partial/sparse clones.
221+
let output = tokio::process::Command::new("git")
222+
.args(["clone", "--depth", "1"])
223+
.arg(&repo_url)
224+
.arg(cache_dir.as_os_str())
225+
.stdout(std::process::Stdio::piped())
226+
.stderr(std::process::Stdio::piped())
227+
.output()
228+
.await
229+
.context("Failed to execute git clone")?;
230+
231+
if !output.status.success() {
232+
let stderr = String::from_utf8_lossy(&output.stderr);
233+
anyhow::bail!("git clone failed (exit {}): {}", output.status, stderr);
234+
}
235+
236+
// Verify the download
237+
let mut count = 0usize;
238+
if tasks_dir.exists() {
239+
let mut entries = tokio::fs::read_dir(&tasks_dir).await?;
240+
while entries.next_entry().await?.is_some() {
241+
count += 1;
242+
}
243+
}
244+
info!("Downloaded {} top-level task directories from HF", count);
245+
246+
if count == 0 {
247+
anyhow::bail!(
248+
"Snapshot download produced no task directories at {:?}",
249+
tasks_dir
250+
);
251+
}
252+
253+
Ok(())
254+
}
255+
170256
async fn list_tree_recursive(
171257
&self,
172258
dataset_id: &str,

0 commit comments

Comments
 (0)