Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions ace/ace.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,11 @@ def _train_single_sample(
token_budget = config_params['token_budget']
use_json_mode = config_params['use_json_mode']
no_ground_truth = config_params['no_ground_truth']
# if algorithmic task, use code prompt style
if hasattr(data_processor, "get_generator_prompt_style"):
prompt_style = data_processor.get_generator_prompt_style()
else:
prompt_style = "json"

# Extract sample data
question = task_dict.get("question", "")
Expand All @@ -468,6 +473,7 @@ def _train_single_sample(
context=context,
reflection="(empty)",
use_json_mode=use_json_mode,
prompt_style=prompt_style,
call_id=f"{step_id}_gen_initial",
log_dir=log_dir
)
Expand Down Expand Up @@ -533,6 +539,7 @@ def _train_single_sample(
context=context,
reflection=reflection_content,
use_json_mode=use_json_mode,
prompt_style=prompt_style,
call_id=f"{step_id}_post_reflect_round_{round_num}",
log_dir=log_dir
)
Expand Down Expand Up @@ -612,6 +619,7 @@ def _train_single_sample(
context=context,
reflection="(empty)",
use_json_mode=use_json_mode,
prompt_style=prompt_style,
call_id=f"{step_id}_post_curate",
log_dir=log_dir
)
Expand Down
12 changes: 9 additions & 3 deletions ace/core/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import json
import re
from typing import Dict, List, Tuple, Optional, Any
from ..prompts.generator import GENERATOR_PROMPT
from ..prompts.generator import GENERATOR_PROMPT_JSON, GENERATOR_PROMPT_CODE
from llm import timed_llm_call

class Generator:
Expand Down Expand Up @@ -37,6 +37,7 @@ def generate(
context: str = "",
reflection: str = "(empty)",
use_json_mode: bool = False,
prompt_style: str = "json",
call_id: str = "gen",
log_dir: Optional[str] = None
) -> Tuple[str, List[str], Dict[str, Any]]:
Expand All @@ -56,8 +57,13 @@ def generate(
Tuple of (full_response, bullet_ids_used, call_info)
"""
# Format the prompt
prompt = GENERATOR_PROMPT.format(playbook, reflection, question, context)
if prompt_style == "code":
prompt = GENERATOR_PROMPT_CODE.format(playbook, reflection, question, context)
else:
prompt = GENERATOR_PROMPT_JSON.format(playbook, reflection, question, context)

use_json_mode_call = use_json_mode and prompt_style != "code"

response, call_info = timed_llm_call(
self.api_client,
self.api_provider,
Expand All @@ -67,7 +73,7 @@ def generate(
call_id=call_id,
max_tokens=self.max_tokens,
log_dir=log_dir,
use_json_mode=use_json_mode
use_json_mode=use_json_mode_call
)

# Extract bullet IDs if using retrieval and reason mode
Expand Down
34 changes: 32 additions & 2 deletions ace/prompts/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""

# Retrieval and Reason Generator prompt that outputs bullet IDs
GENERATOR_PROMPT = """You are an analysis expert tasked with answering questions using your knowledge, a curated playbook of strategies and insights and a reflection that goes over the diagnosis of all previous mistakes made while answering the question.
GENERATOR_PROMPT_JSON = """You are an analysis expert tasked with answering questions using your knowledge, a curated playbook of strategies and insights and a reflection that goes over the diagnosis of all previous mistakes made while answering the question.

**Instructions:**
- Read the playbook carefully and apply relevant strategies, formulas, and insights
Expand Down Expand Up @@ -39,4 +39,34 @@
}}

---
"""
"""

# Code-only generator prompt for programming tasks
GENERATOR_PROMPT_CODE = """You are a coding expert tasked with solving programming problems using your knowledge, a curated playbook of strategies and insights, and a reflection that summarizes previous mistakes.

**Instructions:**
- Use the playbook and reflection when helpful
- Write a complete, runnable solution that follows the problem's input/output format
- Return only the final code (no explanations, no markdown, no JSON)
- Prefer C++17 when the question asks for it

**Playbook:**
{}

**Reflection:**
{}

**Question:**
{}

**Context:**
{}

**Output:**
Return only the code.

---
"""

# Backward-compatible alias
GENERATOR_PROMPT = GENERATOR_PROMPT_JSON
18 changes: 18 additions & 0 deletions eval/frontier-cs/data/algorithmic_test.jsonl

Large diffs are not rendered by default.

137 changes: 137 additions & 0 deletions eval/frontier-cs/data/algorithmic_train.jsonl

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions eval/frontier-cs/data/algorithmic_val.jsonl

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions eval/frontier-cs/data/sample_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"algorithmic": {
"train_data": "./eval/frontier-cs/data/algorithmic_train.jsonl",
"val_data": "./eval/frontier-cs/data/algorithmic_val.jsonl",
"test_data": "./eval/frontier-cs/data/algorithmic_test.jsonl"
},
"research": {
"train_data": "./eval/frontier-cs/data/research_train.jsonl",
"val_data": "./eval/frontier-cs/data/research_val.jsonl",
"test_data": "./eval/frontier-cs/data/research_test.jsonl"
}
}
228 changes: 228 additions & 0 deletions eval/frontier-cs/data_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
import hashlib
import json
import re
import sys
import threading
from pathlib import Path
from typing import List, Dict, Any, Optional


class DataProcessor:
"""
Processor for Frontier-CS tasks.

Note: Frontier-CS is open-ended. Default correctness checks validate
output format rather than solution quality.
For algorithmic track, this can optionally call the Frontier-CS judge
to score code submissions.
"""

def __init__(
self,
task_name: str,
frontier_root: Optional[str] = None,
judge_url: str = "http://localhost:8081",
backend: Optional[str] = None,
use_judge: bool = True,
):
"""
Initialize the data processor.

Args:
task_name: "algorithmic" or "research"
frontier_root: Path to Frontier-CS repo (used to import evaluator)
judge_url: Algorithmic judge server URL
backend: "docker" or "skypilot" (optional)
use_judge: Enable judge-based evaluation for algorithmic track (whether get score or just format validity)
"""
self.task_name = task_name
self.frontier_root = Path(frontier_root).expanduser() if frontier_root else None
self.judge_url = judge_url
self.backend = backend
self.use_judge = use_judge

self._evaluator = None
self._evaluator_error = None
self._score_cache: Dict[tuple, float] = {}
self._cache_lock = threading.Lock()

def process_task_data(self, raw_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Convert raw JSONL items into ACE's standard format.
"""
processed_data = []

for item in raw_data:
context = item.get("context", "")
target = item.get("target", "")
metadata = item.get("metadata", {})

if self.task_name == "algorithmic":
problem_id = metadata.get("problem_id")
if problem_id is None and self.use_judge:
raise ValueError("Missing problem_id in metadata for algorithmic judging.")
if problem_id is not None and (not target or (isinstance(target, str) and not target.strip())):
target = json.dumps(
{"problem_id": int(problem_id)},
separators=(",", ":"),
)

processed_item = {
"context": context,
"question": self._build_question(metadata),
"target": target,
"others": {
"original_context": context,
"task": self.task_name,
"metadata": metadata,
},
}
processed_data.append(processed_item)

return processed_data

def _build_question(self, metadata: Dict[str, Any]) -> str:
if self.task_name == "algorithmic":
base = (
"Solve the algorithmic problem in the context. "
"Write a C++17 program that follows the input/output format. "
"Return only code, no explanations. Optimize for score when applicable."
)
return base
if self.task_name == "research":
return (
"Solve the research problem in the context. "
"Implement the required Python API (e.g., Solution.solve). "
"Return only code, no explanations."
)
raise ValueError(f"Unknown task: {self.task_name}")

def get_generator_prompt_style(self) -> str:
if self.task_name == "algorithmic":
return "code"
return "json"

def _algorithmic_answer_is_valid(self, predicted: str) -> bool:
if not predicted or not predicted.strip():
return False
return bool(re.search(r"\bint\s+main\s*\(", predicted)) and "#include" in predicted

def _extract_problem_id(self, ground_truth: str) -> Optional[int]:
if not ground_truth:
return None
if isinstance(ground_truth, str):
try:
payload = json.loads(ground_truth)
if isinstance(payload, dict) and "problem_id" in payload:
return int(payload["problem_id"])
except json.JSONDecodeError:
if ground_truth.isdigit():
return int(ground_truth)
return None

def _ensure_evaluator(self) -> None:
if not self.use_judge or self.task_name != "algorithmic":
return
if self._evaluator or self._evaluator_error:
return
try:
if self.frontier_root:
src_path = self.frontier_root / "src"
if src_path.exists():
sys.path.insert(0, str(src_path))
from frontier_cs import SingleEvaluator # type: ignore[import-not-found]
except Exception as exc: # pragma: no cover - import-time errors
self._evaluator_error = exc
return

self._evaluator = SingleEvaluator(
backend=self.backend,
base_dir=self.frontier_root,
judge_url=self.judge_url,
)

def _cache_key(self, problem_id: int, code: str) -> tuple:
code_hash = hashlib.sha256(code.encode("utf-8")).hexdigest()
return (str(problem_id), code_hash, self.backend or "")

def _score_algorithmic(self, predicted: str, problem_id: int) -> float:
if not self._algorithmic_answer_is_valid(predicted):
return 0.0

key = self._cache_key(problem_id, predicted)
with self._cache_lock:
if key in self._score_cache:
return self._score_cache[key]

self._ensure_evaluator()
if not self._evaluator:
raise RuntimeError(
"Frontier-CS evaluator not available. "
"Set --frontier_root to the Frontier-CS repo and ensure dependencies are installed."
)

result = self._evaluator.evaluate(
"algorithmic",
problem_id=problem_id,
code=predicted,
backend=self.backend,
)

score = 0.0
if result.success:
score = result.score if result.score is not None else 0.0

with self._cache_lock:
self._score_cache[key] = score

return score

def _research_answer_is_valid(
self,
predicted: str,
ground_truth: Optional[str] = None,
) -> bool:
_ = ground_truth
if not predicted or not predicted.strip():
return False
return "class Solution" in predicted and re.search(r"\bdef\s+solve\s*\(", predicted) is not None

def answer_is_correct(self, predicted: str, ground_truth: str) -> bool:
"""
Format-based correctness for open-ended tasks.
"""
if self.task_name == "algorithmic":
# When using judge scoring, treat correctness as format validity.
return self._algorithmic_answer_is_valid(predicted)
if self.task_name == "research":
return self._research_answer_is_valid(predicted, ground_truth)

raise ValueError(f"Unknown task: {self.task_name}")

def evaluate_accuracy(self, out: List[str], target: List[str]) -> float:
"""
Compute accuracy or average score.
"""
if len(out) != len(target):
raise ValueError("Input lists 'out' and 'target' must have the same length.")

if not out:
return 0.0

if self.task_name == "algorithmic" and self.use_judge:
scores = []
for predicted, ground_truth in zip(out, target):
problem_id = self._extract_problem_id(ground_truth)
if problem_id is None:
scores.append(0.0)
continue
scores.append(self._score_algorithmic(predicted, problem_id))
return sum(scores) / len(scores) if scores else 0.0

correct_count = 0
for predicted, ground_truth in zip(out, target):
if self.answer_is_correct(predicted, ground_truth):
correct_count += 1

return correct_count / len(out)

Loading