ace-agent · jane-jhu · Feb 8, 2026 · Feb 10, 2026 · Feb 11, 2026
diff --git a/ace/ace.py b/ace/ace.py
@@ -454,6 +454,11 @@ def _train_single_sample(
         token_budget = config_params['token_budget']
         use_json_mode = config_params['use_json_mode']
         no_ground_truth = config_params['no_ground_truth']
+        # if algorithmic task, use code prompt style
+        if hasattr(data_processor, "get_generator_prompt_style"):
+            prompt_style = data_processor.get_generator_prompt_style()
+        else:
+            prompt_style = "json"
 
         # Extract sample data
         question = task_dict.get("question", "")
@@ -468,6 +473,7 @@ def _train_single_sample(
             context=context,
             reflection="(empty)",
             use_json_mode=use_json_mode,
+            prompt_style=prompt_style,
             call_id=f"{step_id}_gen_initial",
             log_dir=log_dir
         )
@@ -533,6 +539,7 @@ def _train_single_sample(
                     context=context,
                     reflection=reflection_content,
                     use_json_mode=use_json_mode,
+                    prompt_style=prompt_style,
                     call_id=f"{step_id}_post_reflect_round_{round_num}",
                     log_dir=log_dir
                 )
@@ -612,6 +619,7 @@ def _train_single_sample(
             context=context,
             reflection="(empty)",
             use_json_mode=use_json_mode,
+            prompt_style=prompt_style,
             call_id=f"{step_id}_post_curate",
             log_dir=log_dir
         )

diff --git a/ace/core/generator.py b/ace/core/generator.py
@@ -6,7 +6,7 @@
 import json
 import re
 from typing import Dict, List, Tuple, Optional, Any
-from ..prompts.generator import GENERATOR_PROMPT
+from ..prompts.generator import GENERATOR_PROMPT_JSON, GENERATOR_PROMPT_CODE
 from llm import timed_llm_call
 
 class Generator:
@@ -37,6 +37,7 @@ def generate(
         context: str = "",
         reflection: str = "(empty)",
         use_json_mode: bool = False,
+        prompt_style: str = "json",
         call_id: str = "gen",
         log_dir: Optional[str] = None
     ) -> Tuple[str, List[str], Dict[str, Any]]:
@@ -56,8 +57,13 @@ def generate(
             Tuple of (full_response, bullet_ids_used, call_info)
         """
         # Format the prompt
-        prompt = GENERATOR_PROMPT.format(playbook, reflection, question, context)
+        if prompt_style == "code":
+            prompt = GENERATOR_PROMPT_CODE.format(playbook, reflection, question, context)
+        else:
+            prompt = GENERATOR_PROMPT_JSON.format(playbook, reflection, question, context)
 
+        use_json_mode_call = use_json_mode and prompt_style != "code"
+
         response, call_info = timed_llm_call(
             self.api_client,
             self.api_provider,
@@ -67,7 +73,7 @@ def generate(
             call_id=call_id,
             max_tokens=self.max_tokens,
             log_dir=log_dir,
-            use_json_mode=use_json_mode
+            use_json_mode=use_json_mode_call
         )
 
         # Extract bullet IDs if using retrieval and reason mode

diff --git a/ace/prompts/generator.py b/ace/prompts/generator.py
@@ -3,7 +3,7 @@
 """
 
 # Retrieval and Reason Generator prompt that outputs bullet IDs
-GENERATOR_PROMPT = """You are an analysis expert tasked with answering questions using your knowledge, a curated playbook of strategies and insights and a reflection that goes over the diagnosis of all previous mistakes made while answering the question.
+GENERATOR_PROMPT_JSON = """You are an analysis expert tasked with answering questions using your knowledge, a curated playbook of strategies and insights and a reflection that goes over the diagnosis of all previous mistakes made while answering the question.
 
 **Instructions:**
 - Read the playbook carefully and apply relevant strategies, formulas, and insights
@@ -39,4 +39,34 @@
 }}
 
 ---
-"""
+"""
+
+# Code-only generator prompt for programming tasks
+GENERATOR_PROMPT_CODE = """You are a coding expert tasked with solving programming problems using your knowledge, a curated playbook of strategies and insights, and a reflection that summarizes previous mistakes.
+
+**Instructions:**
+- Use the playbook and reflection when helpful
+- Write a complete, runnable solution that follows the problem's input/output format
+- Return only the final code (no explanations, no markdown, no JSON)
+- Prefer C++17 when the question asks for it
+
+**Playbook:**
+{}
+
+**Reflection:**
+{}
+
+**Question:**
+{}
+
+**Context:**
+{}
+
+**Output:**
+Return only the code.
+
+---
+"""
+
+# Backward-compatible alias
+GENERATOR_PROMPT = GENERATOR_PROMPT_JSON
diff --git a/eval/frontier-cs/data/algorithmic_test.jsonl b/eval/frontier-cs/data/algorithmic_test.jsonl
diff --git a/eval/frontier-cs/data/algorithmic_train.jsonl b/eval/frontier-cs/data/algorithmic_train.jsonl
diff --git a/eval/frontier-cs/data/algorithmic_val.jsonl b/eval/frontier-cs/data/algorithmic_val.jsonl
diff --git a/eval/frontier-cs/data/sample_config.json b/eval/frontier-cs/data/sample_config.json
@@ -0,0 +1,12 @@
+{
+  "algorithmic": {
+    "train_data": "./eval/frontier-cs/data/algorithmic_train.jsonl",
+    "val_data": "./eval/frontier-cs/data/algorithmic_val.jsonl",
+    "test_data": "./eval/frontier-cs/data/algorithmic_test.jsonl"
+  },
+  "research": {
+    "train_data": "./eval/frontier-cs/data/research_train.jsonl",
+    "val_data": "./eval/frontier-cs/data/research_val.jsonl",
+    "test_data": "./eval/frontier-cs/data/research_test.jsonl"
+  }
+}
diff --git a/eval/frontier-cs/data_processor.py b/eval/frontier-cs/data_processor.py
@@ -0,0 +1,228 @@
+import hashlib
+import json
+import re
+import sys
+import threading
+from pathlib import Path
+from typing import List, Dict, Any, Optional
+
+
+class DataProcessor:
+    """
+    Processor for Frontier-CS tasks.
+
+    Note: Frontier-CS is open-ended. Default correctness checks validate
+    output format rather than solution quality.
+    For algorithmic track, this can optionally call the Frontier-CS judge
+    to score code submissions.
+    """
+
+    def __init__(
+        self,
+        task_name: str,
+        frontier_root: Optional[str] = None,
+        judge_url: str = "http://localhost:8081",
+        backend: Optional[str] = None,
+        use_judge: bool = True,
+    ):
+        """
+        Initialize the data processor.
+
+        Args:
+            task_name: "algorithmic" or "research"
+            frontier_root: Path to Frontier-CS repo (used to import evaluator)
+            judge_url: Algorithmic judge server URL
+            backend: "docker" or "skypilot" (optional)
+            use_judge: Enable judge-based evaluation for algorithmic track (whether get score or just format validity)
+        """
+        self.task_name = task_name
+        self.frontier_root = Path(frontier_root).expanduser() if frontier_root else None
+        self.judge_url = judge_url
+        self.backend = backend
+        self.use_judge = use_judge
+
+        self._evaluator = None
+        self._evaluator_error = None
+        self._score_cache: Dict[tuple, float] = {}
+        self._cache_lock = threading.Lock()
+
+    def process_task_data(self, raw_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Convert raw JSONL items into ACE's standard format.
+        """
+        processed_data = []
+
+        for item in raw_data:
+            context = item.get("context", "")
+            target = item.get("target", "")
+            metadata = item.get("metadata", {})
+
+            if self.task_name == "algorithmic":
+                problem_id = metadata.get("problem_id")
+                if problem_id is None and self.use_judge:
+                    raise ValueError("Missing problem_id in metadata for algorithmic judging.")
+                if problem_id is not None and (not target or (isinstance(target, str) and not target.strip())):
+                    target = json.dumps(
+                        {"problem_id": int(problem_id)},
+                        separators=(",", ":"),
+                    )
+
+            processed_item = {
+                "context": context,
+                "question": self._build_question(metadata),
+                "target": target,
+                "others": {
+                    "original_context": context,
+                    "task": self.task_name,
+                    "metadata": metadata,
+                },
+            }
+            processed_data.append(processed_item)
+
+        return processed_data
+
+    def _build_question(self, metadata: Dict[str, Any]) -> str:
+        if self.task_name == "algorithmic":
+            base = (
+                "Solve the algorithmic problem in the context. "
+                "Write a C++17 program that follows the input/output format. "
+                "Return only code, no explanations. Optimize for score when applicable."
+            )
+            return base
+        if self.task_name == "research":
+            return (
+                "Solve the research problem in the context. "
+                "Implement the required Python API (e.g., Solution.solve). "
+                "Return only code, no explanations."
+            )
+        raise ValueError(f"Unknown task: {self.task_name}")
+
+    def get_generator_prompt_style(self) -> str:
+        if self.task_name == "algorithmic":
+            return "code"
+        return "json"
+
+    def _algorithmic_answer_is_valid(self, predicted: str) -> bool:
+        if not predicted or not predicted.strip():
+            return False
+        return bool(re.search(r"\bint\s+main\s*\(", predicted)) and "#include" in predicted
+
+    def _extract_problem_id(self, ground_truth: str) -> Optional[int]:
+        if not ground_truth:
+            return None
+        if isinstance(ground_truth, str):
+            try:
+                payload = json.loads(ground_truth)
+                if isinstance(payload, dict) and "problem_id" in payload:
+                    return int(payload["problem_id"])
+            except json.JSONDecodeError:
+                if ground_truth.isdigit():
+                    return int(ground_truth)
+        return None
+
+    def _ensure_evaluator(self) -> None:
+        if not self.use_judge or self.task_name != "algorithmic":
+            return
+        if self._evaluator or self._evaluator_error:
+            return
+        try:
+            if self.frontier_root:
+                src_path = self.frontier_root / "src"
+                if src_path.exists():
+                    sys.path.insert(0, str(src_path))
+            from frontier_cs import SingleEvaluator  # type: ignore[import-not-found]
+        except Exception as exc:  # pragma: no cover - import-time errors
+            self._evaluator_error = exc
+            return
+
+        self._evaluator = SingleEvaluator(
+            backend=self.backend,
+            base_dir=self.frontier_root,
+            judge_url=self.judge_url,
+        )
+
+    def _cache_key(self, problem_id: int, code: str) -> tuple:
+        code_hash = hashlib.sha256(code.encode("utf-8")).hexdigest()
+        return (str(problem_id), code_hash, self.backend or "")
+
+    def _score_algorithmic(self, predicted: str, problem_id: int) -> float:
+        if not self._algorithmic_answer_is_valid(predicted):
+            return 0.0
+
+        key = self._cache_key(problem_id, predicted)
+        with self._cache_lock:
+            if key in self._score_cache:
+                return self._score_cache[key]
+
+        self._ensure_evaluator()
+        if not self._evaluator:
+            raise RuntimeError(
+                "Frontier-CS evaluator not available. "
+                "Set --frontier_root to the Frontier-CS repo and ensure dependencies are installed."
+            )
+
+        result = self._evaluator.evaluate(
+            "algorithmic",
+            problem_id=problem_id,
+            code=predicted,
+            backend=self.backend,
+        )
+
+        score = 0.0
+        if result.success:
+            score = result.score if result.score is not None else 0.0
+
+        with self._cache_lock:
+            self._score_cache[key] = score
+
+        return score
+
+    def _research_answer_is_valid(
+        self,
+        predicted: str,
+        ground_truth: Optional[str] = None,
+    ) -> bool:
+        _ = ground_truth
+        if not predicted or not predicted.strip():
+            return False
+        return "class Solution" in predicted and re.search(r"\bdef\s+solve\s*\(", predicted) is not None
+
+    def answer_is_correct(self, predicted: str, ground_truth: str) -> bool:
+        """
+        Format-based correctness for open-ended tasks.
+        """
+        if self.task_name == "algorithmic":
+            # When using judge scoring, treat correctness as format validity.
+            return self._algorithmic_answer_is_valid(predicted)
+        if self.task_name == "research":
+            return self._research_answer_is_valid(predicted, ground_truth)
+
+        raise ValueError(f"Unknown task: {self.task_name}")
+
+    def evaluate_accuracy(self, out: List[str], target: List[str]) -> float:
+        """
+        Compute accuracy or average score.
+        """
+        if len(out) != len(target):
+            raise ValueError("Input lists 'out' and 'target' must have the same length.")
+
+        if not out:
+            return 0.0
+
+        if self.task_name == "algorithmic" and self.use_judge:
+            scores = []
+            for predicted, ground_truth in zip(out, target):
+                problem_id = self._extract_problem_id(ground_truth)
+                if problem_id is None:
+                    scores.append(0.0)
+                    continue
+                scores.append(self._score_algorithmic(predicted, problem_id))
+            return sum(scores) / len(scores) if scores else 0.0
+
+        correct_count = 0
+        for predicted, ground_truth in zip(out, target):
+            if self.answer_is_correct(predicted, ground_truth):
+                correct_count += 1
+
+        return correct_count / len(out)
+