From 3872d14af1c37e717cdfa23bd518cd64bb40fbd4 Mon Sep 17 00:00:00 2001 From: E Ramos Date: Thu, 26 Mar 2026 16:46:08 +0000 Subject: [PATCH 1/2] feat(task): add README release asset sync task for PinchBench v2 --- tasks/task_readme_release_asset_sync.md | 126 ++++++++++++++++++ .../task_readme_release_asset_sync/README.md | 11 ++ .../README.original.md | 11 ++ .../release-assets.json | 8 ++ 4 files changed, 156 insertions(+) create mode 100644 tasks/task_readme_release_asset_sync.md create mode 100644 tasks/task_readme_release_asset_sync/README.md create mode 100644 tasks/task_readme_release_asset_sync/README.original.md create mode 100644 tasks/task_readme_release_asset_sync/release-assets.json diff --git a/tasks/task_readme_release_asset_sync.md b/tasks/task_readme_release_asset_sync.md new file mode 100644 index 0000000..96da520 --- /dev/null +++ b/tasks/task_readme_release_asset_sync.md @@ -0,0 +1,126 @@ +id: task_readme_release_asset_sync +name: Sync README installation instructions with actual release assets +category: coding +grading_type: automated +timeout_seconds: 120 +workspace_files: + - README.md + - release-assets.json + +## Prompt + +The workspace contains a README.md and a release-assets.json file. + +Update README.md so that its installation instructions match the actual release assets listed in release-assets.json. + +Requirements: +- Keep the existing README structure and tone. +- Only edit the installation section. +- Use only filenames that actually exist in release-assets.json. +- Make the instructions explicit for both VS Code and Cursor when applicable. +- Do not invent assets, commands, or package names. +- Do not modify release-assets.json. + +## Expected Behavior + +The agent should inspect release-assets.json to determine which distributable files actually exist, then update the installation section of README.md so the documented installation flow matches those assets. + +## Grading Criteria + +- README.md was modified +- release-assets.json was not modified +- Installation section references only assets present in release-assets.json +- No removed/nonexistent asset is still referenced in the installation section +- README structure outside the installation section remains unchanged +- Instructions explicitly cover the available editor targets when supported by the listed assets + +## Automated Checks + +def grade(transcript: list, workspace_path: str) -> dict: + from pathlib import Path + import json + import re + + workspace = Path(workspace_path) + readme = workspace / "README.md" + assets_file = workspace / "release-assets.json" + original_readme = workspace / "README.original.md" + + scores = { + "README.md was modified": 0.0, + "release-assets.json was not modified": 0.0, + "Installation section references only assets present in release-assets.json": 0.0, + "No removed/nonexistent asset is still referenced in the installation section": 0.0, + "README structure outside the installation section remains unchanged": 0.0, + "Instructions explicitly cover the available editor targets when supported by the listed assets": 0.0, + } + + if not readme.exists() or not assets_file.exists() or not original_readme.exists(): + return scores + + new_readme = readme.read_text(encoding="utf-8") + old_readme = original_readme.read_text(encoding="utf-8") + assets = json.loads(assets_file.read_text(encoding="utf-8")) + + asset_names = set() + for item in assets: + if isinstance(item, dict) and "name" in item: + asset_names.add(item["name"]) + + if new_readme != old_readme: + scores["README.md was modified"] = 1.0 + + # release-assets.json unchanged: infer from transcript tool/file writes + # fallback to existence only if no write detected + assets_modified = False + for event in transcript: + if event.get("type") != "message": + continue + msg = event.get("message", {}) + for item in msg.get("content", []): + if item.get("type") == "toolCall": + raw = json.dumps(item, ensure_ascii=False) + if "release-assets.json" in raw: + # allow reads, try to detect write/edit intent heuristically + if any(token in raw.lower() for token in ["write", "replace", "update", "edit", "create"]): + assets_modified = True + if not assets_modified: + scores["release-assets.json was not modified"] = 1.0 + + def extract_install_section(text: str): + m = re.search( + r"(?ims)^(##\s+Installation\b.*?)(?=^##\s+|\Z)", + text + ) + return m.group(1) if m else "" + + new_install = extract_install_section(new_readme) + old_install = extract_install_section(old_readme) + + # collect filenames mentioned in install section + mentioned_assets = set(re.findall(r"[A-Za-z0-9._-]+\.vsix", new_install)) + + if mentioned_assets and all(name in asset_names for name in mentioned_assets): + scores["Installation section references only assets present in release-assets.json"] = 1.0 + + old_assets = set(re.findall(r"[A-Za-z0-9._-]+\.vsix", old_install)) + nonexistent_old_assets = {name for name in old_assets if name not in asset_names} + if not any(name in new_install for name in nonexistent_old_assets): + scores["No removed/nonexistent asset is still referenced in the installation section"] = 1.0 + + def strip_install_section(text: str): + return re.sub(r"(?ims)^##\s+Installation\b.*?(?=^##\s+|\Z)", "", text).strip() + + if strip_install_section(new_readme) == strip_install_section(old_readme): + scores["README structure outside the installation section remains unchanged"] = 1.0 + + has_vscode_asset = any("vscode" in name.lower() for name in asset_names) + has_cursor_asset = any("cursor" in name.lower() for name in asset_names) + + vscode_ok = (not has_vscode_asset) or re.search(r"(?i)\bVS Code\b", new_install) + cursor_ok = (not has_cursor_asset) or re.search(r"(?i)\bCursor\b", new_install) + + if vscode_ok and cursor_ok: + scores["Instructions explicitly cover the available editor targets when supported by the listed assets"] = 1.0 + + return scores \ No newline at end of file diff --git a/tasks/task_readme_release_asset_sync/README.md b/tasks/task_readme_release_asset_sync/README.md new file mode 100644 index 0000000..82e0cd9 --- /dev/null +++ b/tasks/task_readme_release_asset_sync/README.md @@ -0,0 +1,11 @@ +# Kilo Code Extension + +## Installation + +1. Download the latest `.vsix` file from releases +2. Run: + code --install-extension kilo-vscode-OLD.vsix + +## Usage + +Open the command palette and run "Kilo: Start" \ No newline at end of file diff --git a/tasks/task_readme_release_asset_sync/README.original.md b/tasks/task_readme_release_asset_sync/README.original.md new file mode 100644 index 0000000..82e0cd9 --- /dev/null +++ b/tasks/task_readme_release_asset_sync/README.original.md @@ -0,0 +1,11 @@ +# Kilo Code Extension + +## Installation + +1. Download the latest `.vsix` file from releases +2. Run: + code --install-extension kilo-vscode-OLD.vsix + +## Usage + +Open the command palette and run "Kilo: Start" \ No newline at end of file diff --git a/tasks/task_readme_release_asset_sync/release-assets.json b/tasks/task_readme_release_asset_sync/release-assets.json new file mode 100644 index 0000000..5e43eb1 --- /dev/null +++ b/tasks/task_readme_release_asset_sync/release-assets.json @@ -0,0 +1,8 @@ +[ + { + "name": "kilo-vscode-1.0.0.vsix" + }, + { + "name": "kilo-cursor-1.0.0.vsix" + } +] \ No newline at end of file From be9ec07d7eecef4b1a22ecc35aa94d6166ed7596 Mon Sep 17 00:00:00 2001 From: E Ramos Date: Thu, 26 Mar 2026 17:11:11 +0000 Subject: [PATCH 2/2] feat(task): add sum off-by-one bug task for PinchBench v2 --- tasks/task_sum_function_off_by_one.md | 99 +++++++++++++++++++ tasks/task_sum_function_off_by_one/sum.py | 5 + .../task_sum_function_off_by_one/test_sum.py | 7 ++ 3 files changed, 111 insertions(+) create mode 100644 tasks/task_sum_function_off_by_one.md create mode 100644 tasks/task_sum_function_off_by_one/sum.py create mode 100644 tasks/task_sum_function_off_by_one/test_sum.py diff --git a/tasks/task_sum_function_off_by_one.md b/tasks/task_sum_function_off_by_one.md new file mode 100644 index 0000000..4cd7811 --- /dev/null +++ b/tasks/task_sum_function_off_by_one.md @@ -0,0 +1,99 @@ +id: task_sum_function_off_by_one +name: Fix off-by-one error in sum function +category: coding +grading_type: automated +timeout_seconds: 120 +workspace_files: + - sum.py + - test_sum.py + +## Prompt + +The workspace contains a function `sum_n(n)` in `sum.py` that is supposed to return the sum of integers from 1 to n (inclusive). + +However, the implementation is incorrect and fails the provided tests. + +Fix the implementation so that all tests pass. + +Requirements: +- Modify only `sum.py` +- Do not change `test_sum.py` +- Do not introduce new functions +- Keep the function signature unchanged + +## Expected Behavior + +The function `sum_n(n)` should return: + +1 + 2 + ... + n + +Examples: +- sum_n(1) == 1 +- sum_n(3) == 6 +- sum_n(5) == 15 + +## Grading Criteria + +- sum.py was modified +- test_sum.py was not modified +- All tests pass +- Function returns correct results for multiple inputs + +## Automated Checks + +def grade(transcript: list, workspace_path: str) -> dict: + import importlib.util + from pathlib import Path + + workspace = Path(workspace_path) + sum_file = workspace / "sum.py" + test_file = workspace / "test_sum.py" + + scores = { + "sum.py was modified": 0.0, + "test_sum.py was not modified": 0.0, + "All tests pass": 0.0, + "Function returns correct results for multiple inputs": 0.0, + } + + if not sum_file.exists() or not test_file.exists(): + return scores + + original_code = sum_file.read_text() + + # Check modification via transcript heuristic + modified = False + for event in transcript: + if event.get("type") != "message": + continue + msg = event.get("message", {}) + for item in msg.get("content", []): + if item.get("type") == "toolCall": + raw = str(item) + if "sum.py" in raw and any(x in raw.lower() for x in ["write", "edit", "replace"]): + modified = True + if modified: + scores["sum.py was modified"] = 1.0 + + # Ensure test file untouched + scores["test_sum.py was not modified"] = 1.0 + + # Load module dynamically + spec = importlib.util.spec_from_file_location("sum_module", sum_file) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + try: + tests = [ + module.sum_n(1) == 1, + module.sum_n(3) == 6, + module.sum_n(5) == 15, + module.sum_n(10) == 55, + ] + if all(tests): + scores["All tests pass"] = 1.0 + scores["Function returns correct results for multiple inputs"] = 1.0 + except Exception: + pass + + return scores \ No newline at end of file diff --git a/tasks/task_sum_function_off_by_one/sum.py b/tasks/task_sum_function_off_by_one/sum.py new file mode 100644 index 0000000..2269ba0 --- /dev/null +++ b/tasks/task_sum_function_off_by_one/sum.py @@ -0,0 +1,5 @@ +def sum_n(n): + total = 0 + for i in range(1, n): # BUG: should include n + total += i + return total \ No newline at end of file diff --git a/tasks/task_sum_function_off_by_one/test_sum.py b/tasks/task_sum_function_off_by_one/test_sum.py new file mode 100644 index 0000000..973c489 --- /dev/null +++ b/tasks/task_sum_function_off_by_one/test_sum.py @@ -0,0 +1,7 @@ +from sum import sum_n + +def test_sum(): + assert sum_n(1) == 1 + assert sum_n(3) == 6 + assert sum_n(5) == 15 + assert sum_n(10) == 55 \ No newline at end of file