Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 126 additions & 0 deletions tasks/task_readme_release_asset_sync.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
id: task_readme_release_asset_sync
name: Sync README installation instructions with actual release assets
category: coding
grading_type: automated
timeout_seconds: 120
workspace_files:
- README.md
- release-assets.json

## Prompt

The workspace contains a README.md and a release-assets.json file.

Update README.md so that its installation instructions match the actual release assets listed in release-assets.json.

Requirements:
- Keep the existing README structure and tone.
- Only edit the installation section.
- Use only filenames that actually exist in release-assets.json.
- Make the instructions explicit for both VS Code and Cursor when applicable.
- Do not invent assets, commands, or package names.
- Do not modify release-assets.json.

## Expected Behavior

The agent should inspect release-assets.json to determine which distributable files actually exist, then update the installation section of README.md so the documented installation flow matches those assets.

## Grading Criteria

- README.md was modified
- release-assets.json was not modified
- Installation section references only assets present in release-assets.json
- No removed/nonexistent asset is still referenced in the installation section
- README structure outside the installation section remains unchanged
- Instructions explicitly cover the available editor targets when supported by the listed assets

## Automated Checks

def grade(transcript: list, workspace_path: str) -> dict:
from pathlib import Path
import json
import re

workspace = Path(workspace_path)
readme = workspace / "README.md"
assets_file = workspace / "release-assets.json"
original_readme = workspace / "README.original.md"

scores = {
"README.md was modified": 0.0,
"release-assets.json was not modified": 0.0,
"Installation section references only assets present in release-assets.json": 0.0,
"No removed/nonexistent asset is still referenced in the installation section": 0.0,
"README structure outside the installation section remains unchanged": 0.0,
"Instructions explicitly cover the available editor targets when supported by the listed assets": 0.0,
}

if not readme.exists() or not assets_file.exists() or not original_readme.exists():
return scores

new_readme = readme.read_text(encoding="utf-8")
old_readme = original_readme.read_text(encoding="utf-8")
assets = json.loads(assets_file.read_text(encoding="utf-8"))

asset_names = set()
for item in assets:
if isinstance(item, dict) and "name" in item:
asset_names.add(item["name"])

if new_readme != old_readme:
scores["README.md was modified"] = 1.0

# release-assets.json unchanged: infer from transcript tool/file writes
# fallback to existence only if no write detected
assets_modified = False
for event in transcript:
if event.get("type") != "message":
continue
msg = event.get("message", {})
for item in msg.get("content", []):
if item.get("type") == "toolCall":
raw = json.dumps(item, ensure_ascii=False)
if "release-assets.json" in raw:
# allow reads, try to detect write/edit intent heuristically
if any(token in raw.lower() for token in ["write", "replace", "update", "edit", "create"]):
assets_modified = True
if not assets_modified:
scores["release-assets.json was not modified"] = 1.0

def extract_install_section(text: str):
m = re.search(
r"(?ims)^(##\s+Installation\b.*?)(?=^##\s+|\Z)",
text
)
return m.group(1) if m else ""

new_install = extract_install_section(new_readme)
old_install = extract_install_section(old_readme)

# collect filenames mentioned in install section
mentioned_assets = set(re.findall(r"[A-Za-z0-9._-]+\.vsix", new_install))

if mentioned_assets and all(name in asset_names for name in mentioned_assets):
scores["Installation section references only assets present in release-assets.json"] = 1.0

old_assets = set(re.findall(r"[A-Za-z0-9._-]+\.vsix", old_install))
nonexistent_old_assets = {name for name in old_assets if name not in asset_names}
if not any(name in new_install for name in nonexistent_old_assets):
scores["No removed/nonexistent asset is still referenced in the installation section"] = 1.0

def strip_install_section(text: str):
return re.sub(r"(?ims)^##\s+Installation\b.*?(?=^##\s+|\Z)", "", text).strip()

if strip_install_section(new_readme) == strip_install_section(old_readme):
scores["README structure outside the installation section remains unchanged"] = 1.0

has_vscode_asset = any("vscode" in name.lower() for name in asset_names)
has_cursor_asset = any("cursor" in name.lower() for name in asset_names)

vscode_ok = (not has_vscode_asset) or re.search(r"(?i)\bVS Code\b", new_install)
cursor_ok = (not has_cursor_asset) or re.search(r"(?i)\bCursor\b", new_install)

if vscode_ok and cursor_ok:
scores["Instructions explicitly cover the available editor targets when supported by the listed assets"] = 1.0

return scores
11 changes: 11 additions & 0 deletions tasks/task_readme_release_asset_sync/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Kilo Code Extension

## Installation

1. Download the latest `.vsix` file from releases
2. Run:
code --install-extension kilo-vscode-OLD.vsix

## Usage

Open the command palette and run "Kilo: Start"
11 changes: 11 additions & 0 deletions tasks/task_readme_release_asset_sync/README.original.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Kilo Code Extension

## Installation

1. Download the latest `.vsix` file from releases
2. Run:
code --install-extension kilo-vscode-OLD.vsix

## Usage

Open the command palette and run "Kilo: Start"
8 changes: 8 additions & 0 deletions tasks/task_readme_release_asset_sync/release-assets.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[
{
"name": "kilo-vscode-1.0.0.vsix"
},
{
"name": "kilo-cursor-1.0.0.vsix"
}
]
99 changes: 99 additions & 0 deletions tasks/task_sum_function_off_by_one.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
id: task_sum_function_off_by_one
name: Fix off-by-one error in sum function
category: coding
grading_type: automated
timeout_seconds: 120
workspace_files:
- sum.py
- test_sum.py

## Prompt

The workspace contains a function `sum_n(n)` in `sum.py` that is supposed to return the sum of integers from 1 to n (inclusive).

However, the implementation is incorrect and fails the provided tests.

Fix the implementation so that all tests pass.

Requirements:
- Modify only `sum.py`
- Do not change `test_sum.py`
- Do not introduce new functions
- Keep the function signature unchanged

## Expected Behavior

The function `sum_n(n)` should return:

1 + 2 + ... + n

Examples:
- sum_n(1) == 1
- sum_n(3) == 6
- sum_n(5) == 15

## Grading Criteria

- sum.py was modified
- test_sum.py was not modified
- All tests pass
- Function returns correct results for multiple inputs

## Automated Checks

def grade(transcript: list, workspace_path: str) -> dict:
import importlib.util
from pathlib import Path

workspace = Path(workspace_path)
sum_file = workspace / "sum.py"
test_file = workspace / "test_sum.py"

scores = {
"sum.py was modified": 0.0,
"test_sum.py was not modified": 0.0,
"All tests pass": 0.0,
"Function returns correct results for multiple inputs": 0.0,
}

if not sum_file.exists() or not test_file.exists():
return scores

original_code = sum_file.read_text()

# Check modification via transcript heuristic
modified = False
for event in transcript:
if event.get("type") != "message":
continue
msg = event.get("message", {})
for item in msg.get("content", []):
if item.get("type") == "toolCall":
raw = str(item)
if "sum.py" in raw and any(x in raw.lower() for x in ["write", "edit", "replace"]):
modified = True
if modified:
scores["sum.py was modified"] = 1.0

# Ensure test file untouched
scores["test_sum.py was not modified"] = 1.0

# Load module dynamically
spec = importlib.util.spec_from_file_location("sum_module", sum_file)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)

try:
tests = [
module.sum_n(1) == 1,
module.sum_n(3) == 6,
module.sum_n(5) == 15,
module.sum_n(10) == 55,
]
if all(tests):
scores["All tests pass"] = 1.0
scores["Function returns correct results for multiple inputs"] = 1.0
except Exception:
pass

return scores
5 changes: 5 additions & 0 deletions tasks/task_sum_function_off_by_one/sum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
def sum_n(n):
total = 0
for i in range(1, n): # BUG: should include n
total += i
return total
7 changes: 7 additions & 0 deletions tasks/task_sum_function_off_by_one/test_sum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from sum import sum_n

def test_sum():
assert sum_n(1) == 1
assert sum_n(3) == 6
assert sum_n(5) == 15
assert sum_n(10) == 55