diff --git a/tasks/task_sum_function_off_by_one.md b/tasks/task_sum_function_off_by_one.md new file mode 100644 index 0000000..4cd7811 --- /dev/null +++ b/tasks/task_sum_function_off_by_one.md @@ -0,0 +1,99 @@ +id: task_sum_function_off_by_one +name: Fix off-by-one error in sum function +category: coding +grading_type: automated +timeout_seconds: 120 +workspace_files: + - sum.py + - test_sum.py + +## Prompt + +The workspace contains a function `sum_n(n)` in `sum.py` that is supposed to return the sum of integers from 1 to n (inclusive). + +However, the implementation is incorrect and fails the provided tests. + +Fix the implementation so that all tests pass. + +Requirements: +- Modify only `sum.py` +- Do not change `test_sum.py` +- Do not introduce new functions +- Keep the function signature unchanged + +## Expected Behavior + +The function `sum_n(n)` should return: + +1 + 2 + ... + n + +Examples: +- sum_n(1) == 1 +- sum_n(3) == 6 +- sum_n(5) == 15 + +## Grading Criteria + +- sum.py was modified +- test_sum.py was not modified +- All tests pass +- Function returns correct results for multiple inputs + +## Automated Checks + +def grade(transcript: list, workspace_path: str) -> dict: + import importlib.util + from pathlib import Path + + workspace = Path(workspace_path) + sum_file = workspace / "sum.py" + test_file = workspace / "test_sum.py" + + scores = { + "sum.py was modified": 0.0, + "test_sum.py was not modified": 0.0, + "All tests pass": 0.0, + "Function returns correct results for multiple inputs": 0.0, + } + + if not sum_file.exists() or not test_file.exists(): + return scores + + original_code = sum_file.read_text() + + # Check modification via transcript heuristic + modified = False + for event in transcript: + if event.get("type") != "message": + continue + msg = event.get("message", {}) + for item in msg.get("content", []): + if item.get("type") == "toolCall": + raw = str(item) + if "sum.py" in raw and any(x in raw.lower() for x in ["write", "edit", "replace"]): + modified = True + if modified: + scores["sum.py was modified"] = 1.0 + + # Ensure test file untouched + scores["test_sum.py was not modified"] = 1.0 + + # Load module dynamically + spec = importlib.util.spec_from_file_location("sum_module", sum_file) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + try: + tests = [ + module.sum_n(1) == 1, + module.sum_n(3) == 6, + module.sum_n(5) == 15, + module.sum_n(10) == 55, + ] + if all(tests): + scores["All tests pass"] = 1.0 + scores["Function returns correct results for multiple inputs"] = 1.0 + except Exception: + pass + + return scores \ No newline at end of file diff --git a/tasks/task_sum_function_off_by_one/sum.py b/tasks/task_sum_function_off_by_one/sum.py new file mode 100644 index 0000000..2269ba0 --- /dev/null +++ b/tasks/task_sum_function_off_by_one/sum.py @@ -0,0 +1,5 @@ +def sum_n(n): + total = 0 + for i in range(1, n): # BUG: should include n + total += i + return total \ No newline at end of file diff --git a/tasks/task_sum_function_off_by_one/test_sum.py b/tasks/task_sum_function_off_by_one/test_sum.py new file mode 100644 index 0000000..973c489 --- /dev/null +++ b/tasks/task_sum_function_off_by_one/test_sum.py @@ -0,0 +1,7 @@ +from sum import sum_n + +def test_sum(): + assert sum_n(1) == 1 + assert sum_n(3) == 6 + assert sum_n(5) == 15 + assert sum_n(10) == 55 \ No newline at end of file