diff --git a/README.md b/README.md index 8eb0050..f0ebf4a 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,8 @@ reasoning in AI. Following our * *IMO-GradingBench*: A dataset of 1000 human gradings to advance automatic evaluation. + + ## [Aletheia](aletheia/README.md) A math research agent, powered by Gemini Deep Think, that can iteratively generate, verify, and revise solutions. See [paper](aletheia/Aletheia.pdf). diff --git a/imobench-pylib/.gitignore b/imobench-pylib/.gitignore new file mode 100644 index 0000000..26c85b8 --- /dev/null +++ b/imobench-pylib/.gitignore @@ -0,0 +1,137 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +Pipfile.lock + +# PEP 582 +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# macOS +.DS_Store + +# Project specific +*.csv.gz +*.csv.bz2 diff --git a/imobench-pylib/CONTRIBUTING.md b/imobench-pylib/CONTRIBUTING.md new file mode 100644 index 0000000..eaa3916 --- /dev/null +++ b/imobench-pylib/CONTRIBUTING.md @@ -0,0 +1,178 @@ +# Contributing to IMO Bench Python Library + +Thank you for your interest in contributing to the IMO Bench Python library! + +## Repository Structure + +This library (`imobench-pylib`) is part of the larger [Superhuman Reasoning](https://github.com/google-deepmind/superhuman) repository by Google DeepMind. + +## Types of Contributions + +### Bug Reports + +If you find a bug, please open an issue with: +- Clear description of the problem +- Minimal reproducible example +- Expected vs actual behavior +- Python version and environment details +- Relevant error messages and stack traces + +### Feature Requests + +For new features: +- Describe the use case +- Explain why it would benefit users +- Provide example API usage if possible + +### Code Contributions + +1. **Fork and Clone** + ```bash + git clone https://github.com/YOUR-USERNAME/superhuman.git + cd superhuman/imobench-pylib + ``` + +2. **Set Up Development Environment** + ```bash + python -m venv venv + source venv/bin/activate # On Windows: venv\Scripts\activate + pip install -e ".[dev]" + ``` + +3. **Create a Branch** + ```bash + git checkout -b feature/your-feature-name + ``` + +4. **Make Changes** +- Follow existing code style + - Use type hints + - Add docstrings + - Keep functions focused and testable + +5. **Write Tests** + ```bash + # Add tests in tests/ + pytest tests/ + ``` + +6. **Check Code Quality** + ```bash + # Format code + black src/ tests/ + + # Type checking + mypy src/ + + # Linting + ruff check src/ tests/ + ``` + +7. **Run All Tests** + ```bash + pytest tests/ -v --cov=imobench + ``` + +8. **Commit and Push** + ```bash + git add . + git commit -m "Add: brief description of changes" + git push origin feature/your-feature-name + ``` + +9. **Open a Pull Request** + - Describe your changes clearly + - Link any related issues + - Ensure CI passes + +## Development Guidelines + +### Code Style + +- Follow PEP 8 +- Use type hints for all functions +- Maximum line length: 100 characters +- Use descriptive variable names + +### Testing + +- Write tests for new functionality +- Maintain or improve test coverage +- Test edge cases and error conditions +- Use pytest fixtures for common setup + +### Documentation + +- Add docstrings to all public functions/classes +- Update README if adding new features +- Add examples for new functionality +- Keep docstrings clear and concise + +### Type Hints + +```python +from typing import Optional, List + +def load_data( + category: Optional[str] = None, + validate: bool = True +) -> List[Problem]: + """Load problems with optional filtering. + + Args: + category: Filter by category + validate: Enable validation + + Returns: + List of Problem objects + """ + pass +``` + +## Project Structure + +``` +imobench-pylib/ +├── src/imobench/ # Source code +│ ├── __init__.py # Public API +│ ├── types.py # Type definitions +│ ├── loader.py # Data loading +│ ├── validators.py # Validation logic +│ └── exceptions.py # Custom exceptions +├── tests/ # Test suite +├── examples/ # Usage examples +├── docs/ # Documentation +└── setup.py # Package configuration +``` + +## Commit Message Guidelines + +Use clear, descriptive commit messages: + +- `Add: new feature or functionality` +- `Fix: bug fix` +- `Update: modify existing functionality` +- `Refactor: code restructuring` +- `Docs: documentation changes` +- `Test: add or modify tests` +- `Chore: maintenance tasks` + +Example: +``` +Add: lazy loading support for gradingbench + +- Implement iterator-based loading +- Add lazy parameter to load_gradingbench() +- Update tests and documentation +``` + +## Questions? + +For questions about: +- **Library usage**: Open a GitHub issue +- **Dataset content**: See main repository +- **Research paper**: Check IMO Bench website + +## License + +By contributing, you agree that your contributions will be licensed under the Apache License 2.0. diff --git a/imobench-pylib/LICENSE b/imobench-pylib/LICENSE new file mode 100644 index 0000000..f11af38 --- /dev/null +++ b/imobench-pylib/LICENSE @@ -0,0 +1,170 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), strict liability, or + otherwise, contract, or otherwise) arising in any way out of the use of + this software, even if advised of the possibility of such damage. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS diff --git a/imobench-pylib/MANIFEST.in b/imobench-pylib/MANIFEST.in new file mode 100644 index 0000000..689ecb2 --- /dev/null +++ b/imobench-pylib/MANIFEST.in @@ -0,0 +1,17 @@ +include README.md +include LICENSE +include pyproject.toml +include setup.py + +recursive-include src *.py +recursive-include tests *.py +recursive-include examples *.py +recursive-include examples *.md + +exclude .gitignore +exclude .github + +global-exclude __pycache__ +global-exclude *.py[cod] +global-exclude *.so +global-exclude .DS_Store diff --git a/imobench-pylib/README.md b/imobench-pylib/README.md new file mode 100644 index 0000000..23d8916 --- /dev/null +++ b/imobench-pylib/README.md @@ -0,0 +1,306 @@ +# IMO Bench Python Library + +A Python library for loading and working with the **IMO Bench** mathematical reasoning benchmarks from Google DeepMind. + +## Overview + +IMO Bench is a suite of advanced benchmarks designed to evaluate robust mathematical reasoning in AI systems. This library provides a simple, type-safe interface for loading and working with the benchmark datasets. + +### Datasets + +- **IMO-AnswerBench**: 400 challenging short-answer problems +- **IMO-ProofBench**: 60 expert-vetted proof-based problems +- **IMO-GradingBench**: 1000 human gradings for automatic evaluation development + +## Installation + +### From Source + +```bash +cd imobench-pylib +pip install -e . +``` + +### Development Installation + +```bash +pip install -e ".[dev]" +``` + +## Quick Start + +```python +from imobench import load_answerbench, load_proofbench, load_gradingbench + +# Load all short-answer problems +problems = load_answerbench() +print(f"Loaded {len(problems)} problems") + +# Filter by category +algebra_problems = load_answerbench(category="Algebra") +for problem in algebra_problems[:3]: + print(f"{problem.problem_id}: {problem.subcategory}") + +# Load proof-based problems +proof_problems = load_proofbench(level="IMO-easy") + +# Load grading data (use lazy loading for efficiency) +for grading in load_gradingbench(min_points=8, lazy=True): + print(f"Problem {grading.problem_id}: {grading.points}/10 points") + break # Process one at a time +``` + +## Usage Examples + +### Basic Loading + +```python +from imobench import load_answerbench + +# Load all problems +problems = load_answerbench() + +# Access problem fields +problem = problems[0] +print(problem.problem_id) # "imo-bench-algebra-001" +print(problem.category) # "Algebra" +print(problem.subcategory) # "Operation" +print(problem.problem) # LaTeX problem statement +print(problem.short_answer) # Expected answer +print(problem.source) # "IMO Shortlist 2021" +``` + +### Filtering + +```python +# Filter by category +geometry_problems = load_answerbench(category="Geometry") + +# Filter by source +imo_2021 = load_answerbench(source="IMO Shortlist 2021") + +# Multiple filters +algebra_inequalities = load_answerbench( + category="Algebra", + subcategory="Inequality" +) +``` + +### Working with ProofBench + +```python +from imobench import load_proofbench + +# Load by difficulty level +easy_problems = load_proofbench(level="IMO-easy") + +# Access detailed fields +problem = easy_problems[0] +print(problem.solution) # Reference solution +print(problem.grading_guidelines) # Grading criteria +``` + +### Efficient GradingBench Processing + +```python +from imobench import load_gradingbench + +# Load specific problem's gradings +gradings = load_gradingbench(problem_id="PB-Basic-001") + +# Filter by score range +high_quality = load_gradingbench(min_points=8) + +# Lazy loading for memory efficiency (recommended for large datasets) +for grading in load_gradingbench(lazy=True): + # Process one at a time + analyze_response(grading.response, grading.points) +``` + +### Custom Data Directory + +```python +from imobench import IMOBenchLoader +from pathlib import Path + +# Specify custom data location +loader = IMOBenchLoader(data_dir=Path("/path/to/imobench/data")) +problems = loader.load_answerbench() +``` + +### Type-Safe Access + +All data types are immutable dataclasses with full type hints: + +```python +from imobench.types import AnswerBenchProblem + +problem: AnswerBenchProblem = problems[0] +# IDE will provide autocomplete and type checking +``` + +## Data Schema + +### AnswerBenchProblem + +| Field | Type | Description | +|-------|------|-------------| +| `problem_id` | `str` | Unique identifier (e.g., "imo-bench-algebra-001") | +| `problem` | `str` | Problem statement in LaTeX format | +| `short_answer` | `str` | Expected answer | +| `category` | `str` | Main category (Algebra, Combinatorics, Geometry, Number theory) | +| `subcategory` | `str` | Specific subcategory | +| `source` | `str` | Original source of the problem | + +### ProofBenchProblem + +| Field | Type | Description | +|-------|------|-------------| +| `problem_id` | `str` | Unique identifier (e.g., "PB-Basic-001") | +| `problem` | `str` | Problem statement | +| `solution` | `str` | Reference solution | +| `grading_guidelines` | `str` | Guidelines for partial credit | +| `category` | `str` | Main category | +| `level` | `str` | Difficulty level (IMO-easy, pre-IMO, etc.) | +| `short_answer` | `str` | Brief expected answer | +| `source` | `str` | Original source | + +### GradingBenchEntry + +| Field | Type | Description | +|-------|------|-------------| +| `grading_id` | `str` | Unique identifier (e.g., "GB-0001") | +| `problem_id` | `str` | Reference to problem | +| `problem` | `str` | Problem statement | +| `solution` | `str` | Reference solution | +| `grading_guidelines` | `str` | Grading criteria | +| `response` | `str` | The response being graded | +| `points` | `int` | Points awarded (0-10) | +| `reward` | `float` | Reward value | +| `problem_source` | `str` | Original source | + +## API Reference + +### Loading Functions + +#### `load_answerbench(**kwargs) -> list[AnswerBenchProblem]` + +Load IMO-AnswerBench dataset. + +**Parameters:** +- `category` (Optional[str]): Filter by category +- `subcategory` (Optional[str]): Filter by subcategory +- `source` (Optional[str]): Filter by source +- `validate` (bool): Enable validation (default: True) + +#### `load_proofbench(**kwargs) -> list[ProofBenchProblem]` + +Load IMO-ProofBench dataset. + +**Parameters:** +- `category` (Optional[str]): Filter by category +- `level` (Optional[str]): Filter by difficulty level +- `validate` (bool): Enable validation (default: True) + +#### `load_gradingbench(**kwargs) -> list[GradingBenchEntry] | Iterator[GradingBenchEntry]` + +Load IMO-GradingBench dataset. + +**Parameters:** +- `problem_id` (Optional[str]): Filter by problem ID +- `min_points` (Optional[int]): Minimum points threshold +- `max_points` (Optional[int]): Maximum points threshold +- `validate` (bool): Enable validation (default: True) +- `lazy` (bool): Return iterator for memory efficiency (default: False) + +### Class: IMOBenchLoader + +Advanced loader with custom data directory support. + +```python +loader = IMOBenchLoader(data_dir=Path("/path/to/data")) +``` + +## Development + +### Running Tests + +```bash +# Install dev dependencies +pip install -e ".[dev]" + +# Run tests +pytest + +# Run with coverage +pytest --cov=imobench --cov-report=html +``` + +### Code Quality + +```bash +# Format code +black src/ tests/ + +# Type checking +mypy src/ + +# Linting +ruff check src/ tests/ +``` + +## Project Structure + +``` +imobench-pylib/ +├── src/ +│ └── imobench/ +│ ├── __init__.py # Public API +│ ├── types.py # Data type definitions +│ ├── loader.py # Loading functionality +│ ├── validators.py # Data validation +│ └── exceptions.py # Custom exceptions +├── tests/ +│ ├── conftest.py # Test configuration +│ ├── test_types.py # Type tests +│ ├── test_validators.py # Validation tests +│ ├── test_loader.py # Loader tests +│ └── test_integration.py # Integration tests +├── examples/ # Usage examples +├── docs/ # Documentation +├── setup.py # Package setup +├── pyproject.toml # Project configuration +└── README.md # This file +``` + +## License + +This library is licensed under the Apache License 2.0. See the main repository for full license details. + +## Citation + +```bibtex +@inproceedings{luong-etal-2025-towards, + title = "Towards Robust Mathematical Reasoning", + author = {Thang Luong and Dawsen Hwang and Hoang H. Nguyen and Golnaz Ghiasi and Yuri Chervonyi and Insuk Seo and Junsu Kim and Garrett Bingham and Jonathan Lee and Swaroop Mishra and Alex Zhai and Clara Huiyi Hu and Henryk Michalewski and Jimin Kim and Jeonghyun Ahn and Junhwi Bae and Xingyou Song and Trieu H. Trinh and Quoc V. Le and Junehyuk Jung}, + booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing", + year = "2025", + url = "https://aclanthology.org/2025.emnlp-main.1794/", +} +``` + +## Contributing + +This library is maintained as part of the Google DeepMind Superhuman Reasoning project. For issues or contributions related to the datasets themselves, please see the main repository. + +For library-specific issues or improvements, please open an issue describing: +- The problem or feature request +- Expected vs actual behavior +- Minimal reproducible example +- Python version and environment details + +## Support + +- **Documentation**: [https://imobench.github.io](https://imobench.github.io) +- **Issues**: [GitHub Issues](https://github.com/google-deepmind/superhuman/issues) +- **Repository**: [google-deepmind/superhuman](https://github.com/google-deepmind/superhuman) diff --git a/imobench-pylib/examples/README.md b/imobench-pylib/examples/README.md new file mode 100644 index 0000000..19a72b0 --- /dev/null +++ b/imobench-pylib/examples/README.md @@ -0,0 +1,98 @@ +# IMO Bench Python Library - Examples + +This directory contains example scripts demonstrating how to use the IMO Bench Python library. + +## Running Examples + +Make sure you have installed the library first: + +```bash +cd imobench-pylib +pip install -e . +``` + +## Available Examples + +### Quick Start (`quickstart.py`) + +Basic usage patterns for loading and working with IMO Bench datasets. + +```bash +python examples/quickstart.py +``` + +**Topics covered:** +- Basic loading of datasets +- Filtering by category, subcategory, and level +- Category and subcategory analysis +- Working with ProofBench +- Efficient GradingBench processing with lazy loading +- Analyzing problem sources + +### Advanced Usage (`advanced.py`) + +More sophisticated patterns for working with the datasets. + +```bash +python examples/advanced.py +``` + +**Topics covered:** +- Custom data directories +- Data validation and error handling +- Cross-dataset analysis +- Performance optimization techniques +- Statistical analysis +- Difficulty pattern analysis + +## Common Patterns + +### Loading Data + +```python +from imobench import load_answerbench, load_proofbench, load_gradingbench + +# Load all problems +problems = load_answerbench() + +# Filter by category +algebra = load_answerbench(category="Algebra") + +# Lazy loading for efficiency +for grading in load_gradingbench(lazy=True): + process(grading) +``` + +### Custom Data Directory + +```python +from imobench import IMOBenchLoader +from pathlib import Path + +loader = IMOBenchLoader(data_dir=Path("/path/to/data")) +problems = loader.load_answerbench() +``` + +### Error Handling + +```python +from imobench.exceptions import ValidationError, DataLoadError + +try: + problems = load_answerbench(validate=True) +except ValidationError as e: + print(f"Invalid data: {e}") +except DataLoadError as e: + print(f"Loading failed: {e}") +``` + +## Tips + +1. **Use lazy loading** for GradingBench (186K entries) to avoid memory issues +2. **Disable validation** (`validate=False`) for faster loading if data is trusted +3. **Filter early** using built-in parameters rather than loading everything +4. **Use type hints** to get IDE autocomplete and type checking + +## More Information + +See the main [README.md](../README.md) for complete API documentation. diff --git a/imobench-pylib/examples/advanced.py b/imobench-pylib/examples/advanced.py new file mode 100644 index 0000000..0a7884b --- /dev/null +++ b/imobench-pylib/examples/advanced.py @@ -0,0 +1,269 @@ +"""Advanced usage examples for IMO Bench library. + +This script demonstrates more advanced patterns including: +- Custom data directories +- Cross-dataset analysis +- Performance optimization +- Error handling +""" + +from pathlib import Path +from typing import Dict, List +from imobench import IMOBenchLoader +from imobench.types import AnswerBenchProblem, ProofBenchProblem, GradingBenchEntry +from imobench.exceptions import ValidationError, DataLoadError + + +def example_custom_loader(): + """Example 1: Using custom data directory.""" + print("=" * 60) + print("Example 1: Custom Data Directory") + print("=" * 60) + + # Specify custom location for IMO Bench data + # Adjust this path to match your setup + repo_root = Path(__file__).parent.parent.parent + data_dir = repo_root / "imobench" + + try: + loader = IMOBenchLoader(data_dir=data_dir) + problems = loader.load_answerbench() + print(f"\nSuccessfully loaded {len(problems)} problems from: {data_dir}") + except Exception as e: + print(f"\nError loading data: {e}") + print("Adjust the data_dir path to match your setup") + + +def example_validation(): + """Example 2: Data validation and error handling.""" + print("\n" + "=" * 60) + print("Example 2: Validation and Error Handling") + print("=" * 60) + + loader = IMOBenchLoader() + + # Load with validation enabled (default) + try: + problems = loader.load_answerbench(validate=True) + print(f"\nValidation passed! Loaded {len(problems)} valid problems") + except ValidationError as e: + print(f"\nValidation error: {e}") + except DataLoadError as e: + print(f"\nData loading error: {e}") + + # Disable validation for faster loading (if data is trusted) + problems_fast = loader.load_answerbench(validate=False) + print(f"Fast loading (no validation): {len(problems_fast)} problems") + + +def example_cross_dataset_analysis(): + """Example 3: Analyzing relationships across datasets.""" + print("\n" + "=" * 60) + print("Example 3: Cross-Dataset Analysis") + print("=" * 60) + + loader = IMOBenchLoader() + + # Load datasets + proof_problems = loader.load_proofbench() + grading_entries = loader.load_gradingbench() + + # Build index of proof problems + proof_index: Dict[str, ProofBenchProblem] = { + p.problem_id: p for p in proof_problems + } + + # Analyze gradings per problem + gradings_per_problem: Dict[str, List[GradingBenchEntry]] = {} + for entry in grading_entries[:1000]: # Sample first 1000 + if entry.problem_id not in gradings_per_problem: + gradings_per_problem[entry.problem_id] = [] + gradings_per_problem[entry.problem_id].append(entry) + + print(f"\nAnalyzed {len(gradings_per_problem)} problems with gradings") + + # Find problems with most gradings + top_graded = sorted( + gradings_per_problem.items(), + key=lambda x: len(x[1]), + reverse=True + )[:5] + + print("\nMost graded problems:") + for problem_id, entries in top_graded: + if problem_id in proof_index: + problem = proof_index[problem_id] + avg_points = sum(e.points for e in entries) / len(entries) + print(f" {problem_id} ({problem.level}): " + f"{len(entries)} gradings, avg {avg_points:.1f} points") + + +def example_performance_optimization(): + """Example 4: Performance optimization techniques.""" + print("\n" + "=" * 60) + print("Example 4: Performance Optimization") + print("=" * 60) + + loader = IMOBenchLoader() + + # Strategy 1: Lazy loading for large datasets + print("\nStrategy 1: Lazy loading") + print(" Processing gradingbench entries one at a time...") + + count = 0 + for entry in loader.load_gradingbench(lazy=True): + count += 1 + if count >= 1000: + break + + print(f" Processed {count} entries without loading entire dataset") + + # Strategy 2: Filtering at load time + print("\nStrategy 2: Early filtering") + print(" Loading only high-scoring entries...") + + high_scores = loader.load_gradingbench(min_points=9) + print(f" Loaded {len(high_scores)} high-scoring entries") + + # Strategy 3: Disable validation for trusted data + print("\nStrategy 3: Fast loading (validation disabled)") + + import time + start = time.time() + problems_validated = loader.load_answerbench(validate=True) + time_validated = time.time() - start + + start = time.time() + problems_fast = loader.load_answerbench(validate=False) + time_fast = time.time() - start + + print(f" With validation: {time_validated:.3f}s") + print(f" Without validation: {time_fast:.3f}s") + print(f" Speedup: {time_validated/time_fast:.1f}x") + + +def example_statistical_analysis(): + """Example 5: Statistical analysis of datasets.""" + print("\n" + "=" * 60) + print("Example 5: Statistical Analysis") + print("=" * 60) + + loader = IMOBenchLoader() + problems = loader.load_answerbench() + + # Analyze answer formats + answer_types = { + 'numeric': 0, + 'algebraic': 0, + 'interval': 0, + 'set': 0, + 'other': 0, + } + + for problem in problems: + answer = problem.short_answer.strip() + if answer.isdigit() or (answer.startswith('-') and answer[1:].isdigit()): + answer_types['numeric'] += 1 + elif '$' in answer: + answer_types['algebraic'] += 1 + elif 'infty' in answer or '[' in answer or '(' in answer: + answer_types['interval'] += 1 + elif '{' in answer or '}' in answer: + answer_types['set'] += 1 + else: + answer_types['other'] += 1 + + print("\nAnswer format distribution:") + for format_type, count in sorted(answer_types.items(), key=lambda x: -x[1]): + pct = 100 * count / len(problems) + print(f" {format_type}: {count} ({pct:.1f}%)") + + # Analyze problem statement lengths + lengths = [len(p.problem) for p in problems] + avg_length = sum(lengths) / len(lengths) + min_length = min(lengths) + max_length = max(lengths) + + print(f"\nProblem statement statistics:") + print(f" Average length: {avg_length:.0f} characters") + print(f" Shortest: {min_length} characters") + print(f" Longest: {max_length} characters") + + +def example_difficulty_analysis(): + """Example 6: Analyzing difficulty patterns in GradingBench.""" + print("\n" + "=" * 60) + print("Example 6: Difficulty Analysis") + print("=" * 60) + + loader = IMOBenchLoader() + + # Sample grading entries + entries = loader.load_gradingbench()[:5000] + + # Score distribution + score_dist = {i: 0 for i in range(11)} + for entry in entries: + score_dist[entry.points] += 1 + + print("\nScore distribution (0-10 points):") + print(" Score | Count | Percentage | Bar") + print(" " + "-" * 50) + + max_count = max(score_dist.values()) + for score, count in sorted(score_dist.items()): + pct = 100 * count / len(entries) + bar_length = int(30 * count / max_count) + bar = "█" * bar_length + print(f" {score:5d} | {count:5d} | {pct:6.1f}% | {bar}") + + # Average scores by problem + problem_scores: Dict[str, List[int]] = {} + for entry in entries: + if entry.problem_id not in problem_scores: + problem_scores[entry.problem_id] = [] + problem_scores[entry.problem_id].append(entry.points) + + problem_avgs = { + pid: sum(scores) / len(scores) + for pid, scores in problem_scores.items() + } + + # Find hardest and easiest problems + sorted_problems = sorted(problem_avgs.items(), key=lambda x: x[1]) + + print(f"\nEasiest problems (avg score):") + for pid, avg in sorted_problems[-3:]: + print(f" {pid}: {avg:.2f}/10") + + print(f"\nHardest problems (avg score):") + for pid, avg in sorted_problems[:3]: + print(f" {pid}: {avg:.2f}/10") + + +def main(): + """Run all advanced examples.""" + print("\n" + "=" * 60) + print("IMO BENCH LIBRARY - ADVANCED EXAMPLES") + print("=" * 60) + + try: + example_custom_loader() + example_validation() + example_cross_dataset_analysis() + example_performance_optimization() + example_statistical_analysis() + example_difficulty_analysis() + + print("\n" + "=" * 60) + print("All advanced examples completed!") + print("=" * 60 + "\n") + + except Exception as e: + print(f"\nError: {e}") + import traceback + traceback.print_exc() + + +if __name__ == "__main__": + main() diff --git a/imobench-pylib/examples/quickstart.py b/imobench-pylib/examples/quickstart.py new file mode 100644 index 0000000..483d1a8 --- /dev/null +++ b/imobench-pylib/examples/quickstart.py @@ -0,0 +1,200 @@ +"""Basic usage examples for IMO Bench library. + +This script demonstrates the most common use cases for loading and +working with IMO Bench datasets. +""" + +from imobench import load_answerbench, load_proofbench, load_gradingbench + + +def example_basic_loading(): + """Example 1: Basic loading of datasets.""" + print("=" * 60) + print("Example 1: Basic Loading") + print("=" * 60) + + # Load all short-answer problems + problems = load_answerbench() + print(f"\nLoaded {len(problems)} problems from AnswerBench") + + # Display first problem + first = problems[0] + print(f"\nFirst problem:") + print(f" ID: {first.problem_id}") + print(f" Category: {first.category}") + print(f" Subcategory: {first.subcategory}") + print(f" Source: {first.source}") + print(f" Problem: {first.problem[:100]}...") + print(f" Answer: {first.short_answer}") + + +def example_filtering(): + """Example 2: Filtering datasets.""" + print("\n" + "=" * 60) + print("Example 2: Filtering") + print("=" * 60) + + # Filter by category + algebra_problems = load_answerbench(category="Algebra") + print(f"\nAlgebra problems: {len(algebra_problems)}") + + # Filter by subcategory + inequalities = load_answerbench( + category="Algebra", + subcategory="Inequality" + ) + print(f"Algebra inequalities: {len(inequalities)}") + + # Filter proof problems by difficulty + easy_proofs = load_proofbench(level="IMO-easy") + print(f"Easy proof problems: {len(easy_proofs)}") + + +def example_category_analysis(): + """Example 3: Analyzing category distribution.""" + print("\n" + "=" * 60) + print("Example 3: Category Analysis") + print("=" * 60) + + problems = load_answerbench() + + # Count problems by category + categories = {} + for problem in problems: + categories[problem.category] = categories.get(problem.category, 0) + 1 + + print("\nProblems by category:") + for category, count in sorted(categories.items()): + print(f" {category}: {count}") + + +def example_subcategory_analysis(): + """Example 4: Analyzing subcategories within a category.""" + print("\n" + "=" * 60) + print("Example 4: Subcategory Analysis") + print("=" * 60) + + # Focus on Algebra problems + algebra = load_answerbench(category="Algebra") + + # Count subcategories + subcategories = {} + for problem in algebra: + subcategories[problem.subcategory] = \ + subcategories.get(problem.subcategory, 0) + 1 + + print(f"\nAlgebra subcategories ({len(subcategories)} total):") + for subcat, count in sorted(subcategories.items(), key=lambda x: -x[1]): + print(f" {subcat}: {count}") + + +def example_proofbench(): + """Example 5: Working with ProofBench.""" + print("\n" + "=" * 60) + print("Example 5: ProofBench Exploration") + print("=" * 60) + + # Load all proof problems + proofs = load_proofbench() + print(f"\nTotal proof problems: {len(proofs)}") + + # Analyze difficulty levels + levels = {} + for proof in proofs: + levels[proof.level] = levels.get(proof.level, 0) + 1 + + print("\nDifficulty distribution:") + for level, count in sorted(levels.items()): + print(f" {level}: {count}") + + # Show example problem + if proofs: + example = proofs[0] + print(f"\nExample problem ({example.problem_id}):") + print(f" Level: {example.level}") + print(f" Category: {example.category}") + print(f" Problem: {example.problem[:150]}...") + + +def example_gradingbench_lazy(): + """Example 6: Efficient GradingBench processing with lazy loading.""" + print("\n" + "=" * 60) + print("Example 6: GradingBench with Lazy Loading") + print("=" * 60) + + # Use lazy loading to process entries one at a time + print("\nProcessing high-scoring entries (≥8 points)...") + + count = 0 + points_sum = 0 + + for entry in load_gradingbench(min_points=8, lazy=True): + count += 1 + points_sum += entry.points + + # Process first few as examples + if count <= 3: + print(f"\n Entry {count}:") + print(f" Grading ID: {entry.grading_id}") + print(f" Problem ID: {entry.problem_id}") + print(f" Points: {entry.points}/10") + print(f" Response: {entry.response[:100]}...") + + # Stop after processing 100 for this example + if count >= 100: + break + + if count > 0: + avg_points = points_sum / count + print(f"\nProcessed {count} entries") + print(f"Average points: {avg_points:.2f}/10") + + +def example_source_analysis(): + """Example 7: Analyzing problem sources.""" + print("\n" + "=" * 60) + print("Example 7: Source Analysis") + print("=" * 60) + + problems = load_answerbench() + + # Count problems by source + sources = {} + for problem in problems: + sources[problem.source] = sources.get(problem.source, 0) + 1 + + print(f"\nMost common sources:") + for source, count in sorted(sources.items(), key=lambda x: -x[1])[:10]: + print(f" {source}: {count} problems") + + +def main(): + """Run all examples.""" + print("\n" + "=" * 60) + print("IMO BENCH LIBRARY - USAGE EXAMPLES") + print("=" * 60) + + try: + example_basic_loading() + example_filtering() + example_category_analysis() + example_subcategory_analysis() + example_proofbench() + example_gradingbench_lazy() + example_source_analysis() + + print("\n" + "=" * 60) + print("All examples completed successfully!") + print("=" * 60 + "\n") + + except Exception as e: + print(f"\nError: {e}") + print("\nMake sure the imobench data directory is accessible.") + print("You may need to specify a custom data directory:") + print("\n from imobench import IMOBenchLoader") + print(" from pathlib import Path") + print(" loader = IMOBenchLoader(data_dir=Path('/path/to/imobench'))") + + +if __name__ == "__main__": + main() diff --git a/imobench-pylib/pyproject.toml b/imobench-pylib/pyproject.toml new file mode 100644 index 0000000..ff86356 --- /dev/null +++ b/imobench-pylib/pyproject.toml @@ -0,0 +1,104 @@ +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "imobench" +version = "0.1.0" +description = "Python library for loading and working with IMO Bench datasets" +readme = "README.md" +requires-python = ">=3.9" +license = {text = "Apache-2.0"} +authors = [ + {name = "IMO Bench Contributors"} +] +keywords = ["mathematics", "reasoning", "benchmark", "imo", "olympiad", "ai"] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.0.0", + "pytest-cov>=4.0.0", + "black>=23.0.0", + "mypy>=1.0.0", + "ruff>=0.1.0", +] +pandas = [ + "pandas>=1.5.0", +] + +[project. urls] +Homepage = "https://imobench.github.io" +Repository = "https://github.com/google-deepmind/superhuman" +Issues = "https://github.com/google-deepmind/superhuman/issues" + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +addopts = [ + "-v", + "--strict-markers", + "--tb=short", +] + +[tool.black] +line-length = 100 +target-version = ["py39", "py310", "py311", "py312"] +include = '\.pyi?$' + +[tool.mypy] +python_version = "3.9" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = true +disallow_incomplete_defs = true +check_untyped_defs = true +no_implicit_optional = true +warn_redundant_casts = true +warn_unused_ignores = true +warn_no_return = true +strict_equality = true + +[tool.ruff] +line-length = 100 +target-version = "py39" +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "UP", # pyupgrade +] +ignore = [ + "E501", # line too long (handled by black) +] + +[tool.coverage.run] +source = ["src"] +omit = ["*/tests/*", "*/test_*.py"] + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "def __repr__", + "raise AssertionError", + "raise NotImplementedError", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] diff --git a/imobench-pylib/setup.py b/imobench-pylib/setup.py new file mode 100644 index 0000000..45bc148 --- /dev/null +++ b/imobench-pylib/setup.py @@ -0,0 +1,69 @@ +"""Setup configuration for imobench package.""" + +from setuptools import setup, find_packages +from pathlib import Path + +# Read the README file +readme_file = Path(__file__).parent / "README.md" +if readme_file.exists(): + long_description = readme_file.read_text(encoding="utf-8") +else: + long_description = "Python library for loading and working with IMO Bench datasets" + +setup( + name="imobench", + version="0.1.0", + author="IMO Bench Contributors", + author_email="", + description="Python library for loading and working with IMO Bench mathematical reasoning benchmarks", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/google-deepmind/superhuman", + project_urls={ + "Bug Tracker": "https://github.com/google-deepmind/superhuman/issues", + "Documentation": "https://imobench.github.io", + "Source Code": "https://github.com/google-deepmind/superhuman", + }, + package_dir={"": "src"}, + packages=find_packages(where="src"), + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Libraries :: Python Modules", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Operating System :: OS Independent", + ], + python_requires=">=3.9", + install_requires=[ + # No external dependencies required - uses only stdlib + ], + extras_require={ + "dev": [ + "pytest>=7.0.0", + "pytest-cov>=4.0.0", + "black>=23.0.0", + "mypy>=1.0.0", + "ruff>=0.1.0", + ], + "pandas": [ + "pandas>=1.5.0", + ], + }, + keywords=[ + "mathematics", + "reasoning", + "benchmark", + "imo", + "olympiad", + "ai", + "machine-learning", + "evaluation", + ], +) diff --git a/imobench-pylib/src/imobench/__init__.py b/imobench-pylib/src/imobench/__init__.py new file mode 100644 index 0000000..9e39817 --- /dev/null +++ b/imobench-pylib/src/imobench/__init__.py @@ -0,0 +1,40 @@ +"""IMO Bench - Python library for loading and working with IMO Bench datasets. + +This package provides utilities for loading, validating, and working with +the IMO Bench mathematical reasoning benchmarks, including: +- IMO-AnswerBench: 400 challenging short-answer problems +- IMO-ProofBench: 60 proof-based problems +- IMO-GradingBench: 1000 human gradings for evaluation + +Example: + >>> from imobench import load_answerbench, load_proofbench + >>> problems = load_answerbench() + >>> for problem in problems[:5]: + ... print(f"{problem.problem_id}: {problem.category}") +""" + +__version__ = "0.1.0" +__author__ = "IMO Bench Contributors" +__all__ = [ + "load_answerbench", + "load_proofbench", + "load_gradingbench", + "AnswerBenchProblem", + "ProofBenchProblem", + "GradingBenchEntry", + "IMOBenchLoader", + "ValidationError", +] + +from .loader import ( + load_answerbench, + load_proofbench, + load_gradingbench, + IMOBenchLoader, +) +from .types import ( + AnswerBenchProblem, + ProofBenchProblem, + GradingBenchEntry, +) +from .exceptions import ValidationError diff --git a/imobench-pylib/src/imobench/exceptions.py b/imobench-pylib/src/imobench/exceptions.py new file mode 100644 index 0000000..fe71d6f --- /dev/null +++ b/imobench-pylib/src/imobench/exceptions.py @@ -0,0 +1,21 @@ +"""Custom exceptions for IMO Bench library.""" + + +class IMOBenchError(Exception): + """Base exception for all IMO Bench errors.""" + pass + + +class ValidationError(IMOBenchError): + """Raised when data validation fails.""" + pass + + +class DataLoadError(IMOBenchError): + """Raised when data cannot be loaded.""" + pass + + +class FileNotFoundError(DataLoadError): + """Raised when a dataset file cannot be found.""" + pass diff --git a/imobench-pylib/src/imobench/loader.py b/imobench-pylib/src/imobench/loader.py new file mode 100644 index 0000000..34d360b --- /dev/null +++ b/imobench-pylib/src/imobench/loader.py @@ -0,0 +1,318 @@ +"""Data loading functionality for IMO Bench datasets.""" + +import csv +from pathlib import Path +from typing import Iterator, Optional, Callable + +from .types import ( + AnswerBenchProblem, + ProofBenchProblem, + GradingBenchEntry, + AnswerBenchDataset, + ProofBenchDataset, + GradingBenchDataset, +) +from .exceptions import DataLoadError, FileNotFoundError as IMOFileNotFoundError +from .validators import ( + validate_answerbench_row, + validate_proofbench_row, + validate_gradingbench_row, +) + + +class IMOBenchLoader: + """Main loader class for IMO Bench datasets. + + This class provides methods to load datasets from CSV files with + support for filtering, lazy loading, and validation. + + Args: + data_dir: Path to the directory containing CSV files. + Defaults to looking for '../imobench' relative to package. + + Example: + >>> loader = IMOBenchLoader() + >>> problems = loader.load_answerbench(category="Algebra") + >>> print(f"Loaded {len(problems)} algebra problems") + """ + + def __init__(self, data_dir: Optional[Path] = None): + if data_dir is None: + # Default: look for imobench directory at repo root + package_dir = Path(__file__).parent + data_dir = package_dir.parent.parent.parent / "imobench" + + self.data_dir = Path(data_dir) + if not self.data_dir.exists(): + raise IMOFileNotFoundError( + f"Data directory not found: {self.data_dir}\n" + f"Please provide the correct path to the imobench directory." + ) + + def _load_csv(self, filename: str) -> list[dict[str, str]]: + """Load a CSV file and return rows as dictionaries.""" + filepath = self.data_dir / filename + + if not filepath.exists(): + raise IMOFileNotFoundError( + f"Dataset file not found: {filepath}\n" + f"Expected location: {filepath}" + ) + + try: + with open(filepath, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + return list(reader) + except Exception as e: + raise DataLoadError(f"Error reading {filename}: {e}") from e + + def load_answerbench( + self, + category: Optional[str] = None, + subcategory: Optional[str] = None, + source: Optional[str] = None, + validate: bool = True, + ) -> AnswerBenchDataset: + """Load IMO-AnswerBench dataset. + + Args: + category: Filter by category (e.g., "Algebra", "Geometry") + subcategory: Filter by subcategory + source: Filter by source (e.g., "IMO Shortlist 2021") + validate: Whether to validate each row + + Returns: + List of AnswerBenchProblem objects + + Raises: + DataLoadError: If the file cannot be loaded + ValidationError: If validation is enabled and data is invalid + """ + rows = self._load_csv("answerbench.csv") + problems = [] + + for row in rows: + if validate: + validate_answerbench_row(row) + + # Apply filters + if category and row['Category'] != category: + continue + if subcategory and row['Subcategory'] != subcategory: + continue + if source and row['Source'] != source: + continue + + problem = AnswerBenchProblem( + problem_id=row['Problem ID'], + problem=row['Problem'], + short_answer=row['Short Answer'], + category=row['Category'], + subcategory=row['Subcategory'], + source=row['Source'], + ) + problems.append(problem) + + return problems + + def load_proofbench( + self, + category: Optional[str] = None, + level: Optional[str] = None, + validate: bool = True, + ) -> ProofBenchDataset: + """Load IMO-ProofBench dataset. + + Args: + category: Filter by category (e.g., "Algebra", "Geometry") + level: Filter by difficulty level (e.g., "IMO-easy", "pre-IMO") + validate: Whether to validate each row + + Returns: + List of ProofBenchProblem objects + + Raises: + DataLoadError: If the file cannot be loaded + ValidationError: If validation is enabled and data is invalid + """ + rows = self._load_csv("proofbench.csv") + problems = [] + + for row in rows: + if validate: + validate_proofbench_row(row) + + # Apply filters + if category and row['Category'] != category: + continue + if level and row['Level'] != level: + continue + + problem = ProofBenchProblem( + problem_id=row['Problem ID'], + problem=row['Problem'], + solution=row['Solution'], + grading_guidelines=row['Grading guidelines'], + category=row['Category'], + level=row['Level'], + short_answer=row['Short Answer'], + source=row['Source'], + ) + problems.append(problem) + + return problems + + def load_gradingbench( + self, + problem_id: Optional[str] = None, + min_points: Optional[int] = None, + max_points: Optional[int] = None, + validate: bool = True, + lazy: bool = False, + ) -> GradingBenchDataset | Iterator[GradingBenchEntry]: + """Load IMO-GradingBench dataset. + + Note: This dataset is large (186K lines). Consider using lazy=True + for memory-efficient iteration. + + Args: + problem_id: Filter by problem ID + min_points: Filter by minimum points awarded + max_points: Filter by maximum points awarded + validate: Whether to validate each row + lazy: If True, return an iterator instead of loading all data + + Returns: + List of GradingBenchEntry objects, or Iterator if lazy=True + + Raises: + DataLoadError: If the file cannot be loaded + ValidationError: If validation is enabled and data is invalid + """ + if lazy: + return self._iter_gradingbench( + problem_id=problem_id, + min_points=min_points, + max_points=max_points, + validate=validate, + ) + + rows = self._load_csv("gradingbench.csv") + entries = [] + + for row in rows: + if validate: + validate_gradingbench_row(row) + + # Parse numeric fields + try: + points = int(row['Points']) + reward = row['Reward'].strip() + except (ValueError, KeyError) as e: + if validate: + raise DataLoadError(f"Invalid field: {e}") from e + continue + + # Apply filters + if problem_id and row['Problem ID'] != problem_id: + continue + if min_points is not None and points < min_points: + continue + if max_points is not None and points > max_points: + continue + + entry = GradingBenchEntry( + grading_id=row['Grading ID'], + problem_id=row['Problem ID'], + problem=row['Problem'], + solution=row['Solution'], + grading_guidelines=row['Grading guidelines'], + response=row['Response'], + points=points, + reward=reward, + problem_source=row['Problem Source'], + ) + entries.append(entry) + + return entries + + def _iter_gradingbench( + self, + problem_id: Optional[str] = None, + min_points: Optional[int] = None, + max_points: Optional[int] = None, + validate: bool = True, + ) -> Iterator[GradingBenchEntry]: + """Lazy iterator for gradingbench dataset.""" + filepath = self.data_dir / "gradingbench.csv" + + with open(filepath, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + + for row in reader: + if validate: + validate_gradingbench_row(row) + + try: + points = int(row['Points']) + reward = row['Reward'].strip() + except (ValueError, KeyError): + if validate: + raise + continue + + # Apply filters + if problem_id and row['Problem ID'] != problem_id: + continue + if min_points is not None and points < min_points: + continue + if max_points is not None and points > max_points: + continue + + yield GradingBenchEntry( + grading_id=row['Grading ID'], + problem_id=row['Problem ID'], + problem=row['Problem'], + solution=row['Solution'], + grading_guidelines=row['Grading guidelines'], + response=row['Response'], + points=points, + reward=reward, + problem_source=row['Problem Source'], + ) + + +# Convenience functions using default loader +_default_loader: Optional[IMOBenchLoader] = None + + +def _get_default_loader() -> IMOBenchLoader: + """Get or create the default loader instance.""" + global _default_loader + if _default_loader is None: + _default_loader = IMOBenchLoader() + return _default_loader + + +def load_answerbench(**kwargs) -> AnswerBenchDataset: + """Load IMO-AnswerBench using default loader. + + See IMOBenchLoader.load_answerbench() for arguments. + """ + return _get_default_loader().load_answerbench(**kwargs) + + +def load_proofbench(**kwargs) -> ProofBenchDataset: + """Load IMO-ProofBench using default loader. + + See IMOBenchLoader.load_proofbench() for arguments. + """ + return _get_default_loader().load_proofbench(**kwargs) + + +def load_gradingbench(**kwargs) -> GradingBenchDataset | Iterator[GradingBenchEntry]: + """Load IMO-GradingBench using default loader. + + See IMOBenchLoader.load_gradingbench() for arguments. + """ + return _get_default_loader().load_gradingbench(**kwargs) diff --git a/imobench-pylib/src/imobench/types.py b/imobench-pylib/src/imobench/types.py new file mode 100644 index 0000000..5274510 --- /dev/null +++ b/imobench-pylib/src/imobench/types.py @@ -0,0 +1,89 @@ +"""Type definitions for IMO Bench datasets.""" + +from dataclasses import dataclass +from typing import Optional + + +@dataclass(frozen=True) +class AnswerBenchProblem: + """A problem from IMO-AnswerBench. + + Attributes: + problem_id: Unique identifier (e.g., "imo-bench-algebra-001") + problem: The problem statement in LaTeX format + short_answer: The expected answer + category: Main category (Algebra, Combinatorics, Geometry, Number theory) + subcategory: Specific subcategory within the main category + source: Original source of the problem + """ + problem_id: str + problem: str + short_answer: str + category: str + subcategory: str + source: str + + def __repr__(self) -> str: + return f"AnswerBenchProblem(id='{self.problem_id}', category='{self.category}')" + + +@dataclass(frozen=True) +class ProofBenchProblem: + """A problem from IMO-ProofBench. + + Attributes: + problem_id: Unique identifier (e.g., "PB-Basic-001") + problem: The problem statement in LaTeX format + solution: Reference solution + grading_guidelines: Guidelines for grading partial solutions + category: Main category (Algebra, Combinatorics, Geometry, Number theory) + level: Difficulty level (e.g., "IMO-easy", "pre-IMO", "IMO-hard") + short_answer: Brief expected answer + source: Original source of the problem + """ + problem_id: str + problem: str + solution: str + grading_guidelines: str + category: str + level: str + short_answer: str + source: str + + def __repr__(self) -> str: + return f"ProofBenchProblem(id='{self.problem_id}', level='{self.level}')" + + +@dataclass(frozen=True) +class GradingBenchEntry: + """An entry from IMO-GradingBench. + + Attributes: + grading_id: Unique identifier (e.g., "GB-0001") + problem_id: Reference to the problem being graded + problem: The problem statement + solution: Reference solution + grading_guidelines: Grading criteria + response: The response being graded + points: Points awarded (0-10 scale) + reward: Reward category (e.g., "Correct", "Partial", "Incorrect", "Almost") + problem_source: Original source of the problem + """ + grading_id: str + problem_id: str + problem: str + solution: str + grading_guidelines: str + response: str + points: int + reward: str + problem_source: str + + def __repr__(self) -> str: + return f"GradingBenchEntry(id='{self.grading_id}', points={self.points})" + + +# Type aliases for collections +AnswerBenchDataset = list[AnswerBenchProblem] +ProofBenchDataset = list[ProofBenchProblem] +GradingBenchDataset = list[GradingBenchEntry] diff --git a/imobench-pylib/src/imobench/validators.py b/imobench-pylib/src/imobench/validators.py new file mode 100644 index 0000000..09f33fc --- /dev/null +++ b/imobench-pylib/src/imobench/validators.py @@ -0,0 +1,183 @@ +"""Validation utilities for IMO Bench datasets.""" + +from typing import Any +from .exceptions import ValidationError + + +# Required fields for each dataset +ANSWERBENCH_REQUIRED_FIELDS = { + 'Problem ID', 'Problem', 'Short Answer', 'Category', 'Subcategory', 'Source' +} + +PROOFBENCH_REQUIRED_FIELDS = { + 'Problem ID', 'Problem', 'Solution', 'Grading guidelines', + 'Category', 'Level', 'Short Answer', 'Source' +} + +GRADINGBENCH_REQUIRED_FIELDS = { + 'Grading ID', 'Problem ID', 'Problem', 'Solution', + 'Grading guidelines', 'Response', 'Points', 'Reward', 'Problem Source' +} + +# Valid categories +VALID_CATEGORIES = {'Algebra', 'Combinatorics', 'Geometry', 'Number theory'} + + +def validate_answerbench_row(row: dict[str, Any]) -> None: + """Validate a row from answerbench.csv. + + Args: + row: Dictionary representing a CSV row + + Raises: + ValidationError: If validation fails + """ + # Check required fields + missing_fields = ANSWERBENCH_REQUIRED_FIELDS - set(row.keys()) + if missing_fields: + raise ValidationError( + f"Missing required fields: {', '.join(missing_fields)}" + ) + + # Check for empty values + for field in ANSWERBENCH_REQUIRED_FIELDS: + if not row[field] or not row[field].strip(): + raise ValidationError(f"Empty value for required field: {field}") + + # Validate category + if row['Category'] not in VALID_CATEGORIES: + raise ValidationError( + f"Invalid category: {row['Category']}. " + f"Must be one of: {', '.join(VALID_CATEGORIES)}" + ) + + # Validate problem ID format + if not row['Problem ID'].startswith('imo-bench-'): + raise ValidationError( + f"Invalid Problem ID format: {row['Problem ID']}. " + f"Should start with 'imo-bench-'" + ) + + +def validate_proofbench_row(row: dict[str, Any]) -> None: + """Validate a row from proofbench.csv. + + Args: + row: Dictionary representing a CSV row + + Raises: + ValidationError: If validation fails + """ + # Check required fields + missing_fields = PROOFBENCH_REQUIRED_FIELDS - set(row.keys()) + if missing_fields: + raise ValidationError( + f"Missing required fields: {', '.join(missing_fields)}" + ) + + # Check for empty values (allow empty short_answer as it can be descriptive) + for field in PROOFBENCH_REQUIRED_FIELDS - {'Short Answer'}: + if not row[field] or not row[field].strip(): + raise ValidationError(f"Empty value for required field: {field}") + + # Validate category + if row['Category'] not in VALID_CATEGORIES: + raise ValidationError( + f"Invalid category: {row['Category']}. " + f"Must be one of: {', '.join(VALID_CATEGORIES)}" + ) + + # Validate problem ID format + if not row['Problem ID'].startswith('PB-'): + raise ValidationError( + f"Invalid Problem ID format: {row['Problem ID']}. " + f"Should start with 'PB-'" + ) + + +def validate_gradingbench_row(row: dict[str, Any]) -> None: + """Validate a row from gradingbench.csv. + + Args: + row: Dictionary representing a CSV row + + Raises: + ValidationError: If validation fails + """ + # Check required fields + missing_fields = GRADINGBENCH_REQUIRED_FIELDS - set(row.keys()) + if missing_fields: + raise ValidationError( + f"Missing required fields: {', '.join(missing_fields)}" + ) + + # Check for empty values + for field in GRADINGBENCH_REQUIRED_FIELDS: + if field not in row or row[field] is None: + raise ValidationError(f"Missing field: {field}") + if isinstance(row[field], str) and not row[field].strip(): + raise ValidationError(f"Empty value for required field: {field}") + + # Validate grading ID format + if not row['Grading ID'].startswith('GB-'): + raise ValidationError( + f"Invalid Grading ID format: {row['Grading ID']}. " + f"Should start with 'GB-'" + ) + + # Validate numeric fields + try: + points = int(row['Points']) + if not 0 <= points <= 10: + raise ValidationError( + f"Points must be between 0 and 10, got: {points}" + ) + except ValueError: + raise ValidationError(f"Points must be an integer, got: {row['Points']}") + + # Reward is a categorical field (Correct, Partial, Incorrect, Almost, etc.) + # Just check it's not empty - validation happens in field check above + valid_rewards = {'Correct', 'Partial', 'Incorrect', 'Almost'} + if row['Reward'] not in valid_rewards: + # Allow other values but could log warning in production + pass + + +def validate_dataset_counts( + answerbench_count: int, + proofbench_count: int, + gradingbench_count: int, +) -> None: + """Validate that dataset counts match expected values. + + Args: + answerbench_count: Number of problems in answerbench + proofbench_count: Number of problems in proofbench + gradingbench_count: Number of entries in gradingbench + + Raises: + ValidationError: If counts don't match expectations + """ + # Based on the documentation: + # - answerbench: 400 problems + # - proofbench: 60 problems + # - gradingbench: 1000 human gradings + + # We'll allow some flexibility since these are approximate + if not 390 <= answerbench_count <= 410: + raise ValidationError( + f"Unexpected answerbench count: {answerbench_count} " + f"(expected ~400)" + ) + + if not 55 <= proofbench_count <= 65: + raise ValidationError( + f"Unexpected proofbench count: {proofbench_count} " + f"(expected ~60)" + ) + + if not 900 <= gradingbench_count <= 200000: + raise ValidationError( + f"Unexpected gradingbench count: {gradingbench_count} " + f"(expected ~1000)" + ) diff --git a/imobench-pylib/tests/conftest.py b/imobench-pylib/tests/conftest.py new file mode 100644 index 0000000..7f5d455 --- /dev/null +++ b/imobench-pylib/tests/conftest.py @@ -0,0 +1,8 @@ +"""Configuration for pytest.""" + +import sys +from pathlib import Path + +# Add src directory to path +src_dir = Path(__file__).parent.parent / "src" +sys.path.insert(0, str(src_dir)) diff --git a/imobench-pylib/tests/test_integration.py b/imobench-pylib/tests/test_integration.py new file mode 100644 index 0000000..fee2b5c --- /dev/null +++ b/imobench-pylib/tests/test_integration.py @@ -0,0 +1,142 @@ +"""Integration tests for the IMO Bench library.""" + +import pytest +from pathlib import Path +from imobench import ( + load_answerbench, + load_proofbench, + load_gradingbench, + IMOBenchLoader, +) +from imobench.exceptions import FileNotFoundError as IMOFileNotFoundError + + +@pytest.fixture +def data_dir(): + """Get the data directory path.""" + repo_root = Path(__file__).parent.parent.parent + data_dir = repo_root / "imobench" + + if not data_dir.exists(): + pytest.skip(f"Data directory not found: {data_dir}") + + return data_dir + + +def test_end_to_end_workflow(data_dir): + """Test a complete workflow: load, filter, analyze.""" + loader = IMOBenchLoader(data_dir=data_dir) + + # Load all algebra problems + algebra_problems = loader.load_answerbench(category="Algebra", validate=False) + + # Should have some problems + assert len(algebra_problems) > 0 + + # All should be algebra + assert all(p.category == "Algebra" for p in algebra_problems) + + # Count subcategories + subcategories = {} + for problem in algebra_problems: + subcategories[problem.subcategory] = subcategories.get(problem.subcategory, 0) + 1 + + # Should have multiple subcategories + assert len(subcategories) > 1 + + +def test_cross_dataset_consistency(data_dir): + """Test that problem IDs in gradingbench reference actual problems.""" + loader = IMOBenchLoader(data_dir=data_dir) + + # Load proofbench problems + proof_problems = loader.load_proofbench(validate=False) + proof_ids = {p.problem_id for p in proof_problems} + + # Load a sample of grading entries + grading_entries = loader.load_gradingbench(validate=False) + if not grading_entries: + pytest.skip("No grading entries found") + + # Take first 100 entries + sample = grading_entries[:100] + + # Check that referenced problem IDs exist + referenced_ids = {e.problem_id for e in sample} + + # All referenced IDs should be valid proofbench IDs + for ref_id in referenced_ids: + assert ref_id in proof_ids or ref_id.startswith('PB-'), \ + f"Invalid problem reference: {ref_id}" + + +def test_statistics_generation(data_dir): + """Test generating statistics from the datasets.""" + loader = IMOBenchLoader(data_dir=data_dir) + + # Load all datasets + answer_problems = loader.load_answerbench(validate=False) + proof_problems = loader.load_proofbench(validate=False) + + # Generate statistics + stats = { + 'answerbench': { + 'total': len(answer_problems), + 'categories': {}, + }, + 'proofbench': { + 'total': len(proof_problems), + 'levels': {}, + }, + } + + # Count categories in answerbench + for p in answer_problems: + stats['answerbench']['categories'][p.category] = \ + stats['answerbench']['categories'].get(p.category, 0) + 1 + + # Count levels in proofbench + for p in proof_problems: + stats['proofbench']['levels'][p.level] = \ + stats['proofbench']['levels'].get(p.level, 0) + 1 + + # Verify we have reasonable numbers + assert stats['answerbench']['total'] > 100 + assert stats['proofbench']['total'] > 10 + assert len(stats['answerbench']['categories']) >= 3 # At least 3 categories + assert len(stats['proofbench']['levels']) >= 2 # At least 2 difficulty levels + + +def test_memory_efficiency_with_lazy_loading(data_dir): + """Test that lazy loading doesn't load entire dataset into memory.""" + loader = IMOBenchLoader(data_dir=data_dir) + + # Use lazy loading + iterator = loader.load_gradingbench(lazy=True, validate=False) + + # Process first 1000 entries + count = 0 + for entry in iterator: + count += 1 + if count >= 1000: + break + + # Should have processed entries without loading entire dataset + assert count == 1000 + + +def test_filtering_combinations(data_dir): + """Test combining multiple filters.""" + loader = IMOBenchLoader(data_dir=data_dir) + + # Load with multiple filters + problems = loader.load_answerbench( + category="Algebra", + subcategory="Inequality", + validate=False + ) + + # All results should match both filters + for p in problems: + assert p.category == "Algebra" + assert p.subcategory == "Inequality" diff --git a/imobench-pylib/tests/test_loader.py b/imobench-pylib/tests/test_loader.py new file mode 100644 index 0000000..d2a8319 --- /dev/null +++ b/imobench-pylib/tests/test_loader.py @@ -0,0 +1,137 @@ +"""Tests for data loading functionality.""" + +import pytest +from pathlib import Path +from imobench.loader import IMOBenchLoader +from imobench.types import ( + AnswerBenchProblem, + ProofBenchProblem, + GradingBenchEntry, +) +from imobench.exceptions import FileNotFoundError as IMOFileNotFoundError + + +@pytest.fixture +def loader(): + """Create a loader instance pointing to the actual data.""" + # Assumes tests are run from repo root or with proper PYTHONPATH + repo_root = Path(__file__).parent.parent.parent + data_dir = repo_root / "imobench" + + if not data_dir.exists(): + pytest.skip(f"Data directory not found: {data_dir}") + + return IMOBenchLoader(data_dir=data_dir) + + +def test_loader_invalid_data_dir(): + """Test that loader raises error with invalid data directory.""" + with pytest.raises(IMOFileNotFoundError): + IMOBenchLoader(data_dir=Path("/nonexistent/path")) + + +def test_load_answerbench(loader): + """Test loading answerbench dataset.""" + problems = loader.load_answerbench(validate=False) + + assert len(problems) > 0 + assert all(isinstance(p, AnswerBenchProblem) for p in problems) + + # Check first problem has expected structure + first = problems[0] + assert first.problem_id.startswith('imo-bench-') + assert first.category in ['Algebra', 'Combinatorics', 'Geometry', 'Number theory'] + + +def test_load_answerbench_with_category_filter(loader): + """Test loading answerbench with category filter.""" + algebra_problems = loader.load_answerbench(category="Algebra", validate=False) + + assert len(algebra_problems) > 0 + assert all(p.category == "Algebra" for p in algebra_problems) + + +def test_load_answerbench_with_source_filter(loader): + """Test loading answerbench with source filter.""" + problems = loader.load_answerbench(source="IMO Shortlist 2021", validate=False) + + # May or may not have results depending on data + if problems: + assert all(p.source == "IMO Shortlist 2021" for p in problems) + + +def test_load_proofbench(loader): + """Test loading proofbench dataset.""" + problems = loader.load_proofbench(validate=False) + + assert len(problems) > 0 + assert all(isinstance(p, ProofBenchProblem) for p in problems) + + # Check first problem has expected structure + first = problems[0] + assert first.problem_id.startswith('PB-') + assert first.category in ['Algebra', 'Combinatorics', 'Geometry', 'Number theory'] + assert first.level # Should have a level + + +def test_load_proofbench_with_level_filter(loader): + """Test loading proofbench with level filter.""" + problems = loader.load_proofbench(level="IMO-easy", validate=False) + + # May or may not have results depending on data + if problems: + assert all(p.level == "IMO-easy" for p in problems) + + +def test_load_gradingbench(loader): + """Test loading gradingbench dataset.""" + # Load just a small subset for testing + entries = loader.load_gradingbench(max_points=2, validate=False) + + assert len(entries) > 0 + assert all(isinstance(e, GradingBenchEntry) for e in entries) + assert all(e.points <= 2 for e in entries) + + +def test_load_gradingbench_lazy(loader): + """Test lazy loading of gradingbench dataset.""" + iterator = loader.load_gradingbench(min_points=5, lazy=True, validate=False) + + # Get first few entries + entries = [] + for i, entry in enumerate(iterator): + entries.append(entry) + if i >= 10: # Stop after 10 + break + + assert len(entries) > 0 + assert all(isinstance(e, GradingBenchEntry) for e in entries) + assert all(e.points >= 5 for e in entries) + + +def test_load_gradingbench_with_problem_filter(loader): + """Test loading gradingbench filtered by problem ID.""" + # First get a problem ID + all_entries = loader.load_gradingbench(validate=False) + if not all_entries: + pytest.skip("No grading entries found") + + problem_id = all_entries[0].problem_id + + # Load entries for that problem + filtered = loader.load_gradingbench(problem_id=problem_id, validate=False) + + assert len(filtered) > 0 + assert all(e.problem_id == problem_id for e in filtered) + + +def test_convenience_functions(): + """Test convenience functions work.""" + from imobench import load_answerbench, load_proofbench, load_gradingbench + + # These should work if data directory is in expected location + try: + problems = load_answerbench(validate=False) + assert len(problems) > 0 + except IMOFileNotFoundError: + pytest.skip("Data directory not in default location") diff --git a/imobench-pylib/tests/test_types.py b/imobench-pylib/tests/test_types.py new file mode 100644 index 0000000..c0dfc79 --- /dev/null +++ b/imobench-pylib/tests/test_types.py @@ -0,0 +1,111 @@ +"""Tests for IMO Bench data types.""" + +import pytest +from imobench.types import ( + AnswerBenchProblem, + ProofBenchProblem, + GradingBenchEntry, +) + + +def test_answerbench_problem_creation(): + """Test creating an AnswerBenchProblem.""" + problem = AnswerBenchProblem( + problem_id="imo-bench-algebra-001", + problem="Find all $N$ such that...", + short_answer="3", + category="Algebra", + subcategory="Operation", + source="IMO Shortlist 2021" + ) + + assert problem.problem_id == "imo-bench-algebra-001" + assert problem.category == "Algebra" + assert problem.subcategory == "Operation" + + +def test_answerbench_problem_immutable(): + """Test that AnswerBenchProblem is immutable.""" + problem = AnswerBenchProblem( + problem_id="imo-bench-algebra-001", + problem="Find all $N$ such that...", + short_answer="3", + category="Algebra", + subcategory="Operation", + source="IMO Shortlist 2021" + ) + + with pytest.raises(AttributeError): + problem.category = "Geometry" + + +def test_answerbench_problem_repr(): + """Test AnswerBenchProblem string representation.""" + problem = AnswerBenchProblem( + problem_id="imo-bench-algebra-001", + problem="Find all $N$ such that...", + short_answer="3", + category="Algebra", + subcategory="Operation", + source="IMO Shortlist 2021" + ) + + repr_str = repr(problem) + assert "imo-bench-algebra-001" in repr_str + assert "Algebra" in repr_str + + +def test_proofbench_problem_creation(): + """Test creating a ProofBenchProblem.""" + problem = ProofBenchProblem( + problem_id="PB-Basic-001", + problem="Determine all functions...", + solution="By taking $x = 0$...", + grading_guidelines="(Partial) 1. Guessed...", + category="Algebra", + level="IMO-easy", + short_answer="$f(x) = 0$ and $f(x) = 2x + c$", + source="IMO 2019, P1" + ) + + assert problem.problem_id == "PB-Basic-001" + assert problem.level == "IMO-easy" + assert problem.category == "Algebra" + + +def test_gradingbench_entry_creation(): + """Test creating a GradingBenchEntry.""" + entry = GradingBenchEntry( + grading_id="GB-0001", + problem_id="PB-Advanced-001", + problem="For a positive integer $n$...", + solution="Let's look at the following lemma...", + grading_guidelines="(Partial) 1. Proved...", + response="We will prove by induction...", + points=7, + reward="Partial", + problem_source="IMO Shortlist 2021" + ) + + assert entry.grading_id == "GB-0001" + assert entry.points == 7 + assert entry.reward == "Partial" + + +def test_gradingbench_entry_repr(): + """Test GradingBenchEntry string representation.""" + entry = GradingBenchEntry( + grading_id="GB-0001", + problem_id="PB-Advanced-001", + problem="For a positive integer $n$...", + solution="Let's look at the following lemma...", + grading_guidelines="(Partial) 1. Proved...", + response="We will prove by induction...", + points=7, + reward="Partial", + problem_source="IMO Shortlist 2021" + ) + + repr_str = repr(entry) + assert "GB-0001" in repr_str + assert "7" in repr_str diff --git a/imobench-pylib/tests/test_validators.py b/imobench-pylib/tests/test_validators.py new file mode 100644 index 0000000..12b5e1a --- /dev/null +++ b/imobench-pylib/tests/test_validators.py @@ -0,0 +1,230 @@ +"""Tests for data validation functionality.""" + +import pytest +from imobench.validators import ( + validate_answerbench_row, + validate_proofbench_row, + validate_gradingbench_row, + validate_dataset_counts, +) +from imobench.exceptions import ValidationError + + +# AnswerBench validation tests + +def test_validate_answerbench_valid_row(): + """Test validation of a valid answerbench row.""" + row = { + 'Problem ID': 'imo-bench-algebra-001', + 'Problem': 'Find all $N$ such that...', + 'Short Answer': '3', + 'Category': 'Algebra', + 'Subcategory': 'Operation', + 'Source': 'IMO Shortlist 2021' + } + + # Should not raise + validate_answerbench_row(row) + + +def test_validate_answerbench_missing_field(): + """Test validation fails with missing field.""" + row = { + 'Problem ID': 'imo-bench-algebra-001', + 'Problem': 'Find all $N$ such that...', + # Missing 'Short Answer' + 'Category': 'Algebra', + 'Subcategory': 'Operation', + 'Source': 'IMO Shortlist 2021' + } + + with pytest.raises(ValidationError, match="Missing required fields"): + validate_answerbench_row(row) + + +def test_validate_answerbench_empty_field(): + """Test validation fails with empty field.""" + row = { + 'Problem ID': 'imo-bench-algebra-001', + 'Problem': '', # Empty + 'Short Answer': '3', + 'Category': 'Algebra', + 'Subcategory': 'Operation', + 'Source': 'IMO Shortlist 2021' + } + + with pytest.raises(ValidationError, match="Empty value"): + validate_answerbench_row(row) + + +def test_validate_answerbench_invalid_category(): + """Test validation fails with invalid category.""" + row = { + 'Problem ID': 'imo-bench-algebra-001', + 'Problem': 'Find all $N$ such that...', + 'Short Answer': '3', + 'Category': 'InvalidCategory', + 'Subcategory': 'Operation', + 'Source': 'IMO Shortlist 2021' + } + + with pytest.raises(ValidationError, match="Invalid category"): + validate_answerbench_row(row) + + +def test_validate_answerbench_invalid_id_format(): + """Test validation fails with invalid Problem ID format.""" + row = { + 'Problem ID': 'invalid-id-001', + 'Problem': 'Find all $N$ such that...', + 'Short Answer': '3', + 'Category': 'Algebra', + 'Subcategory': 'Operation', + 'Source': 'IMO Shortlist 2021' + } + + with pytest.raises(ValidationError, match="Invalid Problem ID format"): + validate_answerbench_row(row) + + +# ProofBench validation tests + +def test_validate_proofbench_valid_row(): + """Test validation of a valid proofbench row.""" + row = { + 'Problem ID': 'PB-Basic-001', + 'Problem': 'Determine all functions...', + 'Solution': 'By taking $x = 0$...', + 'Grading guidelines': '(Partial) 1. Guessed...', + 'Category': 'Algebra', + 'Level': 'IMO-easy', + 'Short Answer': '$f(x) = 0$', + 'Source': 'IMO 2019, P1' + } + + # Should not raise + validate_proofbench_row(row) + + +def test_validate_proofbench_invalid_id(): + """Test validation fails with invalid Problem ID.""" + row = { + 'Problem ID': 'invalid-001', + 'Problem': 'Determine all functions...', + 'Solution': 'By taking $x = 0$...', + 'Grading guidelines': '(Partial) 1. Guessed...', + 'Category': 'Algebra', + 'Level': 'IMO-easy', + 'Short Answer': '$f(x) = 0$', + 'Source': 'IMO 2019, P1' + } + + with pytest.raises(ValidationError, match="Invalid Problem ID format"): + validate_proofbench_row(row) + + +# GradingBench validation tests + +def test_validate_gradingbench_valid_row(): + """Test validation of a valid gradingbench row.""" + row = { + 'Grading ID': 'GB-0001', + 'Problem ID': 'PB-Advanced-001', + 'Problem': 'For a positive integer $n$...', + 'Solution': "Let's look at the following lemma...", + 'Grading guidelines': '(Partial) 1. Proved...', + 'Response': 'We will prove by induction...', + 'Points': '7', + 'Reward': 'Partial', + 'Problem Source': 'IMO Shortlist 2021' + } + + # Should not raise + validate_gradingbench_row(row) + + +def test_validate_gradingbench_invalid_points(): + """Test validation fails with invalid points.""" + row = { + 'Grading ID': 'GB-0001', + 'Problem ID': 'PB-Advanced-001', + 'Problem': 'For a positive integer $n$...', + 'Solution': "Let's look at the following lemma...", + 'Grading guidelines': '(Partial) 1. Proved...', + 'Response': 'We will prove by induction...', + 'Points': '15', # Out of range + 'Reward': '0.85', + 'Problem Source': 'IMO Shortlist 2021' + } + + with pytest.raises(ValidationError, match="Points must be between 0 and 10"): + validate_gradingbench_row(row) + + +def test_validate_gradingbench_non_numeric_points(): + """Test validation fails with non-numeric points.""" + row = { + 'Grading ID': 'GB-0001', + 'Problem ID': 'PB-Advanced-001', + 'Problem': 'For a positive integer $n$...', + 'Solution': "Let's look at the following lemma...", + 'Grading guidelines': '(Partial) 1. Proved...', + 'Response': 'We will prove by induction...', + 'Points': 'seven', # Not a number + 'Reward': 'Partial', + 'Problem Source': 'IMO Shortlist 2021' + } + + with pytest.raises(ValidationError, match="Points must be an integer"): + validate_gradingbench_row(row) + + +def test_validate_gradingbench_invalid_reward(): + """Test validation handles various reward values.""" + # Reward is now a categorical field, so any non-empty string is valid + row = { + 'Grading ID': 'GB-0001', + 'Problem ID': 'PB-Advanced-001', + 'Problem': 'For a positive integer $n$...', + 'Solution': "Let's look at the following lemma...", + 'Grading guidelines': '(Partial) 1. Proved...', + 'Response': 'We will prove by induction...', + 'Points': '7', + 'Reward': 'Incorrect', # Valid categorical value + 'Problem Source': 'IMO Shortlist 2021' + } + + # Should not raise - categorical rewards are allowed + validate_gradingbench_row(row) + + +# Dataset count validation tests + +def test_validate_dataset_counts_valid(): + """Test validation of valid dataset counts.""" + # Should not raise + validate_dataset_counts( + answerbench_count=400, + proofbench_count=60, + gradingbench_count=1000 + ) + + +def test_validate_dataset_counts_invalid_answerbench(): + """Test validation fails with invalid answerbench count.""" + with pytest.raises(ValidationError, match="Unexpected answerbench count"): + validate_dataset_counts( + answerbench_count=100, # Too few + proofbench_count=60, + gradingbench_count=1000 + ) + + +def test_validate_dataset_counts_invalid_proofbench(): + """Test validation fails with invalid proofbench count.""" + with pytest.raises(ValidationError, match="Unexpected proofbench count"): + validate_dataset_counts( + answerbench_count=400, + proofbench_count=20, # Too few + gradingbench_count=1000 + )