From eaa1bb391ccaf10dde720133dd4b67a45d6331be Mon Sep 17 00:00:00 2001 From: llbbl Date: Thu, 26 Jun 2025 07:49:30 -0500 Subject: [PATCH] feat: Set up comprehensive Python testing infrastructure with Poetry - Configure Poetry as package manager with pyproject.toml - Add pytest, pytest-cov, and pytest-mock as dev dependencies - Set up pytest configuration with coverage thresholds and custom markers - Create testing directory structure (tests/, unit/, integration/) - Add comprehensive conftest.py with reusable fixtures - Update .gitignore with testing and Poetry entries - Create validation tests to verify setup functionality --- .gitignore | 38 +++++++ pyproject.toml | 106 ++++++++++++++++++ tests/__init__.py | 0 tests/conftest.py | 193 +++++++++++++++++++++++++++++++++ tests/integration/__init__.py | 0 tests/test_setup_validation.py | 138 +++++++++++++++++++++++ tests/unit/__init__.py | 0 7 files changed, 475 insertions(+) create mode 100644 pyproject.toml create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/integration/__init__.py create mode 100644 tests/test_setup_validation.py create mode 100644 tests/unit/__init__.py diff --git a/.gitignore b/.gitignore index 894a44cc..085f98b8 100644 --- a/.gitignore +++ b/.gitignore @@ -102,3 +102,41 @@ venv.bak/ # mypy .mypy_cache/ +.dmypy.json +dmypy.json + +# Testing and coverage +.pytest_cache/ +.coverage +.coverage.* +htmlcov/ +coverage.xml +*.cover +*.py,cover +.hypothesis/ +pytest_cache/ +test-results/ + +# Claude settings +.claude/* + +# Poetry +poetry.lock + +# IDE and editor files +.idea/ +.vscode/ +*.swp +*.swo +*~ +.DS_Store + +# Virtual environments +virtualenv/ +.virtualenv/ + +# Build artifacts +*.egg-info/ +.eggs/ +develop-eggs/ +*.egg diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..48865b97 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,106 @@ +[tool.poetry] +name = "grover" +version = "0.1.0" +description = "Grover: A State-of-the-Art Defense against Neural Fake News" +authors = ["Your Name "] +readme = "README.md" +license = "MIT" +packages = [ + { include = "lm" }, + { include = "sample" } +] + +[tool.poetry.dependencies] +python = "^3.8" +pandas = ">=0.24.2" +regex = ">=2019.4.14" +h5py = ">=2.9.0" +numpy = ">=1.19.0,<1.24.0" +tensorboard = ">=1.13.1" +tensorflow = ">=1.13.1,<2.0.0" +tensorflow-estimator = ">=1.13.0,<2.0.0" +tqdm = ">=4.31.1" +requests = ">=2.22.0" + +[tool.poetry.group.dev.dependencies] +pytest = "^7.4.0" +pytest-cov = "^4.1.0" +pytest-mock = "^3.11.1" + +[tool.poetry.scripts] +test = "pytest" +tests = "pytest" + +[tool.pytest.ini_options] +minversion = "7.0" +testpaths = ["tests"] +python_files = ["test_*.py", "*_test.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +addopts = [ + "-ra", + "--strict-markers", + "--cov=lm", + "--cov=sample", + "--cov=discrimination", + "--cov-report=term-missing", + "--cov-report=html", + "--cov-report=xml", + "-v", + "--tb=short", + "--maxfail=3" +] +markers = [ + "unit: marks tests as unit tests (fast, isolated)", + "integration: marks tests as integration tests (may require external resources)", + "slow: marks tests as slow (deselect with '-m \"not slow\"')" +] +console_output_style = "progress" +filterwarnings = [ + "error", + "ignore::UserWarning", + "ignore::DeprecationWarning" +] + +[tool.coverage.run] +source = ["lm", "sample", "discrimination"] +branch = true +parallel = true +omit = [ + "*/tests/*", + "*/__init__.py", + "*/setup.py", + "*/venv/*", + "*/virtualenv/*", + "*/.venv/*", + "*/.virtualenv/*" +] + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "def __repr__", + "if self.debug:", + "if settings.DEBUG", + "raise AssertionError", + "raise NotImplementedError", + "if 0:", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", + "class .*\\bProtocol\\):", + "@(abc\\.)?abstractmethod" +] +precision = 2 +show_missing = true +skip_covered = false +fail_under = 80 + +[tool.coverage.html] +directory = "htmlcov" + +[tool.coverage.xml] +output = "coverage.xml" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..55bf5eca --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,193 @@ +""" +Shared pytest fixtures and configuration for all tests. +""" +import os +import tempfile +import shutil +import json +from pathlib import Path +from typing import Generator, Dict, Any +import pytest + + +@pytest.fixture +def temp_dir() -> Generator[Path, None, None]: + """ + Create a temporary directory for tests that need file system operations. + + Yields: + Path: Path to the temporary directory + """ + temp_path = tempfile.mkdtemp() + yield Path(temp_path) + # Cleanup after test + shutil.rmtree(temp_path, ignore_errors=True) + + +@pytest.fixture +def mock_config() -> Dict[str, Any]: + """ + Provide a mock configuration dictionary for testing. + + Returns: + Dict[str, Any]: Mock configuration data + """ + return { + "model_name": "test_model", + "batch_size": 32, + "learning_rate": 0.001, + "num_epochs": 10, + "hidden_size": 768, + "num_layers": 12, + "vocab_size": 50000, + "max_seq_length": 512, + "checkpoint_dir": "/tmp/checkpoints", + "log_dir": "/tmp/logs" + } + + +@pytest.fixture +def sample_json_data() -> Dict[str, Any]: + """ + Provide sample JSON data for testing data processing functions. + + Returns: + Dict[str, Any]: Sample JSON data + """ + return { + "id": "test_001", + "text": "This is a sample text for testing purposes.", + "label": "real", + "domain": "test.com", + "date": "2023-01-01", + "authors": ["Test Author"], + "title": "Test Article Title" + } + + +@pytest.fixture +def sample_jsonl_file(temp_dir: Path) -> Path: + """ + Create a temporary JSONL file with sample data. + + Args: + temp_dir: Temporary directory fixture + + Returns: + Path: Path to the created JSONL file + """ + jsonl_path = temp_dir / "sample_data.jsonl" + + sample_data = [ + {"id": "1", "text": "First sample text", "label": "real"}, + {"id": "2", "text": "Second sample text", "label": "fake"}, + {"id": "3", "text": "Third sample text", "label": "real"} + ] + + with open(jsonl_path, 'w') as f: + for item in sample_data: + f.write(json.dumps(item) + '\n') + + return jsonl_path + + +@pytest.fixture +def mock_model_checkpoint(temp_dir: Path) -> Path: + """ + Create a mock model checkpoint directory structure. + + Args: + temp_dir: Temporary directory fixture + + Returns: + Path: Path to the mock checkpoint directory + """ + checkpoint_dir = temp_dir / "checkpoint" + checkpoint_dir.mkdir(exist_ok=True) + + # Create mock checkpoint files + (checkpoint_dir / "model.ckpt.index").touch() + (checkpoint_dir / "model.ckpt.data-00000-of-00001").touch() + (checkpoint_dir / "checkpoint").write_text("model_checkpoint_path: \"model.ckpt\"") + + return checkpoint_dir + + +@pytest.fixture +def mock_vocab_files(temp_dir: Path) -> Dict[str, Path]: + """ + Create mock vocabulary files for testing tokenization. + + Args: + temp_dir: Temporary directory fixture + + Returns: + Dict[str, Path]: Dictionary with paths to encoder.json and vocab.bpe + """ + encoder_path = temp_dir / "encoder.json" + vocab_path = temp_dir / "vocab.bpe" + + # Create minimal mock encoder + encoder_data = { + "hello": 1, + "world": 2, + "test": 3, + "<|endoftext|>": 4 + } + + with open(encoder_path, 'w') as f: + json.dump(encoder_data, f) + + # Create minimal mock BPE vocab + vocab_path.write_text("#version: 0.2\nh e l l o 1\nw o r l d 2\n") + + return { + "encoder": encoder_path, + "vocab": vocab_path + } + + +@pytest.fixture +def environment_variables(monkeypatch) -> None: + """ + Set up common environment variables for testing. + + Args: + monkeypatch: pytest monkeypatch fixture + """ + monkeypatch.setenv("TF_CPP_MIN_LOG_LEVEL", "3") # Suppress TensorFlow warnings + monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "") # Disable GPU for tests + monkeypatch.setenv("TEST_MODE", "1") + + +@pytest.fixture(autouse=True) +def cleanup_tensorflow(): + """ + Automatically clean up TensorFlow resources after each test. + """ + yield + # Import only if TensorFlow is available + try: + import tensorflow as tf + tf.keras.backend.clear_session() + tf.compat.v1.reset_default_graph() + except ImportError: + pass + + +@pytest.fixture +def sample_model_config() -> Dict[str, Any]: + """ + Provide a sample model configuration matching the project's config format. + + Returns: + Dict[str, Any]: Model configuration + """ + return { + "n_ctx": 1024, + "n_embd": 768, + "n_head": 12, + "n_layer": 12, + "n_vocab": 50257, + "n_special": 0 + } \ No newline at end of file diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_setup_validation.py b/tests/test_setup_validation.py new file mode 100644 index 00000000..b9c8f960 --- /dev/null +++ b/tests/test_setup_validation.py @@ -0,0 +1,138 @@ +""" +Validation tests to ensure the testing infrastructure is properly set up. +""" +import pytest +import sys +import os +from pathlib import Path + + +class TestInfrastructureSetup: + """Test that the testing infrastructure is properly configured.""" + + def test_pytest_is_importable(self): + """Test that pytest can be imported.""" + import pytest + assert pytest is not None + + def test_coverage_is_importable(self): + """Test that coverage tools can be imported.""" + import pytest_cov + assert pytest_cov is not None + + def test_mock_is_importable(self): + """Test that pytest-mock can be imported.""" + import pytest_mock + assert pytest_mock is not None + + def test_project_structure_exists(self): + """Test that the expected project structure exists.""" + root = Path(__file__).parent.parent + + # Check main directories + assert (root / "lm").exists(), "lm package directory should exist" + assert (root / "sample").exists(), "sample package directory should exist" + assert (root / "tests").exists(), "tests directory should exist" + assert (root / "tests" / "unit").exists(), "unit tests directory should exist" + assert (root / "tests" / "integration").exists(), "integration tests directory should exist" + + def test_conftest_exists(self): + """Test that conftest.py exists and can be imported.""" + conftest_path = Path(__file__).parent / "conftest.py" + assert conftest_path.exists(), "conftest.py should exist" + + def test_pyproject_toml_exists(self): + """Test that pyproject.toml exists.""" + pyproject_path = Path(__file__).parent.parent / "pyproject.toml" + assert pyproject_path.exists(), "pyproject.toml should exist" + + @pytest.mark.unit + def test_unit_marker_works(self): + """Test that the unit test marker works.""" + assert True + + @pytest.mark.integration + def test_integration_marker_works(self): + """Test that the integration test marker works.""" + assert True + + @pytest.mark.slow + def test_slow_marker_works(self): + """Test that the slow test marker works.""" + assert True + + +class TestFixtures: + """Test that the fixtures from conftest.py work correctly.""" + + def test_temp_dir_fixture(self, temp_dir): + """Test that temp_dir fixture creates a directory.""" + assert temp_dir.exists() + assert temp_dir.is_dir() + + # Test we can write to it + test_file = temp_dir / "test.txt" + test_file.write_text("test content") + assert test_file.exists() + assert test_file.read_text() == "test content" + + def test_mock_config_fixture(self, mock_config): + """Test that mock_config fixture provides expected structure.""" + assert isinstance(mock_config, dict) + assert "model_name" in mock_config + assert "batch_size" in mock_config + assert mock_config["model_name"] == "test_model" + assert mock_config["batch_size"] == 32 + + def test_sample_json_data_fixture(self, sample_json_data): + """Test that sample_json_data fixture provides expected data.""" + assert isinstance(sample_json_data, dict) + assert sample_json_data["id"] == "test_001" + assert "text" in sample_json_data + assert "label" in sample_json_data + + def test_sample_jsonl_file_fixture(self, sample_jsonl_file): + """Test that sample_jsonl_file fixture creates a valid JSONL file.""" + assert sample_jsonl_file.exists() + + # Read and verify content + import json + lines = sample_jsonl_file.read_text().strip().split('\n') + assert len(lines) == 3 + + first_item = json.loads(lines[0]) + assert first_item["id"] == "1" + assert "text" in first_item + assert "label" in first_item + + def test_mock_model_checkpoint_fixture(self, mock_model_checkpoint): + """Test that mock_model_checkpoint fixture creates expected structure.""" + assert mock_model_checkpoint.exists() + assert (mock_model_checkpoint / "model.ckpt.index").exists() + assert (mock_model_checkpoint / "checkpoint").exists() + + def test_mock_vocab_files_fixture(self, mock_vocab_files): + """Test that mock_vocab_files fixture creates expected files.""" + assert mock_vocab_files["encoder"].exists() + assert mock_vocab_files["vocab"].exists() + + # Verify encoder content + import json + encoder_data = json.loads(mock_vocab_files["encoder"].read_text()) + assert "hello" in encoder_data + assert encoder_data["hello"] == 1 + + def test_sample_model_config_fixture(self, sample_model_config): + """Test that sample_model_config fixture provides expected structure.""" + assert isinstance(sample_model_config, dict) + assert sample_model_config["n_ctx"] == 1024 + assert sample_model_config["n_embd"] == 768 + assert "n_head" in sample_model_config + assert "n_layer" in sample_model_config + + +def test_coverage_is_configured(): + """Test that coverage is properly configured when running with coverage.""" + # This test will pass whether or not coverage is running + # When running with coverage, it helps verify the setup + assert True \ No newline at end of file diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 00000000..e69de29b