From eaa1bb391ccaf10dde720133dd4b67a45d6331be Mon Sep 17 00:00:00 2001
From: llbbl <logan@llbbl.com>
Date: Thu, 26 Jun 2025 07:49:30 -0500
Subject: [PATCH] feat: Set up comprehensive Python testing infrastructure with
 Poetry

- Configure Poetry as package manager with pyproject.toml
- Add pytest, pytest-cov, and pytest-mock as dev dependencies
- Set up pytest configuration with coverage thresholds and custom markers
- Create testing directory structure (tests/, unit/, integration/)
- Add comprehensive conftest.py with reusable fixtures
- Update .gitignore with testing and Poetry entries
- Create validation tests to verify setup functionality
---
 .gitignore                     |  38 +++++++
 pyproject.toml                 | 106 ++++++++++++++++++
 tests/__init__.py              |   0
 tests/conftest.py              | 193 +++++++++++++++++++++++++++++++++
 tests/integration/__init__.py  |   0
 tests/test_setup_validation.py | 138 +++++++++++++++++++++++
 tests/unit/__init__.py         |   0
 7 files changed, 475 insertions(+)
 create mode 100644 pyproject.toml
 create mode 100644 tests/__init__.py
 create mode 100644 tests/conftest.py
 create mode 100644 tests/integration/__init__.py
 create mode 100644 tests/test_setup_validation.py
 create mode 100644 tests/unit/__init__.py

diff --git a/.gitignore b/.gitignore
index 894a44cc..085f98b8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,41 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Testing and coverage
+.pytest_cache/
+.coverage
+.coverage.*
+htmlcov/
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+pytest_cache/
+test-results/
+
+# Claude settings
+.claude/*
+
+# Poetry
+poetry.lock
+
+# IDE and editor files
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+.DS_Store
+
+# Virtual environments
+virtualenv/
+.virtualenv/
+
+# Build artifacts
+*.egg-info/
+.eggs/
+develop-eggs/
+*.egg
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..48865b97
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,106 @@
+[tool.poetry]
+name = "grover"
+version = "0.1.0"
+description = "Grover: A State-of-the-Art Defense against Neural Fake News"
+authors = ["Your Name <you@example.com>"]
+readme = "README.md"
+license = "MIT"
+packages = [
+    { include = "lm" },
+    { include = "sample" }
+]
+
+[tool.poetry.dependencies]
+python = "^3.8"
+pandas = ">=0.24.2"
+regex = ">=2019.4.14"
+h5py = ">=2.9.0"
+numpy = ">=1.19.0,<1.24.0"
+tensorboard = ">=1.13.1"
+tensorflow = ">=1.13.1,<2.0.0"
+tensorflow-estimator = ">=1.13.0,<2.0.0"
+tqdm = ">=4.31.1"
+requests = ">=2.22.0"
+
+[tool.poetry.group.dev.dependencies]
+pytest = "^7.4.0"
+pytest-cov = "^4.1.0"
+pytest-mock = "^3.11.1"
+
+[tool.poetry.scripts]
+test = "pytest"
+tests = "pytest"
+
+[tool.pytest.ini_options]
+minversion = "7.0"
+testpaths = ["tests"]
+python_files = ["test_*.py", "*_test.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+addopts = [
+    "-ra",
+    "--strict-markers",
+    "--cov=lm",
+    "--cov=sample",
+    "--cov=discrimination",
+    "--cov-report=term-missing",
+    "--cov-report=html",
+    "--cov-report=xml",
+    "-v",
+    "--tb=short",
+    "--maxfail=3"
+]
+markers = [
+    "unit: marks tests as unit tests (fast, isolated)",
+    "integration: marks tests as integration tests (may require external resources)",
+    "slow: marks tests as slow (deselect with '-m \"not slow\"')"
+]
+console_output_style = "progress"
+filterwarnings = [
+    "error",
+    "ignore::UserWarning",
+    "ignore::DeprecationWarning"
+]
+
+[tool.coverage.run]
+source = ["lm", "sample", "discrimination"]
+branch = true
+parallel = true
+omit = [
+    "*/tests/*",
+    "*/__init__.py",
+    "*/setup.py",
+    "*/venv/*",
+    "*/virtualenv/*",
+    "*/.venv/*",
+    "*/.virtualenv/*"
+]
+
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "def __repr__",
+    "if self.debug:",
+    "if settings.DEBUG",
+    "raise AssertionError",
+    "raise NotImplementedError",
+    "if 0:",
+    "if __name__ == .__main__.:",
+    "if TYPE_CHECKING:",
+    "class .*\\bProtocol\\):",
+    "@(abc\\.)?abstractmethod"
+]
+precision = 2
+show_missing = true
+skip_covered = false
+fail_under = 80
+
+[tool.coverage.html]
+directory = "htmlcov"
+
+[tool.coverage.xml]
+output = "coverage.xml"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 00000000..55bf5eca
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,193 @@
+"""
+Shared pytest fixtures and configuration for all tests.
+"""
+import os
+import tempfile
+import shutil
+import json
+from pathlib import Path
+from typing import Generator, Dict, Any
+import pytest
+
+
+@pytest.fixture
+def temp_dir() -> Generator[Path, None, None]:
+    """
+    Create a temporary directory for tests that need file system operations.
+    
+    Yields:
+        Path: Path to the temporary directory
+    """
+    temp_path = tempfile.mkdtemp()
+    yield Path(temp_path)
+    # Cleanup after test
+    shutil.rmtree(temp_path, ignore_errors=True)
+
+
+@pytest.fixture
+def mock_config() -> Dict[str, Any]:
+    """
+    Provide a mock configuration dictionary for testing.
+    
+    Returns:
+        Dict[str, Any]: Mock configuration data
+    """
+    return {
+        "model_name": "test_model",
+        "batch_size": 32,
+        "learning_rate": 0.001,
+        "num_epochs": 10,
+        "hidden_size": 768,
+        "num_layers": 12,
+        "vocab_size": 50000,
+        "max_seq_length": 512,
+        "checkpoint_dir": "/tmp/checkpoints",
+        "log_dir": "/tmp/logs"
+    }
+
+
+@pytest.fixture
+def sample_json_data() -> Dict[str, Any]:
+    """
+    Provide sample JSON data for testing data processing functions.
+    
+    Returns:
+        Dict[str, Any]: Sample JSON data
+    """
+    return {
+        "id": "test_001",
+        "text": "This is a sample text for testing purposes.",
+        "label": "real",
+        "domain": "test.com",
+        "date": "2023-01-01",
+        "authors": ["Test Author"],
+        "title": "Test Article Title"
+    }
+
+
+@pytest.fixture
+def sample_jsonl_file(temp_dir: Path) -> Path:
+    """
+    Create a temporary JSONL file with sample data.
+    
+    Args:
+        temp_dir: Temporary directory fixture
+        
+    Returns:
+        Path: Path to the created JSONL file
+    """
+    jsonl_path = temp_dir / "sample_data.jsonl"
+    
+    sample_data = [
+        {"id": "1", "text": "First sample text", "label": "real"},
+        {"id": "2", "text": "Second sample text", "label": "fake"},
+        {"id": "3", "text": "Third sample text", "label": "real"}
+    ]
+    
+    with open(jsonl_path, 'w') as f:
+        for item in sample_data:
+            f.write(json.dumps(item) + '\n')
+    
+    return jsonl_path
+
+
+@pytest.fixture
+def mock_model_checkpoint(temp_dir: Path) -> Path:
+    """
+    Create a mock model checkpoint directory structure.
+    
+    Args:
+        temp_dir: Temporary directory fixture
+        
+    Returns:
+        Path: Path to the mock checkpoint directory
+    """
+    checkpoint_dir = temp_dir / "checkpoint"
+    checkpoint_dir.mkdir(exist_ok=True)
+    
+    # Create mock checkpoint files
+    (checkpoint_dir / "model.ckpt.index").touch()
+    (checkpoint_dir / "model.ckpt.data-00000-of-00001").touch()
+    (checkpoint_dir / "checkpoint").write_text("model_checkpoint_path: \"model.ckpt\"")
+    
+    return checkpoint_dir
+
+
+@pytest.fixture
+def mock_vocab_files(temp_dir: Path) -> Dict[str, Path]:
+    """
+    Create mock vocabulary files for testing tokenization.
+    
+    Args:
+        temp_dir: Temporary directory fixture
+        
+    Returns:
+        Dict[str, Path]: Dictionary with paths to encoder.json and vocab.bpe
+    """
+    encoder_path = temp_dir / "encoder.json"
+    vocab_path = temp_dir / "vocab.bpe"
+    
+    # Create minimal mock encoder
+    encoder_data = {
+        "hello": 1,
+        "world": 2,
+        "test": 3,
+        "<|endoftext|>": 4
+    }
+    
+    with open(encoder_path, 'w') as f:
+        json.dump(encoder_data, f)
+    
+    # Create minimal mock BPE vocab
+    vocab_path.write_text("#version: 0.2\nh e l l o</w> 1\nw o r l d</w> 2\n")
+    
+    return {
+        "encoder": encoder_path,
+        "vocab": vocab_path
+    }
+
+
+@pytest.fixture
+def environment_variables(monkeypatch) -> None:
+    """
+    Set up common environment variables for testing.
+    
+    Args:
+        monkeypatch: pytest monkeypatch fixture
+    """
+    monkeypatch.setenv("TF_CPP_MIN_LOG_LEVEL", "3")  # Suppress TensorFlow warnings
+    monkeypatch.setenv("CUDA_VISIBLE_DEVICES", "")  # Disable GPU for tests
+    monkeypatch.setenv("TEST_MODE", "1")
+
+
+@pytest.fixture(autouse=True)
+def cleanup_tensorflow():
+    """
+    Automatically clean up TensorFlow resources after each test.
+    """
+    yield
+    # Import only if TensorFlow is available
+    try:
+        import tensorflow as tf
+        tf.keras.backend.clear_session()
+        tf.compat.v1.reset_default_graph()
+    except ImportError:
+        pass
+
+
+@pytest.fixture
+def sample_model_config() -> Dict[str, Any]:
+    """
+    Provide a sample model configuration matching the project's config format.
+    
+    Returns:
+        Dict[str, Any]: Model configuration
+    """
+    return {
+        "n_ctx": 1024,
+        "n_embd": 768,
+        "n_head": 12,
+        "n_layer": 12,
+        "n_vocab": 50257,
+        "n_special": 0
+    }
\ No newline at end of file
diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/test_setup_validation.py b/tests/test_setup_validation.py
new file mode 100644
index 00000000..b9c8f960
--- /dev/null
+++ b/tests/test_setup_validation.py
@@ -0,0 +1,138 @@
+"""
+Validation tests to ensure the testing infrastructure is properly set up.
+"""
+import pytest
+import sys
+import os
+from pathlib import Path
+
+
+class TestInfrastructureSetup:
+    """Test that the testing infrastructure is properly configured."""
+    
+    def test_pytest_is_importable(self):
+        """Test that pytest can be imported."""
+        import pytest
+        assert pytest is not None
+    
+    def test_coverage_is_importable(self):
+        """Test that coverage tools can be imported."""
+        import pytest_cov
+        assert pytest_cov is not None
+    
+    def test_mock_is_importable(self):
+        """Test that pytest-mock can be imported."""
+        import pytest_mock
+        assert pytest_mock is not None
+    
+    def test_project_structure_exists(self):
+        """Test that the expected project structure exists."""
+        root = Path(__file__).parent.parent
+        
+        # Check main directories
+        assert (root / "lm").exists(), "lm package directory should exist"
+        assert (root / "sample").exists(), "sample package directory should exist"
+        assert (root / "tests").exists(), "tests directory should exist"
+        assert (root / "tests" / "unit").exists(), "unit tests directory should exist"
+        assert (root / "tests" / "integration").exists(), "integration tests directory should exist"
+    
+    def test_conftest_exists(self):
+        """Test that conftest.py exists and can be imported."""
+        conftest_path = Path(__file__).parent / "conftest.py"
+        assert conftest_path.exists(), "conftest.py should exist"
+    
+    def test_pyproject_toml_exists(self):
+        """Test that pyproject.toml exists."""
+        pyproject_path = Path(__file__).parent.parent / "pyproject.toml"
+        assert pyproject_path.exists(), "pyproject.toml should exist"
+    
+    @pytest.mark.unit
+    def test_unit_marker_works(self):
+        """Test that the unit test marker works."""
+        assert True
+    
+    @pytest.mark.integration
+    def test_integration_marker_works(self):
+        """Test that the integration test marker works."""
+        assert True
+    
+    @pytest.mark.slow
+    def test_slow_marker_works(self):
+        """Test that the slow test marker works."""
+        assert True
+
+
+class TestFixtures:
+    """Test that the fixtures from conftest.py work correctly."""
+    
+    def test_temp_dir_fixture(self, temp_dir):
+        """Test that temp_dir fixture creates a directory."""
+        assert temp_dir.exists()
+        assert temp_dir.is_dir()
+        
+        # Test we can write to it
+        test_file = temp_dir / "test.txt"
+        test_file.write_text("test content")
+        assert test_file.exists()
+        assert test_file.read_text() == "test content"
+    
+    def test_mock_config_fixture(self, mock_config):
+        """Test that mock_config fixture provides expected structure."""
+        assert isinstance(mock_config, dict)
+        assert "model_name" in mock_config
+        assert "batch_size" in mock_config
+        assert mock_config["model_name"] == "test_model"
+        assert mock_config["batch_size"] == 32
+    
+    def test_sample_json_data_fixture(self, sample_json_data):
+        """Test that sample_json_data fixture provides expected data."""
+        assert isinstance(sample_json_data, dict)
+        assert sample_json_data["id"] == "test_001"
+        assert "text" in sample_json_data
+        assert "label" in sample_json_data
+    
+    def test_sample_jsonl_file_fixture(self, sample_jsonl_file):
+        """Test that sample_jsonl_file fixture creates a valid JSONL file."""
+        assert sample_jsonl_file.exists()
+        
+        # Read and verify content
+        import json
+        lines = sample_jsonl_file.read_text().strip().split('\n')
+        assert len(lines) == 3
+        
+        first_item = json.loads(lines[0])
+        assert first_item["id"] == "1"
+        assert "text" in first_item
+        assert "label" in first_item
+    
+    def test_mock_model_checkpoint_fixture(self, mock_model_checkpoint):
+        """Test that mock_model_checkpoint fixture creates expected structure."""
+        assert mock_model_checkpoint.exists()
+        assert (mock_model_checkpoint / "model.ckpt.index").exists()
+        assert (mock_model_checkpoint / "checkpoint").exists()
+    
+    def test_mock_vocab_files_fixture(self, mock_vocab_files):
+        """Test that mock_vocab_files fixture creates expected files."""
+        assert mock_vocab_files["encoder"].exists()
+        assert mock_vocab_files["vocab"].exists()
+        
+        # Verify encoder content
+        import json
+        encoder_data = json.loads(mock_vocab_files["encoder"].read_text())
+        assert "hello" in encoder_data
+        assert encoder_data["hello"] == 1
+    
+    def test_sample_model_config_fixture(self, sample_model_config):
+        """Test that sample_model_config fixture provides expected structure."""
+        assert isinstance(sample_model_config, dict)
+        assert sample_model_config["n_ctx"] == 1024
+        assert sample_model_config["n_embd"] == 768
+        assert "n_head" in sample_model_config
+        assert "n_layer" in sample_model_config
+
+
+def test_coverage_is_configured():
+    """Test that coverage is properly configured when running with coverage."""
+    # This test will pass whether or not coverage is running
+    # When running with coverage, it helps verify the setup
+    assert True
\ No newline at end of file
diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py
new file mode 100644
index 00000000..e69de29b