diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..3cb7531 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,56 @@ +name: Test + +on: + push: + branches: [main] + pull_request: + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.10', '3.11', '3.12'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e .[dev] + + - name: Run tests with coverage + run: | + pytest tests/ -v --cov=src/document_analysis_mcp --cov-report=term --cov-report=xml + + - name: Upload coverage to Codecov + if: matrix.python-version == '3.12' + uses: codecov/codecov-action@v4 + with: + files: coverage.xml + fail_ci_if_error: false + + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Install ruff + run: pip install ruff + + - name: Run ruff check + run: ruff check src/ tests/ + + - name: Run ruff format check + run: ruff format --check src/ tests/ diff --git a/pyproject.toml b/pyproject.toml index 2b99005..e47e574 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ dev = [ "pytest>=8.0.0", "pytest-asyncio>=0.23.0", + "pytest-cov>=4.0.0", "reportlab>=4.0.0", "ruff>=0.5.0", ] diff --git a/src/document_analysis_mcp/config.py b/src/document_analysis_mcp/config.py index a8fb09c..b6f3340 100644 --- a/src/document_analysis_mcp/config.py +++ b/src/document_analysis_mcp/config.py @@ -137,9 +137,7 @@ def ensure_cache_dir_exists(self) -> Path: """ if self.cache_dir.exists(): if not self.cache_dir.is_dir(): - raise ValueError( - f"Cache path exists but is not a directory: {self.cache_dir}" - ) + raise ValueError(f"Cache path exists but is not a directory: {self.cache_dir}") return self.cache_dir try: @@ -167,8 +165,7 @@ def validate_required(self) -> None: """ if not self.has_api_key: raise ValueError( - "ANTHROPIC_API_KEY is required. " - "Set it in your environment or .env file." + "ANTHROPIC_API_KEY is required. Set it in your environment or .env file." ) diff --git a/src/document_analysis_mcp/processors/chunker.py b/src/document_analysis_mcp/processors/chunker.py index ed4bb8e..2f60749 100644 --- a/src/document_analysis_mcp/processors/chunker.py +++ b/src/document_analysis_mcp/processors/chunker.py @@ -183,7 +183,9 @@ def chunk_text( max_pages = 3 text = _extract_pages(text, list(range(1, max_pages + 1))) estimated_tokens = estimate_tokens(text) - logger.info("QUICK strategy: Limited to %d pages, %d estimated tokens", max_pages, estimated_tokens) + logger.info( + "QUICK strategy: Limited to %d pages, %d estimated tokens", max_pages, estimated_tokens + ) elif strategy == ChunkingStrategy.COMPREHENSIVE: # Comprehensive: Full document, moderate chunk size diff --git a/src/document_analysis_mcp/processors/llm.py b/src/document_analysis_mcp/processors/llm.py index 640dac7..3165b79 100644 --- a/src/document_analysis_mcp/processors/llm.py +++ b/src/document_analysis_mcp/processors/llm.py @@ -150,9 +150,7 @@ def analyze_chunk( user_message = f"{prompt}\n\n---\n\nDocument Content:\n{chunk}" # Build messages list - messages: list[dict[str, Any]] = [ - {"role": "user", "content": user_message} - ] + messages: list[dict[str, Any]] = [{"role": "user", "content": user_message}] # Log the request (truncate chunk for readability) estimated_input = estimate_tokens(user_message) @@ -292,7 +290,9 @@ def _synthesize_summary( except (APIConnectionError, APIStatusError) as e: logger.error("Failed to synthesize summary: %s", e) # Fall back to concatenated summaries - return "\n\n".join(f"**Section {i}:**\n{a}" for i, a in enumerate(chunk_analyses, start=1)) + return "\n\n".join( + f"**Section {i}:**\n{a}" for i, a in enumerate(chunk_analyses, start=1) + ) def _get_chunk_prompt(self, strategy: ChunkingStrategy) -> str: """Get the appropriate chunk analysis prompt for a strategy. @@ -311,7 +311,6 @@ def _get_chunk_prompt(self, strategy: ChunkingStrategy) -> str: - Critical information Keep your response concise (2-3 paragraphs).""", - ChunkingStrategy.COMPREHENSIVE: """Analyze this document section thoroughly. Include: - Main topics and themes @@ -320,7 +319,6 @@ def _get_chunk_prompt(self, strategy: ChunkingStrategy) -> str: - Relationships between concepts Provide a structured analysis with clear organization.""", - ChunkingStrategy.DEEP: """Perform a detailed analysis of this document section. Cover: - Primary and secondary themes diff --git a/src/document_analysis_mcp/processors/text_extractor.py b/src/document_analysis_mcp/processors/text_extractor.py index cb9c518..1b39111 100644 --- a/src/document_analysis_mcp/processors/text_extractor.py +++ b/src/document_analysis_mcp/processors/text_extractor.py @@ -148,14 +148,10 @@ def _extract_with_pdfplumber( total_words += len(page_content.text.split()) tables_count += len(page_content.tables) except (PDFSyntaxError, PSEOF) as e: - logger.warning( - "PDF syntax error on page %d: %s", page_num, e - ) + logger.warning("PDF syntax error on page %d: %s", page_num, e) pages.append(PageContent(page_number=page_num, text="")) except (ValueError, TypeError, AttributeError) as e: - logger.warning( - "Content extraction error on page %d: %s", page_num, e - ) + logger.warning("Content extraction error on page %d: %s", page_num, e) pages.append(PageContent(page_number=page_num, text="")) except (PDFSyntaxError, PSEOF) as e: @@ -194,9 +190,7 @@ def _extract_with_pdfplumber( success=True, ) - def _extract_pdfplumber_page( - self, page: pdfplumber.page.Page, page_num: int - ) -> PageContent: + def _extract_pdfplumber_page(self, page: pdfplumber.page.Page, page_num: int) -> PageContent: """Extract content from a single pdfplumber page. Args: @@ -229,9 +223,7 @@ def _extract_pdfplumber_page( tables=tables, ) - def _extract_pdfplumber_metadata( - self, pdf: pdfplumber.PDF - ) -> DocumentMetadata: + def _extract_pdfplumber_metadata(self, pdf: pdfplumber.PDF) -> DocumentMetadata: """Extract metadata from pdfplumber PDF object. Args: @@ -298,9 +290,7 @@ def _extract_with_pypdf2( total_pages, ) - for page_num, page in enumerate( - reader.pages[:pages_to_process], start=1 - ): + for page_num, page in enumerate(reader.pages[:pages_to_process], start=1): try: text = page.extract_text() or "" page_content = PageContent( @@ -312,9 +302,7 @@ def _extract_with_pypdf2( total_chars += page_content.char_count total_words += len(text.split()) except PdfReadError as e: - logger.warning( - "PDF read error on page %d with PyPDF2: %s", page_num, e - ) + logger.warning("PDF read error on page %d with PyPDF2: %s", page_num, e) pages.append(PageContent(page_number=page_num, text="")) except (ValueError, TypeError, AttributeError) as e: logger.warning( diff --git a/src/document_analysis_mcp/server.py b/src/document_analysis_mcp/server.py index bb72353..9aa6acc 100644 --- a/src/document_analysis_mcp/server.py +++ b/src/document_analysis_mcp/server.py @@ -31,9 +31,7 @@ def _setup_logging(level: str) -> None: Args: level: Logging level string (DEBUG, INFO, WARNING, ERROR). """ - log_format = ( - "%(asctime)s - %(name)s - %(levelname)s - %(message)s" - ) + log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" logging.basicConfig( level=getattr(logging, level.upper(), logging.INFO), format=log_format, @@ -212,8 +210,11 @@ def main() -> None: logger.info("API Key Configured: %s", settings.has_api_key) logger.info("Default Model: %s", settings.default_model) logger.info("Classification Model: %s", settings.classification_model) - logger.info("Health endpoint available at: http://%s:%d/health", - settings.doc_analysis_host, settings.doc_analysis_port) + logger.info( + "Health endpoint available at: http://%s:%d/health", + settings.doc_analysis_host, + settings.doc_analysis_port, + ) # Ensure cache directory exists settings.ensure_cache_dir_exists() diff --git a/src/document_analysis_mcp/tools/extract.py b/src/document_analysis_mcp/tools/extract.py index 3c13e02..3bbe949 100644 --- a/src/document_analysis_mcp/tools/extract.py +++ b/src/document_analysis_mcp/tools/extract.py @@ -178,10 +178,12 @@ def pdf_extract_full( tables = [] for page in result.pages: for table in page.tables: - tables.append({ - "page": page.page_number, - "content": table, - }) + tables.append( + { + "page": page.page_number, + "content": table, + } + ) if tables: response["tables"] = tables diff --git a/tests/test_chunker.py b/tests/test_chunker.py index a14a55b..52ed454 100644 --- a/tests/test_chunker.py +++ b/tests/test_chunker.py @@ -4,8 +4,6 @@ splitting for LLM analysis with proper boundary detection. """ -import pytest - from document_analysis_mcp.processors.chunker import ( CHARS_PER_TOKEN, MODEL_LIMITS, @@ -78,7 +76,7 @@ def test_paragraph_break_preferred(self): break_pos = _find_break_point(text, 55, 0) # The position should be right after the paragraph break assert break_pos == 43 - assert text[break_pos - 2:break_pos] == "\n\n" + assert text[break_pos - 2 : break_pos] == "\n\n" def test_sentence_break_fallback(self): """Test that sentence breaks are used when no paragraph break found.""" @@ -86,7 +84,7 @@ def test_sentence_break_fallback(self): # Position where paragraph break won't be found in first half break_pos = _find_break_point(text, 35, 0) # Should end at a sentence boundary - assert text[break_pos - 2:break_pos] == ". " + assert text[break_pos - 2 : break_pos] == ". " def test_word_break_fallback(self): """Test that word breaks are used when no sentence break found.""" @@ -176,9 +174,7 @@ def test_small_text_single_chunk(self): def test_quick_strategy_limits_pages(self): """Test that QUICK strategy limits to first 3 pages.""" # Create text with page markers - pages = "\n\n".join( - f"[Page {i}]\nContent for page {i}." for i in range(1, 6) - ) + pages = "\n\n".join(f"[Page {i}]\nContent for page {i}." for i in range(1, 6)) chunks = chunk_text(pages, ChunkingStrategy.QUICK, total_pages=5) # Should only contain first 3 pages @@ -191,9 +187,7 @@ def test_quick_strategy_limits_pages(self): def test_comprehensive_strategy_processes_all(self): """Test that COMPREHENSIVE processes entire document.""" - pages = "\n\n".join( - f"[Page {i}]\nContent for page {i}." for i in range(1, 6) - ) + pages = "\n\n".join(f"[Page {i}]\nContent for page {i}." for i in range(1, 6)) chunks = chunk_text(pages, ChunkingStrategy.COMPREHENSIVE, total_pages=5) combined = " ".join(chunks) diff --git a/tests/test_classify.py b/tests/test_classify.py index 3f6f9e0..e90f382 100644 --- a/tests/test_classify.py +++ b/tests/test_classify.py @@ -9,7 +9,7 @@ from unittest.mock import MagicMock, patch import pytest -from anthropic import APIConnectionError, APIStatusError +from anthropic import APIConnectionError from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas @@ -70,7 +70,9 @@ class TestStrategyRecommendations: def test_research_paper_strategy(self): """Test strategy for research papers.""" - assert STRATEGY_RECOMMENDATIONS[DocumentType.RESEARCH_PAPER] == ChunkingStrategy.COMPREHENSIVE + assert ( + STRATEGY_RECOMMENDATIONS[DocumentType.RESEARCH_PAPER] == ChunkingStrategy.COMPREHENSIVE + ) def test_financial_report_strategy(self): """Test strategy for financial reports (needs detailed analysis).""" @@ -216,9 +218,7 @@ def test_empty_content_raises(self): @patch("document_analysis_mcp.tools.classify.LLMProcessor") @patch("document_analysis_mcp.tools.classify.TextExtractor") @patch("document_analysis_mcp.tools.classify.get_settings") - def test_successful_classification( - self, mock_settings, mock_extractor_class, mock_llm_class - ): + def test_successful_classification(self, mock_settings, mock_extractor_class, mock_llm_class): """Test successful document classification.""" # Setup settings mock mock_settings.return_value = MagicMock( @@ -259,9 +259,7 @@ def test_successful_classification( @patch("document_analysis_mcp.tools.classify.LLMProcessor") @patch("document_analysis_mcp.tools.classify.TextExtractor") @patch("document_analysis_mcp.tools.classify.get_settings") - def test_extraction_failure( - self, mock_settings, mock_extractor_class, mock_llm_class - ): + def test_extraction_failure(self, mock_settings, mock_extractor_class, mock_llm_class): """Test handling of PDF extraction failure.""" mock_settings.return_value = MagicMock() @@ -287,9 +285,7 @@ def test_extraction_failure( @patch("document_analysis_mcp.tools.classify.LLMProcessor") @patch("document_analysis_mcp.tools.classify.TextExtractor") @patch("document_analysis_mcp.tools.classify.get_settings") - def test_empty_text_extraction( - self, mock_settings, mock_extractor_class, mock_llm_class - ): + def test_empty_text_extraction(self, mock_settings, mock_extractor_class, mock_llm_class): """Test handling when PDF has no extractable text.""" mock_settings.return_value = MagicMock() @@ -310,9 +306,7 @@ def test_empty_text_extraction( @patch("document_analysis_mcp.tools.classify.LLMProcessor") @patch("document_analysis_mcp.tools.classify.TextExtractor") @patch("document_analysis_mcp.tools.classify.get_settings") - def test_llm_initialization_failure( - self, mock_settings, mock_extractor_class, mock_llm_class - ): + def test_llm_initialization_failure(self, mock_settings, mock_extractor_class, mock_llm_class): """Test handling of LLM initialization failure (no API key).""" mock_settings.return_value = MagicMock(anthropic_api_key="") @@ -335,9 +329,7 @@ def test_llm_initialization_failure( @patch("document_analysis_mcp.tools.classify.LLMProcessor") @patch("document_analysis_mcp.tools.classify.TextExtractor") @patch("document_analysis_mcp.tools.classify.get_settings") - def test_llm_api_error( - self, mock_settings, mock_extractor_class, mock_llm_class - ): + def test_llm_api_error(self, mock_settings, mock_extractor_class, mock_llm_class): """Test handling of LLM API errors.""" mock_settings.return_value = MagicMock( anthropic_api_key="sk-test", @@ -366,9 +358,7 @@ def test_llm_api_error( @patch("document_analysis_mcp.tools.classify.LLMProcessor") @patch("document_analysis_mcp.tools.classify.TextExtractor") @patch("document_analysis_mcp.tools.classify.get_settings") - def test_uses_classification_model( - self, mock_settings, mock_extractor_class, mock_llm_class - ): + def test_uses_classification_model(self, mock_settings, mock_extractor_class, mock_llm_class): """Test that the classification model from settings is used.""" mock_settings.return_value = MagicMock( anthropic_api_key="sk-test", @@ -399,9 +389,7 @@ def test_uses_classification_model( @patch("document_analysis_mcp.tools.classify.LLMProcessor") @patch("document_analysis_mcp.tools.classify.TextExtractor") @patch("document_analysis_mcp.tools.classify.get_settings") - def test_sample_text_limited( - self, mock_settings, mock_extractor_class, mock_llm_class - ): + def test_sample_text_limited(self, mock_settings, mock_extractor_class, mock_llm_class): """Test that sample text is limited to CLASSIFICATION_SAMPLE_CHARS.""" mock_settings.return_value = MagicMock( anthropic_api_key="sk-test", @@ -436,9 +424,7 @@ def test_sample_text_limited( @patch("document_analysis_mcp.tools.classify.LLMProcessor") @patch("document_analysis_mcp.tools.classify.TextExtractor") @patch("document_analysis_mcp.tools.classify.get_settings") - def test_result_includes_metadata( - self, mock_settings, mock_extractor_class, mock_llm_class - ): + def test_result_includes_metadata(self, mock_settings, mock_extractor_class, mock_llm_class): """Test that result includes page and word counts.""" mock_settings.return_value = MagicMock( anthropic_api_key="sk-test", @@ -468,9 +454,7 @@ def test_result_includes_metadata( @patch("document_analysis_mcp.tools.classify.LLMProcessor") @patch("document_analysis_mcp.tools.classify.TextExtractor") @patch("document_analysis_mcp.tools.classify.get_settings") - def test_token_count_in_result( - self, mock_settings, mock_extractor_class, mock_llm_class - ): + def test_token_count_in_result(self, mock_settings, mock_extractor_class, mock_llm_class): """Test that token count is included in result.""" mock_settings.return_value = MagicMock( anthropic_api_key="sk-test", @@ -513,7 +497,7 @@ class TestDocumentTypeStrategies: def test_all_strategies_are_valid(self): """Test that all recommended strategies are valid ChunkingStrategy values.""" - for doc_type, strategy in STRATEGY_RECOMMENDATIONS.items(): + for _doc_type, strategy in STRATEGY_RECOMMENDATIONS.items(): assert isinstance(strategy, ChunkingStrategy) def test_deep_strategy_for_critical_docs(self): @@ -525,7 +509,8 @@ def test_deep_strategy_for_critical_docs(self): def test_comprehensive_strategy_is_common_default(self): """Test that COMPREHENSIVE is the most common recommendation.""" comprehensive_count = sum( - 1 for strategy in STRATEGY_RECOMMENDATIONS.values() + 1 + for strategy in STRATEGY_RECOMMENDATIONS.values() if strategy == ChunkingStrategy.COMPREHENSIVE ) # Should be the most common @@ -564,9 +549,7 @@ def test_real_pdf_classification(self, mock_settings, mock_llm_class): @patch("document_analysis_mcp.tools.classify.LLMProcessor") @patch("document_analysis_mcp.tools.classify.get_settings") - def test_multipage_pdf_extracts_limited_pages( - self, mock_settings, mock_llm_class - ): + def test_multipage_pdf_extracts_limited_pages(self, mock_settings, mock_llm_class): """Test that multi-page PDFs only extract first 3 pages for classification.""" mock_settings.return_value = MagicMock( anthropic_api_key="sk-test", diff --git a/tests/test_llm.py b/tests/test_llm.py index 1728823..89927f7 100644 --- a/tests/test_llm.py +++ b/tests/test_llm.py @@ -4,7 +4,6 @@ response parsing, and document analysis workflows with mocked API calls. """ -import time from unittest.mock import MagicMock, patch import pytest @@ -77,7 +76,7 @@ def test_init_with_api_key(self, mock_anthropic, mock_settings): max_tokens=4096, ) - processor = LLMProcessor(api_key="sk-test-key") + _processor = LLMProcessor(api_key="sk-test-key") mock_anthropic.assert_called_once_with(api_key="sk-test-key") @@ -91,7 +90,7 @@ def test_init_from_settings(self, mock_anthropic, mock_settings): max_tokens=4096, ) - processor = LLMProcessor() + _processor = LLMProcessor() mock_anthropic.assert_called_once_with(api_key="sk-from-settings") @@ -449,9 +448,7 @@ def test_analyze_single_chunk(self, mock_anthropic, mock_settings): @patch("document_analysis_mcp.processors.llm.get_settings") @patch("document_analysis_mcp.processors.llm.anthropic.Anthropic") - def test_analyze_multiple_chunks_with_synthesis( - self, mock_anthropic, mock_settings - ): + def test_analyze_multiple_chunks_with_synthesis(self, mock_anthropic, mock_settings): """Test analyzing multiple chunks triggers synthesis.""" mock_settings.return_value = MagicMock( anthropic_api_key="sk-test", @@ -510,9 +507,7 @@ def test_analyze_document_accumulates_tokens(self, mock_anthropic, mock_settings @patch("document_analysis_mcp.processors.llm.time.sleep") # Skip retry delays @patch("document_analysis_mcp.processors.llm.get_settings") @patch("document_analysis_mcp.processors.llm.anthropic.Anthropic") - def test_chunk_failure_continues_analysis( - self, mock_anthropic, mock_settings, mock_sleep - ): + def test_chunk_failure_continues_analysis(self, mock_anthropic, mock_settings, mock_sleep): """Test that chunk failure doesn't stop entire analysis.""" mock_settings.return_value = MagicMock( anthropic_api_key="sk-test", @@ -645,15 +640,13 @@ def test_synthesis_includes_all_sections(self, mock_anthropic, mock_settings): mock_client = MagicMock() mock_anthropic.return_value = mock_client - mock_client.messages.create.return_value = self._create_mock_response( - "Combined synthesis" - ) + mock_client.messages.create.return_value = self._create_mock_response("Combined synthesis") processor = LLMProcessor(api_key="sk-test") result = AnalysisResult() chunk_analyses = ["First analysis", "Second analysis", "Third analysis"] - summary = processor._synthesize_summary( + _summary = processor._synthesize_summary( chunk_analyses, ChunkingStrategy.COMPREHENSIVE, "claude-sonnet-4-20250514",