Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: Test

on:
push:
branches: [main]
pull_request:

jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.10', '3.11', '3.12']

steps:
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e .[dev]

- name: Run tests with coverage
run: |
pytest tests/ -v --cov=src/document_analysis_mcp --cov-report=term --cov-report=xml

- name: Upload coverage to Codecov
if: matrix.python-version == '3.12'
uses: codecov/codecov-action@v4
with:
files: coverage.xml
fail_ci_if_error: false

lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'

- name: Install ruff
run: pip install ruff

- name: Run ruff check
run: ruff check src/ tests/

- name: Run ruff format check
run: ruff format --check src/ tests/
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ dependencies = [
dev = [
"pytest>=8.0.0",
"pytest-asyncio>=0.23.0",
"pytest-cov>=4.0.0",
"reportlab>=4.0.0",
"ruff>=0.5.0",
]
Expand Down
7 changes: 2 additions & 5 deletions src/document_analysis_mcp/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,7 @@ def ensure_cache_dir_exists(self) -> Path:
"""
if self.cache_dir.exists():
if not self.cache_dir.is_dir():
raise ValueError(
f"Cache path exists but is not a directory: {self.cache_dir}"
)
raise ValueError(f"Cache path exists but is not a directory: {self.cache_dir}")
return self.cache_dir

try:
Expand Down Expand Up @@ -167,8 +165,7 @@ def validate_required(self) -> None:
"""
if not self.has_api_key:
raise ValueError(
"ANTHROPIC_API_KEY is required. "
"Set it in your environment or .env file."
"ANTHROPIC_API_KEY is required. Set it in your environment or .env file."
)


Expand Down
4 changes: 3 additions & 1 deletion src/document_analysis_mcp/processors/chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,9 @@ def chunk_text(
max_pages = 3
text = _extract_pages(text, list(range(1, max_pages + 1)))
estimated_tokens = estimate_tokens(text)
logger.info("QUICK strategy: Limited to %d pages, %d estimated tokens", max_pages, estimated_tokens)
logger.info(
"QUICK strategy: Limited to %d pages, %d estimated tokens", max_pages, estimated_tokens
)

elif strategy == ChunkingStrategy.COMPREHENSIVE:
# Comprehensive: Full document, moderate chunk size
Expand Down
10 changes: 4 additions & 6 deletions src/document_analysis_mcp/processors/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,7 @@ def analyze_chunk(
user_message = f"{prompt}\n\n---\n\nDocument Content:\n{chunk}"

# Build messages list
messages: list[dict[str, Any]] = [
{"role": "user", "content": user_message}
]
messages: list[dict[str, Any]] = [{"role": "user", "content": user_message}]

# Log the request (truncate chunk for readability)
estimated_input = estimate_tokens(user_message)
Expand Down Expand Up @@ -292,7 +290,9 @@ def _synthesize_summary(
except (APIConnectionError, APIStatusError) as e:
logger.error("Failed to synthesize summary: %s", e)
# Fall back to concatenated summaries
return "\n\n".join(f"**Section {i}:**\n{a}" for i, a in enumerate(chunk_analyses, start=1))
return "\n\n".join(
f"**Section {i}:**\n{a}" for i, a in enumerate(chunk_analyses, start=1)
)

def _get_chunk_prompt(self, strategy: ChunkingStrategy) -> str:
"""Get the appropriate chunk analysis prompt for a strategy.
Expand All @@ -311,7 +311,6 @@ def _get_chunk_prompt(self, strategy: ChunkingStrategy) -> str:
- Critical information

Keep your response concise (2-3 paragraphs).""",

ChunkingStrategy.COMPREHENSIVE: """Analyze this document section thoroughly.
Include:
- Main topics and themes
Expand All @@ -320,7 +319,6 @@ def _get_chunk_prompt(self, strategy: ChunkingStrategy) -> str:
- Relationships between concepts

Provide a structured analysis with clear organization.""",

ChunkingStrategy.DEEP: """Perform a detailed analysis of this document section.
Cover:
- Primary and secondary themes
Expand Down
24 changes: 6 additions & 18 deletions src/document_analysis_mcp/processors/text_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,14 +148,10 @@ def _extract_with_pdfplumber(
total_words += len(page_content.text.split())
tables_count += len(page_content.tables)
except (PDFSyntaxError, PSEOF) as e:
logger.warning(
"PDF syntax error on page %d: %s", page_num, e
)
logger.warning("PDF syntax error on page %d: %s", page_num, e)
pages.append(PageContent(page_number=page_num, text=""))
except (ValueError, TypeError, AttributeError) as e:
logger.warning(
"Content extraction error on page %d: %s", page_num, e
)
logger.warning("Content extraction error on page %d: %s", page_num, e)
pages.append(PageContent(page_number=page_num, text=""))

except (PDFSyntaxError, PSEOF) as e:
Expand Down Expand Up @@ -194,9 +190,7 @@ def _extract_with_pdfplumber(
success=True,
)

def _extract_pdfplumber_page(
self, page: pdfplumber.page.Page, page_num: int
) -> PageContent:
def _extract_pdfplumber_page(self, page: pdfplumber.page.Page, page_num: int) -> PageContent:
"""Extract content from a single pdfplumber page.

Args:
Expand Down Expand Up @@ -229,9 +223,7 @@ def _extract_pdfplumber_page(
tables=tables,
)

def _extract_pdfplumber_metadata(
self, pdf: pdfplumber.PDF
) -> DocumentMetadata:
def _extract_pdfplumber_metadata(self, pdf: pdfplumber.PDF) -> DocumentMetadata:
"""Extract metadata from pdfplumber PDF object.

Args:
Expand Down Expand Up @@ -298,9 +290,7 @@ def _extract_with_pypdf2(
total_pages,
)

for page_num, page in enumerate(
reader.pages[:pages_to_process], start=1
):
for page_num, page in enumerate(reader.pages[:pages_to_process], start=1):
try:
text = page.extract_text() or ""
page_content = PageContent(
Expand All @@ -312,9 +302,7 @@ def _extract_with_pypdf2(
total_chars += page_content.char_count
total_words += len(text.split())
except PdfReadError as e:
logger.warning(
"PDF read error on page %d with PyPDF2: %s", page_num, e
)
logger.warning("PDF read error on page %d with PyPDF2: %s", page_num, e)
pages.append(PageContent(page_number=page_num, text=""))
except (ValueError, TypeError, AttributeError) as e:
logger.warning(
Expand Down
11 changes: 6 additions & 5 deletions src/document_analysis_mcp/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,7 @@ def _setup_logging(level: str) -> None:
Args:
level: Logging level string (DEBUG, INFO, WARNING, ERROR).
"""
log_format = (
"%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(
level=getattr(logging, level.upper(), logging.INFO),
format=log_format,
Expand Down Expand Up @@ -212,8 +210,11 @@ def main() -> None:
logger.info("API Key Configured: %s", settings.has_api_key)
logger.info("Default Model: %s", settings.default_model)
logger.info("Classification Model: %s", settings.classification_model)
logger.info("Health endpoint available at: http://%s:%d/health",
settings.doc_analysis_host, settings.doc_analysis_port)
logger.info(
"Health endpoint available at: http://%s:%d/health",
settings.doc_analysis_host,
settings.doc_analysis_port,
)

# Ensure cache directory exists
settings.ensure_cache_dir_exists()
Expand Down
10 changes: 6 additions & 4 deletions src/document_analysis_mcp/tools/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,10 +178,12 @@ def pdf_extract_full(
tables = []
for page in result.pages:
for table in page.tables:
tables.append({
"page": page.page_number,
"content": table,
})
tables.append(
{
"page": page.page_number,
"content": table,
}
)
if tables:
response["tables"] = tables

Expand Down
14 changes: 4 additions & 10 deletions tests/test_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
splitting for LLM analysis with proper boundary detection.
"""

import pytest

from document_analysis_mcp.processors.chunker import (
CHARS_PER_TOKEN,
MODEL_LIMITS,
Expand Down Expand Up @@ -78,15 +76,15 @@ def test_paragraph_break_preferred(self):
break_pos = _find_break_point(text, 55, 0)
# The position should be right after the paragraph break
assert break_pos == 43
assert text[break_pos - 2:break_pos] == "\n\n"
assert text[break_pos - 2 : break_pos] == "\n\n"

def test_sentence_break_fallback(self):
"""Test that sentence breaks are used when no paragraph break found."""
text = "First sentence. Second sentence. Third sentence."
# Position where paragraph break won't be found in first half
break_pos = _find_break_point(text, 35, 0)
# Should end at a sentence boundary
assert text[break_pos - 2:break_pos] == ". "
assert text[break_pos - 2 : break_pos] == ". "

def test_word_break_fallback(self):
"""Test that word breaks are used when no sentence break found."""
Expand Down Expand Up @@ -176,9 +174,7 @@ def test_small_text_single_chunk(self):
def test_quick_strategy_limits_pages(self):
"""Test that QUICK strategy limits to first 3 pages."""
# Create text with page markers
pages = "\n\n".join(
f"[Page {i}]\nContent for page {i}." for i in range(1, 6)
)
pages = "\n\n".join(f"[Page {i}]\nContent for page {i}." for i in range(1, 6))
chunks = chunk_text(pages, ChunkingStrategy.QUICK, total_pages=5)

# Should only contain first 3 pages
Expand All @@ -191,9 +187,7 @@ def test_quick_strategy_limits_pages(self):

def test_comprehensive_strategy_processes_all(self):
"""Test that COMPREHENSIVE processes entire document."""
pages = "\n\n".join(
f"[Page {i}]\nContent for page {i}." for i in range(1, 6)
)
pages = "\n\n".join(f"[Page {i}]\nContent for page {i}." for i in range(1, 6))
chunks = chunk_text(pages, ChunkingStrategy.COMPREHENSIVE, total_pages=5)

combined = " ".join(chunks)
Expand Down
Loading