Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "document-analysis-mcp"
version = "0.3.0"
version = "0.4.0"
description = "General-purpose Document Analysis MCP server for PDF processing"
readme = "README.md"
requires-python = ">=3.10"
Expand Down
45 changes: 44 additions & 1 deletion src/document_analysis_mcp/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@
from document_analysis_mcp.config import get_settings
from document_analysis_mcp.tools.classify import pdf_classify
from document_analysis_mcp.tools.extract import pdf_extract_full
from document_analysis_mcp.tools.kb_ingest import pdf_kb_ingest
from document_analysis_mcp.tools.ocr import pdf_ocr
from document_analysis_mcp.tools.structure import pdf_extract_structure
from document_analysis_mcp.tracking import get_tracker

# Server version - should match pyproject.toml
__version__ = "0.3.0"
__version__ = "0.4.0"

# Track server startup time for uptime calculation
_startup_time: datetime | None = None
Expand Down Expand Up @@ -260,6 +261,48 @@ def pdf_extract_structure_tool(
)


@mcp.tool()
def pdf_kb_ingest_tool(
pdf_content: str,
title: str | None = None,
source_url: str | None = None,
content_type: str | None = None,
max_chunk_size: int = 4000,
max_file_size_mb: float = 50.0,
use_cache: bool = True,
) -> dict[str, Any]:
"""One-shot PDF ingestion for knowledge bank.

Combines extraction, classification, and chunking into a single operation
optimized for batch KB processing. The output format is designed to be
directly consumable by knowledge-bank-tools.

Args:
pdf_content: Base64-encoded PDF content.
title: Document title. If not provided, will be extracted from PDF.
source_url: Source URL for the document.
content_type: Document type (research_paper, technical_doc, financial_report,
legal_doc, manual, other). If not provided, will be auto-classified.
max_chunk_size: Maximum characters per chunk (default 4000).
max_file_size_mb: Maximum allowed file size in megabytes.
use_cache: Whether to use caching for previously processed documents.

Returns:
Dictionary containing success, title, content_type, chunks (with text,
page_numbers, word_count), metadata (page_count, has_tables, has_toc),
and processing_stats.
"""
return pdf_kb_ingest(
pdf_content=pdf_content,
title=title,
source_url=source_url,
content_type=content_type,
max_chunk_size=max_chunk_size,
max_file_size_mb=max_file_size_mb,
use_cache=use_cache,
)


@mcp.tool()
def cache_stats() -> dict[str, Any]:
"""Get cache statistics and usage information.
Expand Down
7 changes: 7 additions & 0 deletions src/document_analysis_mcp/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
PDF_EXTRACT_FULL_METADATA,
pdf_extract_full,
)
from document_analysis_mcp.tools.kb_ingest import (
PDF_KB_INGEST_METADATA,
pdf_kb_ingest,
)

__all__ = [
# Extract tool
Expand All @@ -18,4 +22,7 @@
"pdf_classify",
"PDF_CLASSIFY_METADATA",
"DocumentType",
# KB Ingest tool
"pdf_kb_ingest",
"PDF_KB_INGEST_METADATA",
]
Loading