Skip to content

Commit 2dcdc0c

Browse files
krisoyekrisoye13claude
authored
feat: Phase 4 KB Integration - pdf_kb_ingest tool (#11)
Implements one-shot PDF ingestion for knowledge bank integration: - Smart text chunking respecting paragraph/sentence boundaries - Page number tracking for source references - Document classification (LLM with heuristic fallback) - Caching support with parameter-specific keys - Comprehensive input validation and error handling Adds 43 new tests for kb_ingest functionality (266 total tests passing). Resolves Epic #21 Phase 4: KB Integration Refs: krisoye/project-tracker#96 Co-authored-by: Krisoye Smith <krisoye@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 7018024 commit 2dcdc0c

6 files changed

Lines changed: 1510 additions & 3 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "document-analysis-mcp"
7-
version = "0.3.0"
7+
version = "0.4.0"
88
description = "General-purpose Document Analysis MCP server for PDF processing"
99
readme = "README.md"
1010
requires-python = ">=3.10"

src/document_analysis_mcp/server.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,13 @@
1717
from document_analysis_mcp.config import get_settings
1818
from document_analysis_mcp.tools.classify import pdf_classify
1919
from document_analysis_mcp.tools.extract import pdf_extract_full
20+
from document_analysis_mcp.tools.kb_ingest import pdf_kb_ingest
2021
from document_analysis_mcp.tools.ocr import pdf_ocr
2122
from document_analysis_mcp.tools.structure import pdf_extract_structure
2223
from document_analysis_mcp.tracking import get_tracker
2324

2425
# Server version - should match pyproject.toml
25-
__version__ = "0.3.0"
26+
__version__ = "0.4.0"
2627

2728
# Track server startup time for uptime calculation
2829
_startup_time: datetime | None = None
@@ -260,6 +261,48 @@ def pdf_extract_structure_tool(
260261
)
261262

262263

264+
@mcp.tool()
265+
def pdf_kb_ingest_tool(
266+
pdf_content: str,
267+
title: str | None = None,
268+
source_url: str | None = None,
269+
content_type: str | None = None,
270+
max_chunk_size: int = 4000,
271+
max_file_size_mb: float = 50.0,
272+
use_cache: bool = True,
273+
) -> dict[str, Any]:
274+
"""One-shot PDF ingestion for knowledge bank.
275+
276+
Combines extraction, classification, and chunking into a single operation
277+
optimized for batch KB processing. The output format is designed to be
278+
directly consumable by knowledge-bank-tools.
279+
280+
Args:
281+
pdf_content: Base64-encoded PDF content.
282+
title: Document title. If not provided, will be extracted from PDF.
283+
source_url: Source URL for the document.
284+
content_type: Document type (research_paper, technical_doc, financial_report,
285+
legal_doc, manual, other). If not provided, will be auto-classified.
286+
max_chunk_size: Maximum characters per chunk (default 4000).
287+
max_file_size_mb: Maximum allowed file size in megabytes.
288+
use_cache: Whether to use caching for previously processed documents.
289+
290+
Returns:
291+
Dictionary containing success, title, content_type, chunks (with text,
292+
page_numbers, word_count), metadata (page_count, has_tables, has_toc),
293+
and processing_stats.
294+
"""
295+
return pdf_kb_ingest(
296+
pdf_content=pdf_content,
297+
title=title,
298+
source_url=source_url,
299+
content_type=content_type,
300+
max_chunk_size=max_chunk_size,
301+
max_file_size_mb=max_file_size_mb,
302+
use_cache=use_cache,
303+
)
304+
305+
263306
@mcp.tool()
264307
def cache_stats() -> dict[str, Any]:
265308
"""Get cache statistics and usage information.

src/document_analysis_mcp/tools/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
PDF_EXTRACT_FULL_METADATA,
1010
pdf_extract_full,
1111
)
12+
from document_analysis_mcp.tools.kb_ingest import (
13+
PDF_KB_INGEST_METADATA,
14+
pdf_kb_ingest,
15+
)
1216

1317
__all__ = [
1418
# Extract tool
@@ -18,4 +22,7 @@
1822
"pdf_classify",
1923
"PDF_CLASSIFY_METADATA",
2024
"DocumentType",
25+
# KB Ingest tool
26+
"pdf_kb_ingest",
27+
"PDF_KB_INGEST_METADATA",
2128
]

0 commit comments

Comments
 (0)