Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion langchain_docling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
#
"""Docling LangChain package."""

from langchain_docling.loader import DoclingLoader
from langchain_docling.loader import DoclingLoader, DoclingParser
103 changes: 66 additions & 37 deletions langchain_docling/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from docling.chunking import BaseChunk, BaseChunker, HybridChunker
from docling.datamodel.document import DoclingDocument
from docling.document_converter import DocumentConverter
from langchain_core.document_loaders import BaseLoader
from langchain_core.document_loaders import BaseBlobParser, BaseLoader, Blob
from langchain_core.documents import Document


Expand Down Expand Up @@ -56,6 +56,62 @@ def extract_dl_doc_meta(
return {"source": file_path}


class DoclingParser(BaseBlobParser):
"""Docling Parser."""

def __init__(
self,
*,
converter: Optional[DocumentConverter] = None,
convert_kwargs: Optional[Dict[str, Any]] = None,
export_type: ExportType = ExportType.DOC_CHUNKS,
md_export_kwargs: Optional[dict[str, Any]] = None,
chunker: Optional[BaseChunker] = None,
meta_extractor: Optional[BaseMetaExtractor] = None,
):
"""Initialize DoclingParser."""
self._converter: DocumentConverter = converter or DocumentConverter()
self._convert_kwargs = convert_kwargs if convert_kwargs is not None else {}
self._export_type = export_type
self._md_export_kwargs = (
md_export_kwargs
if md_export_kwargs is not None
else {"image_placeholder": ""}
)
if self._export_type == ExportType.DOC_CHUNKS:
self._chunker: BaseChunker = chunker or HybridChunker()
self._meta_extractor = meta_extractor or MetaExtractor()

def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazy parse blob into documents."""
file_path = str(blob.source) if blob.source else str(blob.path)
conv_res = self._converter.convert(
source=file_path,
**self._convert_kwargs,
)
dl_doc = conv_res.document
if self._export_type == ExportType.MARKDOWN:
yield Document(
page_content=dl_doc.export_to_markdown(**self._md_export_kwargs),
metadata=self._meta_extractor.extract_dl_doc_meta(
file_path=file_path,
dl_doc=dl_doc,
),
)
elif self._export_type == ExportType.DOC_CHUNKS:
chunk_iter = self._chunker.chunk(dl_doc)
for chunk in chunk_iter:
yield Document(
page_content=self._chunker.contextualize(chunk=chunk),
metadata=self._meta_extractor.extract_chunk_meta(
file_path=file_path,
chunk=chunk,
),
)
else:
raise ValueError(f"Unexpected export type: {self._export_type}")


class DoclingLoader(BaseLoader):
"""Docling Loader."""

Expand Down Expand Up @@ -97,46 +153,19 @@ def __init__(
else [file_path]
)

self._converter: DocumentConverter = converter or DocumentConverter()
self._convert_kwargs = convert_kwargs if convert_kwargs is not None else {}
self._export_type = export_type
self._md_export_kwargs = (
md_export_kwargs
if md_export_kwargs is not None
else {"image_placeholder": ""}
self._parser = DoclingParser(
converter=converter,
convert_kwargs=convert_kwargs,
export_type=export_type,
md_export_kwargs=md_export_kwargs,
chunker=chunker,
meta_extractor=meta_extractor,
)
if self._export_type == ExportType.DOC_CHUNKS:
self._chunker: BaseChunker = chunker or HybridChunker()
self._meta_extractor = meta_extractor or MetaExtractor()

def lazy_load(
self,
) -> Iterator[Document]:
"""Lazy load documents."""
for file_path in self._file_paths:
conv_res = self._converter.convert(
source=file_path,
**self._convert_kwargs,
)
dl_doc = conv_res.document
if self._export_type == ExportType.MARKDOWN:
yield Document(
page_content=dl_doc.export_to_markdown(**self._md_export_kwargs),
metadata=self._meta_extractor.extract_dl_doc_meta(
file_path=file_path,
dl_doc=dl_doc,
),
)
elif self._export_type == ExportType.DOC_CHUNKS:
chunk_iter = self._chunker.chunk(dl_doc)
for chunk in chunk_iter:
yield Document(
page_content=self._chunker.contextualize(chunk=chunk),
metadata=self._meta_extractor.extract_chunk_meta(
file_path=file_path,
chunk=chunk,
),
)

else:
raise ValueError(f"Unexpected export type: {self._export_type}")
blob = Blob(path=file_path)
yield from self._parser.lazy_parse(blob)
60 changes: 59 additions & 1 deletion test/test_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
import pytest
from docling.chunking import HierarchicalChunker
from docling.datamodel.document import DoclingDocument
from langchain_core.document_loaders import Blob

from langchain_docling.loader import DoclingLoader, ExportType
from langchain_docling.loader import DoclingLoader, DoclingParser, ExportType

from .test_data_gen_flag import GEN_TEST_DATA

Expand Down Expand Up @@ -79,3 +80,60 @@ def test_load_as_doc_chunks(monkeypatch: pytest.MonkeyPatch) -> None:
with open(exp_file, encoding="utf-8") as f:
exp_data = json.load(f)
assert act_data == exp_data


def test_parser_as_markdown(monkeypatch: pytest.MonkeyPatch) -> None:

mock_dl_doc = DoclingDocument.load_from_json("test/data/input/dl_doc_1.json")
mock_response = MagicMock()
mock_response.document = mock_dl_doc

monkeypatch.setattr(
"docling.document_converter.DocumentConverter.__init__",
lambda *args, **kwargs: None,
)
monkeypatch.setattr(
"docling.document_converter.DocumentConverter.convert",
lambda *args, **kwargs: mock_response,
)

parser = DoclingParser(export_type=ExportType.MARKDOWN)
blob = Blob(path="https://example.com/foo.pdf")
act_lc_docs = list(parser.lazy_parse(blob))
assert len(act_lc_docs) == 1

act_data = {"root": [lc_doc.model_dump() for lc_doc in act_lc_docs]}
exp_file = "test/data/output/lc_doc_md_1.json"
with open(exp_file, encoding="utf-8") as f:
exp_data = json.load(f)
assert act_data == exp_data


def test_parser_as_doc_chunks(monkeypatch: pytest.MonkeyPatch) -> None:

mock_dl_doc = DoclingDocument.load_from_json("test/data/input/dl_doc_1.json")
mock_response = MagicMock()
mock_response.document = mock_dl_doc

monkeypatch.setattr(
"docling.document_converter.DocumentConverter.__init__",
lambda *args, **kwargs: None,
)
monkeypatch.setattr(
"docling.document_converter.DocumentConverter.convert",
lambda *args, **kwargs: mock_response,
)

parser = DoclingParser(
export_type=ExportType.DOC_CHUNKS,
chunker=HierarchicalChunker(),
)
blob = Blob(path="https://example.com/foo.pdf")
act_lc_docs = list(parser.lazy_parse(blob))
assert len(act_lc_docs) == 2

act_data = {"root": [lc_doc.model_dump() for lc_doc in act_lc_docs]}
exp_file = "test/data/output/lc_doc_chunks_1.json"
with open(exp_file, encoding="utf-8") as f:
exp_data = json.load(f)
assert act_data == exp_data