diff --git a/pyproject.toml b/pyproject.toml index f8eca371534f..6f042c452ae2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -204,6 +204,7 @@ Documentation = "https://docs.langflow.org" [project.optional-dependencies] docling = [ "docling>=2.36.1", + "langchain-docling>=1.1.0", "tesserocr>=2.8.0", "rapidocr-onnxruntime>=1.4.4", "ocrmac>=1.0.0; sys_platform == 'darwin'", diff --git a/src/lfx/src/lfx/base/data/docling_utils.py b/src/lfx/src/lfx/base/data/docling_utils.py index 6de077bae82a..6be7079ee6ec 100644 --- a/src/lfx/src/lfx/base/data/docling_utils.py +++ b/src/lfx/src/lfx/base/data/docling_utils.py @@ -1,14 +1,20 @@ +import importlib import signal import sys import traceback from contextlib import suppress +from typing import TYPE_CHECKING from docling_core.types.doc import DoclingDocument +from pydantic import BaseModel, SecretStr, TypeAdapter from lfx.log.logger import logger from lfx.schema.data import Data from lfx.schema.dataframe import DataFrame +if TYPE_CHECKING: + from langchain_core.language_models.chat_models import BaseChatModel + def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_key: str) -> list[DoclingDocument]: documents: list[DoclingDocument] = [] @@ -57,7 +63,45 @@ def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_ke return documents -def docling_worker(file_paths: list[str], queue, pipeline: str, ocr_engine: str): +def _unwrap_secrets(obj): + if isinstance(obj, SecretStr): + return obj.get_secret_value() + if isinstance(obj, dict): + return {k: _unwrap_secrets(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_unwrap_secrets(v) for v in obj] + return obj + + +def _dump_with_secrets(model: BaseModel): + return _unwrap_secrets(model.model_dump(mode="python", round_trip=True)) + + +def _serialize_pydantic_model(model: BaseModel): + return { + "__class_path__": f"{model.__class__.__module__}.{model.__class__.__name__}", + "config": _dump_with_secrets(model), + } + + +def _deserialize_pydantic_model(data: dict): + module_name, class_name = data["__class_path__"].rsplit(".", 1) + module = importlib.import_module(module_name) + cls = getattr(module, class_name) + adapter = TypeAdapter(cls) + return adapter.validate_python(data["config"]) + + +def docling_worker( + *, + file_paths: list[str], + queue, + pipeline: str, + ocr_engine: str, + do_picture_classification: bool, + pic_desc_config: dict | None, + pic_desc_prompt: str, +): """Worker function for processing files with Docling in a separate process.""" # Signal handling for graceful shutdown shutdown_requested = False @@ -106,6 +150,7 @@ def check_shutdown() -> None: from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption from docling.models.factories import get_ocr_factory from docling.pipeline.vlm_pipeline import VlmPipeline + from langchain_docling.picture_description import PictureDescriptionLangChainOptions # Check for shutdown after imports check_shutdown() @@ -143,6 +188,19 @@ def _get_standard_opts() -> PdfPipelineOptions: kind=ocr_engine, ) pipeline_options.ocr_options = ocr_options + + pipeline_options.do_picture_classification = do_picture_classification + + if pic_desc_config: + pic_desc_llm: BaseChatModel = _deserialize_pydantic_model(pic_desc_config) + + logger.info("Docling enabling the picture description stage.") + pipeline_options.do_picture_description = True + pipeline_options.allow_external_plugins = True + pipeline_options.picture_description_options = PictureDescriptionLangChainOptions( + llm=pic_desc_llm, + prompt=pic_desc_prompt, + ) return pipeline_options # Configure the VLM pipeline diff --git a/src/lfx/src/lfx/components/docling/docling_inline.py b/src/lfx/src/lfx/components/docling/docling_inline.py index 681d0e5f4237..e8edfc78f840 100644 --- a/src/lfx/src/lfx/components/docling/docling_inline.py +++ b/src/lfx/src/lfx/components/docling/docling_inline.py @@ -3,8 +3,8 @@ from queue import Empty from lfx.base.data import BaseFileComponent -from lfx.base.data.docling_utils import docling_worker -from lfx.inputs import DropdownInput +from lfx.base.data.docling_utils import _serialize_pydantic_model, docling_worker +from lfx.inputs import BoolInput, DropdownInput, HandleInput, StrInput from lfx.schema import Data @@ -67,6 +67,26 @@ class DoclingInlineComponent(BaseFileComponent): real_time_refresh=False, value="None", ), + BoolInput( + name="do_picture_classification", + display_name="Picture classification", + info="If enabled, the Docling pipeline will classify the pictures type.", + value=False, + ), + HandleInput( + name="pic_desc_llm", + display_name="Picture description LLM", + info="If connected, the model to use for running the picture description task.", + input_types=["LanguageModel"], + required=False, + ), + StrInput( + name="pic_desc_prompt", + display_name="Picture description prompt", + value="Describe the image in three sentences. Be concise and accurate.", + info="The user prompt to use when invoking the model.", + advanced=True, + ), # TODO: expose more Docling options ] @@ -131,11 +151,7 @@ def _terminate_process_gracefully(self, proc, timeout_terminate: int = 10, timeo def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]: try: - from docling.datamodel.base_models import InputFormat - from docling.datamodel.pipeline_options import OcrOptions, PdfPipelineOptions, VlmPipelineOptions - from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption - from docling.models.factories import get_ocr_factory - from docling.pipeline.vlm_pipeline import VlmPipeline + from docling.document_converter import DocumentConverter # noqa: F401 except ImportError as e: msg = ( "Docling is an optional dependency. Install with `uv pip install 'langflow[docling]'` or refer to the " @@ -143,52 +159,29 @@ def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[Bas ) raise ImportError(msg) from e - # Configure the standard PDF pipeline - def _get_standard_opts() -> PdfPipelineOptions: - pipeline_options = PdfPipelineOptions() - pipeline_options.do_ocr = self.ocr_engine != "None" - if pipeline_options.do_ocr: - ocr_factory = get_ocr_factory( - allow_external_plugins=False, - ) - - ocr_options: OcrOptions = ocr_factory.create_options( - kind=self.ocr_engine, - ) - pipeline_options.ocr_options = ocr_options - return pipeline_options - - # Configure the VLM pipeline - def _get_vlm_opts() -> VlmPipelineOptions: - return VlmPipelineOptions() - - # Configure the main format options and create the DocumentConverter() - def _get_converter() -> DocumentConverter: - if self.pipeline == "standard": - pdf_format_option = PdfFormatOption( - pipeline_options=_get_standard_opts(), - ) - elif self.pipeline == "vlm": - pdf_format_option = PdfFormatOption(pipeline_cls=VlmPipeline, pipeline_options=_get_vlm_opts()) - - format_options: dict[InputFormat, FormatOption] = { - InputFormat.PDF: pdf_format_option, - InputFormat.IMAGE: pdf_format_option, - } - - return DocumentConverter(format_options=format_options) - file_paths = [file.path for file in file_list if file.path] if not file_paths: self.log("No files to process.") return file_list + pic_desc_config: dict | None = None + if self.pic_desc_llm is not None: + pic_desc_config = _serialize_pydantic_model(self.pic_desc_llm) + ctx = get_context("spawn") queue: Queue = ctx.Queue() proc = ctx.Process( target=docling_worker, - args=(file_paths, queue, self.pipeline, self.ocr_engine), + kwargs={ + "file_paths": file_paths, + "queue": queue, + "pipeline": self.pipeline, + "ocr_engine": self.ocr_engine, + "do_picture_classification": self.do_picture_classification, + "pic_desc_config": pic_desc_config, + "pic_desc_prompt": self.pic_desc_prompt, + }, ) result = None diff --git a/uv.lock b/uv.lock index c0860a10f779..c0a4d2fcf830 100644 --- a/uv.lock +++ b/uv.lock @@ -4550,6 +4550,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fb/42/0d0221cce6f168f644d7d96cb6c87c4e42fc55d2941da7a36e970e3ab8ab/langchain_core-0.3.75-py3-none-any.whl", hash = "sha256:03ca1fadf955ee3c7d5806a841f4b3a37b816acea5e61a7e6ba1298c05eea7f5", size = 443986, upload-time = "2025-08-26T15:24:10.883Z" }, ] +[[package]] +name = "langchain-docling" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "docling" }, + { name = "langchain-core" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fb/ab/c6e01d4830b8fdaa1e68e05e44159b78febd764debf3b70c64a8c001eaab/langchain_docling-1.1.0.tar.gz", hash = "sha256:cc9df0e438f67ac4f6cc68651c76d4929d177ff42a5242f5152b8a826426d329", size = 7542, upload-time = "2025-09-10T08:11:01.535Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/6d/19119e5de0d7976444d5fa136ff226fc4aaa93070cbdf74c372043e6aa3c/langchain_docling-1.1.0-py3-none-any.whl", hash = "sha256:3194f481e2ced092f5c0ac18a52226a9d928f66809eee7ecfbf934a2b84c4c1d", size = 7412, upload-time = "2025-09-10T08:11:00.483Z" }, +] + [[package]] name = "langchain-elasticsearch" version = "0.3.0" @@ -5108,6 +5121,7 @@ couchbase = [ ] docling = [ { name = "docling" }, + { name = "langchain-docling" }, { name = "ocrmac", marker = "sys_platform == 'darwin'" }, { name = "rapidocr-onnxruntime" }, { name = "tesserocr" }, @@ -5227,6 +5241,7 @@ requires-dist = [ { name = "langchain-chroma", specifier = "==0.1.4" }, { name = "langchain-cohere", specifier = "==0.3.3" }, { name = "langchain-community", specifier = "~=0.3.21" }, + { name = "langchain-docling", marker = "extra == 'docling'", specifier = ">=1.1.0" }, { name = "langchain-elasticsearch", specifier = "==0.3.0" }, { name = "langchain-google-calendar-tools", specifier = "==0.0.1" }, { name = "langchain-google-community", specifier = "==2.0.3" },