From d165f282623ea996d08f03c9b2662732e2190283 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Tue, 9 Sep 2025 15:21:27 +0200 Subject: [PATCH 1/6] configure Docling bundle with LC LLM objects Signed-off-by: Michele Dolfi --- pyproject.toml | 2 + src/frontend/package-lock.json | 1 - src/lfx/src/lfx/base/data/docling_utils.py | 29 +++++++++++- .../lfx/components/docling/docling_inline.py | 45 ++++++++++++++++++- uv.lock | 17 +++++++ 5 files changed, 90 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f8eca371534f..18c616f1d81d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -186,6 +186,7 @@ dev = [ langflow-base = { workspace = true } langflow = { workspace = true } lfx = { workspace = true } +langchain-docling = { git = "https://github.com/docling-project/docling-langchain", branch = "feat-pic-desc-plugin" } [tool.uv.workspace] members = [ @@ -204,6 +205,7 @@ Documentation = "https://docs.langflow.org" [project.optional-dependencies] docling = [ "docling>=2.36.1", + "langchain-docling[plugin]", "tesserocr>=2.8.0", "rapidocr-onnxruntime>=1.4.4", "ocrmac>=1.0.0; sys_platform == 'darwin'", diff --git a/src/frontend/package-lock.json b/src/frontend/package-lock.json index f2275b4808f2..c3717f816b51 100644 --- a/src/frontend/package-lock.json +++ b/src/frontend/package-lock.json @@ -1176,7 +1176,6 @@ }, "node_modules/@clack/prompts/node_modules/is-unicode-supported": { "version": "1.3.0", - "extraneous": true, "inBundle": true, "license": "MIT", "engines": { diff --git a/src/lfx/src/lfx/base/data/docling_utils.py b/src/lfx/src/lfx/base/data/docling_utils.py index b8f8f2b90b30..06ac586a4718 100644 --- a/src/lfx/src/lfx/base/data/docling_utils.py +++ b/src/lfx/src/lfx/base/data/docling_utils.py @@ -4,6 +4,8 @@ from contextlib import suppress from docling_core.types.doc import DoclingDocument +from langchain_openai import ChatOpenAI +from pydantic import TypeAdapter from lfx.log.logger import logger from lfx.schema.data import Data @@ -57,7 +59,16 @@ def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_ke return documents -def docling_worker(file_paths: list[str], queue, pipeline: str, ocr_engine: str): +def docling_worker( + *, + file_paths: list[str], + queue, + pipeline: str, + ocr_engine: str, + do_picture_classification: bool, + pic_desc_config: dict | None, + pic_desc_prompt: str, +): """Worker function for processing files with Docling in a separate process.""" # Signal handling for graceful shutdown shutdown_requested = False @@ -106,6 +117,7 @@ def check_shutdown() -> None: from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption from docling.models.factories import get_ocr_factory from docling.pipeline.vlm_pipeline import VlmPipeline + from langchain_docling.picture_description import PictureDescriptionLangChainOptions # Check for shutdown after imports check_shutdown() @@ -143,6 +155,21 @@ def _get_standard_opts() -> PdfPipelineOptions: kind=ocr_engine, ) pipeline_options.ocr_options = ocr_options + + pipeline_options.do_picture_classification = do_picture_classification + + if pic_desc_config: + adapter = TypeAdapter(ChatOpenAI) + pic_desc_llm = adapter.validate_python(pic_desc_config) + + logger.info("Docling enabling the picture description stage.") + pipeline_options.do_picture_description = True + pipeline_options.allow_external_plugins = True + pipeline_options.picture_description_options = PictureDescriptionLangChainOptions( + llm=pic_desc_llm, + prompt=pic_desc_prompt, + ) + return pipeline_options # Configure the VLM pipeline diff --git a/src/lfx/src/lfx/components/docling/docling_inline.py b/src/lfx/src/lfx/components/docling/docling_inline.py index 681d0e5f4237..a5506ab43fca 100644 --- a/src/lfx/src/lfx/components/docling/docling_inline.py +++ b/src/lfx/src/lfx/components/docling/docling_inline.py @@ -2,9 +2,11 @@ from multiprocessing import Queue, get_context from queue import Empty +from langchain_openai import ChatOpenAI + from lfx.base.data import BaseFileComponent from lfx.base.data.docling_utils import docling_worker -from lfx.inputs import DropdownInput +from lfx.inputs import BoolInput, DropdownInput, HandleInput, StrInput from lfx.schema import Data @@ -67,6 +69,26 @@ class DoclingInlineComponent(BaseFileComponent): real_time_refresh=False, value="None", ), + BoolInput( + name="do_picture_classification", + display_name="Picture classification", + info="If enabled, the Docling pipeline will classify the pictures type.", + value=False, + ), + HandleInput( + name="pic_desc_llm", + display_name="Picture description LLM", + info="If connected, the model to use for running the picture description task.", + input_types=["LanguageModel"], + required=False, + ), + StrInput( + name="pic_desc_prompt", + display_name="Picture description prompt", + value="Describe the image in three sentences. Be consise and accurate.", + info="The user prompt to use when invoking the model.", + advanced=True, + ), # TODO: expose more Docling options ] @@ -184,11 +206,30 @@ def _get_converter() -> DocumentConverter: self.log("No files to process.") return file_list + pic_desc_config: dict | None = None + if self.pic_desc_llm is not None: + if not isinstance(self.pic_desc_llm, ChatOpenAI): + msg = "Picture description LLM only supports models of type ChatOpenAI." + raise RuntimeError(msg) + pic_desc_config = self.pic_desc_llm.model_dump(mode="json") + if isinstance(self.pic_desc_llm.openai_api_key, str): + pic_desc_config["openai_api_key"] = self.pic_desc_llm.openai_api_key + else: + pic_desc_config["openai_api_key"] = self.pic_desc_llm.openai_api_key.get_secret_value() + ctx = get_context("spawn") queue: Queue = ctx.Queue() proc = ctx.Process( target=docling_worker, - args=(file_paths, queue, self.pipeline, self.ocr_engine), + kwargs={ + "file_paths": file_paths, + "queue": queue, + "pipeline": self.pipeline, + "ocr_engine": self.ocr_engine, + "do_picture_classification": self.do_picture_classification, + "pic_desc_config": pic_desc_config, + "pic_desc_prompt": self.pic_desc_prompt, + }, ) result = None diff --git a/uv.lock b/uv.lock index c0860a10f779..701296030fc6 100644 --- a/uv.lock +++ b/uv.lock @@ -4550,6 +4550,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fb/42/0d0221cce6f168f644d7d96cb6c87c4e42fc55d2941da7a36e970e3ab8ab/langchain_core-0.3.75-py3-none-any.whl", hash = "sha256:03ca1fadf955ee3c7d5806a841f4b3a37b816acea5e61a7e6ba1298c05eea7f5", size = 443986, upload-time = "2025-08-26T15:24:10.883Z" }, ] +[[package]] +name = "langchain-docling" +version = "1.0.0" +source = { git = "https://github.com/docling-project/docling-langchain?branch=feat-pic-desc-plugin#dad7a54440b1330d42600d2b26170666a59d775e" } +dependencies = [ + { name = "docling" }, + { name = "langchain-core" }, +] + +[package.optional-dependencies] +plugin = [ + { name = "langchain-openai", version = "0.3.23", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" }, + { name = "langchain-openai", version = "0.3.32", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" }, +] + [[package]] name = "langchain-elasticsearch" version = "0.3.0" @@ -5108,6 +5123,7 @@ couchbase = [ ] docling = [ { name = "docling" }, + { name = "langchain-docling", extra = ["plugin"] }, { name = "ocrmac", marker = "sys_platform == 'darwin'" }, { name = "rapidocr-onnxruntime" }, { name = "tesserocr" }, @@ -5227,6 +5243,7 @@ requires-dist = [ { name = "langchain-chroma", specifier = "==0.1.4" }, { name = "langchain-cohere", specifier = "==0.3.3" }, { name = "langchain-community", specifier = "~=0.3.21" }, + { name = "langchain-docling", extras = ["plugin"], marker = "extra == 'docling'", git = "https://github.com/docling-project/docling-langchain?branch=feat-pic-desc-plugin" }, { name = "langchain-elasticsearch", specifier = "==0.3.0" }, { name = "langchain-google-calendar-tools", specifier = "==0.0.1" }, { name = "langchain-google-community", specifier = "==2.0.3" }, From 5193da6e8cbc564d1f81bcd9da1c54061a7862af Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Tue, 9 Sep 2025 15:37:26 +0200 Subject: [PATCH 2/6] cleanup old not used code Signed-off-by: Michele Dolfi --- .../lfx/components/docling/docling_inline.py | 41 +------------------ 1 file changed, 1 insertion(+), 40 deletions(-) diff --git a/src/lfx/src/lfx/components/docling/docling_inline.py b/src/lfx/src/lfx/components/docling/docling_inline.py index a5506ab43fca..206b1cdab7a4 100644 --- a/src/lfx/src/lfx/components/docling/docling_inline.py +++ b/src/lfx/src/lfx/components/docling/docling_inline.py @@ -153,11 +153,7 @@ def _terminate_process_gracefully(self, proc, timeout_terminate: int = 10, timeo def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]: try: - from docling.datamodel.base_models import InputFormat - from docling.datamodel.pipeline_options import OcrOptions, PdfPipelineOptions, VlmPipelineOptions - from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption - from docling.models.factories import get_ocr_factory - from docling.pipeline.vlm_pipeline import VlmPipeline + from docling.document_converter import DocumentConverter # noqa: F401 except ImportError as e: msg = ( "Docling is an optional dependency. Install with `uv pip install 'langflow[docling]'` or refer to the " @@ -165,41 +161,6 @@ def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[Bas ) raise ImportError(msg) from e - # Configure the standard PDF pipeline - def _get_standard_opts() -> PdfPipelineOptions: - pipeline_options = PdfPipelineOptions() - pipeline_options.do_ocr = self.ocr_engine != "None" - if pipeline_options.do_ocr: - ocr_factory = get_ocr_factory( - allow_external_plugins=False, - ) - - ocr_options: OcrOptions = ocr_factory.create_options( - kind=self.ocr_engine, - ) - pipeline_options.ocr_options = ocr_options - return pipeline_options - - # Configure the VLM pipeline - def _get_vlm_opts() -> VlmPipelineOptions: - return VlmPipelineOptions() - - # Configure the main format options and create the DocumentConverter() - def _get_converter() -> DocumentConverter: - if self.pipeline == "standard": - pdf_format_option = PdfFormatOption( - pipeline_options=_get_standard_opts(), - ) - elif self.pipeline == "vlm": - pdf_format_option = PdfFormatOption(pipeline_cls=VlmPipeline, pipeline_options=_get_vlm_opts()) - - format_options: dict[InputFormat, FormatOption] = { - InputFormat.PDF: pdf_format_option, - InputFormat.IMAGE: pdf_format_option, - } - - return DocumentConverter(format_options=format_options) - file_paths = [file.path for file in file_list if file.path] if not file_paths: From b2ac95e48f7676b8aefea814d19440d71c0e7080 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Wed, 10 Sep 2025 10:18:35 +0200 Subject: [PATCH 3/6] pin released docling plugin Signed-off-by: Michele Dolfi --- pyproject.toml | 3 +-- uv.lock | 16 +++++++--------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 18c616f1d81d..6f042c452ae2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -186,7 +186,6 @@ dev = [ langflow-base = { workspace = true } langflow = { workspace = true } lfx = { workspace = true } -langchain-docling = { git = "https://github.com/docling-project/docling-langchain", branch = "feat-pic-desc-plugin" } [tool.uv.workspace] members = [ @@ -205,7 +204,7 @@ Documentation = "https://docs.langflow.org" [project.optional-dependencies] docling = [ "docling>=2.36.1", - "langchain-docling[plugin]", + "langchain-docling>=1.1.0", "tesserocr>=2.8.0", "rapidocr-onnxruntime>=1.4.4", "ocrmac>=1.0.0; sys_platform == 'darwin'", diff --git a/uv.lock b/uv.lock index 701296030fc6..c0a4d2fcf830 100644 --- a/uv.lock +++ b/uv.lock @@ -4552,17 +4552,15 @@ wheels = [ [[package]] name = "langchain-docling" -version = "1.0.0" -source = { git = "https://github.com/docling-project/docling-langchain?branch=feat-pic-desc-plugin#dad7a54440b1330d42600d2b26170666a59d775e" } +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "docling" }, { name = "langchain-core" }, ] - -[package.optional-dependencies] -plugin = [ - { name = "langchain-openai", version = "0.3.23", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12'" }, - { name = "langchain-openai", version = "0.3.32", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.12'" }, +sdist = { url = "https://files.pythonhosted.org/packages/fb/ab/c6e01d4830b8fdaa1e68e05e44159b78febd764debf3b70c64a8c001eaab/langchain_docling-1.1.0.tar.gz", hash = "sha256:cc9df0e438f67ac4f6cc68651c76d4929d177ff42a5242f5152b8a826426d329", size = 7542, upload-time = "2025-09-10T08:11:01.535Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/6d/19119e5de0d7976444d5fa136ff226fc4aaa93070cbdf74c372043e6aa3c/langchain_docling-1.1.0-py3-none-any.whl", hash = "sha256:3194f481e2ced092f5c0ac18a52226a9d928f66809eee7ecfbf934a2b84c4c1d", size = 7412, upload-time = "2025-09-10T08:11:00.483Z" }, ] [[package]] @@ -5123,7 +5121,7 @@ couchbase = [ ] docling = [ { name = "docling" }, - { name = "langchain-docling", extra = ["plugin"] }, + { name = "langchain-docling" }, { name = "ocrmac", marker = "sys_platform == 'darwin'" }, { name = "rapidocr-onnxruntime" }, { name = "tesserocr" }, @@ -5243,7 +5241,7 @@ requires-dist = [ { name = "langchain-chroma", specifier = "==0.1.4" }, { name = "langchain-cohere", specifier = "==0.3.3" }, { name = "langchain-community", specifier = "~=0.3.21" }, - { name = "langchain-docling", extras = ["plugin"], marker = "extra == 'docling'", git = "https://github.com/docling-project/docling-langchain?branch=feat-pic-desc-plugin" }, + { name = "langchain-docling", marker = "extra == 'docling'", specifier = ">=1.1.0" }, { name = "langchain-elasticsearch", specifier = "==0.3.0" }, { name = "langchain-google-calendar-tools", specifier = "==0.0.1" }, { name = "langchain-google-community", specifier = "==2.0.3" }, From 1df71aaca006fb30beccf619db52d1ae4357a72d Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Wed, 10 Sep 2025 10:45:41 +0200 Subject: [PATCH 4/6] use generic pydantic serialization Signed-off-by: Michele Dolfi --- src/lfx/src/lfx/base/data/docling_utils.py | 40 +++++++++++++++++-- .../lfx/components/docling/docling_inline.py | 13 +----- 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/src/lfx/src/lfx/base/data/docling_utils.py b/src/lfx/src/lfx/base/data/docling_utils.py index c894837ab544..9b2479b3045d 100644 --- a/src/lfx/src/lfx/base/data/docling_utils.py +++ b/src/lfx/src/lfx/base/data/docling_utils.py @@ -1,16 +1,20 @@ +import importlib import signal import sys import traceback from contextlib import suppress +from typing import TYPE_CHECKING from docling_core.types.doc import DoclingDocument -from langchain_openai import ChatOpenAI -from pydantic import TypeAdapter +from pydantic import BaseModel, SecretStr, TypeAdapter from lfx.log.logger import logger from lfx.schema.data import Data from lfx.schema.dataframe import DataFrame +if TYPE_CHECKING: + from langchain_core.language_models.chat_models import BaseChatModel + def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_key: str) -> list[DoclingDocument]: documents: list[DoclingDocument] = [] @@ -59,6 +63,35 @@ def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_ke return documents +def _unwrap_secrets(obj): + if isinstance(obj, SecretStr): + return obj.get_secret_value() + if isinstance(obj, dict): + return {k: _unwrap_secrets(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_unwrap_secrets(v) for v in obj] + return obj + + +def _dump_with_secrets(model: BaseModel): + return _unwrap_secrets(model.model_dump(mode="dict", round_trip=True)) + + +def _serialize_pydantic_model(model: BaseModel): + return { + "__class_path__": f"{model.__class__.__module__}.{model.__class__.__name__}", + "config": _dump_with_secrets(model), + } + + +def _deserialize_pydantic_model(data: dict): + module_name, class_name = data["__class_path__"].rsplit(".", 1) + module = importlib.import_module(module_name) + cls = getattr(module, class_name) + adapter = TypeAdapter(cls) + return adapter.validate_python(data["config"]) + + def docling_worker( *, file_paths: list[str], @@ -159,8 +192,7 @@ def _get_standard_opts() -> PdfPipelineOptions: pipeline_options.do_picture_classification = do_picture_classification if pic_desc_config: - adapter = TypeAdapter(ChatOpenAI) - pic_desc_llm = adapter.validate_python(pic_desc_config) + pic_desc_llm: BaseChatModel = _deserialize_pydantic_model(pic_desc_config) logger.info("Docling enabling the picture description stage.") pipeline_options.do_picture_description = True diff --git a/src/lfx/src/lfx/components/docling/docling_inline.py b/src/lfx/src/lfx/components/docling/docling_inline.py index 206b1cdab7a4..49398b516047 100644 --- a/src/lfx/src/lfx/components/docling/docling_inline.py +++ b/src/lfx/src/lfx/components/docling/docling_inline.py @@ -2,10 +2,8 @@ from multiprocessing import Queue, get_context from queue import Empty -from langchain_openai import ChatOpenAI - from lfx.base.data import BaseFileComponent -from lfx.base.data.docling_utils import docling_worker +from lfx.base.data.docling_utils import _serialize_pydantic_model, docling_worker from lfx.inputs import BoolInput, DropdownInput, HandleInput, StrInput from lfx.schema import Data @@ -169,14 +167,7 @@ def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[Bas pic_desc_config: dict | None = None if self.pic_desc_llm is not None: - if not isinstance(self.pic_desc_llm, ChatOpenAI): - msg = "Picture description LLM only supports models of type ChatOpenAI." - raise RuntimeError(msg) - pic_desc_config = self.pic_desc_llm.model_dump(mode="json") - if isinstance(self.pic_desc_llm.openai_api_key, str): - pic_desc_config["openai_api_key"] = self.pic_desc_llm.openai_api_key - else: - pic_desc_config["openai_api_key"] = self.pic_desc_llm.openai_api_key.get_secret_value() + pic_desc_config = _serialize_pydantic_model(self.pic_desc_llm) ctx = get_context("spawn") queue: Queue = ctx.Queue() From 1df40c05619f174dd3a4cd0f75b9dec8b3e32f00 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Wed, 10 Sep 2025 14:53:02 +0200 Subject: [PATCH 5/6] Apply suggestions from code review Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- src/lfx/src/lfx/base/data/docling_utils.py | 6 ++---- src/lfx/src/lfx/components/docling/docling_inline.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/lfx/src/lfx/base/data/docling_utils.py b/src/lfx/src/lfx/base/data/docling_utils.py index 9b2479b3045d..8571347794f4 100644 --- a/src/lfx/src/lfx/base/data/docling_utils.py +++ b/src/lfx/src/lfx/base/data/docling_utils.py @@ -74,8 +74,7 @@ def _unwrap_secrets(obj): def _dump_with_secrets(model: BaseModel): - return _unwrap_secrets(model.model_dump(mode="dict", round_trip=True)) - + return _unwrap_secrets(model.model_dump(mode="python", round_trip=True)) def _serialize_pydantic_model(model: BaseModel): return { @@ -192,7 +191,7 @@ def _get_standard_opts() -> PdfPipelineOptions: pipeline_options.do_picture_classification = do_picture_classification if pic_desc_config: - pic_desc_llm: BaseChatModel = _deserialize_pydantic_model(pic_desc_config) + pic_desc_llm: "BaseChatModel" = _deserialize_pydantic_model(pic_desc_config) logger.info("Docling enabling the picture description stage.") pipeline_options.do_picture_description = True @@ -201,7 +200,6 @@ def _get_standard_opts() -> PdfPipelineOptions: llm=pic_desc_llm, prompt=pic_desc_prompt, ) - return pipeline_options # Configure the VLM pipeline diff --git a/src/lfx/src/lfx/components/docling/docling_inline.py b/src/lfx/src/lfx/components/docling/docling_inline.py index 49398b516047..e8edfc78f840 100644 --- a/src/lfx/src/lfx/components/docling/docling_inline.py +++ b/src/lfx/src/lfx/components/docling/docling_inline.py @@ -83,7 +83,7 @@ class DoclingInlineComponent(BaseFileComponent): StrInput( name="pic_desc_prompt", display_name="Picture description prompt", - value="Describe the image in three sentences. Be consise and accurate.", + value="Describe the image in three sentences. Be concise and accurate.", info="The user prompt to use when invoking the model.", advanced=True, ), From 7a2ccbd88449214b0037c6a011d681f4a2135d44 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Wed, 10 Sep 2025 12:55:46 +0000 Subject: [PATCH 6/6] [autofix.ci] apply automated fixes --- src/lfx/src/lfx/base/data/docling_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lfx/src/lfx/base/data/docling_utils.py b/src/lfx/src/lfx/base/data/docling_utils.py index 8571347794f4..6be7079ee6ec 100644 --- a/src/lfx/src/lfx/base/data/docling_utils.py +++ b/src/lfx/src/lfx/base/data/docling_utils.py @@ -76,6 +76,7 @@ def _unwrap_secrets(obj): def _dump_with_secrets(model: BaseModel): return _unwrap_secrets(model.model_dump(mode="python", round_trip=True)) + def _serialize_pydantic_model(model: BaseModel): return { "__class_path__": f"{model.__class__.__module__}.{model.__class__.__name__}", @@ -191,7 +192,7 @@ def _get_standard_opts() -> PdfPipelineOptions: pipeline_options.do_picture_classification = do_picture_classification if pic_desc_config: - pic_desc_llm: "BaseChatModel" = _deserialize_pydantic_model(pic_desc_config) + pic_desc_llm: BaseChatModel = _deserialize_pydantic_model(pic_desc_config) logger.info("Docling enabling the picture description stage.") pipeline_options.do_picture_description = True