Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ Documentation = "https://docs.langflow.org"
[project.optional-dependencies]
docling = [
"docling>=2.36.1",
"langchain-docling>=1.1.0",
"tesserocr>=2.8.0",
"rapidocr-onnxruntime>=1.4.4",
"ocrmac>=1.0.0; sys_platform == 'darwin'",
Expand Down
60 changes: 59 additions & 1 deletion src/lfx/src/lfx/base/data/docling_utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import importlib
import signal
import sys
import traceback
from contextlib import suppress
from typing import TYPE_CHECKING

from docling_core.types.doc import DoclingDocument
from pydantic import BaseModel, SecretStr, TypeAdapter

from lfx.log.logger import logger
from lfx.schema.data import Data
from lfx.schema.dataframe import DataFrame

if TYPE_CHECKING:
from langchain_core.language_models.chat_models import BaseChatModel


def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_key: str) -> list[DoclingDocument]:
documents: list[DoclingDocument] = []
Expand Down Expand Up @@ -57,7 +63,45 @@ def extract_docling_documents(data_inputs: Data | list[Data] | DataFrame, doc_ke
return documents


def docling_worker(file_paths: list[str], queue, pipeline: str, ocr_engine: str):
def _unwrap_secrets(obj):
if isinstance(obj, SecretStr):
return obj.get_secret_value()
if isinstance(obj, dict):
return {k: _unwrap_secrets(v) for k, v in obj.items()}
if isinstance(obj, list):
return [_unwrap_secrets(v) for v in obj]
return obj


def _dump_with_secrets(model: BaseModel):
return _unwrap_secrets(model.model_dump(mode="python", round_trip=True))


def _serialize_pydantic_model(model: BaseModel):
return {
"__class_path__": f"{model.__class__.__module__}.{model.__class__.__name__}",
"config": _dump_with_secrets(model),
}


def _deserialize_pydantic_model(data: dict):
module_name, class_name = data["__class_path__"].rsplit(".", 1)
module = importlib.import_module(module_name)
cls = getattr(module, class_name)
adapter = TypeAdapter(cls)
return adapter.validate_python(data["config"])

Comment on lines +87 to +93
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Harden deserialization: validate class path + robust errors.

Dynamic imports from unvalidated class paths are risky and brittle. Gate the module prefix, validate keys, and avoid KeyError/ValueError crashes.

-def _deserialize_pydantic_model(data: dict):
-    module_name, class_name = data["__class_path__"].rsplit(".", 1)
-    module = importlib.import_module(module_name)
-    cls = getattr(module, class_name)
-    adapter = TypeAdapter(cls)
-    return adapter.validate_python(data["config"])
+def _deserialize_pydantic_model(data: dict):
+    try:
+        class_path = data["__class_path__"]
+        module_name, class_name = class_path.rsplit(".", 1)
+    except (KeyError, ValueError) as e:
+        raise ValueError("Invalid serialized model: missing or malformed '__class_path__'.") from e
+    # Allow only known-safe prefixes to avoid arbitrary imports
+    ALLOWED_CLASS_PREFIXES = ("langchain_", "langchain", "langchain_core")
+    if not module_name.startswith(ALLOWED_CLASS_PREFIXES):
+        raise ValueError(f"Deserialization blocked for class path {class_path!r}")
+    module = importlib.import_module(module_name)
+    try:
+        cls = getattr(module, class_name)
+    except AttributeError as e:
+        raise ValueError(f"Class {class_name!r} not found in module {module_name!r}.") from e
+    adapter = TypeAdapter(cls)
+    return adapter.validate_python(data.get("config", {}))
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def _deserialize_pydantic_model(data: dict):
module_name, class_name = data["__class_path__"].rsplit(".", 1)
module = importlib.import_module(module_name)
cls = getattr(module, class_name)
adapter = TypeAdapter(cls)
return adapter.validate_python(data["config"])
def _deserialize_pydantic_model(data: dict):
- module_name, class_name = data["__class_path__"].rsplit(".", 1)
- module = importlib.import_module(module_name)
- cls = getattr(module, class_name)
- adapter = TypeAdapter(cls)
try:
class_path = data["__class_path__"]
module_name, class_name = class_path.rsplit(".", 1)
except (KeyError, ValueError) as e:
raise ValueError("Invalid serialized model: missing or malformed '__class_path__'.") from e
# Allow only known-safe prefixes to avoid arbitrary imports
ALLOWED_CLASS_PREFIXES = ("langchain_", "langchain", "langchain_core")
if not module_name.startswith(ALLOWED_CLASS_PREFIXES):
raise ValueError(f"Deserialization blocked for class path {class_path!r}")
module = importlib.import_module(module_name)
try:
cls = getattr(module, class_name)
except AttributeError as e:
raise ValueError(f"Class {class_name!r} not found in module {module_name!r}.") from e
adapter = TypeAdapter(cls)
return adapter.validate_python(data.get("config", {}))
🤖 Prompt for AI Agents
In src/lfx/src/lfx/base/data/docling_utils.py around lines 87 to 93, the
deserializer trusts data["__class_path__"] and other keys and does an
unrestricted dynamic import; validate presence and types of required keys (e.g.
ensure "__class_path__" and "config" exist and are dict-like), restrict allowed
module prefixes (maintain an allowlist/prefix whitelist and reject imports
outside it), wrap importlib.import_module/getattr/TypeAdapter.validate_python in
try/except and raise clear ValueError/TypeError with contextual messages on
failure (invalid path format, disallowed module, import error, missing class,
validation error) instead of letting KeyError/ValueError propagate; ensure
returned value is the validated model or raise a descriptive exception.


def docling_worker(
*,
file_paths: list[str],
queue,
pipeline: str,
ocr_engine: str,
do_picture_classification: bool,
pic_desc_config: dict | None,
pic_desc_prompt: str,
):
"""Worker function for processing files with Docling in a separate process."""
# Signal handling for graceful shutdown
shutdown_requested = False
Expand Down Expand Up @@ -106,6 +150,7 @@ def check_shutdown() -> None:
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.models.factories import get_ocr_factory
from docling.pipeline.vlm_pipeline import VlmPipeline
from langchain_docling.picture_description import PictureDescriptionLangChainOptions

# Check for shutdown after imports
check_shutdown()
Expand Down Expand Up @@ -143,6 +188,19 @@ def _get_standard_opts() -> PdfPipelineOptions:
kind=ocr_engine,
)
pipeline_options.ocr_options = ocr_options

pipeline_options.do_picture_classification = do_picture_classification

if pic_desc_config:
pic_desc_llm: BaseChatModel = _deserialize_pydantic_model(pic_desc_config)

logger.info("Docling enabling the picture description stage.")
pipeline_options.do_picture_description = True
pipeline_options.allow_external_plugins = True
pipeline_options.picture_description_options = PictureDescriptionLangChainOptions(
llm=pic_desc_llm,
prompt=pic_desc_prompt,
)
return pipeline_options

# Configure the VLM pipeline
Expand Down
79 changes: 36 additions & 43 deletions src/lfx/src/lfx/components/docling/docling_inline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from queue import Empty

from lfx.base.data import BaseFileComponent
from lfx.base.data.docling_utils import docling_worker
from lfx.inputs import DropdownInput
from lfx.base.data.docling_utils import _serialize_pydantic_model, docling_worker
from lfx.inputs import BoolInput, DropdownInput, HandleInput, StrInput
from lfx.schema import Data


Expand Down Expand Up @@ -67,6 +67,26 @@ class DoclingInlineComponent(BaseFileComponent):
real_time_refresh=False,
value="None",
),
BoolInput(
name="do_picture_classification",
display_name="Picture classification",
info="If enabled, the Docling pipeline will classify the pictures type.",
value=False,
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: I believe this is the default value for a BoolInput, so could be omitted if desired.

),
HandleInput(
name="pic_desc_llm",
display_name="Picture description LLM",
info="If connected, the model to use for running the picture description task.",
input_types=["LanguageModel"],
required=False,
),
StrInput(
name="pic_desc_prompt",
display_name="Picture description prompt",
value="Describe the image in three sentences. Be concise and accurate.",
info="The user prompt to use when invoking the model.",
advanced=True,
),
# TODO: expose more Docling options
]

Expand Down Expand Up @@ -131,64 +151,37 @@ def _terminate_process_gracefully(self, proc, timeout_terminate: int = 10, timeo

def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:
try:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import OcrOptions, PdfPipelineOptions, VlmPipelineOptions
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.models.factories import get_ocr_factory
from docling.pipeline.vlm_pipeline import VlmPipeline
from docling.document_converter import DocumentConverter # noqa: F401
except ImportError as e:
msg = (
"Docling is an optional dependency. Install with `uv pip install 'langflow[docling]'` or refer to the "
"documentation on how to install optional dependencies."
)
raise ImportError(msg) from e

# Configure the standard PDF pipeline
def _get_standard_opts() -> PdfPipelineOptions:
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = self.ocr_engine != "None"
if pipeline_options.do_ocr:
ocr_factory = get_ocr_factory(
allow_external_plugins=False,
)

ocr_options: OcrOptions = ocr_factory.create_options(
kind=self.ocr_engine,
)
pipeline_options.ocr_options = ocr_options
return pipeline_options

# Configure the VLM pipeline
def _get_vlm_opts() -> VlmPipelineOptions:
return VlmPipelineOptions()

# Configure the main format options and create the DocumentConverter()
def _get_converter() -> DocumentConverter:
if self.pipeline == "standard":
pdf_format_option = PdfFormatOption(
pipeline_options=_get_standard_opts(),
)
elif self.pipeline == "vlm":
pdf_format_option = PdfFormatOption(pipeline_cls=VlmPipeline, pipeline_options=_get_vlm_opts())

format_options: dict[InputFormat, FormatOption] = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
}

return DocumentConverter(format_options=format_options)

file_paths = [file.path for file in file_list if file.path]

if not file_paths:
self.log("No files to process.")
return file_list

pic_desc_config: dict | None = None
if self.pic_desc_llm is not None:
pic_desc_config = _serialize_pydantic_model(self.pic_desc_llm)

ctx = get_context("spawn")
queue: Queue = ctx.Queue()
proc = ctx.Process(
target=docling_worker,
args=(file_paths, queue, self.pipeline, self.ocr_engine),
kwargs={
"file_paths": file_paths,
"queue": queue,
"pipeline": self.pipeline,
"ocr_engine": self.ocr_engine,
"do_picture_classification": self.do_picture_classification,
"pic_desc_config": pic_desc_config,
"pic_desc_prompt": self.pic_desc_prompt,
},
)

result = None
Expand Down
15 changes: 15 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading