From c193fd747c01d55524c2903f9cf6386f2662aa3c Mon Sep 17 00:00:00 2001 From: Rico Furtado Date: Thu, 12 Mar 2026 12:15:40 -0400 Subject: [PATCH] feat: changes to ingestion flow - added docling chunker. --- flows/ingestion_flow.json | 1540 +++++++++++++++++-------------------- 1 file changed, 696 insertions(+), 844 deletions(-) diff --git a/flows/ingestion_flow.json b/flows/ingestion_flow.json index f389252bd..7cc63f8e6 100644 --- a/flows/ingestion_flow.json +++ b/flows/ingestion_flow.json @@ -1,93 +1,6 @@ { "data": { "edges": [ - { - "animated": false, - "className": "", - "data": { - "sourceHandle": { - "dataType": "DoclingRemote", - "id": "DoclingRemote-Dp3PX", - "name": "dataframe", - "output_types": [ - "DataFrame" - ] - }, - "targetHandle": { - "fieldName": "data_inputs", - "id": "ExportDoclingDocument-zZdRg", - "inputTypes": [ - "Data", - "DataFrame" - ], - "type": "other" - } - }, - "id": "xy-edge__DoclingRemote-Dp3PX{œdataTypeœ:œDoclingRemoteœ,œidœ:œDoclingRemote-Dp3PXœ,œnameœ:œdataframeœ,œoutput_typesœ:[œDataFrameœ]}-ExportDoclingDocument-zZdRg{œfieldNameœ:œdata_inputsœ,œidœ:œExportDoclingDocument-zZdRgœ,œinputTypesœ:[œDataœ,œDataFrameœ],œtypeœ:œotherœ}", - "selected": false, - "source": "DoclingRemote-Dp3PX", - "sourceHandle": "{œdataTypeœ:œDoclingRemoteœ,œidœ:œDoclingRemote-Dp3PXœ,œnameœ:œdataframeœ,œoutput_typesœ:[œDataFrameœ]}", - "target": "ExportDoclingDocument-zZdRg", - "targetHandle": "{œfieldNameœ:œdata_inputsœ,œidœ:œExportDoclingDocument-zZdRgœ,œinputTypesœ:[œDataœ,œDataFrameœ],œtypeœ:œotherœ}" - }, - { - "animated": false, - "className": "", - "data": { - "sourceHandle": { - "dataType": "ExportDoclingDocument", - "id": "ExportDoclingDocument-zZdRg", - "name": "dataframe", - "output_types": [ - "DataFrame" - ] - }, - "targetHandle": { - "fieldName": "df", - "id": "DataFrameOperations-1BWXB", - "inputTypes": [ - "DataFrame" - ], - "type": "other" - } - }, - "id": "xy-edge__ExportDoclingDocument-zZdRg{œdataTypeœ:œExportDoclingDocumentœ,œidœ:œExportDoclingDocument-zZdRgœ,œnameœ:œdataframeœ,œoutput_typesœ:[œDataFrameœ]}-DataFrameOperations-1BWXB{œfieldNameœ:œdfœ,œidœ:œDataFrameOperations-1BWXBœ,œinputTypesœ:[œDataFrameœ],œtypeœ:œotherœ}", - "selected": false, - "source": "ExportDoclingDocument-zZdRg", - "sourceHandle": "{œdataTypeœ:œExportDoclingDocumentœ,œidœ:œExportDoclingDocument-zZdRgœ,œnameœ:œdataframeœ,œoutput_typesœ:[œDataFrameœ]}", - "target": "DataFrameOperations-1BWXB", - "targetHandle": "{œfieldNameœ:œdfœ,œidœ:œDataFrameOperations-1BWXBœ,œinputTypesœ:[œDataFrameœ],œtypeœ:œotherœ}" - }, - { - "animated": false, - "className": "", - "data": { - "sourceHandle": { - "dataType": "DataFrameOperations", - "id": "DataFrameOperations-N80fC", - "name": "output", - "output_types": [ - "DataFrame" - ] - }, - "targetHandle": { - "fieldName": "data_inputs", - "id": "SplitText-QIKhg", - "inputTypes": [ - "Data", - "DataFrame", - "Message" - ], - "type": "other" - } - }, - "id": "xy-edge__DataFrameOperations-N80fC{œdataTypeœ:œDataFrameOperationsœ,œidœ:œDataFrameOperations-N80fCœ,œnameœ:œoutputœ,œoutput_typesœ:[œDataFrameœ]}-SplitText-QIKhg{œfieldNameœ:œdata_inputsœ,œidœ:œSplitText-QIKhgœ,œinputTypesœ:[œDataœ,œDataFrameœ,œMessageœ],œtypeœ:œotherœ}", - "selected": false, - "source": "DataFrameOperations-N80fC", - "sourceHandle": "{œdataTypeœ:œDataFrameOperationsœ,œidœ:œDataFrameOperations-N80fCœ,œnameœ:œoutputœ,œoutput_typesœ:[œDataFrameœ]}", - "target": "SplitText-QIKhg", - "targetHandle": "{œfieldNameœ:œdata_inputsœ,œidœ:œSplitText-QIKhgœ,œinputTypesœ:[œDataœ,œDataFrameœ,œMessageœ],œtypeœ:œotherœ}" - }, { "animated": false, "className": "", @@ -256,35 +169,6 @@ "target": "OpenSearchVectorStoreComponentMultimodalMultiEmbedding-By9U4", "targetHandle": "{œfieldNameœ:œembeddingœ,œidœ:œOpenSearchVectorStoreComponentMultimodalMultiEmbedding-By9U4œ,œinputTypesœ:[œEmbeddingsœ],œtypeœ:œotherœ}" }, - { - "animated": false, - "className": "", - "data": { - "sourceHandle": { - "dataType": "SplitText", - "id": "SplitText-QIKhg", - "name": "dataframe", - "output_types": [ - "DataFrame" - ] - }, - "targetHandle": { - "fieldName": "ingest_data", - "id": "OpenSearchVectorStoreComponentMultimodalMultiEmbedding-By9U4", - "inputTypes": [ - "Data", - "DataFrame" - ], - "type": "other" - } - }, - "id": "xy-edge__SplitText-QIKhg{œdataTypeœ:œSplitTextœ,œidœ:œSplitText-QIKhgœ,œnameœ:œdataframeœ,œoutput_typesœ:[œDataFrameœ]}-OpenSearchVectorStoreComponentMultimodalMultiEmbedding-By9U4{œfieldNameœ:œingest_dataœ,œidœ:œOpenSearchVectorStoreComponentMultimodalMultiEmbedding-By9U4œ,œinputTypesœ:[œDataœ,œDataFrameœ],œtypeœ:œotherœ}", - "selected": false, - "source": "SplitText-QIKhg", - "sourceHandle": "{œdataTypeœ:œSplitTextœ,œidœ:œSplitText-QIKhgœ,œnameœ:œdataframeœ,œoutput_typesœ:[œDataFrameœ]}", - "target": "OpenSearchVectorStoreComponentMultimodalMultiEmbedding-By9U4", - "targetHandle": "{œfieldNameœ:œingest_dataœ,œidœ:œOpenSearchVectorStoreComponentMultimodalMultiEmbedding-By9U4œ,œinputTypesœ:[œDataœ,œDataFrameœ],œtypeœ:œotherœ}" - }, { "animated": false, "className": "", @@ -516,373 +400,182 @@ "sourceHandle": "{œdataTypeœ:œTextInputœ,œidœ:œTextInput-68n9Lœ,œnameœ:œtextœ,œoutput_typesœ:[œMessageœ]}", "target": "AdvancedDynamicFormBuilder-81Exw", "targetHandle": "{œfieldNameœ:œdynamic_owner_nameœ,œidœ:œAdvancedDynamicFormBuilder-81Exwœ,œinputTypesœ:[œTextœ,œMessageœ],œtypeœ:œstrœ}" + }, + { + "animated": false, + "className": "", + "data": { + "sourceHandle": { + "dataType": "DataFrameOperations", + "id": "DataFrameOperations-N80fC", + "name": "output", + "output_types": [ + "DataFrame" + ] + }, + "targetHandle": { + "fieldName": "ingest_data", + "id": "OpenSearchVectorStoreComponentMultimodalMultiEmbedding-By9U4", + "inputTypes": [ + "Data", + "DataFrame" + ], + "type": "other" + } + }, + "id": "xy-edge__DataFrameOperations-N80fC{œdataTypeœ:œDataFrameOperationsœ,œidœ:œDataFrameOperations-N80fCœ,œnameœ:œoutputœ,œoutput_typesœ:[œDataFrameœ]}-OpenSearchVectorStoreComponentMultimodalMultiEmbedding-By9U4{œfieldNameœ:œingest_dataœ,œidœ:œOpenSearchVectorStoreComponentMultimodalMultiEmbedding-By9U4œ,œinputTypesœ:[œDataœ,œDataFrameœ],œtypeœ:œotherœ}", + "selected": false, + "source": "DataFrameOperations-N80fC", + "sourceHandle": "{œdataTypeœ:œDataFrameOperationsœ,œidœ:œDataFrameOperations-N80fCœ,œnameœ:œoutputœ,œoutput_typesœ:[œDataFrameœ]}", + "target": "OpenSearchVectorStoreComponentMultimodalMultiEmbedding-By9U4", + "targetHandle": "{œfieldNameœ:œingest_dataœ,œidœ:œOpenSearchVectorStoreComponentMultimodalMultiEmbedding-By9U4œ,œinputTypesœ:[œDataœ,œDataFrameœ],œtypeœ:œotherœ}" + }, + { + "animated": false, + "className": "", + "data": { + "sourceHandle": { + "dataType": "ChunkDoclingDocument", + "id": "ChunkDoclingDocument-DdOYd", + "name": "dataframe", + "output_types": [ + "DataFrame" + ] + }, + "targetHandle": { + "fieldName": "df", + "id": "DataFrameOperations-1BWXB", + "inputTypes": [ + "DataFrame" + ], + "type": "other" + } + }, + "id": "xy-edge__ChunkDoclingDocument-DdOYd{œdataTypeœ:œChunkDoclingDocumentœ,œidœ:œChunkDoclingDocument-DdOYdœ,œnameœ:œdataframeœ,œoutput_typesœ:[œDataFrameœ]}-DataFrameOperations-1BWXB{œfieldNameœ:œdfœ,œidœ:œDataFrameOperations-1BWXBœ,œinputTypesœ:[œDataFrameœ],œtypeœ:œotherœ}", + "selected": false, + "source": "ChunkDoclingDocument-DdOYd", + "sourceHandle": "{œdataTypeœ:œChunkDoclingDocumentœ,œidœ:œChunkDoclingDocument-DdOYdœ,œnameœ:œdataframeœ,œoutput_typesœ:[œDataFrameœ]}", + "target": "DataFrameOperations-1BWXB", + "targetHandle": "{œfieldNameœ:œdfœ,œidœ:œDataFrameOperations-1BWXBœ,œinputTypesœ:[œDataFrameœ],œtypeœ:œotherœ}" + }, + { + "animated": false, + "className": "", + "data": { + "sourceHandle": { + "dataType": "DoclingRemote", + "id": "DoclingRemote-Dp3PX", + "name": "dataframe", + "output_types": [ + "DataFrame" + ] + }, + "targetHandle": { + "fieldName": "data_inputs", + "id": "ChunkDoclingDocument-DdOYd", + "inputTypes": [ + "Data", + "DataFrame" + ], + "type": "other" + } + }, + "id": "xy-edge__DoclingRemote-Dp3PX{œdataTypeœ:œDoclingRemoteœ,œidœ:œDoclingRemote-Dp3PXœ,œnameœ:œdataframeœ,œoutput_typesœ:[œDataFrameœ]}-ChunkDoclingDocument-DdOYd{œfieldNameœ:œdata_inputsœ,œidœ:œChunkDoclingDocument-DdOYdœ,œinputTypesœ:[œDataœ,œDataFrameœ],œtypeœ:œotherœ}", + "selected": false, + "source": "DoclingRemote-Dp3PX", + "sourceHandle": "{œdataTypeœ:œDoclingRemoteœ,œidœ:œDoclingRemote-Dp3PXœ,œnameœ:œdataframeœ,œoutput_typesœ:[œDataFrameœ]}", + "target": "ChunkDoclingDocument-DdOYd", + "targetHandle": "{œfieldNameœ:œdata_inputsœ,œidœ:œChunkDoclingDocument-DdOYdœ,œinputTypesœ:[œDataœ,œDataFrameœ],œtypeœ:œotherœ}" } ], "nodes": [ { "data": { - "description": "Split text into chunks based on specified criteria.", - "display_name": "Split Text", - "id": "SplitText-QIKhg", + "id": "AdvancedDynamicFormBuilder-81Exw", "node": { "base_classes": [ - "DataFrame" + "Data", + "Message" ], "beta": false, "conditional_paths": [], "custom_fields": {}, - "description": "Split text into chunks based on specified criteria.", - "display_name": "Split Text", - "documentation": "https://docs.langflow.org/split-text", - "edited": false, + "description": "Creates dynamic input fields that can receive data from other components or manual input.", + "display_name": "Create Data", + "documentation": "", + "edited": true, "field_order": [ - "data_inputs", - "chunk_overlap", - "chunk_size", - "separator", - "text_key", - "keep_separator", - "clean_output" + "form_fields", + "include_metadata" ], "frozen": false, - "icon": "scissors-line-dashed", + "icon": "braces", + "last_updated": "2026-03-12T15:33:27.314Z", "legacy": false, - "metadata": { - "code_hash": "29ae597d2d86", - "dependencies": { - "dependencies": [ - { - "name": "langchain_text_splitters", - "version": "0.3.11" - }, - { - "name": "lfx", - "version": null - } - ], - "total_dependencies": 2 - }, - "module": "custom_components.split_text" - }, + "lf_version": "1.7.0.dev21", + "metadata": {}, "minimized": false, "output_types": [], "outputs": [ { "allows_loop": false, "cache": true, - "display_name": "Chunks", + "display_name": "Data", "group_outputs": false, "hidden": null, "loop_types": null, - "method": "split_text", - "name": "dataframe", + "method": "process_form", + "name": "form_data", "options": null, "required_inputs": null, - "selected": "DataFrame", + "selected": "Data", "tool_mode": true, "types": [ - "DataFrame" + "Data" + ], + "value": "__UNDEFINED__" + }, + { + "allows_loop": false, + "cache": true, + "display_name": "Message", + "group_outputs": false, + "hidden": null, + "loop_types": null, + "method": "get_message", + "name": "message", + "options": null, + "required_inputs": null, + "selected": "Message", + "tool_mode": true, + "types": [ + "Message" ], "value": "__UNDEFINED__" } ], "pinned": false, "template": { + "_frontend_node_flow_id": { + "value": "5488df7c-b93f-4f87-a446-b67028bc0813" + }, + "_frontend_node_folder_id": { + "value": "573ff6c4-003c-4e1d-aa3f-8a83e1f3e020" + }, "_type": "Component", - "chunk_overlap": { - "_input_type": "IntInput", - "advanced": false, - "display_name": "Chunk Overlap", - "dynamic": false, - "info": "Number of characters to overlap between chunks.", + "code": { + "advanced": true, + "dynamic": true, + "fileTypes": [], + "file_path": "", + "info": "", "list": false, - "list_add_label": "Add More", - "name": "chunk_overlap", - "override_skip": false, + "load_from_db": false, + "multiline": true, + "name": "code", + "password": false, "placeholder": "", - "required": false, - "show": true, - "title_case": false, - "tool_mode": false, - "trace_as_metadata": true, - "track_in_telemetry": true, - "type": "int", - "value": 200 - }, - "chunk_size": { - "_input_type": "IntInput", - "advanced": false, - "display_name": "Chunk Size", - "dynamic": false, - "info": "The maximum length of each chunk. Text is first split by separator, then chunks are merged up to this size. Individual splits larger than this won't be further divided.", - "list": false, - "list_add_label": "Add More", - "name": "chunk_size", - "override_skip": false, - "placeholder": "", - "required": false, - "show": true, - "title_case": false, - "tool_mode": false, - "trace_as_metadata": true, - "track_in_telemetry": true, - "type": "int", - "value": 1000 - }, - "clean_output": { - "_input_type": "BoolInput", - "advanced": true, - "display_name": "Clean Output", - "dynamic": false, - "info": "When enabled, only the text column is included in the output. Metadata columns are removed.", - "list": false, - "list_add_label": "Add More", - "name": "clean_output", - "override_skip": false, - "placeholder": "", - "required": false, - "show": true, - "title_case": false, - "tool_mode": false, - "trace_as_metadata": true, - "track_in_telemetry": true, - "type": "bool", - "value": false - }, - "code": { - "advanced": true, - "dynamic": true, - "fileTypes": [], - "file_path": "", - "info": "", - "list": false, - "load_from_db": false, - "multiline": true, - "name": "code", - "password": false, - "placeholder": "", - "required": true, - "show": true, - "title_case": false, - "type": "code", - "value": "from langchain_text_splitters import CharacterTextSplitter\n\nfrom lfx.custom.custom_component.component import Component\nfrom lfx.io import BoolInput, DropdownInput, HandleInput, IntInput, MessageTextInput, Output\nfrom lfx.schema.data import Data\nfrom lfx.schema.dataframe import DataFrame\nfrom lfx.schema.message import Message\nfrom lfx.utils.util import unescape_string\n\n\nclass SplitTextComponent(Component):\n display_name: str = \"Split Text\"\n description: str = \"Split text into chunks based on specified criteria.\"\n documentation: str = \"https://docs.langflow.org/split-text\"\n icon = \"scissors-line-dashed\"\n name = \"SplitText\"\n\n inputs = [\n HandleInput(\n name=\"data_inputs\",\n display_name=\"Input\",\n info=\"The data with texts to split in chunks.\",\n input_types=[\"Data\", \"DataFrame\", \"Message\"],\n required=True,\n ),\n IntInput(\n name=\"chunk_overlap\",\n display_name=\"Chunk Overlap\",\n info=\"Number of characters to overlap between chunks.\",\n value=200,\n ),\n IntInput(\n name=\"chunk_size\",\n display_name=\"Chunk Size\",\n info=(\n \"The maximum length of each chunk. Text is first split by separator, \"\n \"then chunks are merged up to this size. \"\n \"Individual splits larger than this won't be further divided.\"\n ),\n value=1000,\n ),\n MessageTextInput(\n name=\"separator\",\n display_name=\"Separator\",\n info=(\n \"The character to split on. Use \\\\n for newline. \"\n \"Examples: \\\\n\\\\n for paragraphs, \\\\n for lines, . for sentences\"\n ),\n value=\"\\n\",\n ),\n MessageTextInput(\n name=\"text_key\",\n display_name=\"Text Key\",\n info=\"The key to use for the text column.\",\n value=\"text\",\n advanced=True,\n ),\n DropdownInput(\n name=\"keep_separator\",\n display_name=\"Keep Separator\",\n info=\"Whether to keep the separator in the output chunks and where to place it.\",\n options=[\"False\", \"True\", \"Start\", \"End\"],\n value=\"False\",\n advanced=True,\n ),\n BoolInput(\n name=\"clean_output\",\n display_name=\"Clean Output\",\n info=\"When enabled, only the text column is included in the output. Metadata columns are removed.\",\n value=False,\n advanced=True,\n ),\n ]\n\n outputs = [\n Output(display_name=\"Chunks\", name=\"dataframe\", method=\"split_text\"),\n ]\n\n def _docs_to_data(self, docs, *, clean: bool = False) -> list[Data]:\n return [\n Data(text=doc.page_content) if clean else Data(text=doc.page_content, data=doc.metadata) for doc in docs\n ]\n\n def _fix_separator(self, separator: str) -> str:\n \"\"\"Fix common separator issues and convert to proper format.\"\"\"\n if separator == \"/n\":\n return \"\\n\"\n if separator == \"/t\":\n return \"\\t\"\n return separator\n\n def split_text_base(self):\n separator = self._fix_separator(self.separator)\n separator = unescape_string(separator)\n\n if isinstance(self.data_inputs, DataFrame):\n if not len(self.data_inputs):\n msg = \"DataFrame is empty\"\n raise TypeError(msg)\n\n self.data_inputs.text_key = self.text_key\n try:\n documents = self.data_inputs.to_lc_documents()\n except Exception as e:\n msg = f\"Error converting DataFrame to documents: {e}\"\n raise TypeError(msg) from e\n elif isinstance(self.data_inputs, Message):\n self.data_inputs = [self.data_inputs.to_data()]\n return self.split_text_base()\n else:\n if not self.data_inputs:\n msg = \"No data inputs provided\"\n raise TypeError(msg)\n\n documents = []\n if isinstance(self.data_inputs, Data):\n self.data_inputs.text_key = self.text_key\n documents = [self.data_inputs.to_lc_document()]\n else:\n try:\n documents = [input_.to_lc_document() for input_ in self.data_inputs if isinstance(input_, Data)]\n if not documents:\n msg = f\"No valid Data inputs found in {type(self.data_inputs)}\"\n raise TypeError(msg)\n except AttributeError as e:\n msg = f\"Invalid input type in collection: {e}\"\n raise TypeError(msg) from e\n try:\n # Convert string 'False'/'True' to boolean\n keep_sep = self.keep_separator\n if isinstance(keep_sep, str):\n if keep_sep.lower() == \"false\":\n keep_sep = False\n elif keep_sep.lower() == \"true\":\n keep_sep = True\n # 'start' and 'end' are kept as strings\n\n splitter = CharacterTextSplitter(\n chunk_overlap=self.chunk_overlap,\n chunk_size=self.chunk_size,\n separator=separator,\n keep_separator=keep_sep,\n )\n return splitter.split_documents(documents)\n except Exception as e:\n msg = f\"Error splitting text: {e}\"\n raise TypeError(msg) from e\n\n def split_text(self) -> DataFrame:\n docs = self.split_text_base()\n df = DataFrame(self._docs_to_data(docs, clean=self.clean_output))\n return df if self.clean_output else df.smart_column_order()\n" - }, - "data_inputs": { - "_input_type": "HandleInput", - "advanced": false, - "display_name": "Input", - "dynamic": false, - "info": "The data with texts to split in chunks.", - "input_types": [ - "Data", - "DataFrame", - "Message" - ], - "list": false, - "list_add_label": "Add More", - "name": "data_inputs", - "override_skip": false, - "placeholder": "", - "required": true, - "show": true, - "title_case": false, - "trace_as_metadata": true, - "track_in_telemetry": false, - "type": "other", - "value": "" - }, - "keep_separator": { - "_input_type": "DropdownInput", - "advanced": true, - "combobox": false, - "dialog_inputs": {}, - "display_name": "Keep Separator", - "dynamic": false, - "external_options": {}, - "info": "Whether to keep the separator in the output chunks and where to place it.", - "name": "keep_separator", - "options": [ - "False", - "True", - "Start", - "End" - ], - "options_metadata": [], - "override_skip": false, - "placeholder": "", - "required": false, - "show": true, - "title_case": false, - "toggle": false, - "tool_mode": false, - "trace_as_metadata": true, - "track_in_telemetry": true, - "type": "str", - "value": "False" - }, - "separator": { - "_input_type": "MessageTextInput", - "advanced": false, - "display_name": "Separator", - "dynamic": false, - "info": "The character to split on. Use \\n for newline. Examples: \\n\\n for paragraphs, \\n for lines, . for sentences", - "input_types": [ - "Message" - ], - "list": false, - "list_add_label": "Add More", - "load_from_db": false, - "name": "separator", - "override_skip": false, - "placeholder": "", - "required": false, - "show": true, - "title_case": false, - "tool_mode": false, - "trace_as_input": true, - "trace_as_metadata": true, - "track_in_telemetry": false, - "type": "str", - "value": "\n" - }, - "text_key": { - "_input_type": "MessageTextInput", - "advanced": true, - "display_name": "Text Key", - "dynamic": false, - "info": "The key to use for the text column.", - "input_types": [ - "Message" - ], - "list": false, - "list_add_label": "Add More", - "load_from_db": false, - "name": "text_key", - "override_skip": false, - "placeholder": "", - "required": false, - "show": true, - "title_case": false, - "tool_mode": false, - "trace_as_input": true, - "trace_as_metadata": true, - "track_in_telemetry": false, - "type": "str", - "value": "text" - } - }, - "tool_mode": false - }, - "selected_output": "chunks", - "showNode": false, - "type": "SplitText" - }, - "dragging": false, - "height": 475, - "id": "SplitText-QIKhg", - "measured": { - "height": 475, - "width": 320 - }, - "position": { - "x": 1353.7989679128482, - "y": 1785.418981315661 - }, - "positionAbsolute": { - "x": 1683.4543896546102, - "y": 1350.7871623588553 - }, - "selected": false, - "type": "genericNode", - "width": 320 - }, - { - "data": { - "id": "AdvancedDynamicFormBuilder-81Exw", - "node": { - "base_classes": [ - "Data", - "Message" - ], - "beta": false, - "conditional_paths": [], - "custom_fields": {}, - "description": "Creates dynamic input fields that can receive data from other components or manual input.", - "display_name": "Create Data", - "documentation": "", - "edited": true, - "field_order": [ - "form_fields", - "include_metadata" - ], - "frozen": false, - "icon": "braces", - "last_updated": "2026-02-27T19:50:35.544Z", - "legacy": false, - "lf_version": "1.7.0.dev21", - "metadata": {}, - "minimized": false, - "output_types": [], - "outputs": [ - { - "allows_loop": false, - "cache": true, - "display_name": "Data", - "group_outputs": false, - "hidden": null, - "loop_types": null, - "method": "process_form", - "name": "form_data", - "options": null, - "required_inputs": null, - "selected": "Data", - "tool_mode": true, - "types": [ - "Data" - ], - "value": "__UNDEFINED__" - }, - { - "allows_loop": false, - "cache": true, - "display_name": "Message", - "group_outputs": false, - "hidden": null, - "loop_types": null, - "method": "get_message", - "name": "message", - "options": null, - "required_inputs": null, - "selected": "Message", - "tool_mode": true, - "types": [ - "Message" - ], - "value": "__UNDEFINED__" - } - ], - "pinned": false, - "template": { - "_frontend_node_flow_id": { - "value": "5488df7c-b93f-4f87-a446-b67028bc0813" - }, - "_frontend_node_folder_id": { - "value": "bbfbd352-e931-4894-afe8-6552a3f0cc2c" - }, - "_type": "Component", - "code": { - "advanced": true, - "dynamic": true, - "fileTypes": [], - "file_path": "", - "info": "", - "list": false, - "load_from_db": false, - "multiline": true, - "name": "code", - "password": false, - "placeholder": "", - "required": true, + "required": true, "show": true, "title_case": false, "type": "code", @@ -1324,7 +1017,9 @@ ], "frozen": false, "icon": "Docling", + "last_updated": "2026-03-12T15:10:50.735Z", "legacy": false, + "lf_version": "1.8.0", "metadata": { "code_hash": "409d771a961e", "dependencies": { @@ -1374,6 +1069,12 @@ ], "pinned": false, "template": { + "_frontend_node_flow_id": { + "value": "5488df7c-b93f-4f87-a446-b67028bc0813" + }, + "_frontend_node_folder_id": { + "value": "573ff6c4-003c-4e1d-aa3f-8a83e1f3e020" + }, "_type": "Component", "api_headers": { "_input_type": "NestedDictInput", @@ -1557,6 +1258,7 @@ "type": "bool", "value": true }, + "is_refresh": false, "max_concurrency": { "_input_type": "IntInput", "advanced": true, @@ -1632,315 +1334,46 @@ "potm", "ppsm", "pptx", - "tiff", - "txt", - "xls", - "xlsx", - "xhtml", - "xml", - "webp", - "zip", - "tar", - "tgz", - "bz2", - "gz" - ], - "file_path": [], - "info": "Supported file extensions: adoc, asciidoc, asc, bmp, csv, dotx, dotm, docm, docx, htm, html, jpeg, jpg, json, md, pdf, png, potx, ppsx, pptm, potm, ppsm, pptx, tiff, txt, xls, xlsx, xhtml, xml, webp; optionally bundled in file extensions: zip, tar, tgz, bz2, gz", - "list": true, - "list_add_label": "Add More", - "name": "path", - "override_skip": false, - "placeholder": "", - "required": false, - "show": true, - "temp_file": false, - "title_case": false, - "tool_mode": true, - "trace_as_metadata": true, - "track_in_telemetry": false, - "type": "file", - "value": "" - }, - "separator": { - "_input_type": "StrInput", - "advanced": true, - "display_name": "Separator", - "dynamic": false, - "info": "Specify the separator to use between multiple outputs in Message format.", - "list": false, - "list_add_label": "Add More", - "load_from_db": false, - "name": "separator", - "override_skip": false, - "placeholder": "", - "required": false, - "show": true, - "title_case": false, - "tool_mode": false, - "trace_as_metadata": true, - "track_in_telemetry": false, - "type": "str", - "value": "\n\n" - }, - "silent_errors": { - "_input_type": "BoolInput", - "advanced": true, - "display_name": "Silent Errors", - "dynamic": false, - "info": "If true, errors will not raise an exception.", - "list": false, - "list_add_label": "Add More", - "name": "silent_errors", - "override_skip": false, - "placeholder": "", - "required": false, - "show": true, - "title_case": false, - "tool_mode": false, - "trace_as_metadata": true, - "track_in_telemetry": true, - "type": "bool", - "value": false - } - }, - "tool_mode": false - }, - "showNode": false, - "type": "DoclingRemote" - }, - "dragging": false, - "id": "DoclingRemote-Dp3PX", - "measured": { - "height": 52, - "width": 192 - }, - "position": { - "x": -18.22506037537059, - "y": 1767.7398128168159 - }, - "selected": false, - "type": "genericNode" - }, - { - "data": { - "description": "Export DoclingDocument to markdown, html or other formats.", - "display_name": "Export DoclingDocument", - "id": "ExportDoclingDocument-zZdRg", - "node": { - "base_classes": [ - "Data", - "DataFrame" - ], - "beta": false, - "conditional_paths": [], - "custom_fields": {}, - "description": "Export DoclingDocument to markdown, html or other formats.", - "display_name": "Export DoclingDocument", - "documentation": "https://docling-project.github.io/docling/", - "edited": false, - "field_order": [ - "data_inputs", - "export_format", - "image_mode", - "md_image_placeholder", - "md_page_break_placeholder", - "doc_key" - ], - "frozen": false, - "icon": "Docling", - "legacy": false, - "metadata": { - "code_hash": "32577a7e396b", - "dependencies": { - "dependencies": [ - { - "name": "docling_core", - "version": "2.60.1" - }, - { - "name": "lfx", - "version": null - } - ], - "total_dependencies": 2 - }, - "module": "custom_components.export_doclingdocument" - }, - "minimized": false, - "output_types": [], - "outputs": [ - { - "allows_loop": false, - "cache": true, - "display_name": "Exported data", - "group_outputs": false, - "loop_types": null, - "method": "export_document", - "name": "data", - "options": null, - "required_inputs": null, - "selected": "Data", - "tool_mode": true, - "types": [ - "Data" - ], - "value": "__UNDEFINED__" - }, - { - "allows_loop": false, - "cache": true, - "display_name": "DataFrame", - "group_outputs": false, - "loop_types": null, - "method": "as_dataframe", - "name": "dataframe", - "options": null, - "required_inputs": null, - "selected": "DataFrame", - "tool_mode": true, - "types": [ - "DataFrame" - ], - "value": "__UNDEFINED__" - } - ], - "pinned": false, - "template": { - "_type": "Component", - "code": { - "advanced": true, - "dynamic": true, - "fileTypes": [], - "file_path": "", - "info": "", - "list": false, - "load_from_db": false, - "multiline": true, - "name": "code", - "password": false, - "placeholder": "", - "required": true, - "show": true, - "title_case": false, - "type": "code", - "value": "from typing import Any\n\nfrom docling_core.types.doc import ImageRefMode\n\nfrom lfx.base.data.docling_utils import extract_docling_documents\nfrom lfx.custom import Component\nfrom lfx.io import DropdownInput, HandleInput, MessageTextInput, Output, StrInput\nfrom lfx.schema import Data, DataFrame\n\n\nclass ExportDoclingDocumentComponent(Component):\n display_name: str = \"Export DoclingDocument\"\n description: str = \"Export DoclingDocument to markdown, html or other formats.\"\n documentation = \"https://docling-project.github.io/docling/\"\n icon = \"Docling\"\n name = \"ExportDoclingDocument\"\n\n inputs = [\n HandleInput(\n name=\"data_inputs\",\n display_name=\"Data or DataFrame\",\n info=\"The data with documents to export.\",\n input_types=[\"Data\", \"DataFrame\"],\n required=True,\n ),\n DropdownInput(\n name=\"export_format\",\n display_name=\"Export format\",\n options=[\"Markdown\", \"HTML\", \"Plaintext\", \"DocTags\"],\n info=\"Select the export format to convert the input.\",\n value=\"Markdown\",\n real_time_refresh=True,\n ),\n DropdownInput(\n name=\"image_mode\",\n display_name=\"Image export mode\",\n options=[\"placeholder\", \"embedded\"],\n info=(\n \"Specify how images are exported in the output. Placeholder will replace the images with a string, \"\n \"whereas Embedded will include them as base64 encoded images.\"\n ),\n value=\"placeholder\",\n ),\n StrInput(\n name=\"md_image_placeholder\",\n display_name=\"Image placeholder\",\n info=\"Specify the image placeholder for markdown exports.\",\n value=\"\",\n advanced=True,\n ),\n StrInput(\n name=\"md_page_break_placeholder\",\n display_name=\"Page break placeholder\",\n info=\"Add this placeholder betweek pages in the markdown output.\",\n value=\"\",\n advanced=True,\n ),\n MessageTextInput(\n name=\"doc_key\",\n display_name=\"Doc Key\",\n info=\"The key to use for the DoclingDocument column.\",\n value=\"doc\",\n advanced=True,\n ),\n ]\n\n outputs = [\n Output(display_name=\"Exported data\", name=\"data\", method=\"export_document\"),\n Output(display_name=\"DataFrame\", name=\"dataframe\", method=\"as_dataframe\"),\n ]\n\n def update_build_config(self, build_config: dict, field_value: Any, field_name: str | None = None) -> dict:\n if field_name == \"export_format\" and field_value == \"Markdown\":\n build_config[\"md_image_placeholder\"][\"show\"] = True\n build_config[\"md_page_break_placeholder\"][\"show\"] = True\n build_config[\"image_mode\"][\"show\"] = True\n elif field_name == \"export_format\" and field_value == \"HTML\":\n build_config[\"md_image_placeholder\"][\"show\"] = False\n build_config[\"md_page_break_placeholder\"][\"show\"] = False\n build_config[\"image_mode\"][\"show\"] = True\n elif field_name == \"export_format\" and field_value in {\"Plaintext\", \"DocTags\"}:\n build_config[\"md_image_placeholder\"][\"show\"] = False\n build_config[\"md_page_break_placeholder\"][\"show\"] = False\n build_config[\"image_mode\"][\"show\"] = False\n\n return build_config\n\n def export_document(self) -> list[Data]:\n documents, warning = extract_docling_documents(self.data_inputs, self.doc_key)\n if warning:\n self.status = warning\n\n results: list[Data] = []\n try:\n image_mode = ImageRefMode(self.image_mode)\n for doc in documents:\n content = \"\"\n if self.export_format == \"Markdown\":\n content = doc.export_to_markdown(\n image_mode=image_mode,\n image_placeholder=self.md_image_placeholder,\n page_break_placeholder=self.md_page_break_placeholder,\n )\n elif self.export_format == \"HTML\":\n content = doc.export_to_html(image_mode=image_mode)\n elif self.export_format == \"Plaintext\":\n content = doc.export_to_text()\n elif self.export_format == \"DocTags\":\n content = doc.export_to_doctags()\n\n results.append(Data(text=content))\n except Exception as e:\n msg = f\"Error splitting text: {e}\"\n raise TypeError(msg) from e\n\n return results\n\n def as_dataframe(self) -> DataFrame:\n return DataFrame(self.export_document())\n" - }, - "data_inputs": { - "_input_type": "HandleInput", - "advanced": false, - "display_name": "Data or DataFrame", - "dynamic": false, - "info": "The data with documents to export.", - "input_types": [ - "Data", - "DataFrame" - ], - "list": false, - "list_add_label": "Add More", - "name": "data_inputs", - "override_skip": false, - "placeholder": "", - "required": true, - "show": true, - "title_case": false, - "trace_as_metadata": true, - "track_in_telemetry": false, - "type": "other", - "value": "" - }, - "doc_key": { - "_input_type": "MessageTextInput", - "advanced": true, - "display_name": "Doc Key", - "dynamic": false, - "info": "The key to use for the DoclingDocument column.", - "input_types": [ - "Message" - ], - "list": false, - "list_add_label": "Add More", - "load_from_db": false, - "name": "doc_key", - "override_skip": false, - "placeholder": "", - "required": false, - "show": true, - "title_case": false, - "tool_mode": false, - "trace_as_input": true, - "trace_as_metadata": true, - "track_in_telemetry": false, - "type": "str", - "value": "doc" - }, - "export_format": { - "_input_type": "DropdownInput", - "advanced": false, - "combobox": false, - "dialog_inputs": {}, - "display_name": "Export format", - "dynamic": false, - "external_options": {}, - "info": "Select the export format to convert the input.", - "name": "export_format", - "options": [ - "Markdown", - "HTML", - "Plaintext", - "DocTags" - ], - "options_metadata": [], - "override_skip": false, - "placeholder": "", - "real_time_refresh": true, - "required": false, - "show": true, - "title_case": false, - "toggle": false, - "tool_mode": false, - "trace_as_metadata": true, - "track_in_telemetry": true, - "type": "str", - "value": "Markdown" - }, - "image_mode": { - "_input_type": "DropdownInput", - "advanced": false, - "combobox": false, - "dialog_inputs": {}, - "display_name": "Image export mode", - "dynamic": false, - "external_options": {}, - "info": "Specify how images are exported in the output. Placeholder will replace the images with a string, whereas Embedded will include them as base64 encoded images.", - "name": "image_mode", - "options": [ - "placeholder", - "embedded" + "tiff", + "txt", + "xls", + "xlsx", + "xhtml", + "xml", + "webp", + "zip", + "tar", + "tgz", + "bz2", + "gz" ], - "options_metadata": [], + "file_path": [], + "info": "Supported file extensions: adoc, asciidoc, asc, bmp, csv, dotx, dotm, docm, docx, htm, html, jpeg, jpg, json, md, pdf, png, potx, ppsx, pptm, potm, ppsm, pptx, tiff, txt, xls, xlsx, xhtml, xml, webp; optionally bundled in file extensions: zip, tar, tgz, bz2, gz", + "list": true, + "list_add_label": "Add More", + "name": "path", "override_skip": false, "placeholder": "", "required": false, "show": true, + "temp_file": false, "title_case": false, - "toggle": false, - "tool_mode": false, + "tool_mode": true, "trace_as_metadata": true, - "track_in_telemetry": true, - "type": "str", - "value": "placeholder" + "track_in_telemetry": false, + "type": "file", + "value": "" }, - "md_image_placeholder": { + "separator": { "_input_type": "StrInput", "advanced": true, - "display_name": "Image placeholder", + "display_name": "Separator", "dynamic": false, - "info": "Specify the image placeholder for markdown exports.", + "info": "Specify the separator to use between multiple outputs in Message format.", "list": false, "list_add_label": "Add More", "load_from_db": false, - "name": "md_image_placeholder", + "name": "separator", "override_skip": false, "placeholder": "", "required": false, @@ -1950,18 +1383,17 @@ "trace_as_metadata": true, "track_in_telemetry": false, "type": "str", - "value": "" + "value": "\n\n" }, - "md_page_break_placeholder": { - "_input_type": "StrInput", + "silent_errors": { + "_input_type": "BoolInput", "advanced": true, - "display_name": "Page break placeholder", + "display_name": "Silent Errors", "dynamic": false, - "info": "Add this placeholder betweek pages in the markdown output.", + "info": "If true, errors will not raise an exception.", "list": false, "list_add_label": "Add More", - "load_from_db": false, - "name": "md_page_break_placeholder", + "name": "silent_errors", "override_skip": false, "placeholder": "", "required": false, @@ -1969,26 +1401,25 @@ "title_case": false, "tool_mode": false, "trace_as_metadata": true, - "track_in_telemetry": false, - "type": "str", - "value": "" + "track_in_telemetry": true, + "type": "bool", + "value": false } }, "tool_mode": false }, - "selected_output": "dataframe", "showNode": false, - "type": "ExportDoclingDocument" + "type": "DoclingRemote" }, "dragging": false, - "id": "ExportDoclingDocument-zZdRg", + "id": "DoclingRemote-Dp3PX", "measured": { "height": 52, "width": 192 }, "position": { - "x": 240.80357530808504, - "y": 1768.6446880643246 + "x": 96.24733042785462, + "y": 1776.8766711328894 }, "selected": false, "type": "genericNode" @@ -2438,8 +1869,8 @@ "width": 192 }, "position": { - "x": 533.2200473013957, - "y": 1769.3212323101711 + "x": 705.5150898330688, + "y": 1778.4580906262447 }, "selected": false, "type": "genericNode" @@ -2889,8 +2320,8 @@ "width": 192 }, "position": { - "x": 1070.158304947694, - "y": 1767.9220352210484 + "x": 1318.1587449554054, + "y": 1780.9746899582965 }, "selected": false, "type": "genericNode" @@ -3340,8 +2771,8 @@ "width": 192 }, "position": { - "x": 787.135897166392, - "y": 1770.3912596804048 + "x": 1015.5573550682316, + "y": 1779.5281179964784 }, "selected": false, "type": "genericNode" @@ -4224,7 +3655,7 @@ "value": "5488df7c-b93f-4f87-a446-b67028bc0813" }, "_frontend_node_folder_id": { - "value": "bbfbd352-e931-4894-afe8-6552a3f0cc2c" + "value": "573ff6c4-003c-4e1d-aa3f-8a83e1f3e020" }, "_type": "Component", "api_base": { @@ -4492,6 +3923,40 @@ "name": "text-embedding-ada-002", "provider": "OpenAI" }, + { + "category": "Ollama", + "icon": "Ollama", + "metadata": { + "embedding_class": "OllamaEmbeddings", + "model_type": "embeddings", + "param_mapping": { + "base_url": "base_url", + "model": "model", + "model_kwargs": "model_kwargs", + "num_ctx": "num_ctx", + "request_timeout": "request_timeout" + } + }, + "name": "nomic-embed-text:latest", + "provider": "Ollama" + }, + { + "category": "Ollama", + "icon": "Ollama", + "metadata": { + "embedding_class": "OllamaEmbeddings", + "model_type": "embeddings", + "param_mapping": { + "base_url": "base_url", + "model": "model", + "model_kwargs": "model_kwargs", + "num_ctx": "num_ctx", + "request_timeout": "request_timeout" + } + }, + "name": "qwen3-embedding:latest", + "provider": "Ollama" + }, { "category": "Google Generative AI", "icon": "GoogleGenerativeAI", @@ -4543,7 +4008,7 @@ "show_progress_bar": "show_progress_bar" } }, - "name": "text-embedding-3-large", + "name": "text-embedding-3-small", "provider": "OpenAI" } ] @@ -4579,7 +4044,7 @@ ], "list": false, "list_add_label": "Add More", - "load_from_db": true, + "load_from_db": false, "name": "project_id", "override_skip": false, "placeholder": "", @@ -4591,7 +4056,7 @@ "trace_as_metadata": true, "track_in_telemetry": false, "type": "str", - "value": "WATSONX_PROJECT_ID" + "value": "" }, "request_timeout": { "_input_type": "FloatInput", @@ -4752,7 +4217,7 @@ "value": "5488df7c-b93f-4f87-a446-b67028bc0813" }, "_frontend_node_folder_id": { - "value": "bbfbd352-e931-4894-afe8-6552a3f0cc2c" + "value": "573ff6c4-003c-4e1d-aa3f-8a83e1f3e020" }, "_type": "Component", "api_base": { @@ -5020,6 +4485,40 @@ "name": "text-embedding-ada-002", "provider": "OpenAI" }, + { + "category": "Ollama", + "icon": "Ollama", + "metadata": { + "embedding_class": "OllamaEmbeddings", + "model_type": "embeddings", + "param_mapping": { + "base_url": "base_url", + "model": "model", + "model_kwargs": "model_kwargs", + "num_ctx": "num_ctx", + "request_timeout": "request_timeout" + } + }, + "name": "nomic-embed-text:latest", + "provider": "Ollama" + }, + { + "category": "Ollama", + "icon": "Ollama", + "metadata": { + "embedding_class": "OllamaEmbeddings", + "model_type": "embeddings", + "param_mapping": { + "base_url": "base_url", + "model": "model", + "model_kwargs": "model_kwargs", + "num_ctx": "num_ctx", + "request_timeout": "request_timeout" + } + }, + "name": "qwen3-embedding:latest", + "provider": "Ollama" + }, { "category": "Google Generative AI", "icon": "GoogleGenerativeAI", @@ -5279,7 +4778,7 @@ "value": "5488df7c-b93f-4f87-a446-b67028bc0813" }, "_frontend_node_folder_id": { - "value": "bbfbd352-e931-4894-afe8-6552a3f0cc2c" + "value": "573ff6c4-003c-4e1d-aa3f-8a83e1f3e020" }, "_type": "Component", "api_base": { @@ -5547,6 +5046,40 @@ "name": "text-embedding-ada-002", "provider": "OpenAI" }, + { + "category": "Ollama", + "icon": "Ollama", + "metadata": { + "embedding_class": "OllamaEmbeddings", + "model_type": "embeddings", + "param_mapping": { + "base_url": "base_url", + "model": "model", + "model_kwargs": "model_kwargs", + "num_ctx": "num_ctx", + "request_timeout": "request_timeout" + } + }, + "name": "nomic-embed-text:latest", + "provider": "Ollama" + }, + { + "category": "Ollama", + "icon": "Ollama", + "metadata": { + "embedding_class": "OllamaEmbeddings", + "model_type": "embeddings", + "param_mapping": { + "base_url": "base_url", + "model": "model", + "model_kwargs": "model_kwargs", + "num_ctx": "num_ctx", + "request_timeout": "request_timeout" + } + }, + "name": "qwen3-embedding:latest", + "provider": "Ollama" + }, { "category": "Google Generative AI", "icon": "GoogleGenerativeAI", @@ -6034,7 +5567,160 @@ }, { "data": { - "id": "TextInput-OGCeZ", + "id": "TextInput-OGCeZ", + "node": { + "base_classes": [ + "Message" + ], + "beta": false, + "conditional_paths": [], + "custom_fields": {}, + "description": "Get user text inputs.", + "display_name": "Text Input", + "documentation": "https://docs.langflow.org/text-input-and-output", + "edited": false, + "field_order": [ + "input_value", + "use_global_variable" + ], + "frozen": false, + "icon": "type", + "last_updated": "2026-02-27T18:37:07.463Z", + "legacy": false, + "metadata": { + "code_hash": "518f16485886", + "dependencies": { + "dependencies": [ + { + "name": "lfx", + "version": null + } + ], + "total_dependencies": 1 + }, + "module": "lfx.components.input_output.text.TextInputComponent" + }, + "minimized": false, + "output_types": [], + "outputs": [ + { + "allows_loop": false, + "cache": true, + "display_name": "Output Text", + "group_outputs": false, + "loop_types": null, + "method": "text_response", + "name": "text", + "options": null, + "required_inputs": null, + "selected": "Message", + "tool_mode": true, + "types": [ + "Message" + ], + "value": "__UNDEFINED__" + } + ], + "pinned": false, + "template": { + "_frontend_node_flow_id": { + "value": "5488df7c-b93f-4f87-a446-b67028bc0813" + }, + "_frontend_node_folder_id": { + "value": "2bef1fdd-4d60-4bb6-8fd2-c0a3eae09d1e" + }, + "_type": "Component", + "code": { + "advanced": true, + "dynamic": true, + "fileTypes": [], + "file_path": "", + "info": "", + "list": false, + "load_from_db": false, + "multiline": true, + "name": "code", + "password": false, + "placeholder": "", + "required": true, + "show": true, + "title_case": false, + "type": "code", + "value": "from typing import Any\n\nfrom lfx.base.io.text import TextComponent\nfrom lfx.io import BoolInput, MultilineInput, Output\nfrom lfx.schema.message import Message\n\n\nclass TextInputComponent(TextComponent):\n display_name = \"Text Input\"\n description = \"Get user text inputs.\"\n documentation: str = \"https://docs.langflow.org/text-input-and-output\"\n icon = \"type\"\n name = \"TextInput\"\n\n inputs = [\n MultilineInput(\n name=\"input_value\",\n display_name=\"Text\",\n info=\"Text to be passed as input.\",\n ),\n BoolInput(\n name=\"use_global_variable\",\n display_name=\"Use Global Variable\",\n info=\"Enable to select from global variables (shows globe icon). Disables multiline editing.\",\n value=False,\n advanced=True,\n real_time_refresh=True,\n ),\n ]\n outputs = [\n Output(display_name=\"Output Text\", name=\"text\", method=\"text_response\"),\n ]\n\n def update_build_config(self, build_config: dict, field_value: Any, field_name: str | None = None) -> dict:\n if field_name == \"use_global_variable\":\n if field_value:\n # Enable global variable mode: single-line with password masking and globe dropdown\n build_config[\"input_value\"][\"multiline\"] = False\n build_config[\"input_value\"][\"password\"] = True\n else:\n # Default mode: multiline text editing\n build_config[\"input_value\"][\"multiline\"] = True\n build_config[\"input_value\"][\"password\"] = False\n return build_config\n\n def text_response(self) -> Message:\n return Message(\n text=self.input_value,\n )\n" + }, + "input_value": { + "_input_type": "MultilineInput", + "advanced": false, + "ai_enabled": false, + "copy_field": false, + "display_name": "Text", + "dynamic": false, + "info": "Text to be passed as input.", + "input_types": [ + "Message" + ], + "list": false, + "list_add_label": "Add More", + "load_from_db": true, + "multiline": false, + "name": "input_value", + "override_skip": false, + "password": true, + "placeholder": "", + "required": false, + "show": true, + "title_case": false, + "tool_mode": false, + "trace_as_input": true, + "trace_as_metadata": true, + "track_in_telemetry": false, + "type": "str", + "value": "CONNECTOR_TYPE" + }, + "is_refresh": false, + "use_global_variable": { + "_input_type": "BoolInput", + "advanced": true, + "display_name": "Use Global Variable", + "dynamic": false, + "info": "Enable to select from global variables (shows globe icon). Disables multiline editing.", + "list": false, + "list_add_label": "Add More", + "name": "use_global_variable", + "override_skip": false, + "placeholder": "", + "real_time_refresh": true, + "required": false, + "show": true, + "title_case": false, + "tool_mode": false, + "trace_as_metadata": true, + "track_in_telemetry": true, + "type": "bool", + "value": true + } + }, + "tool_mode": false + }, + "showNode": false, + "type": "TextInput" + }, + "dragging": false, + "id": "TextInput-OGCeZ", + "measured": { + "height": 52, + "width": 192 + }, + "position": { + "x": 1034.5767520283541, + "y": 1227.2295750493238 + }, + "selected": false, + "type": "genericNode" + }, + { + "data": { + "id": "TextInput-PI6at", "node": { "base_classes": [ "Message" @@ -6142,7 +5828,7 @@ "trace_as_metadata": true, "track_in_telemetry": false, "type": "str", - "value": "CONNECTOR_TYPE" + "value": "DOCUMENT_ID" }, "is_refresh": false, "use_global_variable": { @@ -6173,21 +5859,21 @@ "type": "TextInput" }, "dragging": false, - "id": "TextInput-OGCeZ", + "id": "TextInput-PI6at", "measured": { "height": 52, "width": 192 }, "position": { - "x": 1034.5767520283541, - "y": 1227.2295750493238 + "x": 1034.5767520283543, + "y": 1310.364074191445 }, "selected": false, "type": "genericNode" }, { "data": { - "id": "TextInput-PI6at", + "id": "TextInput-gRPNR", "node": { "base_classes": [ "Message" @@ -6295,7 +5981,7 @@ "trace_as_metadata": true, "track_in_telemetry": false, "type": "str", - "value": "DOCUMENT_ID" + "value": "OWNER" }, "is_refresh": false, "use_global_variable": { @@ -6326,21 +6012,21 @@ "type": "TextInput" }, "dragging": false, - "id": "TextInput-PI6at", + "id": "TextInput-gRPNR", "measured": { "height": 52, "width": 192 }, "position": { - "x": 1034.5767520283543, - "y": 1310.364074191445 + "x": 1034.5767520283541, + "y": 1393.498573333566 }, "selected": false, "type": "genericNode" }, { "data": { - "id": "TextInput-gRPNR", + "id": "TextInput-lTHSx", "node": { "base_classes": [ "Message" @@ -6448,7 +6134,7 @@ "trace_as_metadata": true, "track_in_telemetry": false, "type": "str", - "value": "OWNER" + "value": "OWNER_EMAIL" }, "is_refresh": false, "use_global_variable": { @@ -6479,21 +6165,21 @@ "type": "TextInput" }, "dragging": false, - "id": "TextInput-gRPNR", + "id": "TextInput-lTHSx", "measured": { "height": 52, "width": 192 }, "position": { - "x": 1034.5767520283541, - "y": 1393.498573333566 + "x": 1032.6873315933058, + "y": 1476.633072475687 }, "selected": false, "type": "genericNode" }, { "data": { - "id": "TextInput-lTHSx", + "id": "TextInput-68n9L", "node": { "base_classes": [ "Message" @@ -6601,7 +6287,7 @@ "trace_as_metadata": true, "track_in_telemetry": false, "type": "str", - "value": "OWNER_EMAIL" + "value": "OWNER_NAME" }, "is_refresh": false, "use_global_variable": { @@ -6632,21 +6318,21 @@ "type": "TextInput" }, "dragging": false, - "id": "TextInput-lTHSx", + "id": "TextInput-68n9L", "measured": { "height": 52, "width": 192 }, "position": { - "x": 1032.6873315933058, - "y": 1476.633072475687 + "x": 1032.687331593306, + "y": 1552.2098898776146 }, "selected": false, "type": "genericNode" }, { "data": { - "id": "TextInput-68n9L", + "id": "TextInput-UZQ8v", "node": { "base_classes": [ "Message" @@ -6754,7 +6440,7 @@ "trace_as_metadata": true, "track_in_telemetry": false, "type": "str", - "value": "OWNER_NAME" + "value": "SOURCE_URL" }, "is_refresh": false, "use_global_variable": { @@ -6785,52 +6471,66 @@ "type": "TextInput" }, "dragging": false, - "id": "TextInput-68n9L", + "id": "TextInput-UZQ8v", "measured": { "height": 52, "width": 192 }, "position": { - "x": 1032.687331593306, - "y": 1552.2098898776146 + "x": 1034.5767520283541, + "y": 1639.123229889832 }, "selected": false, "type": "genericNode" }, { "data": { - "id": "TextInput-UZQ8v", + "id": "ChunkDoclingDocument-DdOYd", "node": { "base_classes": [ - "Message" + "DataFrame" ], "beta": false, "conditional_paths": [], "custom_fields": {}, - "description": "Get user text inputs.", - "display_name": "Text Input", - "documentation": "https://docs.langflow.org/text-input-and-output", + "description": "Use the DocumentDocument chunkers to split the document into chunks.", + "display_name": "Chunk DoclingDocument", + "documentation": "https://docling-project.github.io/docling/concepts/chunking/", "edited": false, "field_order": [ - "input_value", - "use_global_variable" + "data_inputs", + "chunker", + "provider", + "hf_model_name", + "openai_model_name", + "max_tokens", + "merge_peers", + "always_emit_headings", + "doc_key" ], "frozen": false, - "icon": "type", - "last_updated": "2026-02-27T18:37:07.463Z", + "icon": "Docling", "legacy": false, "metadata": { - "code_hash": "518f16485886", + "code_hash": "49d762d97039", "dependencies": { "dependencies": [ + { + "name": "tiktoken", + "version": "0.12.0" + }, + { + "name": "docling_core", + "version": "2.60.1" + }, { "name": "lfx", "version": null } ], - "total_dependencies": 1 + "total_dependencies": 3 }, - "module": "lfx.components.input_output.text.TextInputComponent" + "module": "lfx.components.docling.chunk_docling_document.ChunkDoclingDocumentComponent" }, "minimized": false, "output_types": [], @@ -6838,30 +6538,72 @@ { "allows_loop": false, "cache": true, - "display_name": "Output Text", + "display_name": "DataFrame", "group_outputs": false, - "loop_types": null, - "method": "text_response", - "name": "text", - "options": null, - "required_inputs": null, - "selected": "Message", + "method": "chunk_documents", + "name": "dataframe", + "selected": "DataFrame", "tool_mode": true, "types": [ - "Message" + "DataFrame" ], "value": "__UNDEFINED__" } ], "pinned": false, "template": { - "_frontend_node_flow_id": { - "value": "5488df7c-b93f-4f87-a446-b67028bc0813" + "_type": "Component", + "always_emit_headings": { + "_input_type": "BoolInput", + "advanced": true, + "display_name": "Always emit headings", + "dynamic": true, + "info": "Emit headings even for empty sections.", + "list": false, + "list_add_label": "Add More", + "name": "always_emit_headings", + "override_skip": false, + "placeholder": "", + "required": false, + "show": true, + "title_case": false, + "tool_mode": false, + "trace_as_metadata": true, + "track_in_telemetry": true, + "type": "bool", + "value": false }, - "_frontend_node_folder_id": { - "value": "2bef1fdd-4d60-4bb6-8fd2-c0a3eae09d1e" + "chunker": { + "_input_type": "DropdownInput", + "advanced": false, + "combobox": false, + "dialog_inputs": {}, + "display_name": "Chunker", + "dynamic": false, + "external_options": {}, + "info": "Which chunker to use.", + "input_types": [ + "Message" + ], + "name": "chunker", + "options": [ + "HybridChunker", + "HierarchicalChunker" + ], + "options_metadata": [], + "override_skip": false, + "placeholder": "", + "real_time_refresh": true, + "required": false, + "show": true, + "title_case": false, + "toggle": false, + "tool_mode": false, + "trace_as_metadata": true, + "track_in_telemetry": true, + "type": "str", + "value": "HybridChunker" }, - "_type": "Component", "code": { "advanced": true, "dynamic": true, @@ -6878,26 +6620,45 @@ "show": true, "title_case": false, "type": "code", - "value": "from typing import Any\n\nfrom lfx.base.io.text import TextComponent\nfrom lfx.io import BoolInput, MultilineInput, Output\nfrom lfx.schema.message import Message\n\n\nclass TextInputComponent(TextComponent):\n display_name = \"Text Input\"\n description = \"Get user text inputs.\"\n documentation: str = \"https://docs.langflow.org/text-input-and-output\"\n icon = \"type\"\n name = \"TextInput\"\n\n inputs = [\n MultilineInput(\n name=\"input_value\",\n display_name=\"Text\",\n info=\"Text to be passed as input.\",\n ),\n BoolInput(\n name=\"use_global_variable\",\n display_name=\"Use Global Variable\",\n info=\"Enable to select from global variables (shows globe icon). Disables multiline editing.\",\n value=False,\n advanced=True,\n real_time_refresh=True,\n ),\n ]\n outputs = [\n Output(display_name=\"Output Text\", name=\"text\", method=\"text_response\"),\n ]\n\n def update_build_config(self, build_config: dict, field_value: Any, field_name: str | None = None) -> dict:\n if field_name == \"use_global_variable\":\n if field_value:\n # Enable global variable mode: single-line with password masking and globe dropdown\n build_config[\"input_value\"][\"multiline\"] = False\n build_config[\"input_value\"][\"password\"] = True\n else:\n # Default mode: multiline text editing\n build_config[\"input_value\"][\"multiline\"] = True\n build_config[\"input_value\"][\"password\"] = False\n return build_config\n\n def text_response(self) -> Message:\n return Message(\n text=self.input_value,\n )\n" + "value": "import json\n\nimport tiktoken\nfrom docling_core.transforms.chunker import BaseChunker, DocMeta\nfrom docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker\n\nfrom lfx.base.data.docling_utils import extract_docling_documents\nfrom lfx.custom import Component\nfrom lfx.io import BoolInput, DropdownInput, HandleInput, IntInput, MessageTextInput, Output, StrInput\nfrom lfx.schema import Data, DataFrame\n\n\nclass ChunkDoclingDocumentComponent(Component):\n display_name: str = \"Chunk DoclingDocument\"\n description: str = \"Use the DocumentDocument chunkers to split the document into chunks.\"\n documentation = \"https://docling-project.github.io/docling/concepts/chunking/\"\n icon = \"Docling\"\n name = \"ChunkDoclingDocument\"\n\n inputs = [\n HandleInput(\n name=\"data_inputs\",\n display_name=\"Data or DataFrame\",\n info=\"The data with documents to split in chunks.\",\n input_types=[\"Data\", \"DataFrame\"],\n required=True,\n ),\n DropdownInput(\n name=\"chunker\",\n display_name=\"Chunker\",\n options=[\"HybridChunker\", \"HierarchicalChunker\"],\n info=(\"Which chunker to use.\"),\n value=\"HybridChunker\",\n real_time_refresh=True,\n input_types=[\"Message\"],\n ),\n DropdownInput(\n name=\"provider\",\n display_name=\"Provider\",\n options=[\"Hugging Face\", \"OpenAI\"],\n info=(\"Which tokenizer provider.\"),\n value=\"Hugging Face\",\n show=True,\n real_time_refresh=True,\n advanced=True,\n dynamic=True,\n ),\n StrInput(\n name=\"hf_model_name\",\n display_name=\"HF model name\",\n info=(\n \"Model name of the tokenizer to use with the HybridChunker when Hugging Face is chosen as a tokenizer.\"\n ),\n value=\"sentence-transformers/all-MiniLM-L6-v2\",\n show=True,\n advanced=True,\n dynamic=True,\n ),\n StrInput(\n name=\"openai_model_name\",\n display_name=\"OpenAI model name\",\n info=(\"Model name of the tokenizer to use with the HybridChunker when OpenAI is chosen as a tokenizer.\"),\n value=\"gpt-4o\",\n show=False,\n advanced=True,\n dynamic=True,\n ),\n IntInput(\n name=\"max_tokens\",\n display_name=\"Maximum tokens\",\n info=(\"Maximum number of tokens for the HybridChunker.\"),\n show=True,\n required=False,\n advanced=True,\n dynamic=True,\n input_types=[\"Message\"],\n ),\n BoolInput(\n name=\"merge_peers\",\n display_name=\"Merge peers\",\n info=\"Merge undersized chunks sharing the same relevant metadata.\",\n value=True,\n show=True,\n advanced=True,\n dynamic=True,\n ),\n BoolInput(\n name=\"always_emit_headings\",\n display_name=\"Always emit headings\",\n info=\"Emit headings even for empty sections.\",\n value=False,\n show=True,\n advanced=True,\n dynamic=True,\n ),\n MessageTextInput(\n name=\"doc_key\",\n display_name=\"Doc Key\",\n info=\"The key to use for the DoclingDocument column.\",\n value=\"doc\",\n advanced=True,\n ),\n ]\n\n outputs = [\n Output(display_name=\"DataFrame\", name=\"dataframe\", method=\"chunk_documents\"),\n ]\n\n def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:\n \"\"\"Update build_config to show/hide fields based on chunker and provider selection.\"\"\"\n if field_name == \"chunker\":\n provider_type = build_config[\"provider\"][\"value\"]\n is_hf = provider_type == \"Hugging Face\"\n is_openai = provider_type == \"OpenAI\"\n if field_value == \"HybridChunker\":\n build_config[\"provider\"][\"show\"] = True\n build_config[\"hf_model_name\"][\"show\"] = is_hf\n build_config[\"openai_model_name\"][\"show\"] = is_openai\n build_config[\"max_tokens\"][\"show\"] = True\n build_config[\"merge_peers\"][\"show\"] = True\n build_config[\"always_emit_headings\"][\"show\"] = True\n else:\n build_config[\"provider\"][\"show\"] = False\n build_config[\"hf_model_name\"][\"show\"] = False\n build_config[\"openai_model_name\"][\"show\"] = False\n build_config[\"max_tokens\"][\"show\"] = False\n build_config[\"merge_peers\"][\"show\"] = False\n build_config[\"always_emit_headings\"][\"show\"] = False\n elif field_name == \"provider\" and build_config[\"chunker\"][\"value\"] == \"HybridChunker\":\n if field_value == \"Hugging Face\":\n build_config[\"hf_model_name\"][\"show\"] = True\n build_config[\"openai_model_name\"][\"show\"] = False\n elif field_value == \"OpenAI\":\n build_config[\"hf_model_name\"][\"show\"] = False\n build_config[\"openai_model_name\"][\"show\"] = True\n\n return build_config\n\n def _docs_to_data(self, docs) -> list[Data]:\n return [Data(text=doc.page_content, data=doc.metadata) for doc in docs]\n\n def chunk_documents(self) -> DataFrame:\n documents, warning = extract_docling_documents(self.data_inputs, self.doc_key)\n if warning:\n self.status = warning\n\n chunker: BaseChunker\n if self.chunker == \"HybridChunker\":\n try:\n from docling_core.transforms.chunker.hybrid_chunker import HybridChunker\n except ImportError as e:\n msg = (\n \"HybridChunker is not installed. Please install it with `uv pip install docling-core[chunking] \"\n \"or `uv pip install transformers`\"\n )\n raise ImportError(msg) from e\n max_tokens: int | None = self.max_tokens if self.max_tokens else None\n if self.provider == \"Hugging Face\":\n try:\n from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer\n except ImportError as e:\n msg = (\n \"HuggingFaceTokenizer is not installed.\"\n \" Please install it with `uv pip install docling-core[chunking]`\"\n )\n raise ImportError(msg) from e\n tokenizer = HuggingFaceTokenizer.from_pretrained(\n model_name=self.hf_model_name,\n max_tokens=max_tokens,\n )\n elif self.provider == \"OpenAI\":\n try:\n from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer\n except ImportError as e:\n msg = (\n \"OpenAITokenizer is not installed.\"\n \" Please install it with `uv pip install docling-core[chunking]`\"\n \" or `uv pip install transformers`\"\n )\n raise ImportError(msg) from e\n if max_tokens is None:\n max_tokens = 128 * 1024 # context window length required for OpenAI tokenizers\n tokenizer = OpenAITokenizer(\n tokenizer=tiktoken.encoding_for_model(self.openai_model_name), max_tokens=max_tokens\n )\n chunker = HybridChunker(\n tokenizer=tokenizer,\n merge_peers=bool(self.merge_peers),\n always_emit_headings=bool(self.always_emit_headings),\n )\n\n elif self.chunker == \"HierarchicalChunker\":\n chunker = HierarchicalChunker()\n else:\n msg = f\"Unknown chunker: {self.chunker}\"\n raise ValueError(msg)\n\n results: list[Data] = []\n try:\n for doc in documents:\n for chunk in chunker.chunk(dl_doc=doc):\n enriched_text = chunker.contextualize(chunk=chunk)\n meta = DocMeta.model_validate(chunk.meta)\n\n results.append(\n Data(\n data={\n \"text\": enriched_text,\n \"document_id\": f\"{doc.origin.binary_hash}\",\n \"doc_items\": json.dumps([item.self_ref for item in meta.doc_items]),\n }\n )\n )\n\n except Exception as e:\n msg = f\"Error splitting text: {e}\"\n raise TypeError(msg) from e\n\n return DataFrame(results)\n" }, - "input_value": { - "_input_type": "MultilineInput", + "data_inputs": { + "_input_type": "HandleInput", "advanced": false, - "ai_enabled": false, - "copy_field": false, - "display_name": "Text", + "display_name": "Data or DataFrame", "dynamic": false, - "info": "Text to be passed as input.", + "info": "The data with documents to split in chunks.", + "input_types": [ + "Data", + "DataFrame" + ], + "list": false, + "list_add_label": "Add More", + "name": "data_inputs", + "override_skip": false, + "placeholder": "", + "required": true, + "show": true, + "title_case": false, + "trace_as_metadata": true, + "track_in_telemetry": false, + "type": "other", + "value": "" + }, + "doc_key": { + "_input_type": "MessageTextInput", + "advanced": true, + "display_name": "Doc Key", + "dynamic": false, + "info": "The key to use for the DoclingDocument column.", "input_types": [ "Message" ], "list": false, "list_add_label": "Add More", - "load_from_db": true, - "multiline": false, - "name": "input_value", + "load_from_db": false, + "name": "doc_key", "override_skip": false, - "password": true, "placeholder": "", "required": false, "show": true, @@ -6907,21 +6668,63 @@ "trace_as_metadata": true, "track_in_telemetry": false, "type": "str", - "value": "SOURCE_URL" + "value": "doc" }, - "is_refresh": false, - "use_global_variable": { + "hf_model_name": { + "_input_type": "StrInput", + "advanced": true, + "display_name": "HF model name", + "dynamic": true, + "info": "Model name of the tokenizer to use with the HybridChunker when Hugging Face is chosen as a tokenizer.", + "list": false, + "list_add_label": "Add More", + "load_from_db": false, + "name": "hf_model_name", + "override_skip": false, + "placeholder": "", + "required": false, + "show": true, + "title_case": false, + "tool_mode": false, + "trace_as_metadata": true, + "track_in_telemetry": false, + "type": "str", + "value": "sentence-transformers/all-MiniLM-L6-v2" + }, + "max_tokens": { + "_input_type": "IntInput", + "advanced": true, + "display_name": "Maximum tokens", + "dynamic": true, + "info": "Maximum number of tokens for the HybridChunker.", + "input_types": [ + "Message" + ], + "list": false, + "list_add_label": "Add More", + "name": "max_tokens", + "override_skip": false, + "placeholder": "", + "required": false, + "show": true, + "title_case": false, + "tool_mode": false, + "trace_as_metadata": true, + "track_in_telemetry": true, + "type": "int", + "value": 0 + }, + "merge_peers": { "_input_type": "BoolInput", "advanced": true, - "display_name": "Use Global Variable", - "dynamic": false, - "info": "Enable to select from global variables (shows globe icon). Disables multiline editing.", + "display_name": "Merge peers", + "dynamic": true, + "info": "Merge undersized chunks sharing the same relevant metadata.", "list": false, "list_add_label": "Add More", - "name": "use_global_variable", + "name": "merge_peers", "override_skip": false, "placeholder": "", - "real_time_refresh": true, "required": false, "show": true, "title_case": false, @@ -6930,31 +6733,80 @@ "track_in_telemetry": true, "type": "bool", "value": true + }, + "openai_model_name": { + "_input_type": "StrInput", + "advanced": true, + "display_name": "OpenAI model name", + "dynamic": true, + "info": "Model name of the tokenizer to use with the HybridChunker when OpenAI is chosen as a tokenizer.", + "list": false, + "list_add_label": "Add More", + "load_from_db": false, + "name": "openai_model_name", + "override_skip": false, + "placeholder": "", + "required": false, + "show": false, + "title_case": false, + "tool_mode": false, + "trace_as_metadata": true, + "track_in_telemetry": false, + "type": "str", + "value": "gpt-4o" + }, + "provider": { + "_input_type": "DropdownInput", + "advanced": true, + "combobox": false, + "dialog_inputs": {}, + "display_name": "Provider", + "dynamic": true, + "external_options": {}, + "info": "Which tokenizer provider.", + "name": "provider", + "options": [ + "Hugging Face", + "OpenAI" + ], + "options_metadata": [], + "override_skip": false, + "placeholder": "", + "real_time_refresh": true, + "required": false, + "show": true, + "title_case": false, + "toggle": false, + "tool_mode": false, + "trace_as_metadata": true, + "track_in_telemetry": true, + "type": "str", + "value": "Hugging Face" } }, "tool_mode": false }, "showNode": false, - "type": "TextInput" + "type": "ChunkDoclingDocument" }, "dragging": false, - "id": "TextInput-UZQ8v", + "id": "ChunkDoclingDocument-DdOYd", "measured": { "height": 52, "width": 192 }, "position": { - "x": 1034.5767520283541, - "y": 1639.123229889832 + "x": 386.5505235051394, + "y": 1778.4620067663204 }, "selected": false, "type": "genericNode" } ], "viewport": { - "x": 202.3992870671026, - "y": -396.95628496879374, - "zoom": 0.50751717990745 + "x": 214.73712842346947, + "y": -385.1444291751076, + "zoom": 0.49162047009596377 } }, "description": "Load your data for chat context with Retrieval Augmented Generation.",