diff --git a/EXTERNAL_PROPERTIES_FEATURE.md b/EXTERNAL_PROPERTIES_FEATURE.md new file mode 100644 index 00000000..a17600a9 --- /dev/null +++ b/EXTERNAL_PROPERTIES_FEATURE.md @@ -0,0 +1,83 @@ +# External Properties Feature - Implementation Summary + +## Overview + +Added a flexible external properties feature that allows adding any business-specific properties from source document metadata to chunk nodes in the graph database. + +## Changes Made + +### 1. Configuration (`lexical-graph/src/graphrag_toolkit/lexical_graph/config.py`) +- Added `chunk_external_properties` property to `GraphRAGConfig` +- Accepts dictionary mapping chunk property names to source metadata keys +- Supports environment variable: `CHUNK_EXTERNAL_PROPERTIES` (JSON format) +- Default: `None` (feature disabled) + +### 2. Chunk Node Builder (`lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py`) +- Extracts multiple properties from validated source metadata when configured +- Iterates through property mapping and adds each available property +- Adds to chunk metadata: `metadata['chunk']['metadata'][property_name]` (nested structure matching source metadata) +- Uses `_get_source_info_metadata()` to ensure only valid (non-collection-based) metadata is used + +### 3. Chunk Graph Builder (`lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py`) +- Stores all external properties as properties on chunk nodes +- Reads from nested `metadata['chunk']['metadata']` dictionary +- Dynamically generates SET statements for each property +- Uses: `SET chunk.property_name = params.property_name` + +## Usage + +```python +from graphrag_toolkit.lexical_graph import GraphRAGConfig +from llama_index.core.schema import Document + +# Configure multiple properties +GraphRAGConfig.chunk_external_properties = { + 'article_code': 'article_id', + 'document_type': 'doc_type', + 'department': 'dept_code' +} + +# Create document with metadata +doc = Document( + text="Your content...", + metadata={ + 'article_id': 'ART-2024-001', + 'doc_type': 'research', + 'dept_code': 'ENG' + } +) + +# Build graph - chunks will have all configured properties +``` + +## Query Examples + +```cypher +// Find chunks by article code +MATCH (chunk:__Chunk__ {article_code: 'ART-2024-001'}) +RETURN chunk + +// Find chunks by document type +MATCH (chunk:__Chunk__ {document_type: 'research'}) +RETURN chunk + +// Complex multi-property query +MATCH (chunk:__Chunk__) +WHERE chunk.document_type = 'research' + AND chunk.department = 'ENG' +RETURN chunk +``` + +## Files Modified + +- `lexical-graph/src/graphrag_toolkit/lexical_graph/config.py` +- `lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py` +- `lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py` + +## Key Features + +- **Flexible**: Support any number of properties +- **Configurable**: Dictionary-based mapping +- **Graceful**: Handles missing metadata keys +- **Backward Compatible**: No breaking changes +- **Safe**: Uses validated source metadata to avoid write failures diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/__init__.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/__init__.py index 9d32969c..56546536 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/__init__.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/__init__.py @@ -40,7 +40,7 @@ def _asyncio_run(coro): from .tenant_id import TenantId, DEFAULT_TENANT_ID, DEFAULT_TENANT_NAME, TenantIdType, to_tenant_id from .config import GraphRAGConfig as GraphRAGConfig, LLMType, EmbeddingType -from .errors import ModelError, BatchJobError, IndexError, GraphQueryError +from .errors import ModelError, BatchJobError, IndexError, GraphQueryError, ConfigurationError from .logging import set_logging_config, set_advanced_logging_config from .lexical_graph_query_engine import LexicalGraphQueryEngine from .lexical_graph_index import LexicalGraphIndex diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/config.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/config.py index 73f40e52..504ed9f7 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/config.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/config.py @@ -19,6 +19,8 @@ from boto3 import Session as Boto3Session from botocore.config import Config +from graphrag_toolkit.lexical_graph.errors import ConfigurationError + from llama_index.llms.bedrock_converse import BedrockConverse from llama_index.embeddings.bedrock import BedrockEmbedding from llama_index.core.settings import Settings @@ -49,6 +51,7 @@ DEFAULT_METADATA_DATETIME_SUFFIXES = ['_date', '_datetime'] DEFAULT_OPENSEARCH_ENGINE = 'nmslib' DEFAULT_ENABLE_VERSIONING = False +DEFAULT_CHUNK_EXTERNAL_PROPERTIES = None def _is_json_string(s): """ @@ -287,6 +290,7 @@ class _GraphRAGConfig: _metadata_datetime_suffixes: Optional[List[str]] = None _opensearch_engine: Optional[str] = None _enable_versioning = None + _chunk_external_properties: Optional[Dict[str, str]] = None @contextlib.contextmanager def _validate_sso_token(self, profile): @@ -1167,5 +1171,59 @@ def enable_versioning(self) -> bool: def enable_versioning(self, enable_versioning: bool) -> None: self._enable_versioning = enable_versioning + @property + def chunk_external_properties(self) -> Optional[Dict[str, str]]: + """ + Gets the mapping of external property names to source metadata keys. + + This property allows you to configure which metadata fields from source documents + should be extracted and added as properties on chunk nodes in the graph database. + This enables querying and filtering chunks by business-specific identifiers. + + The mapping is a dictionary where: + - Key: The property name to use on the chunk node (e.g., 'article_code', 'document_id') + - Value: The metadata key to extract from source document (e.g., 'article_id', 'doc_ref') + + Example: + { + 'article_code': 'article_id', # chunk.article_code from metadata['article_id'] + 'document_type': 'doc_type', # chunk.document_type from metadata['doc_type'] + 'department': 'dept_code' # chunk.department from metadata['dept_code'] + } + + Returns: + Optional[Dict[str, str]]: Dictionary mapping chunk property names to metadata keys, + or None if not configured. + """ + if self._chunk_external_properties is None: + env_value = os.environ.get('CHUNK_EXTERNAL_PROPERTIES', DEFAULT_CHUNK_EXTERNAL_PROPERTIES) + if env_value and _is_json_string(env_value): + self._chunk_external_properties = json.loads(env_value) + else: + self._chunk_external_properties = env_value + return self._chunk_external_properties + + @chunk_external_properties.setter + def chunk_external_properties(self, chunk_external_properties: Optional[Dict[str, str]]) -> None: + """ + Sets the mapping of external property names to source metadata keys. + + Args: + chunk_external_properties: Dictionary mapping chunk property names to metadata keys, + or None to disable the feature. + + Example: + GraphRAGConfig.chunk_external_properties = { + 'article_code': 'article_id', + 'document_type': 'doc_type' + } + """ + if chunk_external_properties and isinstance(chunk_external_properties, dict): + if 'text' in chunk_external_properties: + raise ConfigurationError("chunk_external_properties cannot contain a 'text' key") + if 'chunkId' in chunk_external_properties: + raise ConfigurationError("chunk_external_properties cannot contain a 'chunkId' key") + self._chunk_external_properties = chunk_external_properties + GraphRAGConfig = _GraphRAGConfig() \ No newline at end of file diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/errors.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/errors.py index 4a51e8b0..fa7e149e 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/errors.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/errors.py @@ -1,6 +1,9 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 +class ConfigurationError(Exception): + pass + class ModelError(Exception): pass diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py index e035745d..3ace9de5 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py @@ -50,9 +50,9 @@ def build(self, node:BaseNode, graph_client: GraphStore, **kwargs:Any): graph_client: The graph client interface to interact with the graph database. **kwargs: Additional optional parameters for configuring the operation. """ - chunk_metadata = node.metadata.get('chunk', {}) - chunk_id = chunk_metadata.get('chunkId', None) - + chunk_id = node.metadata.get('chunk', {}).get('chunkId', None) + chunk_metadata = node.metadata.get('chunk', {}).get('metadata', {}) + if chunk_id: logger.debug(f'Inserting chunk [chunk_id: {chunk_id}]') @@ -60,15 +60,27 @@ def build(self, node:BaseNode, graph_client: GraphStore, **kwargs:Any): statements_c = [ '// insert chunks', 'UNWIND $params AS params', - f'MERGE (chunk:`__Chunk__`{{{graph_client.node_id("chunkId")}: params.chunk_id}})', - 'ON CREATE SET chunk.value = params.text ON MATCH SET chunk.value = params.text' + f'MERGE (chunk:`__Chunk__`{{{graph_client.node_id("chunkId")}: params.chunk_id}})' ] + chunk_property_setters = [ + 'chunk.value = params.text' + ] + properties_c = { 'chunk_id': chunk_id, 'text': node.text } + # Add external properties if present + for key, value in chunk_metadata.items(): + if key != 'chunkId': # Skip the ID field + chunk_property_setters.append(f'chunk.{key} = params.{key}') + properties_c[key] = value + + setter_statement = f"ON CREATE SET {', '.join(chunk_property_setters)} ON MATCH SET {', '.join(chunk_property_setters)}" + statements_c.append(setter_statement) + query_c = '\n'.join(statements_c) graph_client.execute_query_with_retry(query_c, self._to_params(properties_c), max_attempts=5, max_wait=7) @@ -134,4 +146,4 @@ def insert_chunk_to_chunk_relationship(node_id:str, relationship_type:str): insert_chunk_to_chunk_relationship(node_id, 'next') else: - logger.warning(f'chunk_id missing from chunk node [node_id: {node.node_id}]') \ No newline at end of file + logger.warning(f'chunk_id missing from chunk node [node_id: {node.node_id}]') diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py index a336a0af..4a29fc0f 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py @@ -6,6 +6,7 @@ from llama_index.core.schema import BaseNode, DEFAULT_TEXT_NODE_TMPL from llama_index.core.schema import NodeRelationship +from graphrag_toolkit.lexical_graph import GraphRAGConfig from graphrag_toolkit.lexical_graph.indexing.build.node_builder import NodeBuilder from graphrag_toolkit.lexical_graph.indexing.constants import TOPICS_KEY from graphrag_toolkit.lexical_graph.storage.constants import INDEX_KEY @@ -90,14 +91,26 @@ def build_nodes(self, nodes:List[BaseNode], **kwargs): 'sourceId': source_id }, 'chunk': { - 'chunkId': chunk_id + 'chunkId': chunk_id, + 'metadata': {} }, 'topics': topics } if source_info.metadata: + metadata['source'].update(self._get_source_info_metadata(source_info.metadata)) + # Add external properties if configured + external_props = GraphRAGConfig.chunk_external_properties + if external_props and isinstance(external_props, dict): + valid_source_metadata = metadata['source']['metadata'] + chunk_metadata = metadata['chunk']['metadata'] + for prop_name, metadata_key in external_props.items(): + if metadata_key in valid_source_metadata: + chunk_metadata[prop_name] = valid_source_metadata[metadata_key] + + metadata = self._update_metadata_with_versioning_info(metadata, node, build_timestamp) metadata[INDEX_KEY] = {