Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions EXTERNAL_PROPERTIES_FEATURE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# External Properties Feature - Implementation Summary

## Overview

Added a flexible external properties feature that allows adding any business-specific properties from source document metadata to chunk nodes in the graph database.

## Changes Made

### 1. Configuration (`lexical-graph/src/graphrag_toolkit/lexical_graph/config.py`)
- Added `chunk_external_properties` property to `GraphRAGConfig`
- Accepts dictionary mapping chunk property names to source metadata keys
- Supports environment variable: `CHUNK_EXTERNAL_PROPERTIES` (JSON format)
- Default: `None` (feature disabled)

### 2. Chunk Node Builder (`lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py`)
- Extracts multiple properties from validated source metadata when configured
- Iterates through property mapping and adds each available property
- Adds to chunk metadata: `metadata['chunk']['metadata'][property_name]` (nested structure matching source metadata)
- Uses `_get_source_info_metadata()` to ensure only valid (non-collection-based) metadata is used

### 3. Chunk Graph Builder (`lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py`)
- Stores all external properties as properties on chunk nodes
- Reads from nested `metadata['chunk']['metadata']` dictionary
- Dynamically generates SET statements for each property
- Uses: `SET chunk.property_name = params.property_name`

## Usage

```python
from graphrag_toolkit.lexical_graph import GraphRAGConfig
from llama_index.core.schema import Document

# Configure multiple properties
GraphRAGConfig.chunk_external_properties = {
'article_code': 'article_id',
'document_type': 'doc_type',
'department': 'dept_code'
}

# Create document with metadata
doc = Document(
text="Your content...",
metadata={
'article_id': 'ART-2024-001',
'doc_type': 'research',
'dept_code': 'ENG'
}
)

# Build graph - chunks will have all configured properties
```

## Query Examples

```cypher
// Find chunks by article code
MATCH (chunk:__Chunk__ {article_code: 'ART-2024-001'})
RETURN chunk

// Find chunks by document type
MATCH (chunk:__Chunk__ {document_type: 'research'})
RETURN chunk

// Complex multi-property query
MATCH (chunk:__Chunk__)
WHERE chunk.document_type = 'research'
AND chunk.department = 'ENG'
RETURN chunk
```

## Files Modified

- `lexical-graph/src/graphrag_toolkit/lexical_graph/config.py`
- `lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py`
- `lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py`

## Key Features

- **Flexible**: Support any number of properties
- **Configurable**: Dictionary-based mapping
- **Graceful**: Handles missing metadata keys
- **Backward Compatible**: No breaking changes
- **Safe**: Uses validated source metadata to avoid write failures
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def _asyncio_run(coro):

from .tenant_id import TenantId, DEFAULT_TENANT_ID, DEFAULT_TENANT_NAME, TenantIdType, to_tenant_id
from .config import GraphRAGConfig as GraphRAGConfig, LLMType, EmbeddingType
from .errors import ModelError, BatchJobError, IndexError, GraphQueryError
from .errors import ModelError, BatchJobError, IndexError, GraphQueryError, ConfigurationError
from .logging import set_logging_config, set_advanced_logging_config
from .lexical_graph_query_engine import LexicalGraphQueryEngine
from .lexical_graph_index import LexicalGraphIndex
Expand Down
58 changes: 58 additions & 0 deletions lexical-graph/src/graphrag_toolkit/lexical_graph/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
from boto3 import Session as Boto3Session
from botocore.config import Config

from graphrag_toolkit.lexical_graph.errors import ConfigurationError

from llama_index.llms.bedrock_converse import BedrockConverse
from llama_index.embeddings.bedrock import BedrockEmbedding
from llama_index.core.settings import Settings
Expand Down Expand Up @@ -49,6 +51,7 @@
DEFAULT_METADATA_DATETIME_SUFFIXES = ['_date', '_datetime']
DEFAULT_OPENSEARCH_ENGINE = 'nmslib'
DEFAULT_ENABLE_VERSIONING = False
DEFAULT_CHUNK_EXTERNAL_PROPERTIES = None

def _is_json_string(s):
"""
Expand Down Expand Up @@ -287,6 +290,7 @@ class _GraphRAGConfig:
_metadata_datetime_suffixes: Optional[List[str]] = None
_opensearch_engine: Optional[str] = None
_enable_versioning = None
_chunk_external_properties: Optional[Dict[str, str]] = None

@contextlib.contextmanager
def _validate_sso_token(self, profile):
Expand Down Expand Up @@ -1167,5 +1171,59 @@ def enable_versioning(self) -> bool:
def enable_versioning(self, enable_versioning: bool) -> None:
self._enable_versioning = enable_versioning

@property
def chunk_external_properties(self) -> Optional[Dict[str, str]]:
"""
Gets the mapping of external property names to source metadata keys.

This property allows you to configure which metadata fields from source documents
should be extracted and added as properties on chunk nodes in the graph database.
This enables querying and filtering chunks by business-specific identifiers.

The mapping is a dictionary where:
- Key: The property name to use on the chunk node (e.g., 'article_code', 'document_id')
- Value: The metadata key to extract from source document (e.g., 'article_id', 'doc_ref')

Example:
{
'article_code': 'article_id', # chunk.article_code from metadata['article_id']
'document_type': 'doc_type', # chunk.document_type from metadata['doc_type']
'department': 'dept_code' # chunk.department from metadata['dept_code']
}

Returns:
Optional[Dict[str, str]]: Dictionary mapping chunk property names to metadata keys,
or None if not configured.
"""
if self._chunk_external_properties is None:
env_value = os.environ.get('CHUNK_EXTERNAL_PROPERTIES', DEFAULT_CHUNK_EXTERNAL_PROPERTIES)
if env_value and _is_json_string(env_value):
self._chunk_external_properties = json.loads(env_value)
else:
self._chunk_external_properties = env_value
return self._chunk_external_properties

@chunk_external_properties.setter
def chunk_external_properties(self, chunk_external_properties: Optional[Dict[str, str]]) -> None:
"""
Sets the mapping of external property names to source metadata keys.

Args:
chunk_external_properties: Dictionary mapping chunk property names to metadata keys,
or None to disable the feature.

Example:
GraphRAGConfig.chunk_external_properties = {
'article_code': 'article_id',
'document_type': 'doc_type'
}
"""
if chunk_external_properties and isinstance(chunk_external_properties, dict):
if 'text' in chunk_external_properties:
raise ConfigurationError("chunk_external_properties cannot contain a 'text' key")
if 'chunkId' in chunk_external_properties:
raise ConfigurationError("chunk_external_properties cannot contain a 'chunkId' key")
self._chunk_external_properties = chunk_external_properties


GraphRAGConfig = _GraphRAGConfig()
3 changes: 3 additions & 0 deletions lexical-graph/src/graphrag_toolkit/lexical_graph/errors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

class ConfigurationError(Exception):
pass

class ModelError(Exception):
pass

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,25 +50,37 @@ def build(self, node:BaseNode, graph_client: GraphStore, **kwargs:Any):
graph_client: The graph client interface to interact with the graph database.
**kwargs: Additional optional parameters for configuring the operation.
"""
chunk_metadata = node.metadata.get('chunk', {})
chunk_id = chunk_metadata.get('chunkId', None)

chunk_id = node.metadata.get('chunk', {}).get('chunkId', None)
chunk_metadata = node.metadata.get('chunk', {}).get('metadata', {})
if chunk_id:

logger.debug(f'Inserting chunk [chunk_id: {chunk_id}]')

statements_c = [
'// insert chunks',
'UNWIND $params AS params',
f'MERGE (chunk:`__Chunk__`{{{graph_client.node_id("chunkId")}: params.chunk_id}})',
'ON CREATE SET chunk.value = params.text ON MATCH SET chunk.value = params.text'
f'MERGE (chunk:`__Chunk__`{{{graph_client.node_id("chunkId")}: params.chunk_id}})'
]

chunk_property_setters = [
'chunk.value = params.text'
]

properties_c = {
'chunk_id': chunk_id,
'text': node.text
}

# Add external properties if present
for key, value in chunk_metadata.items():
if key != 'chunkId': # Skip the ID field
chunk_property_setters.append(f'chunk.{key} = params.{key}')
properties_c[key] = value

setter_statement = f"ON CREATE SET {', '.join(chunk_property_setters)} ON MATCH SET {', '.join(chunk_property_setters)}"
statements_c.append(setter_statement)

query_c = '\n'.join(statements_c)

graph_client.execute_query_with_retry(query_c, self._to_params(properties_c), max_attempts=5, max_wait=7)
Expand Down Expand Up @@ -134,4 +146,4 @@ def insert_chunk_to_chunk_relationship(node_id:str, relationship_type:str):
insert_chunk_to_chunk_relationship(node_id, 'next')

else:
logger.warning(f'chunk_id missing from chunk node [node_id: {node.node_id}]')
logger.warning(f'chunk_id missing from chunk node [node_id: {node.node_id}]')
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from llama_index.core.schema import BaseNode, DEFAULT_TEXT_NODE_TMPL
from llama_index.core.schema import NodeRelationship

from graphrag_toolkit.lexical_graph import GraphRAGConfig
from graphrag_toolkit.lexical_graph.indexing.build.node_builder import NodeBuilder
from graphrag_toolkit.lexical_graph.indexing.constants import TOPICS_KEY
from graphrag_toolkit.lexical_graph.storage.constants import INDEX_KEY
Expand Down Expand Up @@ -90,14 +91,26 @@ def build_nodes(self, nodes:List[BaseNode], **kwargs):
'sourceId': source_id
},
'chunk': {
'chunkId': chunk_id
'chunkId': chunk_id,
'metadata': {}
},
'topics': topics
}

if source_info.metadata:

metadata['source'].update(self._get_source_info_metadata(source_info.metadata))

# Add external properties if configured
external_props = GraphRAGConfig.chunk_external_properties
if external_props and isinstance(external_props, dict):
valid_source_metadata = metadata['source']['metadata']
chunk_metadata = metadata['chunk']['metadata']
for prop_name, metadata_key in external_props.items():
if metadata_key in valid_source_metadata:
chunk_metadata[prop_name] = valid_source_metadata[metadata_key]


metadata = self._update_metadata_with_versioning_info(metadata, node, build_timestamp)

metadata[INDEX_KEY] = {
Expand Down