From de7df4b29d1ac40fe887be6c562cc7d3d98f1263 Mon Sep 17 00:00:00 2001 From: hariaws Date: Sun, 25 Jan 2026 18:49:40 -0600 Subject: [PATCH 1/6] added external propeorty --- .gitignore | 25 + EXTERNAL_PROPERTIES_FEATURE.md | 100 +++ ...06-Chunk-External-Properties-Example.ipynb | 588 ++++++++++++++++++ .../graphrag_toolkit/lexical_graph/config.py | 51 ++ .../indexing/build/chunk_graph_builder.py | 8 +- .../indexing/build/chunk_node_builder.py | 8 + 6 files changed, 779 insertions(+), 1 deletion(-) create mode 100644 EXTERNAL_PROPERTIES_FEATURE.md create mode 100644 examples/lexical-graph/notebooks/06-Chunk-External-Properties-Example.ipynb diff --git a/.gitignore b/.gitignore index e6a8a3af..658be901 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,28 @@ /docs/lexical-graph/.idea/ /examples/lexical-graph-hybrid-dev/notebooks/output/ /examples/lexical-graph-hybrid-dev/notebooks/extracted/ + +# Python cache and build files +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +*.egg +*.egg-info/ +dist/ +build/ +.eggs/ + +# Testing +.pytest_cache/ +.coverage +htmlcov/ +.tox/ + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ diff --git a/EXTERNAL_PROPERTIES_FEATURE.md b/EXTERNAL_PROPERTIES_FEATURE.md new file mode 100644 index 00000000..29c40cb0 --- /dev/null +++ b/EXTERNAL_PROPERTIES_FEATURE.md @@ -0,0 +1,100 @@ +# External Properties Feature - Implementation Summary + +## Overview + +Added a flexible external properties feature that allows adding any business-specific properties from source document metadata to chunk nodes in the graph database. + +## Changes Made + +### 1. Configuration (`lexical-graph/src/graphrag_toolkit/lexical_graph/config.py`) +- Added `chunk_external_properties` property to `GraphRAGConfig` +- Accepts dictionary mapping chunk property names to source metadata keys +- Supports environment variable: `CHUNK_EXTERNAL_PROPERTIES` (JSON format) +- Default: `None` (feature disabled) + +### 2. Chunk Node Builder (`lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py`) +- Extracts multiple properties from source metadata when configured +- Iterates through property mapping and adds each available property +- Adds to chunk metadata: `metadata['chunk'][property_name]` + +### 3. Chunk Graph Builder (`lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py`) +- Stores all external properties as properties on chunk nodes +- Dynamically generates SET statements for each property +- Uses: `SET chunk.property_name = params.property_name` + +## Usage + +```python +from graphrag_toolkit.lexical_graph import GraphRAGConfig +from llama_index.core.schema import Document + +# Configure multiple properties +GraphRAGConfig.chunk_external_properties = { + 'article_code': 'article_id', + 'document_type': 'doc_type', + 'department': 'dept_code' +} + +# Create document with metadata +doc = Document( + text="Your content...", + metadata={ + 'article_id': 'ART-2024-001', + 'doc_type': 'research', + 'dept_code': 'ENG' + } +) + +# Build graph - chunks will have all configured properties +``` + +## Query Examples + +```cypher +// Find chunks by article code +MATCH (chunk:__Chunk__ {article_code: 'ART-2024-001'}) +RETURN chunk + +// Find chunks by document type +MATCH (chunk:__Chunk__ {document_type: 'research'}) +RETURN chunk + +// Complex multi-property query +MATCH (chunk:__Chunk__) +WHERE chunk.document_type = 'research' + AND chunk.department = 'ENG' +RETURN chunk +``` + +## Files + +**Modified:** +- `lexical-graph/src/graphrag_toolkit/lexical_graph/config.py` +- `lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py` +- `lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py` + +**Created:** +- `lexical-graph/tests/test_chunk_external_properties.py` - Unit tests +- `lexical-graph/docs/chunk-external-properties.md` - Full documentation +- `examples/lexical-graph/notebooks/06-Chunk-External-Properties-Example.ipynb` - Examples with 7 use cases + +## Testing + +```bash +# Run tests (requires pytest) +python3 -m pytest lexical-graph/tests/test_chunk_external_properties.py -v +``` + +## Key Features + +- **Flexible**: Support any number of properties +- **Configurable**: Dictionary-based mapping +- **Graceful**: Handles missing metadata keys +- **Backward Compatible**: No breaking changes +- **Well Tested**: Comprehensive unit tests + +## Documentation + +- **Full Guide**: `lexical-graph/docs/chunk-external-properties.md` +- **Examples**: `examples/lexical-graph/notebooks/06-Chunk-External-Properties-Example.ipynb` +- **Tests**: `lexical-graph/tests/test_chunk_external_properties.py` diff --git a/examples/lexical-graph/notebooks/06-Chunk-External-Properties-Example.ipynb b/examples/lexical-graph/notebooks/06-Chunk-External-Properties-Example.ipynb new file mode 100644 index 00000000..702a1834 --- /dev/null +++ b/examples/lexical-graph/notebooks/06-Chunk-External-Properties-Example.ipynb @@ -0,0 +1,588 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Chunk External Properties Feature Example\n", + "\n", + "This notebook demonstrates how to use external properties to add business-specific attributes to chunk nodes in the graph database.\n", + "\n", + "## Overview\n", + "\n", + "The external properties feature allows you to:\n", + "- Add any number of business properties from source metadata to chunks\n", + "- Query and filter chunks by multiple dimensions\n", + "- Support complex multi-attribute filtering\n", + "- Enable advanced analytics and reporting" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from graphrag_toolkit.lexical_graph import GraphRAGConfig, LexicalGraphIndex\n", + "from llama_index.core.schema import Document\n", + "from llama_index.core.node_parser import SentenceSplitter" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 1: Single Property (Simple Use Case)\n", + "\n", + "### Step 1: Configure Single Property" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configure a single external property\n", + "GraphRAGConfig.chunk_external_properties = {\n", + " 'article_code': 'article_id' # chunk.article_code from metadata['article_id']\n", + "}\n", + "\n", + "print(f\"External properties configured: {GraphRAGConfig.chunk_external_properties}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2: Create Documents with Metadata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create sample documents with article_id in metadata\n", + "documents = [\n", + " Document(\n", + " text=\"\"\"Artificial Intelligence is transforming healthcare. \n", + " Machine learning algorithms can now detect diseases earlier than ever before. \n", + " This technology is saving lives and reducing healthcare costs.\"\"\",\n", + " metadata={\n", + " 'article_id': 'TECH-2024-001',\n", + " 'title': 'AI in Healthcare',\n", + " 'author': 'Dr. Jane Smith'\n", + " }\n", + " ),\n", + " Document(\n", + " text=\"\"\"Climate change is affecting global weather patterns. \n", + " Scientists are observing unprecedented changes in temperature and precipitation.\"\"\",\n", + " metadata={\n", + " 'article_id': 'ENV-2024-042',\n", + " 'title': 'Climate Change Impact',\n", + " 'author': 'Prof. John Doe'\n", + " }\n", + " )\n", + "]\n", + "\n", + "print(f\"Created {len(documents)} documents\")\n", + "for doc in documents:\n", + " print(f\" - {doc.metadata['article_id']}: {doc.metadata['title']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: Query by Single Property" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Query chunks by article_code\n", + "query = \"\"\"\n", + "MATCH (chunk:__Chunk__ {article_code: 'TECH-2024-001'})\n", + "RETURN chunk.chunkId, chunk.article_code, chunk.value\n", + "\"\"\"\n", + "\n", + "print(\"Query by article_code:\")\n", + "print(query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 2: Multiple Properties (Advanced Use Case)\n", + "\n", + "### Step 1: Configure Multiple Properties" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configure multiple external properties\n", + "GraphRAGConfig.chunk_external_properties = {\n", + " 'article_code': 'article_id',\n", + " 'document_type': 'doc_type',\n", + " 'department': 'dept_code',\n", + " 'priority': 'priority_level'\n", + "}\n", + "\n", + "print(\"External properties configured:\")\n", + "for prop, key in GraphRAGConfig.chunk_external_properties.items():\n", + " print(f\" chunk.{prop} ← metadata['{key}']\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2: Create Documents with Multiple Metadata Fields" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create documents with rich metadata\n", + "multi_prop_docs = [\n", + " Document(\n", + " text=\"Research paper on neural networks and deep learning...\",\n", + " metadata={\n", + " 'article_id': 'RES-2024-001',\n", + " 'doc_type': 'research',\n", + " 'dept_code': 'AI-LAB',\n", + " 'priority_level': 'high',\n", + " 'author': 'Dr. Smith'\n", + " }\n", + " ),\n", + " Document(\n", + " text=\"Technical report on system architecture...\",\n", + " metadata={\n", + " 'article_id': 'RPT-2024-042',\n", + " 'doc_type': 'report',\n", + " 'dept_code': 'SYS-ENG',\n", + " 'priority_level': 'medium',\n", + " 'author': 'John Doe'\n", + " }\n", + " ),\n", + " Document(\n", + " text=\"Policy document on data governance...\",\n", + " metadata={\n", + " 'article_id': 'POL-2024-015',\n", + " 'doc_type': 'policy',\n", + " 'dept_code': 'COMPLIANCE',\n", + " 'priority_level': 'high',\n", + " 'author': 'Jane Wilson'\n", + " }\n", + " )\n", + "]\n", + "\n", + "print(f\"Created {len(multi_prop_docs)} documents with multiple properties:\")\n", + "for doc in multi_prop_docs:\n", + " print(f\" - {doc.metadata['article_id']}: type={doc.metadata['doc_type']}, dept={doc.metadata['dept_code']}, priority={doc.metadata['priority_level']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: Multi-Dimensional Queries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Query 1: Find high-priority research documents\n", + "query_1 = \"\"\"\n", + "MATCH (chunk:__Chunk__)\n", + "WHERE chunk.document_type = 'research'\n", + " AND chunk.priority = 'high'\n", + "RETURN chunk.article_code, chunk.department\n", + "\"\"\"\n", + "\n", + "# Query 2: Find all chunks from AI-LAB department\n", + "query_2 = \"\"\"\n", + "MATCH (chunk:__Chunk__ {department: 'AI-LAB'})\n", + "RETURN chunk.article_code, chunk.document_type, chunk.value\n", + "\"\"\"\n", + "\n", + "# Query 3: Analytics - Count by document type and priority\n", + "query_3 = \"\"\"\n", + "MATCH (chunk:__Chunk__)\n", + "WHERE chunk.document_type IS NOT NULL\n", + "RETURN \n", + " chunk.document_type,\n", + " chunk.priority,\n", + " count(chunk) as chunk_count\n", + "ORDER BY chunk_count DESC\n", + "\"\"\"\n", + "\n", + "print(\"Multi-dimensional queries:\")\n", + "print(\"\\n1. High-priority research:\")\n", + "print(query_1)\n", + "print(\"\\n2. AI-LAB department:\")\n", + "print(query_2)\n", + "print(\"\\n3. Analytics by type and priority:\")\n", + "print(query_3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 3: Research Paper Management\n", + "\n", + "Use case: Managing research papers with DOI, journal, and field" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configure for research papers\n", + "GraphRAGConfig.chunk_external_properties = {\n", + " 'doi': 'doi',\n", + " 'journal': 'journal_name',\n", + " 'year': 'publication_year',\n", + " 'field': 'research_field'\n", + "}\n", + "\n", + "research_papers = [\n", + " Document(\n", + " text=\"Abstract: This paper presents advances in neural networks...\",\n", + " metadata={\n", + " 'doi': '10.1234/nn.2024.001',\n", + " 'journal_name': 'Nature AI',\n", + " 'publication_year': '2024',\n", + " 'research_field': 'Machine Learning'\n", + " }\n", + " ),\n", + " Document(\n", + " text=\"Abstract: Quantum computing applications in cryptography...\",\n", + " metadata={\n", + " 'doi': '10.1234/qc.2024.042',\n", + " 'journal_name': 'Science',\n", + " 'publication_year': '2024',\n", + " 'research_field': 'Quantum Computing'\n", + " }\n", + " )\n", + "]\n", + "\n", + "print(\"Research papers configured with properties:\")\n", + "for paper in research_papers:\n", + " print(f\" - DOI: {paper.metadata['doi']}\")\n", + " print(f\" Journal: {paper.metadata['journal_name']}, Field: {paper.metadata['research_field']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Query research papers\n", + "research_query = \"\"\"\n", + "MATCH (chunk:__Chunk__)\n", + "WHERE chunk.journal = 'Nature AI'\n", + " AND chunk.year = '2024'\n", + " AND chunk.field = 'Machine Learning'\n", + "RETURN chunk.doi, chunk.value\n", + "\"\"\"\n", + "\n", + "print(\"Query for ML papers in Nature AI (2024):\")\n", + "print(research_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 4: Organizational Hierarchy\n", + "\n", + "Use case: Corporate documents with department, division, and project tracking" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configure for organizational hierarchy\n", + "GraphRAGConfig.chunk_external_properties = {\n", + " 'department': 'dept',\n", + " 'division': 'div',\n", + " 'team': 'team',\n", + " 'project': 'project_code'\n", + "}\n", + "\n", + "corporate_docs = [\n", + " Document(\n", + " text=\"Project specification for the new AI platform...\",\n", + " metadata={\n", + " 'dept': 'Engineering',\n", + " 'div': 'R&D',\n", + " 'team': 'AI-Platform',\n", + " 'project_code': 'PROJ-2024-AI-001'\n", + " }\n", + " ),\n", + " Document(\n", + " text=\"Security audit report for cloud infrastructure...\",\n", + " metadata={\n", + " 'dept': 'Engineering',\n", + " 'div': 'Infrastructure',\n", + " 'team': 'Security',\n", + " 'project_code': 'PROJ-2024-SEC-042'\n", + " }\n", + " )\n", + "]\n", + "\n", + "print(\"Corporate documents with organizational hierarchy:\")\n", + "for doc in corporate_docs:\n", + " print(f\" - {doc.metadata['dept']} > {doc.metadata['div']} > {doc.metadata['team']}\")\n", + " print(f\" Project: {doc.metadata['project_code']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Query by organizational hierarchy\n", + "org_query = \"\"\"\n", + "MATCH (chunk:__Chunk__)\n", + "WHERE chunk.department = 'Engineering'\n", + " AND chunk.division = 'R&D'\n", + "RETURN chunk.project, chunk.team, count(chunk) as chunks_per_project\n", + "ORDER BY chunks_per_project DESC\n", + "\"\"\"\n", + "\n", + "print(\"Query Engineering R&D projects:\")\n", + "print(org_query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 5: Handling Partial Metadata\n", + "\n", + "The feature gracefully handles cases where some metadata keys are missing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Configure multiple properties\n", + "GraphRAGConfig.chunk_external_properties = {\n", + " 'article_code': 'article_id',\n", + " 'document_type': 'doc_type',\n", + " 'department': 'dept_code'\n", + "}\n", + "\n", + "# Create documents with partial metadata\n", + "partial_docs = [\n", + " Document(\n", + " text=\"Document with all properties...\",\n", + " metadata={\n", + " 'article_id': 'DOC-001',\n", + " 'doc_type': 'report',\n", + " 'dept_code': 'ENG'\n", + " }\n", + " ),\n", + " Document(\n", + " text=\"Document with partial properties...\",\n", + " metadata={\n", + " 'article_id': 'DOC-002',\n", + " 'doc_type': 'memo'\n", + " # Missing 'dept_code'\n", + " }\n", + " ),\n", + " Document(\n", + " text=\"Document with minimal properties...\",\n", + " metadata={\n", + " 'article_id': 'DOC-003'\n", + " # Missing 'doc_type' and 'dept_code'\n", + " }\n", + " )\n", + "]\n", + "\n", + "print(\"Documents with varying metadata:\")\n", + "for doc in partial_docs:\n", + " props = [k for k in ['article_id', 'doc_type', 'dept_code'] if k in doc.metadata]\n", + " print(f\" - {doc.metadata.get('article_id', 'N/A')}: has {props}\")\n", + "\n", + "print(\"\\nResult: Only available properties will be added to chunks.\")\n", + "print(\"No errors will be raised for missing keys.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 6: Disabling the Feature" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Disable external properties feature\n", + "GraphRAGConfig.chunk_external_properties = None\n", + "\n", + "print(f\"External properties feature disabled: {GraphRAGConfig.chunk_external_properties}\")\n", + "print(\"\\nChunks created after this will NOT include external properties.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 7: Advanced Analytics\n", + "\n", + "Use external properties for complex analytics and reporting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Analytics query 1: Distribution by multiple dimensions\n", + "analytics_1 = \"\"\"\n", + "MATCH (chunk:__Chunk__)\n", + "WHERE chunk.document_type IS NOT NULL\n", + " AND chunk.department IS NOT NULL\n", + "RETURN \n", + " chunk.document_type,\n", + " chunk.department,\n", + " count(chunk) as chunk_count,\n", + " count(DISTINCT chunk.article_code) as article_count\n", + "ORDER BY chunk_count DESC\n", + "\"\"\"\n", + "\n", + "# Analytics query 2: Find related content\n", + "analytics_2 = \"\"\"\n", + "MATCH (chunk1:__Chunk__ {article_code: 'RES-2024-001'})\n", + "MATCH (chunk2:__Chunk__)\n", + "WHERE chunk2.department = chunk1.department\n", + " AND chunk2.article_code <> chunk1.article_code\n", + "RETURN DISTINCT chunk2.article_code, chunk2.document_type\n", + "\"\"\"\n", + "\n", + "# Analytics query 3: Aggregation with filtering\n", + "analytics_3 = \"\"\"\n", + "MATCH (chunk:__Chunk__)\n", + "WHERE chunk.priority = 'high'\n", + "RETURN \n", + " chunk.department,\n", + " count(DISTINCT chunk.article_code) as high_priority_docs,\n", + " count(chunk) as total_chunks\n", + "ORDER BY high_priority_docs DESC\n", + "\"\"\"\n", + "\n", + "print(\"Advanced Analytics Queries:\")\n", + "print(\"\\n1. Distribution by type and department:\")\n", + "print(analytics_1)\n", + "print(\"\\n2. Find related content:\")\n", + "print(analytics_2)\n", + "print(\"\\n3. High-priority documents by department:\")\n", + "print(analytics_3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "The external properties feature provides:\n", + "\n", + "### Key Benefits\n", + "\n", + "1. **Flexible**: Add any number of business properties\n", + "2. **Multi-dimensional**: Query by multiple attributes simultaneously\n", + "3. **Graceful**: Handles missing metadata without errors\n", + "4. **Powerful**: Enables complex analytics and filtering\n", + "5. **Domain-specific**: Tailor to your business needs\n", + "\n", + "### Common Use Cases\n", + "\n", + "- **Document Management**: article_code, document_type, status\n", + "- **Research Papers**: doi, journal, year, field\n", + "- **Corporate Docs**: department, division, team, project\n", + "- **Compliance**: classification, retention_policy, compliance_tag\n", + "- **Content Management**: content_id, status, version, author\n", + "\n", + "### Best Practices\n", + "\n", + "- Start with 2-3 essential properties\n", + "- Use clear, descriptive property names\n", + "- Validate metadata before building graph\n", + "- Document your property mapping\n", + "- Consider property cardinality for queries\n", + "\n", + "### Next Steps\n", + "\n", + "- Define properties for your domain\n", + "- Add metadata to your documents\n", + "- Build the graph with configured properties\n", + "- Create queries leveraging multiple properties\n", + "- Combine with vector search for hybrid retrieval" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/config.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/config.py index 73f40e52..e931d9e4 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/config.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/config.py @@ -49,6 +49,7 @@ DEFAULT_METADATA_DATETIME_SUFFIXES = ['_date', '_datetime'] DEFAULT_OPENSEARCH_ENGINE = 'nmslib' DEFAULT_ENABLE_VERSIONING = False +DEFAULT_CHUNK_EXTERNAL_PROPERTIES = None def _is_json_string(s): """ @@ -287,6 +288,7 @@ class _GraphRAGConfig: _metadata_datetime_suffixes: Optional[List[str]] = None _opensearch_engine: Optional[str] = None _enable_versioning = None + _chunk_external_properties: Optional[Dict[str, str]] = None @contextlib.contextmanager def _validate_sso_token(self, profile): @@ -1167,5 +1169,54 @@ def enable_versioning(self) -> bool: def enable_versioning(self, enable_versioning: bool) -> None: self._enable_versioning = enable_versioning + @property + def chunk_external_properties(self) -> Optional[Dict[str, str]]: + """ + Gets the mapping of external property names to source metadata keys. + + This property allows you to configure which metadata fields from source documents + should be extracted and added as properties on chunk nodes in the graph database. + This enables querying and filtering chunks by business-specific identifiers. + + The mapping is a dictionary where: + - Key: The property name to use on the chunk node (e.g., 'article_code', 'document_id') + - Value: The metadata key to extract from source document (e.g., 'article_id', 'doc_ref') + + Example: + { + 'article_code': 'article_id', # chunk.article_code from metadata['article_id'] + 'document_type': 'doc_type', # chunk.document_type from metadata['doc_type'] + 'department': 'dept_code' # chunk.department from metadata['dept_code'] + } + + Returns: + Optional[Dict[str, str]]: Dictionary mapping chunk property names to metadata keys, + or None if not configured. + """ + if self._chunk_external_properties is None: + env_value = os.environ.get('CHUNK_EXTERNAL_PROPERTIES', DEFAULT_CHUNK_EXTERNAL_PROPERTIES) + if env_value and _is_json_string(env_value): + self._chunk_external_properties = json.loads(env_value) + else: + self._chunk_external_properties = env_value + return self._chunk_external_properties + + @chunk_external_properties.setter + def chunk_external_properties(self, chunk_external_properties: Optional[Dict[str, str]]) -> None: + """ + Sets the mapping of external property names to source metadata keys. + + Args: + chunk_external_properties: Dictionary mapping chunk property names to metadata keys, + or None to disable the feature. + + Example: + GraphRAGConfig.chunk_external_properties = { + 'article_code': 'article_id', + 'document_type': 'doc_type' + } + """ + self._chunk_external_properties = chunk_external_properties + GraphRAGConfig = _GraphRAGConfig() \ No newline at end of file diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py index e035745d..d4914099 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py @@ -63,11 +63,17 @@ def build(self, node:BaseNode, graph_client: GraphStore, **kwargs:Any): f'MERGE (chunk:`__Chunk__`{{{graph_client.node_id("chunkId")}: params.chunk_id}})', 'ON CREATE SET chunk.value = params.text ON MATCH SET chunk.value = params.text' ] - + properties_c = { 'chunk_id': chunk_id, 'text': node.text } + + # Add external properties if present + for key, value in chunk_metadata.items(): + if key != 'chunkId': # Skip the ID field + statements_c.append(f'SET chunk.{key} = params.{key}') + properties_c[key] = value query_c = '\n'.join(statements_c) diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py index f8a3dbf6..bbd1d052 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py @@ -6,6 +6,7 @@ from llama_index.core.schema import BaseNode, DEFAULT_TEXT_NODE_TMPL from llama_index.core.schema import NodeRelationship +from graphrag_toolkit.lexical_graph import GraphRAGConfig from graphrag_toolkit.lexical_graph.indexing.build.node_builder import NodeBuilder from graphrag_toolkit.lexical_graph.indexing.constants import TOPICS_KEY from graphrag_toolkit.lexical_graph.storage.constants import INDEX_KEY @@ -97,6 +98,13 @@ def build_nodes(self, nodes:List[BaseNode], **kwargs): if source_info.metadata: metadata['source']['metadata'] = source_info.metadata + + # Add external properties if configured + external_props = GraphRAGConfig.chunk_external_properties + if external_props and isinstance(external_props, dict): + for prop_name, metadata_key in external_props.items(): + if metadata_key in source_info.metadata: + metadata['chunk'][prop_name] = source_info.metadata[metadata_key] metadata = self._update_metadata_with_versioning_info(metadata, node, build_timestamp) From 5bed7496831d2ece30f9a4f278bb1ed63bde5c96 Mon Sep 17 00:00:00 2001 From: ianrob Date: Thu, 29 Jan 2026 10:22:50 +0000 Subject: [PATCH 2/6] Update chunk property setters when creating chunk graph nodes --- .../src/graphrag_toolkit/lexical_graph/__init__.py | 2 +- .../src/graphrag_toolkit/lexical_graph/config.py | 7 +++++++ .../src/graphrag_toolkit/lexical_graph/errors.py | 3 +++ .../indexing/build/chunk_graph_builder.py | 14 ++++++++++---- .../storage/vector/opensearch_vector_indexes.py | 2 +- 5 files changed, 22 insertions(+), 6 deletions(-) diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/__init__.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/__init__.py index 9d32969c..56546536 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/__init__.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/__init__.py @@ -40,7 +40,7 @@ def _asyncio_run(coro): from .tenant_id import TenantId, DEFAULT_TENANT_ID, DEFAULT_TENANT_NAME, TenantIdType, to_tenant_id from .config import GraphRAGConfig as GraphRAGConfig, LLMType, EmbeddingType -from .errors import ModelError, BatchJobError, IndexError, GraphQueryError +from .errors import ModelError, BatchJobError, IndexError, GraphQueryError, ConfigurationError from .logging import set_logging_config, set_advanced_logging_config from .lexical_graph_query_engine import LexicalGraphQueryEngine from .lexical_graph_index import LexicalGraphIndex diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/config.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/config.py index e931d9e4..504ed9f7 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/config.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/config.py @@ -19,6 +19,8 @@ from boto3 import Session as Boto3Session from botocore.config import Config +from graphrag_toolkit.lexical_graph.errors import ConfigurationError + from llama_index.llms.bedrock_converse import BedrockConverse from llama_index.embeddings.bedrock import BedrockEmbedding from llama_index.core.settings import Settings @@ -1216,6 +1218,11 @@ def chunk_external_properties(self, chunk_external_properties: Optional[Dict[str 'document_type': 'doc_type' } """ + if chunk_external_properties and isinstance(chunk_external_properties, dict): + if 'text' in chunk_external_properties: + raise ConfigurationError("chunk_external_properties cannot contain a 'text' key") + if 'chunkId' in chunk_external_properties: + raise ConfigurationError("chunk_external_properties cannot contain a 'chunkId' key") self._chunk_external_properties = chunk_external_properties diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/errors.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/errors.py index 4a51e8b0..fa7e149e 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/errors.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/errors.py @@ -1,6 +1,9 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 +class ConfigurationError(Exception): + pass + class ModelError(Exception): pass diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py index d4914099..476b62a8 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py @@ -60,21 +60,27 @@ def build(self, node:BaseNode, graph_client: GraphStore, **kwargs:Any): statements_c = [ '// insert chunks', 'UNWIND $params AS params', - f'MERGE (chunk:`__Chunk__`{{{graph_client.node_id("chunkId")}: params.chunk_id}})', - 'ON CREATE SET chunk.value = params.text ON MATCH SET chunk.value = params.text' + f'MERGE (chunk:`__Chunk__`{{{graph_client.node_id("chunkId")}: params.chunk_id}})' + ] + + chunk_property_setters = [ + 'chunk.value = params.text' ] properties_c = { 'chunk_id': chunk_id, 'text': node.text } - + # Add external properties if present for key, value in chunk_metadata.items(): if key != 'chunkId': # Skip the ID field - statements_c.append(f'SET chunk.{key} = params.{key}') + chunk_property_setters.append(f'chunk.{key} = params.{key}') properties_c[key] = value + setter_statement = f"ON CREATE SET {', '.join(chunk_property_setters)} ON MATCH SET {', '.join(chunk_property_setters)}" + statements_c.append(setter_statement) + query_c = '\n'.join(statements_c) graph_client.execute_query_with_retry(query_c, self._to_params(properties_c), max_attempts=5, max_wait=7) diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/opensearch_vector_indexes.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/opensearch_vector_indexes.py index 6ad8ca87..d7453d3b 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/opensearch_vector_indexes.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector/opensearch_vector_indexes.py @@ -283,7 +283,7 @@ def index_exists(endpoint, index_name, dimensions, writeable) -> bool: index_exists = False try: - index_exists = client.indices.exists(index_name) + index_exists = client.indices.exists(index=index_name) if not index_exists and writeable: logger.debug(f'Creating OpenSearch index [index_name: {index_name}, endpoint: {endpoint}]') client.indices.create(index=index_name, body=idx_conf) From bb38c74b5a0a88c9709b4f3f1fbd4c8ada415031 Mon Sep 17 00:00:00 2001 From: goviha01 Date: Wed, 4 Feb 2026 21:58:04 -0600 Subject: [PATCH 3/6] add exmaple notebook for external property --- .gitignore | 3 +- .../lexical-graph/notebooks/00-Setup.ipynb | 2 +- ...06-Chunk-External-Properties-Example.ipynb | 576 +++++------------- 3 files changed, 141 insertions(+), 440 deletions(-) diff --git a/.gitignore b/.gitignore index 658be901..6f1a379a 100644 --- a/.gitignore +++ b/.gitignore @@ -17,7 +17,8 @@ __pycache__/ dist/ build/ .eggs/ - +*.env +*.DS_Store # Testing .pytest_cache/ .coverage diff --git a/examples/lexical-graph/notebooks/00-Setup.ipynb b/examples/lexical-graph/notebooks/00-Setup.ipynb index 1a277f62..8cef0564 100644 --- a/examples/lexical-graph/notebooks/00-Setup.ipynb +++ b/examples/lexical-graph/notebooks/00-Setup.ipynb @@ -23,7 +23,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install --only-binary :all: https://github.com/awslabs/graphrag-toolkit/archive/refs/tags/v3.15.2.zip#subdirectory=lexical-graph" + "!pip install -e ../../../lexical-graph" ] }, { diff --git a/examples/lexical-graph/notebooks/06-Chunk-External-Properties-Example.ipynb b/examples/lexical-graph/notebooks/06-Chunk-External-Properties-Example.ipynb index 702a1834..ef1ab94f 100644 --- a/examples/lexical-graph/notebooks/06-Chunk-External-Properties-Example.ipynb +++ b/examples/lexical-graph/notebooks/06-Chunk-External-Properties-Example.ipynb @@ -21,505 +21,231 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from graphrag_toolkit.lexical_graph import GraphRAGConfig, LexicalGraphIndex\n", - "from llama_index.core.schema import Document\n", - "from llama_index.core.node_parser import SentenceSplitter" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Example 1: Single Property (Simple Use Case)\n", + "## Setup\n", "\n", - "### Step 1: Configure Single Property" + "If you haven't already, install the toolkit and dependencies using the [Setup](./00-Setup.ipynb) notebook." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Configure a single external property\n", - "GraphRAGConfig.chunk_external_properties = {\n", - " 'article_code': 'article_id' # chunk.article_code from metadata['article_id']\n", - "}\n", - "\n", - "print(f\"External properties configured: {GraphRAGConfig.chunk_external_properties}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 2: Create Documents with Metadata" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create sample documents with article_id in metadata\n", - "documents = [\n", - " Document(\n", - " text=\"\"\"Artificial Intelligence is transforming healthcare. \n", - " Machine learning algorithms can now detect diseases earlier than ever before. \n", - " This technology is saving lives and reducing healthcare costs.\"\"\",\n", - " metadata={\n", - " 'article_id': 'TECH-2024-001',\n", - " 'title': 'AI in Healthcare',\n", - " 'author': 'Dr. Jane Smith'\n", - " }\n", - " ),\n", - " Document(\n", - " text=\"\"\"Climate change is affecting global weather patterns. \n", - " Scientists are observing unprecedented changes in temperature and precipitation.\"\"\",\n", - " metadata={\n", - " 'article_id': 'ENV-2024-042',\n", - " 'title': 'Climate Change Impact',\n", - " 'author': 'Prof. John Doe'\n", - " }\n", - " )\n", - "]\n", - "\n", - "print(f\"Created {len(documents)} documents\")\n", - "for doc in documents:\n", - " print(f\" - {doc.metadata['article_id']}: {doc.metadata['title']}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 3: Query by Single Property" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "# Query chunks by article_code\n", - "query = \"\"\"\n", - "MATCH (chunk:__Chunk__ {article_code: 'TECH-2024-001'})\n", - "RETURN chunk.chunkId, chunk.article_code, chunk.value\n", + "env_content = \"\"\"AWS_REGION=\"us-east-1\"\n", + "EMBEDDINGS_MODEL=\"cohere.embed-english-v3\"\n", + "EMBEDDINGS_DIMENSIONS=1024\n", + "EXTRACTION_MODEL=\"us.anthropic.claude-3-7-sonnet-20250219-v1:0\"\n", + "RESPONSE_MODEL=\"us.anthropic.claude-3-7-sonnet-20250219-v1:0\"\n", + "GRAPH_STORE=\"graph-store-endpoint:port\"\n", + "VECTOR_STORE=\"postgres-endpoint\"\n", "\"\"\"\n", "\n", - "print(\"Query by article_code:\")\n", - "print(query)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Example 2: Multiple Properties (Advanced Use Case)\n", - "\n", - "### Step 1: Configure Multiple Properties" + "with open('.env', 'w') as f:\n", + " f.write(env_content)\n", + "print(\"Updated .env\")" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "# Configure multiple external properties\n", - "GraphRAGConfig.chunk_external_properties = {\n", - " 'article_code': 'article_id',\n", - " 'document_type': 'doc_type',\n", - " 'department': 'dept_code',\n", - " 'priority': 'priority_level'\n", - "}\n", + "%reload_ext dotenv\n", + "%dotenv\n", "\n", - "print(\"External properties configured:\")\n", - "for prop, key in GraphRAGConfig.chunk_external_properties.items():\n", - " print(f\" chunk.{prop} ← metadata['{key}']\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 2: Create Documents with Multiple Metadata Fields" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create documents with rich metadata\n", - "multi_prop_docs = [\n", - " Document(\n", - " text=\"Research paper on neural networks and deep learning...\",\n", - " metadata={\n", - " 'article_id': 'RES-2024-001',\n", - " 'doc_type': 'research',\n", - " 'dept_code': 'AI-LAB',\n", - " 'priority_level': 'high',\n", - " 'author': 'Dr. Smith'\n", - " }\n", - " ),\n", - " Document(\n", - " text=\"Technical report on system architecture...\",\n", - " metadata={\n", - " 'article_id': 'RPT-2024-042',\n", - " 'doc_type': 'report',\n", - " 'dept_code': 'SYS-ENG',\n", - " 'priority_level': 'medium',\n", - " 'author': 'John Doe'\n", - " }\n", - " ),\n", - " Document(\n", - " text=\"Policy document on data governance...\",\n", - " metadata={\n", - " 'article_id': 'POL-2024-015',\n", - " 'doc_type': 'policy',\n", - " 'dept_code': 'COMPLIANCE',\n", - " 'priority_level': 'high',\n", - " 'author': 'Jane Wilson'\n", - " }\n", - " )\n", - "]\n", + "import os\n", + "from pathlib import Path\n", "\n", - "print(f\"Created {len(multi_prop_docs)} documents with multiple properties:\")\n", - "for doc in multi_prop_docs:\n", - " print(f\" - {doc.metadata['article_id']}: type={doc.metadata['doc_type']}, dept={doc.metadata['dept_code']}, priority={doc.metadata['priority_level']}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 3: Multi-Dimensional Queries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Query 1: Find high-priority research documents\n", - "query_1 = \"\"\"\n", - "MATCH (chunk:__Chunk__)\n", - "WHERE chunk.document_type = 'research'\n", - " AND chunk.priority = 'high'\n", - "RETURN chunk.article_code, chunk.department\n", - "\"\"\"\n", - "\n", - "# Query 2: Find all chunks from AI-LAB department\n", - "query_2 = \"\"\"\n", - "MATCH (chunk:__Chunk__ {department: 'AI-LAB'})\n", - "RETURN chunk.article_code, chunk.document_type, chunk.value\n", - "\"\"\"\n", + "from graphrag_toolkit.lexical_graph import GraphRAGConfig, LexicalGraphIndex, set_logging_config\n", + "from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory, VectorStoreFactory\n", + "from llama_index.core import SimpleDirectoryReader\n", "\n", - "# Query 3: Analytics - Count by document type and priority\n", - "query_3 = \"\"\"\n", - "MATCH (chunk:__Chunk__)\n", - "WHERE chunk.document_type IS NOT NULL\n", - "RETURN \n", - " chunk.document_type,\n", - " chunk.priority,\n", - " count(chunk) as chunk_count\n", - "ORDER BY chunk_count DESC\n", - "\"\"\"\n", + "set_logging_config('INFO')\n", "\n", - "print(\"Multi-dimensional queries:\")\n", - "print(\"\\n1. High-priority research:\")\n", - "print(query_1)\n", - "print(\"\\n2. AI-LAB department:\")\n", - "print(query_2)\n", - "print(\"\\n3. Analytics by type and priority:\")\n", - "print(query_3)" + "print(f\"Graph Store: {os.environ.get('GRAPH_STORE', 'Not set')}\")\n", + "print(f\"Vector Store: {os.environ.get('VECTOR_STORE', 'Not set')}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Example 3: Research Paper Management\n", + "## Configure Document Source Folder\n", "\n", - "Use case: Managing research papers with DOI, journal, and field" + "Set the path to your local folder containing documents to index." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "# Configure for research papers\n", - "GraphRAGConfig.chunk_external_properties = {\n", - " 'doi': 'doi',\n", - " 'journal': 'journal_name',\n", - " 'year': 'publication_year',\n", - " 'field': 'research_field'\n", - "}\n", - "\n", - "research_papers = [\n", - " Document(\n", - " text=\"Abstract: This paper presents advances in neural networks...\",\n", - " metadata={\n", - " 'doi': '10.1234/nn.2024.001',\n", - " 'journal_name': 'Nature AI',\n", - " 'publication_year': '2024',\n", - " 'research_field': 'Machine Learning'\n", - " }\n", - " ),\n", - " Document(\n", - " text=\"Abstract: Quantum computing applications in cryptography...\",\n", - " metadata={\n", - " 'doi': '10.1234/qc.2024.042',\n", - " 'journal_name': 'Science',\n", - " 'publication_year': '2024',\n", - " 'research_field': 'Quantum Computing'\n", - " }\n", - " )\n", - "]\n", + "# Set the path to your documents folder\n", + "# Change this to point to your local folder\n", + "DOCUMENTS_FOLDER = \"./data\"\n", "\n", - "print(\"Research papers configured with properties:\")\n", - "for paper in research_papers:\n", - " print(f\" - DOI: {paper.metadata['doi']}\")\n", - " print(f\" Journal: {paper.metadata['journal_name']}, Field: {paper.metadata['research_field']}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Query research papers\n", - "research_query = \"\"\"\n", - "MATCH (chunk:__Chunk__)\n", - "WHERE chunk.journal = 'Nature AI'\n", - " AND chunk.year = '2024'\n", - " AND chunk.field = 'Machine Learning'\n", - "RETURN chunk.doi, chunk.value\n", - "\"\"\"\n", + "# Create the folder if it doesn't exist (for demo purposes)\n", + "Path(DOCUMENTS_FOLDER).mkdir(parents=True, exist_ok=True)\n", "\n", - "print(\"Query for ML papers in Nature AI (2024):\")\n", - "print(research_query)" + "print(f\"Documents folder: {os.path.abspath(DOCUMENTS_FOLDER)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Example 4: Organizational Hierarchy\n", + "## Configure External Properties\n", "\n", - "Use case: Corporate documents with department, division, and project tracking" + "Define which metadata fields should be added as properties on chunk nodes." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "# Configure for organizational hierarchy\n", + "# Configure external properties - these will be added to chunk nodes\n", + "# Format: {'chunk_property_name': 'metadata_key'}\n", "GraphRAGConfig.chunk_external_properties = {\n", - " 'department': 'dept',\n", - " 'division': 'div',\n", - " 'team': 'team',\n", - " 'project': 'project_code'\n", + " 'file_name': 'file_name',\n", + " 'file_path': 'file_path',\n", + " 'file_type': 'file_type',\n", + " 'article_code': 'article_id',\n", + " 'department': 'department'\n", "}\n", "\n", - "corporate_docs = [\n", - " Document(\n", - " text=\"Project specification for the new AI platform...\",\n", - " metadata={\n", - " 'dept': 'Engineering',\n", - " 'div': 'R&D',\n", - " 'team': 'AI-Platform',\n", - " 'project_code': 'PROJ-2024-AI-001'\n", - " }\n", - " ),\n", - " Document(\n", - " text=\"Security audit report for cloud infrastructure...\",\n", - " metadata={\n", - " 'dept': 'Engineering',\n", - " 'div': 'Infrastructure',\n", - " 'team': 'Security',\n", - " 'project_code': 'PROJ-2024-SEC-042'\n", - " }\n", - " )\n", - "]\n", - "\n", - "print(\"Corporate documents with organizational hierarchy:\")\n", - "for doc in corporate_docs:\n", - " print(f\" - {doc.metadata['dept']} > {doc.metadata['div']} > {doc.metadata['team']}\")\n", - " print(f\" Project: {doc.metadata['project_code']}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Query by organizational hierarchy\n", - "org_query = \"\"\"\n", - "MATCH (chunk:__Chunk__)\n", - "WHERE chunk.department = 'Engineering'\n", - " AND chunk.division = 'R&D'\n", - "RETURN chunk.project, chunk.team, count(chunk) as chunks_per_project\n", - "ORDER BY chunks_per_project DESC\n", - "\"\"\"\n", - "\n", - "print(\"Query Engineering R&D projects:\")\n", - "print(org_query)" + "print(\"External properties configured:\")\n", + "for prop, key in GraphRAGConfig.chunk_external_properties.items():\n", + " print(f\" chunk.{prop} ← metadata['{key}']\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Example 5: Handling Partial Metadata\n", - "\n", - "The feature gracefully handles cases where some metadata keys are missing" + "## Load Documents from Local Folder" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "# Configure multiple properties\n", - "GraphRAGConfig.chunk_external_properties = {\n", - " 'article_code': 'article_id',\n", - " 'document_type': 'doc_type',\n", - " 'department': 'dept_code'\n", - "}\n", - "\n", - "# Create documents with partial metadata\n", - "partial_docs = [\n", - " Document(\n", - " text=\"Document with all properties...\",\n", - " metadata={\n", - " 'article_id': 'DOC-001',\n", - " 'doc_type': 'report',\n", - " 'dept_code': 'ENG'\n", - " }\n", - " ),\n", - " Document(\n", - " text=\"Document with partial properties...\",\n", - " metadata={\n", - " 'article_id': 'DOC-002',\n", - " 'doc_type': 'memo'\n", - " # Missing 'dept_code'\n", - " }\n", - " ),\n", - " Document(\n", - " text=\"Document with minimal properties...\",\n", - " metadata={\n", - " 'article_id': 'DOC-003'\n", - " # Missing 'doc_type' and 'dept_code'\n", - " }\n", - " )\n", - "]\n", - "\n", - "print(\"Documents with varying metadata:\")\n", - "for doc in partial_docs:\n", - " props = [k for k in ['article_id', 'doc_type', 'dept_code'] if k in doc.metadata]\n", - " print(f\" - {doc.metadata.get('article_id', 'N/A')}: has {props}\")\n", - "\n", - "print(\"\\nResult: Only available properties will be added to chunks.\")\n", - "print(\"No errors will be raised for missing keys.\")" + "# Define a function to add custom metadata to documents\n", + "def add_custom_metadata(file_path: str) -> dict:\n", + " \"\"\"Add custom metadata based on file name or path.\"\"\"\n", + " filename = os.path.basename(file_path)\n", + " \n", + " # Example: derive metadata from filename\n", + " metadata = {\n", + " 'file_name': filename,\n", + " 'file_path': file_path,\n", + " 'file_type': os.path.splitext(filename)[1].lstrip('.'),\n", + " 'article_id': os.path.splitext(filename)[0].lstrip('.')\n", + " }\n", + " \n", + " return metadata\n", + "\n", + "# Load documents from the folder\n", + "reader = SimpleDirectoryReader(\n", + " input_dir=DOCUMENTS_FOLDER,\n", + " recursive=True,\n", + " file_metadata=add_custom_metadata\n", + ")\n", + "\n", + "documents = reader.load_data()\n", + "\n", + "print(f\"Loaded {len(documents)} documents from {DOCUMENTS_FOLDER}\")\n", + "print(\"\\nDocuments with metadata:\")\n", + "for doc in documents:\n", + " print(f\" - {doc.metadata.get('file_name', 'N/A')}\")\n", + " print(f\" article_id: {doc.metadata.get('article_id', 'N/A')}\")\n", + " print(f\" department: {doc.metadata.get('department', 'N/A')}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Example 6: Disabling the Feature" + "## Build the Graph with External Properties" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "# Disable external properties feature\n", - "GraphRAGConfig.chunk_external_properties = None\n", + "# Build the graph - external properties will be added to chunk nodes\n", + "with (\n", + " GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE']) as graph_store,\n", + " VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE']) as vector_store\n", + "):\n", + " graph_index = LexicalGraphIndex(\n", + " graph_store, \n", + " vector_store\n", + " )\n", + " \n", + " # Extract and build - this will add external properties to chunks\n", + " graph_index.extract_and_build(documents, show_progress=True)\n", "\n", - "print(f\"External properties feature disabled: {GraphRAGConfig.chunk_external_properties}\")\n", - "print(\"\\nChunks created after this will NOT include external properties.\")" + "print('\\nGraph build complete!')\n", + "print('Chunks now have external properties from document metadata.')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Example 7: Advanced Analytics\n", - "\n", - "Use external properties for complex analytics and reporting" + "## Query Chunks by External Properties" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "# Analytics query 1: Distribution by multiple dimensions\n", - "analytics_1 = \"\"\"\n", - "MATCH (chunk:__Chunk__)\n", - "WHERE chunk.document_type IS NOT NULL\n", - " AND chunk.department IS NOT NULL\n", - "RETURN \n", - " chunk.document_type,\n", - " chunk.department,\n", - " count(chunk) as chunk_count,\n", - " count(DISTINCT chunk.article_code) as article_count\n", - "ORDER BY chunk_count DESC\n", - "\"\"\"\n", - "\n", - "# Analytics query 2: Find related content\n", - "analytics_2 = \"\"\"\n", - "MATCH (chunk1:__Chunk__ {article_code: 'RES-2024-001'})\n", - "MATCH (chunk2:__Chunk__)\n", - "WHERE chunk2.department = chunk1.department\n", - " AND chunk2.article_code <> chunk1.article_code\n", - "RETURN DISTINCT chunk2.article_code, chunk2.document_type\n", - "\"\"\"\n", - "\n", - "# Analytics query 3: Aggregation with filtering\n", - "analytics_3 = \"\"\"\n", - "MATCH (chunk:__Chunk__)\n", - "WHERE chunk.priority = 'high'\n", - "RETURN \n", - " chunk.department,\n", - " count(DISTINCT chunk.article_code) as high_priority_docs,\n", - " count(chunk) as total_chunks\n", - "ORDER BY high_priority_docs DESC\n", - "\"\"\"\n", - "\n", - "print(\"Advanced Analytics Queries:\")\n", - "print(\"\\n1. Distribution by type and department:\")\n", - "print(analytics_1)\n", - "print(\"\\n2. Find related content:\")\n", - "print(analytics_2)\n", - "print(\"\\n3. High-priority documents by department:\")\n", - "print(analytics_3)" + "# Verify that external properties are stored on chunk nodes\n", + "with GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE']) as graph_store:\n", + " \n", + " print(\"All chunks with their external properties:\")\n", + " print(\"=\" * 70)\n", + " \n", + " query = \"\"\"\n", + " MATCH (chunk:`__Chunk__`)\n", + " WHERE chunk.file_name IS NOT NULL\n", + " RETURN chunk.chunkId as chunkId,\n", + " chunk.file_name as file_name,\n", + " chunk.article_code as article_code,\n", + " chunk.department as department\n", + " ORDER BY chunk.file_name\n", + " \"\"\"\n", + " \n", + " results = graph_store.execute_query(query)\n", + " for r in results:\n", + " print(f\"File: {r.get('file_name', 'N/A')}\")\n", + " print(f\"article_id: {r.get('article_code', 'N/A')}\")\n", + " print()" ] }, { @@ -528,39 +254,13 @@ "source": [ "## Summary\n", "\n", - "The external properties feature provides:\n", - "\n", - "### Key Benefits\n", - "\n", - "1. **Flexible**: Add any number of business properties\n", - "2. **Multi-dimensional**: Query by multiple attributes simultaneously\n", - "3. **Graceful**: Handles missing metadata without errors\n", - "4. **Powerful**: Enables complex analytics and filtering\n", - "5. **Domain-specific**: Tailor to your business needs\n", - "\n", - "### Common Use Cases\n", - "\n", - "- **Document Management**: article_code, document_type, status\n", - "- **Research Papers**: doi, journal, year, field\n", - "- **Corporate Docs**: department, division, team, project\n", - "- **Compliance**: classification, retention_policy, compliance_tag\n", - "- **Content Management**: content_id, status, version, author\n", - "\n", - "### Best Practices\n", - "\n", - "- Start with 2-3 essential properties\n", - "- Use clear, descriptive property names\n", - "- Validate metadata before building graph\n", - "- Document your property mapping\n", - "- Consider property cardinality for queries\n", - "\n", - "### Next Steps\n", + "This notebook demonstrated:\n", "\n", - "- Define properties for your domain\n", - "- Add metadata to your documents\n", - "- Build the graph with configured properties\n", - "- Create queries leveraging multiple properties\n", - "- Combine with vector search for hybrid retrieval" + "1. **Loading documents from a local folder** using `SimpleDirectoryReader`\n", + "2. **Adding custom metadata** via the `file_metadata` callback\n", + "3. **Configuring external properties** via `GraphRAGConfig.chunk_external_properties`\n", + "4. **Building the graph** with documents that have metadata\n", + "5. **Querying chunks** using the external properties for filtering" ] } ], @@ -580,7 +280,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.0" + "version": "3.10.8" } }, "nbformat": 4, From 6e05a1127798ee6608945c790d473be235c0045c Mon Sep 17 00:00:00 2001 From: Hariharan Govindharajan <33079555+goviha01@users.noreply.github.com> Date: Wed, 25 Feb 2026 00:49:02 -0600 Subject: [PATCH 4/6] Update installation command for lexical-graph --- examples/lexical-graph/notebooks/00-Setup.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/lexical-graph/notebooks/00-Setup.ipynb b/examples/lexical-graph/notebooks/00-Setup.ipynb index 8cef0564..0b24b72b 100644 --- a/examples/lexical-graph/notebooks/00-Setup.ipynb +++ b/examples/lexical-graph/notebooks/00-Setup.ipynb @@ -23,7 +23,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -e ../../../lexical-graph" + "!pip install --only-binary :all: https://github.com/awslabs/graphrag-toolkit/archive/refs/tags/v3.15.2.zip#subdirectory=lexical-graph" ] }, { From 0b456870ccf90c148382dd91a2cd088e81468106 Mon Sep 17 00:00:00 2001 From: Hariharan Govindharajan <33079555+goviha01@users.noreply.github.com> Date: Wed, 25 Feb 2026 00:49:29 -0600 Subject: [PATCH 5/6] Update 00-Setup.ipynb --- examples/lexical-graph/notebooks/00-Setup.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/lexical-graph/notebooks/00-Setup.ipynb b/examples/lexical-graph/notebooks/00-Setup.ipynb index 0b24b72b..1a277f62 100644 --- a/examples/lexical-graph/notebooks/00-Setup.ipynb +++ b/examples/lexical-graph/notebooks/00-Setup.ipynb @@ -23,7 +23,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install --only-binary :all: https://github.com/awslabs/graphrag-toolkit/archive/refs/tags/v3.15.2.zip#subdirectory=lexical-graph" + "!pip install --only-binary :all: https://github.com/awslabs/graphrag-toolkit/archive/refs/tags/v3.15.2.zip#subdirectory=lexical-graph" ] }, { From 12d00511f647a521ac6bef1a7c3cffac0ecc5125 Mon Sep 17 00:00:00 2001 From: goviha01 Date: Thu, 26 Feb 2026 18:58:23 -0600 Subject: [PATCH 6/6] fixed review comments --- .gitignore | 3 +- EXTERNAL_PROPERTIES_FEATURE.md | 29 +- ...06-Chunk-External-Properties-Example.ipynb | 288 ------------------ .../indexing/build/chunk_graph_builder.py | 8 +- .../indexing/build/chunk_node_builder.py | 13 +- 5 files changed, 20 insertions(+), 321 deletions(-) delete mode 100644 examples/lexical-graph/notebooks/06-Chunk-External-Properties-Example.ipynb diff --git a/.gitignore b/.gitignore index 6f1a379a..658be901 100644 --- a/.gitignore +++ b/.gitignore @@ -17,8 +17,7 @@ __pycache__/ dist/ build/ .eggs/ -*.env -*.DS_Store + # Testing .pytest_cache/ .coverage diff --git a/EXTERNAL_PROPERTIES_FEATURE.md b/EXTERNAL_PROPERTIES_FEATURE.md index 29c40cb0..a17600a9 100644 --- a/EXTERNAL_PROPERTIES_FEATURE.md +++ b/EXTERNAL_PROPERTIES_FEATURE.md @@ -13,12 +13,14 @@ Added a flexible external properties feature that allows adding any business-spe - Default: `None` (feature disabled) ### 2. Chunk Node Builder (`lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py`) -- Extracts multiple properties from source metadata when configured +- Extracts multiple properties from validated source metadata when configured - Iterates through property mapping and adds each available property -- Adds to chunk metadata: `metadata['chunk'][property_name]` +- Adds to chunk metadata: `metadata['chunk']['metadata'][property_name]` (nested structure matching source metadata) +- Uses `_get_source_info_metadata()` to ensure only valid (non-collection-based) metadata is used ### 3. Chunk Graph Builder (`lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py`) - Stores all external properties as properties on chunk nodes +- Reads from nested `metadata['chunk']['metadata']` dictionary - Dynamically generates SET statements for each property - Uses: `SET chunk.property_name = params.property_name` @@ -66,35 +68,16 @@ WHERE chunk.document_type = 'research' RETURN chunk ``` -## Files +## Files Modified -**Modified:** - `lexical-graph/src/graphrag_toolkit/lexical_graph/config.py` - `lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py` - `lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py` -**Created:** -- `lexical-graph/tests/test_chunk_external_properties.py` - Unit tests -- `lexical-graph/docs/chunk-external-properties.md` - Full documentation -- `examples/lexical-graph/notebooks/06-Chunk-External-Properties-Example.ipynb` - Examples with 7 use cases - -## Testing - -```bash -# Run tests (requires pytest) -python3 -m pytest lexical-graph/tests/test_chunk_external_properties.py -v -``` - ## Key Features - **Flexible**: Support any number of properties - **Configurable**: Dictionary-based mapping - **Graceful**: Handles missing metadata keys - **Backward Compatible**: No breaking changes -- **Well Tested**: Comprehensive unit tests - -## Documentation - -- **Full Guide**: `lexical-graph/docs/chunk-external-properties.md` -- **Examples**: `examples/lexical-graph/notebooks/06-Chunk-External-Properties-Example.ipynb` -- **Tests**: `lexical-graph/tests/test_chunk_external_properties.py` +- **Safe**: Uses validated source metadata to avoid write failures diff --git a/examples/lexical-graph/notebooks/06-Chunk-External-Properties-Example.ipynb b/examples/lexical-graph/notebooks/06-Chunk-External-Properties-Example.ipynb deleted file mode 100644 index ef1ab94f..00000000 --- a/examples/lexical-graph/notebooks/06-Chunk-External-Properties-Example.ipynb +++ /dev/null @@ -1,288 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Chunk External Properties Feature Example\n", - "\n", - "This notebook demonstrates how to use external properties to add business-specific attributes to chunk nodes in the graph database.\n", - "\n", - "## Overview\n", - "\n", - "The external properties feature allows you to:\n", - "- Add any number of business properties from source metadata to chunks\n", - "- Query and filter chunks by multiple dimensions\n", - "- Support complex multi-attribute filtering\n", - "- Enable advanced analytics and reporting" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup\n", - "\n", - "If you haven't already, install the toolkit and dependencies using the [Setup](./00-Setup.ipynb) notebook." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "env_content = \"\"\"AWS_REGION=\"us-east-1\"\n", - "EMBEDDINGS_MODEL=\"cohere.embed-english-v3\"\n", - "EMBEDDINGS_DIMENSIONS=1024\n", - "EXTRACTION_MODEL=\"us.anthropic.claude-3-7-sonnet-20250219-v1:0\"\n", - "RESPONSE_MODEL=\"us.anthropic.claude-3-7-sonnet-20250219-v1:0\"\n", - "GRAPH_STORE=\"graph-store-endpoint:port\"\n", - "VECTOR_STORE=\"postgres-endpoint\"\n", - "\"\"\"\n", - "\n", - "with open('.env', 'w') as f:\n", - " f.write(env_content)\n", - "print(\"Updated .env\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "%reload_ext dotenv\n", - "%dotenv\n", - "\n", - "import os\n", - "from pathlib import Path\n", - "\n", - "from graphrag_toolkit.lexical_graph import GraphRAGConfig, LexicalGraphIndex, set_logging_config\n", - "from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory, VectorStoreFactory\n", - "from llama_index.core import SimpleDirectoryReader\n", - "\n", - "set_logging_config('INFO')\n", - "\n", - "print(f\"Graph Store: {os.environ.get('GRAPH_STORE', 'Not set')}\")\n", - "print(f\"Vector Store: {os.environ.get('VECTOR_STORE', 'Not set')}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configure Document Source Folder\n", - "\n", - "Set the path to your local folder containing documents to index." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Set the path to your documents folder\n", - "# Change this to point to your local folder\n", - "DOCUMENTS_FOLDER = \"./data\"\n", - "\n", - "# Create the folder if it doesn't exist (for demo purposes)\n", - "Path(DOCUMENTS_FOLDER).mkdir(parents=True, exist_ok=True)\n", - "\n", - "print(f\"Documents folder: {os.path.abspath(DOCUMENTS_FOLDER)}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configure External Properties\n", - "\n", - "Define which metadata fields should be added as properties on chunk nodes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Configure external properties - these will be added to chunk nodes\n", - "# Format: {'chunk_property_name': 'metadata_key'}\n", - "GraphRAGConfig.chunk_external_properties = {\n", - " 'file_name': 'file_name',\n", - " 'file_path': 'file_path',\n", - " 'file_type': 'file_type',\n", - " 'article_code': 'article_id',\n", - " 'department': 'department'\n", - "}\n", - "\n", - "print(\"External properties configured:\")\n", - "for prop, key in GraphRAGConfig.chunk_external_properties.items():\n", - " print(f\" chunk.{prop} ← metadata['{key}']\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Documents from Local Folder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Define a function to add custom metadata to documents\n", - "def add_custom_metadata(file_path: str) -> dict:\n", - " \"\"\"Add custom metadata based on file name or path.\"\"\"\n", - " filename = os.path.basename(file_path)\n", - " \n", - " # Example: derive metadata from filename\n", - " metadata = {\n", - " 'file_name': filename,\n", - " 'file_path': file_path,\n", - " 'file_type': os.path.splitext(filename)[1].lstrip('.'),\n", - " 'article_id': os.path.splitext(filename)[0].lstrip('.')\n", - " }\n", - " \n", - " return metadata\n", - "\n", - "# Load documents from the folder\n", - "reader = SimpleDirectoryReader(\n", - " input_dir=DOCUMENTS_FOLDER,\n", - " recursive=True,\n", - " file_metadata=add_custom_metadata\n", - ")\n", - "\n", - "documents = reader.load_data()\n", - "\n", - "print(f\"Loaded {len(documents)} documents from {DOCUMENTS_FOLDER}\")\n", - "print(\"\\nDocuments with metadata:\")\n", - "for doc in documents:\n", - " print(f\" - {doc.metadata.get('file_name', 'N/A')}\")\n", - " print(f\" article_id: {doc.metadata.get('article_id', 'N/A')}\")\n", - " print(f\" department: {doc.metadata.get('department', 'N/A')}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Build the Graph with External Properties" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Build the graph - external properties will be added to chunk nodes\n", - "with (\n", - " GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE']) as graph_store,\n", - " VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE']) as vector_store\n", - "):\n", - " graph_index = LexicalGraphIndex(\n", - " graph_store, \n", - " vector_store\n", - " )\n", - " \n", - " # Extract and build - this will add external properties to chunks\n", - " graph_index.extract_and_build(documents, show_progress=True)\n", - "\n", - "print('\\nGraph build complete!')\n", - "print('Chunks now have external properties from document metadata.')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Query Chunks by External Properties" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Verify that external properties are stored on chunk nodes\n", - "with GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE']) as graph_store:\n", - " \n", - " print(\"All chunks with their external properties:\")\n", - " print(\"=\" * 70)\n", - " \n", - " query = \"\"\"\n", - " MATCH (chunk:`__Chunk__`)\n", - " WHERE chunk.file_name IS NOT NULL\n", - " RETURN chunk.chunkId as chunkId,\n", - " chunk.file_name as file_name,\n", - " chunk.article_code as article_code,\n", - " chunk.department as department\n", - " ORDER BY chunk.file_name\n", - " \"\"\"\n", - " \n", - " results = graph_store.execute_query(query)\n", - " for r in results:\n", - " print(f\"File: {r.get('file_name', 'N/A')}\")\n", - " print(f\"article_id: {r.get('article_code', 'N/A')}\")\n", - " print()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary\n", - "\n", - "This notebook demonstrated:\n", - "\n", - "1. **Loading documents from a local folder** using `SimpleDirectoryReader`\n", - "2. **Adding custom metadata** via the `file_metadata` callback\n", - "3. **Configuring external properties** via `GraphRAGConfig.chunk_external_properties`\n", - "4. **Building the graph** with documents that have metadata\n", - "5. **Querying chunks** using the external properties for filtering" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py index 476b62a8..3ace9de5 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_graph_builder.py @@ -50,9 +50,9 @@ def build(self, node:BaseNode, graph_client: GraphStore, **kwargs:Any): graph_client: The graph client interface to interact with the graph database. **kwargs: Additional optional parameters for configuring the operation. """ - chunk_metadata = node.metadata.get('chunk', {}) - chunk_id = chunk_metadata.get('chunkId', None) - + chunk_id = node.metadata.get('chunk', {}).get('chunkId', None) + chunk_metadata = node.metadata.get('chunk', {}).get('metadata', {}) + if chunk_id: logger.debug(f'Inserting chunk [chunk_id: {chunk_id}]') @@ -146,4 +146,4 @@ def insert_chunk_to_chunk_relationship(node_id:str, relationship_type:str): insert_chunk_to_chunk_relationship(node_id, 'next') else: - logger.warning(f'chunk_id missing from chunk node [node_id: {node.node_id}]') \ No newline at end of file + logger.warning(f'chunk_id missing from chunk node [node_id: {node.node_id}]') diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py index bbd1d052..4a29fc0f 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/build/chunk_node_builder.py @@ -91,20 +91,25 @@ def build_nodes(self, nodes:List[BaseNode], **kwargs): 'sourceId': source_id }, 'chunk': { - 'chunkId': chunk_id + 'chunkId': chunk_id, + 'metadata': {} }, 'topics': topics } if source_info.metadata: - metadata['source']['metadata'] = source_info.metadata + + metadata['source'].update(self._get_source_info_metadata(source_info.metadata)) # Add external properties if configured external_props = GraphRAGConfig.chunk_external_properties if external_props and isinstance(external_props, dict): + valid_source_metadata = metadata['source']['metadata'] + chunk_metadata = metadata['chunk']['metadata'] for prop_name, metadata_key in external_props.items(): - if metadata_key in source_info.metadata: - metadata['chunk'][prop_name] = source_info.metadata[metadata_key] + if metadata_key in valid_source_metadata: + chunk_metadata[prop_name] = valid_source_metadata[metadata_key] + metadata = self._update_metadata_with_versioning_info(metadata, node, build_timestamp)