From 16bf5be0adf65fe8e330b77de26aa99913bf53cd Mon Sep 17 00:00:00 2001 From: fbmz-improving Date: Sun, 22 Feb 2026 10:18:28 -0800 Subject: [PATCH 1/4] Add GitHub Actions workflow for lexical graph release and update versioning in pyproject.toml --- .github/workflows/lexical-graph-release.yml | 60 +-------------------- 1 file changed, 1 insertion(+), 59 deletions(-) diff --git a/.github/workflows/lexical-graph-release.yml b/.github/workflows/lexical-graph-release.yml index a33e3d80..6094ee8c 100644 --- a/.github/workflows/lexical-graph-release.yml +++ b/.github/workflows/lexical-graph-release.yml @@ -36,11 +36,6 @@ jobs: - name: Build package run: .venv/bin/python -m build - - name: Create latest dist copies - run: | - cp dist/*.whl dist/lexical-graph-latest.whl - cp dist/*.tar.gz dist/lexical-graph-latest.tar.gz - - name: Upload build artifacts uses: actions/upload-artifact@v4 with: @@ -48,65 +43,12 @@ jobs: path: lexical-graph/dist/ retention-days: 30 - - name: Extract version - id: version - run: | - if [[ "$GITHUB_REF" == refs/tags/* ]]; then - VERSION="${GITHUB_REF_NAME#lexical-graph/}" - else - VERSION="v$(grep '^version = ' pyproject.toml | sed 's/version = "\(.*\)"/\1/')" - fi - echo "version=${VERSION}" >> $GITHUB_OUTPUT - - - name: Update pip install version in lexical-graph notebooks - working-directory: examples/lexical-graph/notebooks - run: python3 -c "import re; v='${{ steps.version.outputs.version }}'; nb=open('00-Setup.ipynb').read(); updated=re.sub(r'/refs/tags/v[^#]+\.zip','/refs/tags/'+v+'.zip',nb); assert updated!=nb,'No match in 00-Setup.ipynb'; open('00-Setup.ipynb','w').write(updated); print('Updated to',v)" - - - name: Zip lexical-graph notebooks - working-directory: examples/lexical-graph/notebooks - run: | - zip lexical-graph-examples.zip *.ipynb - cp lexical-graph-examples.zip lexical-graph-examples-latest.zip - - - name: Upload lexical-graph-examples.zip - uses: actions/upload-artifact@v4 - with: - name: lexical-graph-examples - path: | - examples/lexical-graph/notebooks/lexical-graph-examples.zip - examples/lexical-graph/notebooks/lexical-graph-examples-latest.zip - retention-days: 30 - - - name: Update pip install version in lexical-graph-hybrid-dev notebooks - working-directory: examples/lexical-graph-hybrid-dev/notebooks - run: python3 -c "import re; v='${{ steps.version.outputs.version }}'; nb=open('00-Setup.ipynb').read(); updated=re.sub(r'/refs/tags/v[^#]+\.zip','/refs/tags/'+v+'.zip',nb); assert updated!=nb,'No match in 00-Setup.ipynb'; open('00-Setup.ipynb','w').write(updated); print('Updated to',v)" - - - name: Zip lexical-graph-hybrid-dev notebooks - working-directory: examples/lexical-graph-hybrid-dev/notebooks - run: | - zip lexical-graph-hybrid-dev-examples.zip *.ipynb - cp lexical-graph-hybrid-dev-examples.zip lexical-graph-hybrid-dev-examples-latest.zip - - - name: Upload lexical-graph-hybrid-dev-examples.zip - uses: actions/upload-artifact@v4 - with: - name: lexical-graph-hybrid-dev-examples - path: | - examples/lexical-graph-hybrid-dev/notebooks/lexical-graph-hybrid-dev-examples.zip - examples/lexical-graph-hybrid-dev/notebooks/lexical-graph-hybrid-dev-examples-latest.zip - retention-days: 30 - # Automatically create a GitHub Release when a new tag is pushed - name: Create GitHub Release if: startsWith(github.ref, 'refs/tags/') uses: softprops/action-gh-release@v2 with: - files: | - lexical-graph/dist/* - examples/lexical-graph/notebooks/lexical-graph-examples.zip - examples/lexical-graph/notebooks/lexical-graph-examples-latest.zip - examples/lexical-graph-hybrid-dev/notebooks/lexical-graph-hybrid-dev-examples.zip - examples/lexical-graph-hybrid-dev/notebooks/lexical-graph-hybrid-dev-examples-latest.zip + files: lexical-graph/dist/* generate_release_notes: true prerelease: >- ${{ From 038ca800c0a696f66ac525081dde9e24f63b758d Mon Sep 17 00:00:00 2001 From: fbmz-improving Date: Mon, 23 Feb 2026 19:04:51 -0800 Subject: [PATCH 2/4] Enhance GitHub Actions workflow for lexical graph release by adding steps to create and upload latest distribution copies and update notebook versions. --- .github/workflows/lexical-graph-release.yml | 64 ++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/.github/workflows/lexical-graph-release.yml b/.github/workflows/lexical-graph-release.yml index 6094ee8c..e0081214 100644 --- a/.github/workflows/lexical-graph-release.yml +++ b/.github/workflows/lexical-graph-release.yml @@ -36,6 +36,11 @@ jobs: - name: Build package run: .venv/bin/python -m build + - name: Create latest dist copies + run: | + cp dist/*.whl dist/lexical-graph-latest.whl + cp dist/*.tar.gz dist/lexical-graph-latest.tar.gz + - name: Upload build artifacts uses: actions/upload-artifact@v4 with: @@ -43,12 +48,69 @@ jobs: path: lexical-graph/dist/ retention-days: 30 + - name: Extract version + id: version + run: | + if [[ "$GITHUB_REF" == refs/tags/* ]]; then + VERSION="${GITHUB_REF_NAME#lexical-graph/}" + else + VERSION="v$(grep '^version = ' pyproject.toml | sed 's/version = "\(.*\)"/\1/')" + fi + echo "version=${VERSION}" >> $GITHUB_OUTPUT + + - name: Update pip install version in lexical-graph notebooks + working-directory: examples/lexical-graph/notebooks + env: + VERSION: ${{ steps.version.outputs.version }} + run: sed -i "s|/refs/tags/v[^#\"]*\.zip|/refs/tags/${VERSION}.zip|g" 00-Setup.ipynb + + - name: Zip lexical-graph notebooks + working-directory: examples/lexical-graph/notebooks + run: | + zip lexical-graph-examples.zip *.ipynb + cp lexical-graph-examples.zip lexical-graph-examples-latest.zip + + - name: Upload lexical-graph-examples.zip + uses: actions/upload-artifact@v4 + with: + name: lexical-graph-examples + path: | + examples/lexical-graph/notebooks/lexical-graph-examples.zip + examples/lexical-graph/notebooks/lexical-graph-examples-latest.zip + retention-days: 30 + + - name: Update pip install version in lexical-graph-hybrid-dev notebooks + working-directory: examples/lexical-graph-hybrid-dev/notebooks + env: + VERSION: ${{ steps.version.outputs.version }} + run: sed -i "s|/refs/tags/v[^#\"]*\.zip|/refs/tags/${VERSION}.zip|g" 00-Setup.ipynb + + - name: Zip lexical-graph-hybrid-dev notebooks + working-directory: examples/lexical-graph-hybrid-dev/notebooks + run: | + zip lexical-graph-hybrid-dev-examples.zip *.ipynb + cp lexical-graph-hybrid-dev-examples.zip lexical-graph-hybrid-dev-examples-latest.zip + + - name: Upload lexical-graph-hybrid-dev-examples.zip + uses: actions/upload-artifact@v4 + with: + name: lexical-graph-hybrid-dev-examples + path: | + examples/lexical-graph-hybrid-dev/notebooks/lexical-graph-hybrid-dev-examples.zip + examples/lexical-graph-hybrid-dev/notebooks/lexical-graph-hybrid-dev-examples-latest.zip + retention-days: 30 + # Automatically create a GitHub Release when a new tag is pushed - name: Create GitHub Release if: startsWith(github.ref, 'refs/tags/') uses: softprops/action-gh-release@v2 with: - files: lexical-graph/dist/* + files: | + lexical-graph/dist/* + examples/lexical-graph/notebooks/lexical-graph-examples.zip + examples/lexical-graph/notebooks/lexical-graph-examples-latest.zip + examples/lexical-graph-hybrid-dev/notebooks/lexical-graph-hybrid-dev-examples.zip + examples/lexical-graph-hybrid-dev/notebooks/lexical-graph-hybrid-dev-examples-latest.zip generate_release_notes: true prerelease: >- ${{ From b74cfcfa1c8d222681cb041c9898c07b84ad995f Mon Sep 17 00:00:00 2001 From: fbmz-improving Date: Mon, 23 Feb 2026 19:58:41 -0800 Subject: [PATCH 3/4] Refactor notebook version update steps to use Python script for improved readability and maintainability --- .github/workflows/lexical-graph-release.yml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/lexical-graph-release.yml b/.github/workflows/lexical-graph-release.yml index e0081214..a33e3d80 100644 --- a/.github/workflows/lexical-graph-release.yml +++ b/.github/workflows/lexical-graph-release.yml @@ -60,9 +60,7 @@ jobs: - name: Update pip install version in lexical-graph notebooks working-directory: examples/lexical-graph/notebooks - env: - VERSION: ${{ steps.version.outputs.version }} - run: sed -i "s|/refs/tags/v[^#\"]*\.zip|/refs/tags/${VERSION}.zip|g" 00-Setup.ipynb + run: python3 -c "import re; v='${{ steps.version.outputs.version }}'; nb=open('00-Setup.ipynb').read(); updated=re.sub(r'/refs/tags/v[^#]+\.zip','/refs/tags/'+v+'.zip',nb); assert updated!=nb,'No match in 00-Setup.ipynb'; open('00-Setup.ipynb','w').write(updated); print('Updated to',v)" - name: Zip lexical-graph notebooks working-directory: examples/lexical-graph/notebooks @@ -81,9 +79,7 @@ jobs: - name: Update pip install version in lexical-graph-hybrid-dev notebooks working-directory: examples/lexical-graph-hybrid-dev/notebooks - env: - VERSION: ${{ steps.version.outputs.version }} - run: sed -i "s|/refs/tags/v[^#\"]*\.zip|/refs/tags/${VERSION}.zip|g" 00-Setup.ipynb + run: python3 -c "import re; v='${{ steps.version.outputs.version }}'; nb=open('00-Setup.ipynb').read(); updated=re.sub(r'/refs/tags/v[^#]+\.zip','/refs/tags/'+v+'.zip',nb); assert updated!=nb,'No match in 00-Setup.ipynb'; open('00-Setup.ipynb','w').write(updated); print('Updated to',v)" - name: Zip lexical-graph-hybrid-dev notebooks working-directory: examples/lexical-graph-hybrid-dev/notebooks From 9e6e6b48780ed7d2d5055e727cdfdb81a466fa0c Mon Sep 17 00:00:00 2001 From: fbmz-improving Date: Wed, 25 Feb 2026 18:59:10 -0800 Subject: [PATCH 4/4] Fix inaccuracies and gaps in configuration.md - Remove streaming and dimensions fields from JSON examples (silently ignored) - Fix embed model type name: Bedrock -> BedrockEmbedding - Fix aws_region default: hardcoded us-east-1 -> default boto3 session region - Fix include_domain_labels env var: DEFAULT_INCLUDE_DOMAIN_LABELS -> INCLUDE_DOMAIN_LABELS - Add missing parameters: include_classification_in_entity_id, filename (logging) - Correct wildcard support description for set_advanced_logging_config --- docs/lexical-graph/configuration.md | 72 +++- docs/lexical-graph/faq.md | 6 + docs/lexical-graph/indexing.md | 33 +- docs/lexical-graph/multi-tenancy.md | 18 +- docs/lexical-graph/querying.md | 104 ++++- docs/lexical-graph/versioned-updates.md | 14 +- lexical-graph/DOC_REVIEW_FINDINGS.md | 389 ++++++++++++++++++ lexical-graph/README.md | 160 +++++-- .../lexical_graph/lexical_graph_index.py | 27 +- .../lexical_graph_query_engine.py | 2 +- .../prompts/prompt_provider_config.py | 50 --- .../storage/vector_store_factory.py | 4 +- .../lexical_graph/tenant_id.py | 5 +- 13 files changed, 735 insertions(+), 149 deletions(-) create mode 100644 lexical-graph/DOC_REVIEW_FINDINGS.md diff --git a/docs/lexical-graph/configuration.md b/docs/lexical-graph/configuration.md index 21d97191..dfe294b0 100644 --- a/docs/lexical-graph/configuration.md +++ b/docs/lexical-graph/configuration.md @@ -21,9 +21,18 @@ The lexical-graph also allows you to set the logging level and apply logging fil ### GraphRAGConfig -`GraphRAGConfig` allows you to configure LLMs, embedding models, and the extract and build processes. +`GraphRAGConfig` is a module-level singleton (not a class to instantiate). It is created once at import time ([`config.py`](../../lexical-graph/src/graphrag_toolkit/lexical_graph/config.py#L1171)) and shared across the process. Set attributes directly on the imported object: -**Important**: If you want to change any of these values, do so early in your code, prior to creating a graph store or vector store. +```python +from graphrag_toolkit.lexical_graph import GraphRAGConfig + +GraphRAGConfig.aws_region = 'eu-west-1' +GraphRAGConfig.extraction_llm = 'anthropic.claude-3-5-sonnet-20241022-v2:0' +``` + +Setting `aws_profile` or `aws_region` automatically clears all cached boto3 clients. + +**Important**: Change configuration values early in your code, before creating any graph store or vector store. The configuration includes the following parameters: @@ -40,21 +49,37 @@ The configuration includes the following parameters: | `build_batch_size` | The number of input nodes to be processed in parallel across all workers in the build stage | `4` | `BUILD_BATCH_SIZE` | | `build_batch_write_size` | The number of elements to be written in a bulk operation to the graph and vector stores (see [Batch writes](#batch-writes)) | `25` | `BUILD_BATCH_WRITE_SIZE` | | `batch_writes_enabled` | Determines whether, on a per-worker basis, to write all elements (nodes and edges, or vectors) emitted by a batch of input nodes as a bulk operation, or singly, to the graph and vector stores (see [Batch writes](#batch-writes)) | `True` | `BATCH_WRITES_ENABLED` | -| `include_domain_labels` | Determines whether entities will have a domain-specific label (e.g. `Company`) as well as the [graph model's](./graph-model.md#entity-relationship-tier) `__Entity__` label | `False` | `DEFAULT_INCLUDE_DOMAIN_LABELS` | +| `include_domain_labels` | Determines whether entities will have a domain-specific label (e.g. `Company`) as well as the [graph model's](./graph-model.md#entity-relationship-tier) `__Entity__` label | `False` | `INCLUDE_DOMAIN_LABELS` | +| `include_local_entities` | Whether to include local-context entities in the graph | `False` | `INCLUDE_LOCAL_ENTITIES` | +| `include_classification_in_entity_id` | Whether to include an entity's classification in its graph node id | `True` | `INCLUDE_CLASSIFICATION_IN_ENTITY_ID` | +| `enable_versioning` | Whether to enable versioned updates (see [Versioned Updates](./versioned-updates.md)) | `False` | `ENABLE_VERSIONING` | | `enable_cache` | Determines whether the results of LLM calls to models on Amazon Bedrock are cached to the local filesystem (see [Caching Amazon Bedrock LLM responses](#caching-amazon-bedrock-llm-responses)) | `False` | `ENABLE_CACHE` | | `aws_profile` | AWS CLI named profile used to authenticate requests to Bedrock and other services | *None* | `AWS_PROFILE` | -| `aws_region` | AWS region used to scope Bedrock service calls | `us-east-1` | `AWS_REGION` | +| `aws_region` | AWS region used to scope Bedrock service calls | *Default boto3 session region* | `AWS_REGION` | + +The following parameters configure the rerankers used by query retrievers: + +| Parameter | Description | Default | Environment Variable | +| ------------- | ------------- | ------------- | ------------- | +| `reranking_model` | Local reranker model (mixedbread-ai) | `mixedbread-ai/mxbai-rerank-xsmall-v1` | `RERANKING_MODEL` | +| `bedrock_reranking_model` | Amazon Bedrock reranker model | `cohere.rerank-v3-5:0` | `BEDROCK_RERANKING_MODEL` | + +The following parameter applies only when using Amazon OpenSearch Serverless as a vector store: + +| Parameter | Description | Default | Environment Variable | +| ------------- | ------------- | ------------- | ------------- | +| `opensearch_engine` | OpenSearch kNN engine | `nmslib` | `OPENSEARCH_ENGINE` | To set a configuration parameter in your application code: ```python from graphrag_toolkit.lexical_graph import GraphRAGConfig -GraphRAGConfig.response_llm = 'anthropic.claude-3-haiku-20240307-v1:0' +GraphRAGConfig.response_llm = 'anthropic.claude-3-haiku-20240307-v1:0' GraphRAGConfig.extraction_num_workers = 4 ``` -You can also set configuration parameters via environment variables, as per the variable names in the table above. +You can also set any of these via environment variables using the variable names in the tables above. #### LLM configuration @@ -68,8 +93,7 @@ The `extraction_llm` and `response_llm` configuration parameters accept three di { "model": "anthropic.claude-3-7-sonnet-20250219-v1:0", "temperature": 0.0, - "max_tokens": 4096, - "streaming": true + "max_tokens": 4096 } ``` @@ -79,16 +103,20 @@ The `embed_model` configuration parameter accepts three different types of value - You can pass an instance of a LlamaIndex `BaseEmbedding` object. This allows you to configure the lexical-graph for embedding backends other than Amazon Bedrock. - You can pass the model name of an Amazon Bedrock model. For example: `amazon.titan-embed-text-v1`. - - You can pass a JSON string representation of a LlamaIndex `Bedrock` instance. For example: + - You can pass a JSON string representation of a LlamaIndex `BedrockEmbedding` instance. For example: ``` { - "model_name": "amazon.titan-embed-text-v2:0", - "dimensions": 512 + "model_name": "amazon.titan-embed-text-v2:0" } ``` - -When configuring an embedding model, you must also set the `embed_dimensions` configuration parameter. + +When configuring an embedding model, you must also set the `embed_dimensions` configuration parameter to match the model's output dimensions. For example: + +```python +GraphRAGConfig.embed_model = '{"model_name": "amazon.titan-embed-text-v2:0"}' +GraphRAGConfig.embed_dimensions = 512 +``` #### Batch writes @@ -116,7 +144,7 @@ The `graphrag_toolkit` provides two methods for configuring logging in your appl #### set_logging_config -The `set_logging_config` method allows you to configure logging with a basic set of options, such as logging level and module filters. Wildcards are supported for module names, and you can pass either a single string or a list of strings for included or excluded modules. For example: +The `set_logging_config` method allows you to configure logging with a basic set of options, such as logging level and module filters. Wildcards are supported for module names, and you can pass either a single string or a list of strings for included or excluded modules. You can optionally provide a `filename` to write log output to a file in addition to stdout. For example: ```python from graphrag_toolkit.lexical_graph import set_logging_config @@ -124,13 +152,14 @@ from graphrag_toolkit.lexical_graph import set_logging_config set_logging_config( logging_level='DEBUG', # or logging.DEBUG debug_include_modules='graphrag_toolkit.lexical_graph.storage', # single string or list of strings - debug_exclude_modules=['opensearch', 'boto'] # single string or list of strings + debug_exclude_modules=['opensearch', 'boto'], # single string or list of strings + filename='output.log' # optional: also write logs to a file ) ``` #### set_advanced_logging_config -The `set_advanced_logging_config` method provides more advanced logging configuration options, including the ability to specify filters for included and excluded modules or messages based on logging levels. Wildcards are supported only for module names, and you can pass either a single string or a list of strings for modules. This method offers greater flexibility and control over the logging behavior. +The `set_advanced_logging_config` method provides more advanced logging configuration options, including the ability to specify filters for included and excluded modules or messages based on logging levels. Wildcards are supported for module names and included messages, and you can pass either a single string or a list of strings for modules or messages. This method offers greater flexibility and control over the logging behavior. ##### Parameters @@ -141,6 +170,7 @@ The `set_advanced_logging_config` method provides more advanced logging configur | `excluded_modules` | `dict[int, str \| list[str]]` | Modules to exclude from logging, grouped by logging level. Wildcards are supported. | `None` | | `included_messages` | `dict[int, str \| list[str]]` | Specific messages to include in logging, grouped by logging level. Wildcards are supported. | `None` | | `excluded_messages` | `dict[int, str \| list[str]]` | Specific messages to exclude from logging, grouped by logging level. | `None` | +| `filename` | `str` | If provided, log output is also written to this file in addition to stdout. | `None` | ##### Example Usage @@ -186,6 +216,12 @@ export AWS_PROFILE=padmin export AWS_REGION=us-east-1 ``` -If no profile or region is set explicitly, the system will fall back to environment variables or use the default AWS CLI configuration. +If no profile or region is set explicitly, the system falls back to environment variables or the default AWS CLI configuration. + +See [Using AWS Profiles in `GraphRAGConfig`](./aws-profile.md) for more details on configuring and using AWS named profiles. + +#### Resilient clients and SSO token refresh + +All boto3 clients created by `GraphRAGConfig` are wrapped in a `ResilientClient` ([`config.py:94`](https://github.com/awslabs/graphrag-toolkit/blob/main/lexical-graph/src/graphrag_toolkit/lexical_graph/config.py#L94)). On `ExpiredToken`, `RequestExpired`, or `InvalidClientTokenId` errors the client is refreshed automatically and the call is retried. -See [Using AWS Profiles in `GraphRAGConfig`](./aws-profile.md) for more details on configuring and using **AWS named profiles** in the lexical-graph by leveraging the `GraphRAGConfig` class. +When an AWS SSO profile is in use, the client wrapper also validates the SSO token age. If the token is more than one hour old, it runs `aws sso login` automatically before retrying. This is relevant for long-running indexing jobs and any environment where SSO sessions can expire mid-run. diff --git a/docs/lexical-graph/faq.md b/docs/lexical-graph/faq.md index 2b98c2b1..879be385 100644 --- a/docs/lexical-graph/faq.md +++ b/docs/lexical-graph/faq.md @@ -97,6 +97,12 @@ To fix, [enable access](https://docs.aws.amazon.com/bedrock/latest/userguide/mo --- +#### Importing the package patches llama_index async internals + +When you import `graphrag_toolkit.lexical_graph`, the package patches `llama_index.core.async_utils.asyncio_run` unconditionally ([`__init__.py:34`](https://github.com/awslabs/graphrag-toolkit/blob/main/lexical-graph/src/graphrag_toolkit/lexical_graph/__init__.py#L34)). The patch makes LlamaIndex's internal async runner work inside Jupyter notebooks by re-using the existing event loop instead of creating a new one. If no running loop is found, it falls back to `asyncio.run()`. This can interact unexpectedly with other code using LlamaIndex in the same process, particularly if that code relies on `asyncio_run` starting a clean event loop. There is currently no opt-out. + +--- + #### WARNING:graph_store:Retrying query in x seconds because it raised ConcurrentModificationException While indexing data in Amazon Neptune Database, Neptune can sometimes issue a `ConcurrentModificationException`. This occurs because multiple workers are attempting to [update the same set of vertices](https://docs.aws.amazon.com/neptune/latest/userguide/transactions-exceptions.html). The GraphRAG Toolkit automatically retries transactionsb that are cancelled because of a `ConcurrentModificationException`. If the maximum number of retries is exceeded and the indexing fails, consider reducing the number of workers in the build stage using [`GraphRAGConfig.build_num_workers`](./configuration.md#graphragconfig). diff --git a/docs/lexical-graph/indexing.md b/docs/lexical-graph/indexing.md index cf9d89e7..42d6bea7 100644 --- a/docs/lexical-graph/indexing.md +++ b/docs/lexical-graph/indexing.md @@ -256,9 +256,10 @@ The `IndexingConfig` object has the following parameters: | Parameter | Description | Default Value | | ------------- | ------------- | ------------- | -| `chunking` | A list of node parsers (e.g. LlamaIndex `SentenceSplitter`) to be used for chunking source documents. Set `chunking` to `None` to skip chunking. | `SentenceSplitter` with `chunk_size=256` and `chunk_overlap=20` | +| `chunking` | A list of node parsers (e.g. LlamaIndex `SentenceSplitter`) to be used for chunking source documents. Set `chunking` to `None` to skip chunking. | `SentenceSplitter` with `chunk_size=256` and `chunk_overlap=25` | | `extraction` | An `ExtractionConfig` object specifying extraction options | `ExtractionConfig` with default values | -| `batch_config` | Batch configuration to be used if performing [batch extraction](./batch-extraction.md). If `batch_config` is `None`, the toolkit will perform chunk-by-chunk extraction. | `None` | +| `build` | A `BuildConfig` object specifying build options | `BuildConfig` with default values | +| `batch_config` | Batch configuration to be used if performing [batch extraction](./batch-extraction.md). If `batch_config` is `None`, the toolkit will perform chunk-by-chunk extraction. | `None` | The `ExtractionConfig` object has the following parameters: @@ -266,11 +267,22 @@ The `ExtractionConfig` object has the following parameters: | ------------- | ------------- | ------------- | | `enable_proposition_extraction` | Perform proposition extraction before extracting topics, statements, facts and entities | `True` | | `preferred_entity_classifications` | Comma-separated list of preferred entity classifications used to seed the entity extraction | `DEFAULT_ENTITY_CLASSIFICATIONS` | -| `infer_entity_classifications` | Determines whether to pre-process documents to identify significant domain entity classifications. Supply either `True` or `False`, or an `InferClassificationsConfig` object. | `False` | +| `preferred_topics` | List of preferred topic names (or a callable that returns them) supplied to the LLM to seed topic extraction. Accepts the same type as `preferred_entity_classifications`. | `[]` | +| `infer_entity_classifications` | Determines whether to pre-process documents to identify significant domain entity classifications. Supply either `True` or `False`, or an `InferClassificationsConfig` object. When `True`, an `InferClassifications` step runs as a **pre-processor** before the main extraction loop — one extra LLM round-trip per batch, not per document. | `False` | | `extract_propositions_prompt_template` | Prompt used to extract propositions from chunks. If `None`, the [default extract propositions template](https://github.com/awslabs/graphrag-toolkit/blob/main/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/prompts.py#L29-L72) is used. See [Custom prompts](#custom-prompts) below. | `None` | | `extract_topics_prompt_template` | Prompt used to extract topics, statements and entities from chunks. If `None`, the [default extract topics template](https://github.com/awslabs/graphrag-toolkit/blob/main/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/prompts.py#L74-L191) is used. See [Custom prompts](#custom-prompts) below. | `None` | +The `BuildConfig` object has the following parameters: + +| Parameter | Description | Default Value | +| ------------- | ------------- | ------------- | +| `build_filters` | A `BuildFilters` object to include or exclude specific node types during the build stage | `BuildFilters()` | +| `include_domain_labels` | Whether to add a domain-specific label (e.g. `Company`) to entity nodes in addition to `__Entity__` | `None` (falls back to `GraphRAGConfig.include_domain_labels`) | +| `include_local_entities` | Whether to include local-context entities in the graph | `None` (falls back to `GraphRAGConfig.include_local_entities`) | +| `source_metadata_formatter` | A `SourceMetadataFormatter` instance for customising source metadata written to the graph | `DefaultSourceMetadataFormatter()` | +| `enable_versioning` | Whether to enable versioned updates. Overrides `GraphRAGConfig.enable_versioning` when set. | `None` | + The `InferClassificationsConfig` object has the following parameters: | Parameter | Description | Default Value | @@ -353,6 +365,21 @@ topic: topic You can use [Amazon Bedrock batch inference](https://docs.aws.amazon.com/bedrock/latest/userguide/batch-inference.html) with the extract stage of the indexing process. See [Batch Extraction](./batch-extraction.md) for more details. +`BatchConfig` ([`indexing/extract/batch_config.py`](https://github.com/awslabs/graphrag-toolkit/blob/main/lexical-graph/src/graphrag_toolkit/lexical_graph/indexing/extract/batch_config.py)) accepts the following parameters: + +| Parameter | Description | Required | +| ------------- | ------------- | ------------- | +| `role_arn` | ARN of the IAM role Bedrock will assume to run batch jobs | Yes | +| `region` | AWS region where batch jobs will run | Yes | +| `bucket_name` | S3 bucket for batch job input/output | Yes | +| `key_prefix` | S3 key prefix for job files | No | +| `s3_encryption_key_id` | KMS key ID for S3 object encryption | No | +| `subnet_ids` | VPC subnet IDs for the batch job network configuration | No | +| `security_group_ids` | VPC security group IDs | No | +| `max_batch_size` | Maximum records per batch job (Bedrock limit: 50,000; jobs under 100 records are skipped and processed inline) | `25000` | +| `max_num_concurrent_batches` | Maximum concurrent batch jobs per worker | `3` | +| `delete_on_success` | Whether to delete S3 job files after a successful run | `True` | + #### Metadata filtering You can add metadata to source documents on ingest, and then use this metadata to filter documents during the extract and build stages. Source metadata is also used for metadata filtering when querying a lexical graph. See the [Metadata Filtering](./metadata-filtering.md) section for more details. diff --git a/docs/lexical-graph/multi-tenancy.md b/docs/lexical-graph/multi-tenancy.md index e6c97788..d0b10990 100644 --- a/docs/lexical-graph/multi-tenancy.md +++ b/docs/lexical-graph/multi-tenancy.md @@ -16,11 +16,13 @@ Multi-tenancy allows you to host multiple separate lexical graphs in the same un ### Tenant Id -To use the multi-tenancy feature, you must supply a tenant id when creating a `LexicalGraphIndex` or `LexicalGraphQueryEngine`. A tenant id is a string containing 1-10 lower case characters and numbers. If you don't supply a tenant id, the index and query engine will use the _default tenant_ (i.e. a tenant id value of `None`). +To use the multi-tenancy feature, supply a tenant id when creating a `LexicalGraphIndex` or `LexicalGraphQueryEngine`. A tenant id is a string of 1–25 lowercase letters, numbers, and periods (periods cannot appear at the start or end). If you don't supply a tenant id, the index and query engine use the _default tenant_ (a tenant id value of `None`). + +See [`tenant_id.py`](../../lexical-graph/src/graphrag_toolkit/lexical_graph/tenant_id.py) for the validation logic. ### Indexing and multi-tenancy -The following example creates a `LexicalGraphIndex` for tenant 'user123': +The following example creates a `LexicalGraphIndex` for tenant `user123`: ```python from graphrag_toolkit.lexical_graph import LexicalGraphIndex @@ -29,17 +31,17 @@ graph_store = ... vector_store = ... graph_index = LexicalGraphIndex( - graph_store, + graph_store, vector_store, tenant_id='user123' ) ``` -The `LexicalGraphIndex` always uses the _default tenant_ for the [extract stage](https://github.com/awslabs/graphrag-toolkit/blob/main/docs/indexing.md#extract), even if you supply a different tenant id. The [build stage](https://github.com/awslabs/graphrag-toolkit/blob/main/docs/indexing.md#build), however, will use the tenant id. The reason for this is so that you can extract once, and then build many times, potentially for different tenants. +**Important:** the extract stage always writes under the _default_ tenant, regardless of the tenant id you set. This is intentional — it lets you extract once and build for multiple tenants from the same extracted output. Only the build stage applies the tenant id. A warning is logged when a non-default tenant id is set ([`lexical_graph_index.py:445`](../../lexical-graph/src/graphrag_toolkit/lexical_graph/lexical_graph_index.py#L445)). ### Querying and multi-tenancy -The following example creates a `LexicalGraphQueryEngine` for tenant 'user123': +The following example creates a `LexicalGraphQueryEngine` for tenant `user123`: ```python from graphrag_toolkit.lexical_graph import LexicalGraphQueryEngine @@ -48,14 +50,14 @@ graph_store = ... vector_store = ... query_engine = LexicalGraphQueryEngine.for_traversal_based_search( - graph_store, + graph_store, vector_store, tenant_id='user123' ) ``` -If a lexical graph does not exist for the specified tenant id, the underlying retrievers will return an empty set of results. +If a lexical graph does not exist for the specified tenant id, the retrievers return an empty result set. ### Implementation details -Multi-tenancy works by using tenant-specific node labels for nodes in the graph, and tenant-specific indexes in the vector store. For example, chunk nodes in a graph belonging to tenant 'user123' will be labelled `__Chunk__user123__`, while the chunk vector index will be named `chunk_user123`. +Multi-tenancy works by using tenant-specific node labels and index names. For example, chunk nodes for tenant `user123` are labelled `__Chunk__user123__`, and the chunk vector index is named `chunk_user123`. diff --git a/docs/lexical-graph/querying.md b/docs/lexical-graph/querying.md index 0961e955..b37c6fa1 100644 --- a/docs/lexical-graph/querying.md +++ b/docs/lexical-graph/querying.md @@ -1,16 +1,104 @@ [[Home](./)] ## Querying - -The primary unit of context presented to the LLM by the lexical-graph is the *statement*, which is a standalone assertion or proposition. Source documents are broken into chunks, and from these chunks are extracted statements. In the graphrag-toolkit's [graph model](./graph-model.md), statements are thematically grouped by topic, and supported by facts. At question-answering time, the lexical-graph retrieves groups of statements, and presents them in the context window to the LLM. -The lexical-graph uses a [traversal-based search](./traversal-based-search.md) strategy to perform hybrid top-down and bottom-up similarity and graph-based searches for sets of statements grouped by topic and source. (The lexical-graph also includes a [semantic-guided search](./semantic-guided-search.md) approach which will likely be retired in future versions). +The primary unit of context presented to the LLM is the *statement* — a standalone assertion or proposition extracted from a source chunk. Statements are grouped by topic and source, and that grouping is what the query engine presents to the LLM. -Querying supports [metadata filtering](./metadata-filtering.md) and [multi-tenancy](multi-tenancy.md). Metadata filtering allows you to retrieve a constrained set of sources, topics and statements based on metadata filters and associated values when querying a lexical graph. Multi-tenancy allows you to query different lexical graphs hosted in the same backend graph and vector stores. +The lexical-graph uses a [traversal-based search](./traversal-based-search.md) strategy that combines similarity search with graph traversal. A [semantic-guided search](./semantic-guided-search.md) approach also exists but is likely to be retired in a future release. + +Querying supports [metadata filtering](./metadata-filtering.md) and [multi-tenancy](multi-tenancy.md). + +### Topics + +- [Factory methods](#factory-methods) +- [Context format](#context-format) +- [Verbose mode](#verbose-mode) +- [Async querying](#async-querying) +- [Managing indexed sources](#managing-indexed-sources) + +### Factory methods + +Use `LexicalGraphQueryEngine.for_traversal_based_search()` for most workloads. Use `for_semantic_guided_search()` if you specifically need the semantic-guided strategy. + +Both factory methods accept `graph_store`, `vector_store`, `tenant_id`, `post_processors`, `filter_config`, and `**kwargs`. The versioning parameter name differs between the two ([`lexical_graph_query_engine.py:67`](https://github.com/awslabs/graphrag-toolkit/blob/main/lexical-graph/src/graphrag_toolkit/lexical_graph/lexical_graph_query_engine.py#L67)): + +| Factory method | versioning parameter | +| --- | --- | +| `for_traversal_based_search` | `versioning` | +| `for_semantic_guided_search` | `enable_versioning` | + +You can also construct `LexicalGraphQueryEngine` directly, passing `system_prompt`, `user_prompt`, or a `prompt_provider` kwarg. See [Using Custom Prompt Providers](./prompts.md). + +### Context format + +The `context_format` kwarg controls how retrieved statements are serialised before being injected into the LLM prompt. Supported values ([`lexical_graph_query_engine.py:408`](https://github.com/awslabs/graphrag-toolkit/blob/main/lexical-graph/src/graphrag_toolkit/lexical_graph/lexical_graph_query_engine.py#L408)): + +| Value | Description | Default for | +| --- | --- | --- | +| `'json'` | JSON array of topic/statement objects | `__init__` direct construction | +| `'yaml'` | YAML representation of the same structure | — | +| `'xml'` | XML representation of the same structure | — | +| `'text'` | Plain text, one topic heading per group | `for_traversal_based_search` | +| `'bedrock_xml'` | Pre-formatted XML produced by a `BedrockContextFormat` post-processor | `for_semantic_guided_search` (hardcoded) | + +`for_semantic_guided_search` always uses `'bedrock_xml'` and ignores any `context_format` kwarg you pass. `for_traversal_based_search` defaults to `'text'` but accepts any of the values above. + +### Verbose mode + +The `verbose` kwarg (default `True`) controls answer length. When `True`, the LLM is instructed to answer fully; when `False`, concisely. This only affects the non-streaming code path ([`lexical_graph_query_engine.py:356`](https://github.com/awslabs/graphrag-toolkit/blob/main/lexical-graph/src/graphrag_toolkit/lexical_graph/lexical_graph_query_engine.py#L356)). + +```python +query_engine = LexicalGraphQueryEngine.for_traversal_based_search( + graph_store, + vector_store, + verbose=False +) +``` + +### Async querying + +`LexicalGraphQueryEngine` does not implement async querying. Calling `await query_engine.aquery(...)` will raise a `NotImplementedError`. Use `query_engine.query(...)` instead ([`lexical_graph_query_engine.py:563`](https://github.com/awslabs/graphrag-toolkit/blob/main/lexical-graph/src/graphrag_toolkit/lexical_graph/lexical_graph_query_engine.py#L563)). + +### Managing indexed sources + +`LexicalGraphIndex` exposes three methods for inspecting and managing what has been indexed ([`lexical_graph_index.py:596`](https://github.com/awslabs/graphrag-toolkit/blob/main/lexical-graph/src/graphrag_toolkit/lexical_graph/lexical_graph_index.py#L596)): + +#### `get_stats()` + +Returns a dict with node counts and two graph connectivity metrics: + +```python +stats = graph_index.get_stats() +# { +# 'source': 12, 'chunk': 180, 'topic': 950, +# 'statement': 4200, 'fact': 3100, 'entity': 820, +# 'localConnectivity': 1.23456, +# 'globalConnectivity': 0.98765, +# ... +# } +``` + +#### `get_sources(...)` + +Queries the graph for source document metadata. Accepts a `source_id` (str), `source_ids` (list), `filter` (`FilterConfig`, dict, or list of dicts), an optional `versioning_config`, and an optional `order_by` field name or list. + +```python +sources = graph_index.get_sources(filter={'url': 'https://example.com/page'}) +``` + +#### `delete_sources(...)` + +Same filter API as `get_sources`. Removes matching sources from both the graph store and the vector store and returns the list of deleted source IDs. + +```python +deleted = graph_index.delete_sources(source_id='chunk::abc123') +``` + +--- See also: - - [Traversal-Based Search](./traversal-based-search.md) - - [Configuring and Tuning Traversal-Based Search](./traversal-based-search-configuration.md) - - [Metadata Filtering](./metadata-filtering.md) - - [Multi-Tenancy](./multi-tenancy.md) +- [Traversal-Based Search](./traversal-based-search.md) +- [Configuring and Tuning Traversal-Based Search](./traversal-based-search-configuration.md) +- [Metadata Filtering](./metadata-filtering.md) +- [Multi-Tenancy](./multi-tenancy.md) diff --git a/docs/lexical-graph/versioned-updates.md b/docs/lexical-graph/versioned-updates.md index 3277f761..8ecd0786 100644 --- a/docs/lexical-graph/versioned-updates.md +++ b/docs/lexical-graph/versioned-updates.md @@ -55,7 +55,19 @@ If you have an existing graph and vector store built by a version of the graphra Indexed documents are versioned based on _extraction_ timestamps. A document will be `valid_from` the timestamp when it was extracted. If a different version of the document is subsequently indexed, the old version will be considered `valid_to` the extraction timestamp of the new version. -When _extracting_ data (using `LexicalGraphIndex.extract()` or `LexicalGraphIndex.extract_and_build()`), you must add the names of _version-independent metadata fields_ to the metadata of each document you want to update and version. +When _extracting_ data (using `LexicalGraphIndex.extract()` or `LexicalGraphIndex.extract_and_build()`), you must add the names of _version-independent metadata fields_ to the metadata of each document you want to update and version. Use the `add_versioning_info` helper to do this ([`versioning.py:35`](https://github.com/awslabs/graphrag-toolkit/blob/main/lexical-graph/src/graphrag_toolkit/lexical_graph/versioning.py#L35)): + +```python +from graphrag_toolkit.lexical_graph import add_versioning_info + +metadata = add_versioning_info( + metadata={}, + id_fields=['url'], # metadata fields that together identify this document across versions + valid_from=1761899971000 # optional: unix timestamp (ms) when this version became valid +) +``` + +Both `id_fields` and `valid_from` are optional. `id_fields` accepts a string or a list of strings. When _building_ a lexical graph (using `LexicalGraphIndex.build()` or `LexicalGraphIndex.extract_and_build()`), you must enable versioning, using either the `GraphRAGConfig.enable_versioning=True` global configuration parameter, or by passing a `BuildConfig(enable_versioning=True)` configuration object to the `LexicalGraphIndex` constructor, or by passing `enable_versioning=True` to the `LexicalGraphIndex.build()` or `LexicalGraphIndex.extract_and_build()` methods. diff --git a/lexical-graph/DOC_REVIEW_FINDINGS.md b/lexical-graph/DOC_REVIEW_FINDINGS.md new file mode 100644 index 00000000..c9cdad5f --- /dev/null +++ b/lexical-graph/DOC_REVIEW_FINDINGS.md @@ -0,0 +1,389 @@ +# Documentation Review Findings — `lexical-graph` + +> **Ground truth: the source code.** Everything below compares `README.md` against the actual implementation. Items are grouped by severity: **Critical** (code contradicts docs), **Gap** (significant undocumented behaviour), and **Minor** (style / polish). + +--- + +## Summary + +The README is intentionally brief — it is a landing page that points to `../docs/`. The critical issues are mostly in the code itself (docstring bugs, API inconsistencies) rather than in the README. The biggest gap is that none of the operational knobs (environment variables, prompt customisation, additional public API surface) are surfaced anywhere in the README. + +--- + +## Critical — Code Contradicts Documentation + +### 1. Method names: `extract_only` / `build_only` do not exist + +The README and the class-level docstring of `LexicalGraphIndex` describe two methods — `extract_only` and `build_only`. The actual public methods are: + +```python +# What the code has: +index.extract(nodes, ...) +index.build(nodes, ...) +index.extract_and_build(nodes, ...) +``` + +Anyone reading the docs and trying `.extract_only()` or `.build_only()` will get an `AttributeError`. The documentation must either rename these methods or the code must add aliases. + +**Files:** [lexical_graph_index.py:421](src/graphrag_toolkit/lexical_graph/lexical_graph_index.py#L421), [lexical_graph_index.py:475](src/graphrag_toolkit/lexical_graph/lexical_graph_index.py#L475) + +--- + +### 2. `IndexingConfig` docstring: chunk overlap is wrong + +The docstring says: + +> _"a default `SentenceSplitter` is used with a chunk size of 256 and an overlap of **20**"_ + +The code is: + +```python +SentenceSplitter(chunk_size=256, chunk_overlap=25) # overlap = 25, not 20 +``` + +**File:** [lexical_graph_index.py:181](src/graphrag_toolkit/lexical_graph/lexical_graph_index.py#L181) + +--- + +### 3. `TenantId` docstring: max length is wrong + +The class docstring says tenant IDs are _"between 1 to **10** characters"_. The validation code allows up to **25**: + +```python +if len(value) > 25: # actual limit + return False +``` + +**File:** [tenant_id.py:47](src/graphrag_toolkit/lexical_graph/tenant_id.py#L47) + +--- + +### 4. `to_indexing_config` is defined twice + +The function is defined at line 190 and again at line 209 in the same file. The second definition (with the docstring) silently overwrites the first. This is dead code and could confuse anyone auditing the module. + +**File:** [lexical_graph_index.py:190-250](src/graphrag_toolkit/lexical_graph/lexical_graph_index.py#L190) + +--- + +### 5. `VectorStoreFactory.for_composite` has a variable-name clash + +```python +for k, v in v.indexes: # outer loop variable 'v' is shadowed by inner 'v' +``` + +This loop re-assigns `v` inside itself, which will raise a `TypeError` on the second iteration (iterating over a non-iterable). The method is essentially broken. + +**File:** [vector_store_factory.py:119](src/graphrag_toolkit/lexical_graph/storage/vector_store_factory.py#L119) + +--- + +### 6. Async queries are silently unimplemented + +`LexicalGraphQueryEngine` inherits from `BaseQueryEngine` and must implement `_aquery`. The current implementation is: + +```python +async def _aquery(self, query_bundle: QueryBundle) -> RESPONSE_TYPE: + pass # returns None +``` + +There is no error, no warning, and no documentation saying async is unsupported. Callers using `await query_engine.aquery(...)` will silently receive `None`. + +**File:** [lexical_graph_query_engine.py:563](src/graphrag_toolkit/lexical_graph/lexical_graph_query_engine.py#L563) + +--- + +### 7. `for_traversal_based_search` and `for_semantic_guided_search` use different versioning parameter names + +| Factory method | versioning param | +|---|---| +| `for_traversal_based_search` | `versioning` | +| `for_semantic_guided_search` | `enable_versioning` | + +These should be identical since they are sibling factory methods on the same class. + +**File:** [lexical_graph_query_engine.py:67-247](src/graphrag_toolkit/lexical_graph/lexical_graph_query_engine.py#L67) + +--- + +## Gaps — Significant Undocumented Behaviour + +### 8. Extraction always uses the default tenant, regardless of `tenant_id` + +When `tenant_id` is set to a non-default value, the extraction phase (proposition + topic extraction) still writes output under the **default** tenant. Only the build phase uses the custom tenant. The code emits a warning, but this is a non-obvious semantic that affects anyone running multi-tenant indexing: + +```python +if not self.tenant_id.is_default_tenant(): + logger.warning('TenantId has been set to non-default tenant id, but extraction will use default tenant id') +``` + +This behaviour should be clearly documented in the multi-tenancy guide. + +**File:** [lexical_graph_index.py:445-455](src/graphrag_toolkit/lexical_graph/lexical_graph_index.py#L445) + +--- + +### 9. Context format defaults differ between factory methods and the constructor + +| Entry point | `context_format` default | +|---|---| +| `for_traversal_based_search` | `'text'` | +| `for_semantic_guided_search` | always `'bedrock_xml'`, ignores kwarg | +| `__init__` direct | `'json'` | + +The supported values (`'json'`, `'yaml'`, `'xml'`, `'text'`, `'bedrock_xml'`) are nowhere documented. `'bedrock_xml'` also automatically appends a `BedrockContextFormat` post-processor. + +**File:** [lexical_graph_query_engine.py:132-244](src/graphrag_toolkit/lexical_graph/lexical_graph_query_engine.py#L132) + +--- + +### 10. All configuration can be driven by environment variables + +None of the following env vars appear in the README: + +| Variable | Default | Purpose | +|---|---|---| +| `AWS_PROFILE` | — | AWS named profile | +| `AWS_REGION` | boto3 default | AWS region | +| `EXTRACTION_MODEL` | `us.anthropic.claude-3-7-sonnet-20250219-v1:0` | LLM for extraction | +| `RESPONSE_MODEL` | `us.anthropic.claude-3-7-sonnet-20250219-v1:0` | LLM for query responses | +| `EMBEDDINGS_MODEL` | `cohere.embed-english-v3` | Embedding model | +| `EMBEDDINGS_DIMENSIONS` | `1024` | Embedding vector size | +| `EXTRACTION_NUM_WORKERS` | `2` | Worker processes for extraction | +| `EXTRACTION_BATCH_SIZE` | `4` | Docs per batch | +| `EXTRACTION_NUM_THREADS_PER_WORKER` | `4` | Threads inside each worker | +| `BUILD_NUM_WORKERS` | `2` | Worker processes for build | +| `BUILD_BATCH_SIZE` | `4` | Items per build batch | +| `BUILD_BATCH_WRITE_SIZE` | `25` | Items per graph write call | +| `BATCH_WRITES_ENABLED` | `true` | Enable/disable batch graph writes | +| `INCLUDE_DOMAIN_LABELS` | `false` | Add domain labels to graph nodes | +| `INCLUDE_LOCAL_ENTITIES` | `false` | Include local-context entities | +| `INCLUDE_CLASSIFICATION_IN_ENTITY_ID` | `true` | Include classification in entity hash | +| `ENABLE_CACHE` | `false` | LLM response caching | +| `RERANKING_MODEL` | `mixedbread-ai/mxbai-rerank-xsmall-v1` | Local reranker | +| `BEDROCK_RERANKING_MODEL` | `cohere.rerank-v3-5:0` | Bedrock reranker | +| `OPENSEARCH_ENGINE` | `nmslib` | OpenSearch kNN engine | +| `ENABLE_VERSIONING` | `false` | Enable versioned updates | + +**File:** [config.py](src/graphrag_toolkit/lexical_graph/config.py) + +--- + +### 11. Connection string formats for stores are not documented + +The README shows one example (`neptune-db://…`, `aoss://…`) but never explains the full set of accepted prefixes: + +| Store | Connection string prefix | +|---|---| +| Neptune Analytics | `neptune-graph://[graph-id]` | +| Neptune Database | `neptune-db://[hostname]` or any hostname ending `.neptune.amazonaws.com` | +| Neo4j | `bolt://`, `bolt+ssc://`, `bolt+s://`, `neo4j://`, `neo4j+ssc://`, `neo4j+s://` | +| OpenSearch Serverless | `aoss://[url]` | +| pgvector | resolved via `PGVectorIndexFactory` | +| S3 Vectors | resolved via `S3VectorIndexFactory` | +| Dummy (no-op) | `None` / any unrecognised string → `DummyGraphStore`/`DummyVectorIndex` | + +**Files:** [neptune_graph_stores.py:22-24](src/graphrag_toolkit/lexical_graph/storage/graph/neptune_graph_stores.py#L22), [neo4j_graph_store_factory.py:8](src/graphrag_toolkit/lexical_graph/storage/graph/neo4j_graph_store_factory.py#L8) + +--- + +### 12. Prompt customisation system is completely undocumented + +The query engine supports four prompt provider backends that control the system and user prompts sent to the LLM: + +| Provider | Trigger | Config class | +|---|---|---| +| `StaticPromptProvider` | default (no config needed) | `StaticPromptProviderConfig` | +| `FilePromptProvider` | env var `PROMPT_PATH` | `FilePromptProviderConfig` | +| `S3PromptProvider` | env var `PROMPT_S3_BUCKET` | `S3PromptProviderConfig` | +| `BedrockPromptProvider` | env vars `SYSTEM_PROMPT_ARN` / `USER_PROMPT_ARN` | `BedrockPromptProviderConfig` | + +The `PromptProviderFactory` auto-detects which to use based on which env vars are set. Custom prompts can also be passed directly to `LexicalGraphQueryEngine.__init__` via `system_prompt` and `user_prompt` parameters, or via a `prompt_provider` kwarg. + +**Files:** [prompts/](src/graphrag_toolkit/lexical_graph/prompts/) + +--- + +### 13. Undocumented public API on `LexicalGraphIndex` + +Beyond `extract_and_build`, the index class has: + +- **`get_stats()`** — returns a dict with node counts (`source`, `chunk`, `topic`, `statement`, `fact`, `entity`) and two connectivity metrics (`localConnectivity`, `globalConnectivity`). +- **`get_sources(...)`** — queries the graph for source document metadata, supports filtering by `source_id`, list of IDs, `FilterConfig`, or dict, plus versioning and ordering. +- **`delete_sources(...)`** — same filter API as `get_sources`, deletes matching sources from both the graph store and the vector store. + +**File:** [lexical_graph_index.py:596-785](src/graphrag_toolkit/lexical_graph/lexical_graph_index.py#L596) + +--- + +### 14. `add_versioning_info()` utility is a public API + +This function is exported from `__init__.py` but is never described: + +```python +from graphrag_toolkit.lexical_graph import add_versioning_info + +metadata = add_versioning_info( + metadata={}, + id_fields=['url'], # fields that determine document identity across versions + valid_from=1234567890 # unix timestamp (ms) when this version became valid +) +``` + +It adds the internal versioning keys to a document's metadata dict before indexing, enabling point-in-time querying. + +**File:** [versioning.py:35-44](src/graphrag_toolkit/lexical_graph/versioning.py#L35) + +--- + +### 15. `BatchConfig` is undocumented for all required fields + +The README mentions batch extraction in passing but never shows a `BatchConfig` construction. The required parameters are non-obvious AWS infrastructure values: + +```python +BatchConfig( + role_arn='arn:aws:iam::…', # IAM role for Bedrock batch jobs + region='us-east-1', + bucket_name='my-bucket', # S3 bucket for job I/O + key_prefix='batch/', # Optional S3 prefix + s3_encryption_key_id=None, # Optional KMS key + subnet_ids=[], # VPC subnet IDs + security_group_ids=[], # VPC security groups + max_batch_size=25000, + max_num_concurrent_batches=3, + delete_on_success=True +) +``` + +**File:** [extract/batch_config.py](src/graphrag_toolkit/lexical_graph/indexing/extract/batch_config.py) + +--- + +### 16. Built-in document readers are not documented + +The `indexing/load/` sub-package includes pre-built reader providers for many source types that can be composed via `FileBasedDocs` and `S3BasedDocs`. None appear in the README: + +PDF, Advanced PDF, DOCX, PPTX, CSV, JSON, Markdown, Web pages, Wikipedia, YouTube transcripts, GitHub repositories, Database tables, S3 directories, and a general Directory reader. + +**Directory:** [indexing/load/readers/providers/](src/graphrag_toolkit/lexical_graph/indexing/load/readers/providers/) + +--- + +### 17. Asyncio patching is a silent side effect of importing the module + +On import, `__init__.py` patches `llama_index.core.async_utils.asyncio_run` to support Jupyter notebooks. This is done unconditionally and silently. It can interact unexpectedly with other code using LlamaIndex in the same process. + +**File:** [__init__.py:16-38](src/graphrag_toolkit/lexical_graph/__init__.py#L16) + +--- + +### 18. `ExtractionConfig.preferred_topics` is missing from the class docstring + +The docstring lists `preferred_entity_classifications` and `infer_entity_classifications` but omits `preferred_topics`, which accepts the same type (`PREFERRED_VALUES_PROVIDER_TYPE` — a list or a callable provider) and seeds the LLM with preferred topic names during extraction. + +Similarly, `BuildConfig.enable_versioning` is present in `__init__` but absent from both the class docstring and the attribute list. + +**File:** [lexical_graph_index.py:44-131](src/graphrag_toolkit/lexical_graph/lexical_graph_index.py#L44) + +--- + +### 19. `GraphRAGConfig` is a singleton instance, not a class to instantiate + +The README and configuration docs may imply users should create a `GraphRAGConfig()` object. It is actually a module-level singleton: + +```python +GraphRAGConfig = _GraphRAGConfig() # single shared instance +``` + +Usage is always mutation of this singleton: + +```python +from graphrag_toolkit.lexical_graph import GraphRAGConfig +GraphRAGConfig.aws_region = 'eu-west-1' +GraphRAGConfig.extraction_llm = 'anthropic.claude-3-5-sonnet-20241022-v2:0' +``` + +Setting `aws_profile` or `aws_region` automatically clears all cached boto3 clients. + +**File:** [config.py:1171](src/graphrag_toolkit/lexical_graph/config.py#L1171) + +--- + +### 20. README installation version is behind the codebase + +The README install URL pins `v3.15.5`: + +``` +pip install https://github.com/awslabs/graphrag-toolkit/archive/refs/tags/v3.15.5.zip#... +``` + +`pyproject.toml` shows version `3.16.2-SNAPSHOT`. The README should either always reference `main` or be updated on each release. + +--- + +### 21. `ResilientClient` and SSO login flow are undocumented + +`GraphRAGConfig` wraps all boto3 clients in `ResilientClient`, which retries on `ExpiredToken` / `RequestExpired` / `InvalidClientTokenId` errors by refreshing the client. When an SSO profile is detected, it validates the token age (1 hour) and will automatically run `aws sso login` if the token is stale. This is significant behaviour for users running in SSO environments or long-running jobs. + +**File:** [config.py:94-229](src/graphrag_toolkit/lexical_graph/config.py#L94) + +--- + +## Minor — Polish / Clarity + +### 22. Querying example in README does not use a context manager + +The indexing example wraps stores in `with ... as store:`, the querying example does not. This inconsistency could mislead users into thinking store connections are not closeable resources during query time. Either both examples should use context managers, or the README should note when it is and is not required. + +### 23. `_get_prompts`, `_get_prompt_modules`, `_update_prompts` are stub implementations + +These three methods from the LlamaIndex `PromptMixin` interface all `pass` (return `None`). They will silently fail if any LlamaIndex tooling tries to inspect or modify prompts on the query engine. + +**File:** [lexical_graph_query_engine.py:566-573](src/graphrag_toolkit/lexical_graph/lexical_graph_query_engine.py#L566) + +### 24. `prompt_provider_config.py` contains a "Suggested Next Enhancements" comment block + +A 10-item enhancement wishlist is embedded in a production source file as a comment block. This should be either moved to an issue tracker or removed. + +**File:** [prompts/prompt_provider_config.py:237-282](src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_config.py#L237) + +### 25. `LexicalGraphQueryEngine.verbose` controls answer verbosity + +The `verbose` kwarg (default `True`) is passed to the LLM prompt as `answer_mode='fully' if self.verbose else 'concisely'`. This is not documented. It only affects the non-streaming code path. + +**File:** [lexical_graph_query_engine.py:356-361](src/graphrag_toolkit/lexical_graph/lexical_graph_query_engine.py#L356) + +### 26. `InferClassifications` runs as a pre-processor before chunking/extraction + +When `infer_entity_classifications=True` in `ExtractionConfig`, an `InferClassifications` step is added as a **pre-processor** (not a pipeline component). It samples documents and uses an LLM to infer entity classification names before the main extraction loop begins. This means an extra LLM round-trip happens once per batch, not per document. + +**File:** [lexical_graph_index.py:365-388](src/graphrag_toolkit/lexical_graph/lexical_graph_index.py#L365) + +### 27. S3 Vectors additional dependency is undocumented in README + +The README lists installation extras for OpenSearch, pgvector, and Neo4j. There is no equivalent section for S3 Vectors even though an `S3VectorIndexFactory` and `s3_vector_indexes.py` exist. + +--- + +## Recommended Actions + +| Priority | Action | +|---|---| +| Critical | Rename `extract_only`/`build_only` in docs to `extract`/`build` (or add aliases to code) | +| Critical | Fix `to_indexing_config` duplicate definition | +| Critical | Fix `VectorStoreFactory.for_composite` variable shadowing bug | +| Critical | Document or implement `_aquery` | +| High | Align `versioning` / `enable_versioning` parameter names across factory methods | +| High | Fix `TenantId` and `IndexingConfig` docstring length/overlap bugs | +| High | Add environment variable reference table to configuration docs | +| High | Document connection string formats for all stores | +| High | Document prompt customisation system | +| High | Document `BatchConfig` required fields with an example | +| Medium | Document `get_stats`, `get_sources`, `delete_sources` API | +| Medium | Document `add_versioning_info` utility | +| Medium | Clarify tenant-extraction behaviour (always default tenant) | +| Medium | Update install URL to current release | +| Medium | Document `context_format` options and their defaults per factory | +| Low | Move `prompt_provider_config.py` enhancement notes out of source | +| Low | Document `verbose` kwarg on query engine | +| Low | Add S3 Vectors to the "Additional dependencies" section of README | diff --git a/lexical-graph/README.md b/lexical-graph/README.md index caadab42..0370d6c3 100644 --- a/lexical-graph/README.md +++ b/lexical-graph/README.md @@ -1,6 +1,6 @@ ## Lexical Graph -The lexical-graph package provides a framework for automating the construction of a [hierarchical lexical graph](../docs/lexical-graph/graph-model.md) from unstructured data, and composing question-answering strategies that query this graph when answering user questions. +The lexical-graph package provides a framework for automating the construction of a [hierarchical lexical graph](../docs/lexical-graph/graph-model.md) from unstructured data, and composing question-answering strategies that query this graph when answering user questions. ### Features @@ -8,18 +8,25 @@ The lexical-graph package provides a framework for automating the construction o - Built-in vector store support for Neptune Analytics, [Amazon OpenSearch Serverless](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/serverless.html), [Amazon S3 Vectors](https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-vectors.html) and Postgres with the pgvector extension. - Built-in support for foundation models (LLMs and embedding models) on [Amazon Bedrock](https://docs.aws.amazon.com/bedrock/). - Easily extended to support additional graph and vector stores and model backends. - - [Multi-tenancy](../docs/lexical-graph/multi-tenancy.md) – multiple separate lexical graphs in the same underlying graph and vector stores. + - [Multi-tenancy](../docs/lexical-graph/multi-tenancy.md) – multiple separate lexical graphs in the same underlying graph and vector stores. - Continuous ingest and [batch extraction](../docs/lexical-graph/batch-extraction.md) (using [Bedrock batch inference](https://docs.aws.amazon.com/bedrock/latest/userguide/batch-inference.html)) modes. - - [Versioned updates](../docs/lexical-graph/versioned-updates.md) for updating source documents and querying the state of the graph and vector stores at a -point in time. + - [Versioned updates](../docs/lexical-graph/versioned-updates.md) for updating source documents and querying the state of the graph and vector stores at a point in time. - Quickstart [AWS CloudFormation templates](../examples/lexical-graph/cloudformation-templates/) for Neptune Database, OpenSearch Serverless, and Amazon Aurora Postgres. ## Installation -The lexical-graph requires Python and [pip](http://www.pip-installer.org/en/latest/) to install. You can install the lexical-graph using pip: +The lexical-graph requires Python 3.10 or greater and [pip](http://www.pip-installer.org/en/latest/). + +Install from the latest release tag: ``` -$ pip install https://github.com/awslabs/graphrag-toolkit/archive/refs/tags/v3.15.5.zip#subdirectory=lexical-graph +$ pip install https://github.com/awslabs/graphrag-toolkit/archive/refs/tags/v3.16.2.zip#subdirectory=lexical-graph +``` + +Or install from the `main` branch to get the latest changes: + +``` +$ pip install https://github.com/awslabs/graphrag-toolkit/archive/refs/heads/main.zip#subdirectory=lexical-graph ``` If you're running on AWS, you must run your application in an AWS region containing the Amazon Bedrock foundation models used by the lexical graph (see the [configuration](../docs/lexical-graph/configuration.md#graphragconfig) section in the documentation for details on the default models used), and must [enable access](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html) to these models before running any part of the solution. @@ -30,25 +37,42 @@ You will need to install additional dependencies for specific graph and vector s #### Amazon OpenSearch Serverless -``` +```bash $ pip install opensearch-py llama-index-vector-stores-opensearch ``` #### Postgres with pgvector -``` +```bash $ pip install psycopg2-binary pgvector ``` -#### Neo4j +#### Amazon S3 Vectors +```bash +$ pip install boto3 ``` + +#### Neo4j + +``` bash $ pip install neo4j ``` -### Supported Python versions +### Connection strings + +Pass a connection string to `GraphStoreFactory.for_graph_store()` or `VectorStoreFactory.for_vector_store()` to select a backend: -The lexical-graph requires Python 3.10 or greater. +| Store | Connection string | +| --- | --- | +| Neptune Analytics (graph) | `neptune-graph://` | +| Neptune Database (graph) | `neptune-db://` or any hostname ending `.neptune.amazonaws.com` | +| Neo4j (graph) | `bolt://`, `bolt+ssc://`, `bolt+s://`, `neo4j://`, `neo4j+ssc://`, or `neo4j+s://` URLs | +| OpenSearch Serverless (vector) | `aoss://` | +| Neptune Analytics (vector) | `neptune-graph://` | +| pgvector (vector) | constructed via `PGVectorIndexFactory` | +| S3 Vectors (vector) | constructed via `S3VectorIndexFactory` | +| Dummy / no-op | `None` or any unrecognised string — falls back to `DummyGraphStore` / `DummyVectorIndex` | ## Example of use @@ -74,7 +98,7 @@ def run_extract_and_build(): ): graph_index = LexicalGraphIndex( - graph_store, + graph_store, vector_store ) @@ -105,24 +129,25 @@ from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory def run_query(): - graph_store = GraphStoreFactory.for_graph_store( - 'neptune-db://my-graph.cluster-abcdefghijkl.us-east-1.neptune.amazonaws.com' - ) - - vector_store = VectorStoreFactory.for_vector_store( - 'aoss://https://abcdefghijkl.us-east-1.aoss.amazonaws.com' - ) - - query_engine = LexicalGraphQueryEngine.for_traversal_based_search( - graph_store, - vector_store - ) - - response = query_engine.query('''What are the differences between Neptune Database - and Neptune Analytics?''') - - print(response.response) - + with ( + GraphStoreFactory.for_graph_store( + 'neptune-db://my-graph.cluster-abcdefghijkl.us-east-1.neptune.amazonaws.com' + ) as graph_store, + VectorStoreFactory.for_vector_store( + 'aoss://https://abcdefghijkl.us-east-1.aoss.amazonaws.com' + ) as vector_store + ): + + query_engine = LexicalGraphQueryEngine.for_traversal_based_search( + graph_store, + vector_store + ) + + response = query_engine.query('''What are the differences between Neptune Database + and Neptune Analytics?''') + + print(response.response) + if __name__ == '__main__': run_query() ``` @@ -131,20 +156,83 @@ if __name__ == '__main__': - [Overview](../docs/lexical-graph/overview.md) - [Graph Model](../docs/lexical-graph/graph-model.md) - - [Storage Model](../docs/lexical-graph/storage-model.md) - - [Indexing](../docs/lexical-graph/indexing.md) - - [Batch Extraction](../docs/lexical-graph/batch-extraction.md) + - [Storage Model](../docs/lexical-graph/storage-model.md) + - [Indexing](../docs/lexical-graph/indexing.md) + - [Batch Extraction](../docs/lexical-graph/batch-extraction.md) - [Configuring Batch Extraction](../docs/lexical-graph/configuring-batch-extraction.md) - [Versioned Updates](../docs/lexical-graph/versioned-updates.md) - - [Querying](../docs/lexical-graph/querying.md) + - [Querying](../docs/lexical-graph/querying.md) - [Traversal-Based Search](../docs/lexical-graph/traversal-based-search.md) - [Traversal-Based Search Configuration](../docs/lexical-graph/traversal-based-search-configuration.md) - - [Configuration](../docs/lexical-graph/configuration.md) + - [Configuration](../docs/lexical-graph/configuration.md) - [Security](../docs/lexical-graph/security.md) - [FAQ](../docs/lexical-graph/faq.md) +## Suggested Future Enhancements + +Here are several points that could be added to improve the [`prompt_provider_config.py`](./src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_config.py) + +### 1. Unified PromptProviderRegistry or Factory + +- Introduce a registry that maps provider types to config classes, e.g.: + + ```python + registry = { + "static": StaticPromptProviderConfig, + "file": FilePromptProviderConfig, + "s3": S3PromptProviderConfig, + "bedrock": BedrockPromptProviderConfig + } + ``` + +- Enable initialization from a config dict: `registry[type](**params).build()` + +### 2. Config Serialization + +- Add `.to_dict()` and `.from_dict()` methods to each config class for CLI/JSON compatibility. +- Useful for web UIs or YAML-driven orchestration. + +### 3. Validation & Type Enforcement + +- Use Pydantic or `__post_init__()` methods to validate inputs (e.g., ARN format, S3 bucket name). +- Example: validate AWS region format or prompt ARN prefix. + +### 4. Logging Enhancements + +- Add verbose logging on each provider (e.g., which prompt path or ARN was loaded). +- Include diagnostics for STS calls and client creation failures. + +### 5. Caching Layer + +- Cache resolved prompt text in memory or on disk (especially for S3 and Bedrock). +- Avoid unnecessary repeated fetches in batch queries. + +### 6. Runtime Provider Switching + +- Allow query-time override of prompt provider (e.g., via `query_engine.query(..., prompt_provider=...)`). +- Enables experimentation with different prompt strategies. + +### 7. Prompt Fallback Strategy + +- Support fallback to defaults or static provider if S3/Bedrock fails. +- Enables robust operation in partially degraded environments. + +### 8. Custom Prompt Variables + +- Support variable interpolation in prompt templates (e.g., using `{tenant_id}` or `{user_role}`). +- Useful for multi-tenant or role-specific prompting. + +### 9. Multi-Language Prompt Support + +- Load prompt variants based on locale/language code. +- Supports internationalization of RAG applications. + +### 10. Bedrock Caching with Prompt Versioning + +- Cache based on `(ARN, version)` tuple. +- Useful when managing multiple versions in experiments or A/B testing. + ## License This project is licensed under the Apache-2.0 License. - diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/lexical_graph_index.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/lexical_graph_index.py index 42cc7d7d..35f6a632 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/lexical_graph_index.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/lexical_graph_index.py @@ -56,6 +56,9 @@ class ExtractionConfig(): preferred_entity_classifications (List[str]): A list of preferred entity classifications to focus on during the extraction process. Defaults to DEFAULT_ENTITY_CLASSIFICATIONS if not specified. + preferred_topics: A list of preferred topic names (or a callable that + returns them) used to seed the LLM during topic extraction. Defaults + to an empty list. infer_entity_classifications (Union[InferClassificationsConfig, bool]): Specifies whether to infer entity classifications, using either a configuration object or a boolean flag. Defaults to False. @@ -101,6 +104,9 @@ class BuildConfig(): domain labels as part of the build output. source_metadata_formatter (Optional[SourceMetadataFormatter]): Formatter responsible for handling source metadata during the build. + enable_versioning (Optional[bool]): Whether to enable versioned updates + during the build stage. Overrides GraphRAGConfig.enable_versioning when + set. """ def __init__(self, build_filters: Optional[BuildFilters] = None, @@ -144,7 +150,7 @@ class IndexingConfig(): chunking (Optional[List[NodeParser]]): List of chunking strategies to be applied during indexing. If no chunking strategies are provided, a default `SentenceSplitter` is used with a chunk size of 256 and an - overlap of 20. + overlap of 25. extraction (Optional[ExtractionConfig]): Configuration for data extraction, defaulting to a new instance of `ExtractionConfig` if not provided. build (Optional[BuildConfig]): Build-specific configuration, defaulting to @@ -187,25 +193,6 @@ def __init__(self, IndexingConfigType = Union[IndexingConfig, ExtractionConfig, BuildConfig, BatchConfig, List[NodeParser]] -def to_indexing_config(indexing_config:Optional[IndexingConfigType]=None) -> IndexingConfig: - if not indexing_config: - return IndexingConfig() - if isinstance(indexing_config, IndexingConfig): - return indexing_config - elif isinstance(indexing_config, ExtractionConfig): - return IndexingConfig(extraction=indexing_config) - elif isinstance(indexing_config, BuildConfig): - return IndexingConfig(build=indexing_config) - elif isinstance(indexing_config, BatchConfig): - return IndexingConfig(batch_config=indexing_config) - elif isinstance(indexing_config, list): - for np in indexing_config: - if not isinstance(np, NodeParser): - raise ValueError(f'Invalid indexing config type: {type(np)}') - return IndexingConfig(chunking=indexing_config) - else: - raise ValueError(f'Invalid indexing config type: {type(indexing_config)}') - def to_indexing_config(indexing_config: Optional[IndexingConfigType] = None) -> IndexingConfig: """ Converts a given indexing configuration into an `IndexingConfig` object. diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/lexical_graph_query_engine.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/lexical_graph_query_engine.py index 9e6a79c1..0a251149 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/lexical_graph_query_engine.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/lexical_graph_query_engine.py @@ -561,7 +561,7 @@ def _query(self, query_bundle: QueryBundle) -> RESPONSE_TYPE: raise async def _aquery(self, query_bundle: QueryBundle) -> RESPONSE_TYPE: - pass + raise NotImplementedError("Async querying is not supported. Use query() instead.") def _get_prompts(self) -> PromptDictType: pass diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_config.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_config.py index 1783daaf..5f9f6759 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_config.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/prompts/prompt_provider_config.py @@ -230,53 +230,3 @@ def build() -> PromptProvider: from graphrag_toolkit.lexical_graph.prompts.static_prompt_provider import StaticPromptProvider return StaticPromptProvider() - -# ------------------------------------------------------------------------------ -# Suggested Next Enhancements (Optional) -# ------------------------------------------------------------------------------ - -# 1. Unified PromptProviderRegistry or Factory -# - Introduce a registry that maps provider types to config classes, e.g., -# registry = { -# "static": StaticPromptProviderConfig, -# "file": FilePromptProviderConfig, -# "s3": S3PromptProviderConfig, -# "bedrock": BedrockPromptProviderConfig -# } -# - Enable initialization from a config dict: registry[type](**params).build() - -# 2. Config Serialization -# - Add `.to_dict()` and `.from_dict()` methods to each config class for CLI/JSON compatibility. -# - Useful for web UIs or YAML-driven orchestration. - -# 3. Validation & Type Enforcement -# - Use Pydantic or `__post_init__()` methods to validate inputs (e.g., ARN format, S3 bucket name). -# - Example: validate AWS region format or prompt ARN prefix. - -# 4. Logging Enhancements -# - Add verbose logging on each provider (e.g., which prompt path or ARN was loaded). -# - Include diagnostics for STS calls and client creation failures. - -# 5. Caching Layer -# - Cache resolved prompt text in memory or on disk (especially for S3 and Bedrock). -# - Avoid unnecessary repeated fetches in batch queries. - -# 6. Runtime Provider Switching -# - Allow query-time override of prompt provider (e.g., via `query_engine.query(..., prompt_provider=...)`). -# - Enables experimentation with different prompt strategies. - -# 7. Prompt Fallback Strategy -# - Support fallback to defaults or static provider if S3/Bedrock fails. -# - Enables robust operation in partially degraded environments. - -# 8. Custom Prompt Variables -# - Support variable interpolation in prompt templates (e.g., using `{tenant_id}` or `{user_role}`). -# - Useful for multi-tenant or role-specific prompting. - -# 9. Multi-Language Prompt Support -# - Load prompt variants based on locale/language code. -# - Supports internationalization of RAG applications. - -# 10. Bedrock Caching with Prompt Versioning -# - Cache based on (ARN, version) tuple. -# - Useful when managing multiple versions in experiments or A/B testing. diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector_store_factory.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector_store_factory.py index 5f30dfde..b3ee4b48 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector_store_factory.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/storage/vector_store_factory.py @@ -115,8 +115,8 @@ def for_composite(vector_store_list:List[VectorStore]): VectorStore instances. """ indexes = {} - for v in vector_store_list: - for k, v in v.indexes: + for vs in vector_store_list: + for k, v in vs.indexes.items(): indexes[k] = v return VectorStore(indexes=indexes) \ No newline at end of file diff --git a/lexical-graph/src/graphrag_toolkit/lexical_graph/tenant_id.py b/lexical-graph/src/graphrag_toolkit/lexical_graph/tenant_id.py index bbc6aa82..16b5f264 100644 --- a/lexical-graph/src/graphrag_toolkit/lexical_graph/tenant_id.py +++ b/lexical-graph/src/graphrag_toolkit/lexical_graph/tenant_id.py @@ -14,8 +14,9 @@ class TenantId(BaseModel): This class provides functionality to validate and handle tenant identifiers, with optional formatting methods that adapt the given label, index name, hashable string, or ID based on whether the tenant is default or custom. - Tenant IDs are validated to ensure they are alphanumeric, lowercase, - and between 1 to 10 characters in length. + Tenant IDs are validated to ensure they consist of lowercase letters, + numbers, and periods (not at start/end), and are between 1 and 25 + characters in length. Attributes: value (Optional[str]): The tenant identifier. None indicates the default tenant.