diff --git a/docs/config.html b/docs/config.html
index 26187bb9..4c0d066e 100644
--- a/docs/config.html
+++ b/docs/config.html
@@ -988,6 +988,12 @@
LlamaStackConfiguration
Path to configuration file used when Llama Stack is run in library
mode |
+
+ | timeout |
+ integer |
+ Timeout in seconds for requests to Llama Stack service. Default is
+180 seconds (3 minutes) to accommodate long-running RAG queries. |
+
ModelContextProtocolServer
diff --git a/docs/config.json b/docs/config.json
index f9f1e023..124c005a 100644
--- a/docs/config.json
+++ b/docs/config.json
@@ -6,6 +6,40 @@
},
"components": {
"schemas": {
+ "A2AStateConfiguration": {
+ "additionalProperties": false,
+ "description": "A2A protocol persistent state configuration.\n\nConfigures how A2A task state and context-to-conversation mappings are\nstored. For multi-worker deployments, use SQLite or PostgreSQL to ensure\nstate is shared across all workers.\n\nIf no configuration is provided, in-memory storage is used (default).\nThis is suitable for single-worker deployments but state will be lost\non restarts and not shared across workers.\n\nAttributes:\n sqlite: SQLite database configuration for A2A state storage.\n postgres: PostgreSQL database configuration for A2A state storage.",
+ "properties": {
+ "sqlite": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/SQLiteDatabaseConfiguration"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "SQLite database configuration for A2A state storage.",
+ "title": "SQLite configuration"
+ },
+ "postgres": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/PostgreSQLDatabaseConfiguration"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "PostgreSQL database configuration for A2A state storage.",
+ "title": "PostgreSQL configuration"
+ }
+ },
+ "title": "A2AStateConfiguration",
+ "type": "object"
+ },
"APIKeyTokenConfiguration": {
"additionalProperties": false,
"description": "API Key Token configuration.",
@@ -78,7 +112,11 @@
"get_config",
"info",
"model_override",
- "rlsapi_v1_infer"
+ "rlsapi_v1_infer",
+ "a2a_agent_card",
+ "a2a_task_execution",
+ "a2a_message",
+ "a2a_jsonrpc"
],
"title": "Action",
"type": "string"
@@ -97,6 +135,12 @@
"title": "Skip Tls Verification",
"type": "boolean"
},
+ "skip_for_health_probes": {
+ "default": false,
+ "description": "Skip authorization for readiness and liveness probes",
+ "title": "Skip authorization for probes",
+ "type": "boolean"
+ },
"k8s_cluster_api": {
"type": "string",
"nullable": true,
@@ -162,6 +206,43 @@
"title": "AuthorizationConfiguration",
"type": "object"
},
+ "AzureEntraIdConfiguration": {
+ "additionalProperties": false,
+ "description": "Microsoft Entra ID authentication attributes for Azure.",
+ "properties": {
+ "tenant_id": {
+ "format": "password",
+ "title": "Tenant Id",
+ "type": "string",
+ "writeOnly": true
+ },
+ "client_id": {
+ "format": "password",
+ "title": "Client Id",
+ "type": "string",
+ "writeOnly": true
+ },
+ "client_secret": {
+ "format": "password",
+ "title": "Client Secret",
+ "type": "string",
+ "writeOnly": true
+ },
+ "scope": {
+ "default": "https://cognitiveservices.azure.com/.default",
+ "description": "Azure Cognitive Services scope for token requests. Override only if using a different Azure service.",
+ "title": "Token scope",
+ "type": "string"
+ }
+ },
+ "required": [
+ "tenant_id",
+ "client_id",
+ "client_secret"
+ ],
+ "title": "AzureEntraIdConfiguration",
+ "type": "object"
+ },
"ByokRag": {
"additionalProperties": false,
"description": "BYOK (Bring Your Own Knowledge) RAG configuration.",
@@ -346,10 +427,45 @@
"title": "BYOK RAG configuration",
"type": "array"
},
+ "a2a_state": {
+ "$ref": "#/components/schemas/A2AStateConfiguration",
+ "description": "Configuration for A2A protocol persistent state storage.",
+ "title": "A2A state configuration"
+ },
"quota_handlers": {
"$ref": "#/components/schemas/QuotaHandlersConfiguration",
"description": "Quota handlers configuration",
"title": "Quota handlers"
+ },
+ "azure_entra_id": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/AzureEntraIdConfiguration"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null
+ },
+ "splunk": {
+ "anyOf": [
+ {
+ "$ref": "#/components/schemas/SplunkConfiguration"
+ },
+ {
+ "type": "null"
+ }
+ ],
+ "default": null,
+ "description": "Splunk HEC configuration for sending telemetry events.",
+ "title": "Splunk configuration"
+ },
+ "deployment_environment": {
+ "default": "development",
+ "description": "Deployment environment name (e.g., 'development', 'staging', 'production'). Used in telemetry events.",
+ "title": "Deployment environment",
+ "type": "string"
}
},
"required": [
@@ -466,6 +582,18 @@
"default": null,
"title": "System Prompt"
},
+ "agent_card_path": {
+ "type": "string",
+ "nullable": true,
+ "default": null,
+ "title": "Agent Card Path"
+ },
+ "agent_card_config": {
+ "type": "object",
+ "nullable": true,
+ "default": null,
+ "title": "Agent Card Config"
+ },
"custom_profile": {
"anyOf": [
{
@@ -713,6 +841,21 @@
"description": "URL of the MCP server",
"title": "MCP server URL",
"type": "string"
+ },
+ "authorization_headers": {
+ "additionalProperties": {
+ "type": "string"
+ },
+ "description": "Headers to send to the MCP server. The map contains the header name and the path to a file containing the header value (secret). There are 2 special cases: 1. Usage of the kubernetes token in the header. To specify this use a string 'kubernetes' instead of the file path. 2. Usage of the client provided token in the header. To specify this use a string 'client' instead of the file path.",
+ "title": "Authorization headers",
+ "type": "object"
+ },
+ "timeout": {
+ "type": "integer",
+ "nullable": true,
+ "default": null,
+ "description": "Timeout in seconds for requests to the MCP server. If not specified, the default timeout from Llama Stack will be used. Note: This field is reserved for future use when Llama Stack adds timeout support.",
+ "title": "Request timeout"
}
},
"required": [
@@ -900,6 +1043,20 @@
"minimum": 0,
"title": "Period",
"type": "integer"
+ },
+ "database_reconnection_count": {
+ "default": 10,
+ "description": "Database reconnection count on startup. When database for quota is not available on startup, the service tries to reconnect N times with specified delay.",
+ "minimum": 0,
+ "title": "Database reconnection count on startup",
+ "type": "integer"
+ },
+ "database_reconnection_delay": {
+ "default": 1,
+ "description": "Database reconnection delay specified in seconds. When database for quota is not available on startup, the service tries to reconnect N times with specified delay.",
+ "minimum": 0,
+ "title": "Database reconnection delay",
+ "type": "integer"
}
},
"title": "QuotaSchedulerConfiguration",
@@ -953,6 +1110,13 @@
"title": "Port",
"type": "integer"
},
+ "base_url": {
+ "type": "string",
+ "nullable": true,
+ "default": null,
+ "description": "Externally reachable base URL for the service; needed for A2A support.",
+ "title": "Base URL"
+ },
"auth_enabled": {
"default": false,
"description": "Enables the authentication subsystem",
@@ -992,6 +1156,60 @@
"title": "ServiceConfiguration",
"type": "object"
},
+ "SplunkConfiguration": {
+ "additionalProperties": false,
+ "description": "Splunk HEC (HTTP Event Collector) configuration.\n\nSplunk HEC allows sending events directly to Splunk over HTTP/HTTPS.\nThis configuration is used to send telemetry events for inference\nrequests to the corporate Splunk deployment.\n\nUseful resources:\n\n - [Splunk HEC Docs](https://docs.splunk.com/Documentation/SplunkCloud)\n - [About HEC](https://docs.splunk.com/Documentation/Splunk/latest/Data)",
+ "properties": {
+ "enabled": {
+ "default": false,
+ "description": "Enable or disable Splunk HEC integration.",
+ "title": "Enabled",
+ "type": "boolean"
+ },
+ "url": {
+ "type": "string",
+ "nullable": true,
+ "default": null,
+ "description": "Splunk HEC endpoint URL.",
+ "title": "HEC URL"
+ },
+ "token_path": {
+ "type": "string",
+ "nullable": true,
+ "default": null,
+ "description": "Path to file containing the Splunk HEC authentication token.",
+ "title": "Token path"
+ },
+ "index": {
+ "type": "string",
+ "nullable": true,
+ "default": null,
+ "description": "Target Splunk index for events.",
+ "title": "Index"
+ },
+ "source": {
+ "default": "lightspeed-stack",
+ "description": "Event source identifier.",
+ "title": "Source",
+ "type": "string"
+ },
+ "timeout": {
+ "default": 5,
+ "description": "HTTP timeout in seconds for HEC requests.",
+ "minimum": 0,
+ "title": "Timeout",
+ "type": "integer"
+ },
+ "verify_ssl": {
+ "default": true,
+ "description": "Whether to verify SSL certificates for HEC endpoint.",
+ "title": "Verify SSL",
+ "type": "boolean"
+ }
+ },
+ "title": "SplunkConfiguration",
+ "type": "object"
+ },
"TLSConfiguration": {
"additionalProperties": false,
"description": "TLS configuration.\n\nTransport Layer Security (TLS) is a cryptographic protocol designed to\nprovide communications security over a computer network, such as the\nInternet. The protocol is widely used in applications such as email,\ninstant messaging, and voice over IP, but its use in securing HTTPS remains\nthe most publicly visible.\n\nUseful resources:\n\n - [FastAPI HTTPS Deployment](https://fastapi.tiangolo.com/deployment/https/)\n - [Transport Layer Security Overview](https://en.wikipedia.org/wiki/Transport_Layer_Security)\n - [What is TLS](https://www.ssltrust.eu/learning/ssl/transport-layer-security-tls)",
diff --git a/docs/config.md b/docs/config.md
index e6d461e0..1c5dd1cf 100644
--- a/docs/config.md
+++ b/docs/config.md
@@ -345,6 +345,7 @@ Useful resources:
| api_key | string | API key to access Llama Stack service |
| use_as_library_client | boolean | When set to true Llama Stack will be used in library mode, not in server mode (default) |
| library_client_config_path | string | Path to configuration file used when Llama Stack is run in library mode |
+| timeout | integer | Timeout in seconds for requests to Llama Stack service. Default is 180 seconds (3 minutes) to accommodate long-running RAG queries. |
## ModelContextProtocolServer
diff --git a/docs/config.puml b/docs/config.puml
index 250c4a4b..bec2d1f2 100644
--- a/docs/config.puml
+++ b/docs/config.puml
@@ -1,5 +1,12 @@
@startuml classes
set namespaceSeparator none
+class "A2AStateConfiguration" as src.models.config.A2AStateConfiguration {
+ config
+ postgres : Optional[PostgreSQLDatabaseConfiguration]
+ sqlite : Optional[SQLiteDatabaseConfiguration]
+ storage_type
+ check_a2a_state_configuration() -> Self
+}
class "APIKeyTokenConfiguration" as src.models.config.APIKeyTokenConfiguration {
api_key
}
@@ -20,12 +27,19 @@ class "AuthenticationConfiguration" as src.models.config.AuthenticationConfigura
module : str
rh_identity_config : Optional[RHIdentityConfiguration]
rh_identity_configuration
+ skip_for_health_probes : bool
skip_tls_verification : bool
check_authentication_model() -> Self
}
class "AuthorizationConfiguration" as src.models.config.AuthorizationConfiguration {
access_rules : list[AccessRule]
}
+class "AzureEntraIdConfiguration" as src.models.config.AzureEntraIdConfiguration {
+ client_id
+ client_secret
+ scope : str
+ tenant_id
+}
class "ByokRag" as src.models.config.ByokRag {
db_path
embedding_dimension
@@ -42,20 +56,25 @@ class "CORSConfiguration" as src.models.config.CORSConfiguration {
check_cors_configuration() -> Self
}
class "Configuration" as src.models.config.Configuration {
+ a2a_state
authentication
authorization : Optional[AuthorizationConfiguration]
+ azure_entra_id : Optional[AzureEntraIdConfiguration]
byok_rag : list[ByokRag]
conversation_cache
customization : Optional[Customization]
database
+ deployment_environment : str
inference
llama_stack
mcp_servers : list[ModelContextProtocolServer]
name : str
quota_handlers
service
+ splunk : Optional[SplunkConfiguration]
user_data_collection
- dump(filename: str) -> None
+ dump(filename: str | Path) -> None
+ validate_mcp_auth_headers() -> Self
}
class "ConfigurationBase" as src.models.config.ConfigurationBase {
model_config
@@ -64,7 +83,7 @@ class "ConversationHistoryConfiguration" as src.models.config.ConversationHistor
memory : Optional[InMemoryCacheConfig]
postgres : Optional[PostgreSQLDatabaseConfiguration]
sqlite : Optional[SQLiteDatabaseConfiguration]
- type : Literal['noop', 'memory', 'sqlite', 'postgres'] | None
+ type : Optional[Literal['noop', 'memory', 'sqlite', 'postgres']]
check_cache_configuration() -> Self
}
class "CustomProfile" as src.models.config.CustomProfile {
@@ -73,6 +92,8 @@ class "CustomProfile" as src.models.config.CustomProfile {
get_prompts() -> dict[str, str]
}
class "Customization" as src.models.config.Customization {
+ agent_card_config : Optional[dict[str, Any]]
+ agent_card_path : Optional[FilePath]
custom_profile : Optional[CustomProfile]
disable_query_system_prompt : bool
profile_path : Optional[str]
@@ -121,14 +142,19 @@ class "JwtRoleRule" as src.models.config.JwtRoleRule {
class "LlamaStackConfiguration" as src.models.config.LlamaStackConfiguration {
api_key : Optional[SecretStr]
library_client_config_path : Optional[str]
+ timeout
url : Optional[str]
use_as_library_client : Optional[bool]
check_llama_stack_model() -> Self
}
class "ModelContextProtocolServer" as src.models.config.ModelContextProtocolServer {
+ authorization_headers : dict[str, str]
name : str
provider_id : str
+ resolved_authorization_headers
+ timeout : Optional[PositiveInt]
url : str
+ resolve_auth_headers() -> Self
}
class "PostgreSQLDatabaseConfiguration" as src.models.config.PostgreSQLDatabaseConfiguration {
ca_cert_path : Optional[FilePath]
@@ -170,6 +196,7 @@ class "SQLiteDatabaseConfiguration" as src.models.config.SQLiteDatabaseConfigura
class "ServiceConfiguration" as src.models.config.ServiceConfiguration {
access_log : bool
auth_enabled : bool
+ base_url : Optional[str]
color_log : bool
cors
host : str
@@ -178,6 +205,16 @@ class "ServiceConfiguration" as src.models.config.ServiceConfiguration {
workers
check_service_configuration() -> Self
}
+class "SplunkConfiguration" as src.models.config.SplunkConfiguration {
+ enabled : bool
+ index : Optional[str]
+ source : str
+ timeout
+ token_path : Optional[FilePath]
+ url : Optional[str]
+ verify_ssl : bool
+ check_splunk_configuration() -> Self
+}
class "TLSConfiguration" as src.models.config.TLSConfiguration {
tls_certificate_path : Optional[FilePath]
tls_key_password : Optional[FilePath]
@@ -191,10 +228,12 @@ class "UserDataCollection" as src.models.config.UserDataCollection {
transcripts_storage : Optional[str]
check_storage_location_is_set_when_needed() -> Self
}
+src.models.config.A2AStateConfiguration --|> src.models.config.ConfigurationBase
src.models.config.APIKeyTokenConfiguration --|> src.models.config.ConfigurationBase
src.models.config.AccessRule --|> src.models.config.ConfigurationBase
src.models.config.AuthenticationConfiguration --|> src.models.config.ConfigurationBase
src.models.config.AuthorizationConfiguration --|> src.models.config.ConfigurationBase
+src.models.config.AzureEntraIdConfiguration --|> src.models.config.ConfigurationBase
src.models.config.ByokRag --|> src.models.config.ConfigurationBase
src.models.config.CORSConfiguration --|> src.models.config.ConfigurationBase
src.models.config.Configuration --|> src.models.config.ConfigurationBase
@@ -215,8 +254,10 @@ src.models.config.QuotaSchedulerConfiguration --|> src.models.config.Configurati
src.models.config.RHIdentityConfiguration --|> src.models.config.ConfigurationBase
src.models.config.SQLiteDatabaseConfiguration --|> src.models.config.ConfigurationBase
src.models.config.ServiceConfiguration --|> src.models.config.ConfigurationBase
+src.models.config.SplunkConfiguration --|> src.models.config.ConfigurationBase
src.models.config.TLSConfiguration --|> src.models.config.ConfigurationBase
src.models.config.UserDataCollection --|> src.models.config.ConfigurationBase
+src.models.config.A2AStateConfiguration --* src.models.config.Configuration : a2a_state
src.models.config.AuthenticationConfiguration --* src.models.config.Configuration : authentication
src.models.config.CORSConfiguration --* src.models.config.ServiceConfiguration : cors
src.models.config.ConversationHistoryConfiguration --* src.models.config.Configuration : conversation_cache
diff --git a/docs/openapi.json b/docs/openapi.json
index 693ff1b9..f329d4ed 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -3664,7 +3664,7 @@
"rlsapi-v1"
],
"summary": "Infer Endpoint",
- "description": "Handle rlsapi v1 /infer requests for stateless inference.\n\nThis endpoint serves requests from the RHEL Lightspeed Command Line Assistant (CLA).\n\nAccepts a question with optional context (stdin, attachments, terminal output,\nsystem info) and returns an LLM-generated response.\n\nArgs:\n infer_request: The inference request containing question and context.\n auth: Authentication tuple from the configured auth provider.\n\nReturns:\n RlsapiV1InferResponse containing the generated response text and request ID.\n\nRaises:\n HTTPException: 503 if the LLM service is unavailable.",
+ "description": "Handle rlsapi v1 /infer requests for stateless inference.\n\nThis endpoint serves requests from the RHEL Lightspeed Command Line Assistant (CLA).\n\nAccepts a question with optional context (stdin, attachments, terminal output,\nsystem info) and returns an LLM-generated response.\n\nArgs:\n infer_request: The inference request containing question and context.\n request: The FastAPI request object for accessing headers and state.\n background_tasks: FastAPI background tasks for async Splunk event sending.\n auth: Authentication tuple from the configured auth provider.\n\nReturns:\n RlsapiV1InferResponse containing the generated response text and request ID.\n\nRaises:\n HTTPException: 503 if the LLM service is unavailable.",
"operationId": "infer_endpoint_v1_infer_post",
"requestBody": {
"content": {
@@ -4290,7 +4290,7 @@
],
"summary": "Handle A2A Jsonrpc",
"description": "Handle A2A JSON-RPC requests following the A2A protocol specification.\n\nThis endpoint uses the DefaultRequestHandler from the A2A SDK to handle\nall JSON-RPC requests including message/send, message/stream, etc.\n\nThe A2A SDK application is created per-request to include authentication\ncontext while still leveraging FastAPI's authorization middleware.\n\nAutomatically detects streaming requests (message/stream JSON-RPC method)\nand returns a StreamingResponse to enable real-time chunk delivery.\n\nArgs:\n request: FastAPI request object\n auth: Authentication tuple\n mcp_headers: MCP headers for context propagation\n\nReturns:\n JSON-RPC response or streaming response",
- "operationId": "handle_a2a_jsonrpc_a2a_get",
+ "operationId": "handle_a2a_jsonrpc_a2a_post",
"responses": {
"200": {
"description": "Successful Response",
@@ -4308,7 +4308,7 @@
],
"summary": "Handle A2A Jsonrpc",
"description": "Handle A2A JSON-RPC requests following the A2A protocol specification.\n\nThis endpoint uses the DefaultRequestHandler from the A2A SDK to handle\nall JSON-RPC requests including message/send, message/stream, etc.\n\nThe A2A SDK application is created per-request to include authentication\ncontext while still leveraging FastAPI's authorization middleware.\n\nAutomatically detects streaming requests (message/stream JSON-RPC method)\nand returns a StreamingResponse to enable real-time chunk delivery.\n\nArgs:\n request: FastAPI request object\n auth: Authentication tuple\n mcp_headers: MCP headers for context propagation\n\nReturns:\n JSON-RPC response or streaming response",
- "operationId": "handle_a2a_jsonrpc_a2a_get",
+ "operationId": "handle_a2a_jsonrpc_a2a_post",
"responses": {
"200": {
"description": "Successful Response",
@@ -6957,6 +6957,13 @@
],
"title": "Llama Stack configuration path",
"description": "Path to configuration file used when Llama Stack is run in library mode"
+ },
+ "timeout": {
+ "type": "integer",
+ "exclusiveMinimum": 0.0,
+ "title": "Request timeout",
+ "description": "Timeout in seconds for requests to Llama Stack service. Default is 180 seconds (3 minutes) to accommodate long-running RAG queries.",
+ "default": 180
}
},
"additionalProperties": false,
diff --git a/docs/openapi.md b/docs/openapi.md
index f576011e..74315b24 100644
--- a/docs/openapi.md
+++ b/docs/openapi.md
@@ -3200,6 +3200,8 @@ system info) and returns an LLM-generated response.
Args:
infer_request: The inference request containing question and context.
+ request: The FastAPI request object for accessing headers and state.
+ background_tasks: FastAPI background tasks for async Splunk event sending.
auth: Authentication tuple from the configured auth provider.
Returns:
@@ -4860,6 +4862,7 @@ Useful resources:
| api_key | | API key to access Llama Stack service |
| use_as_library_client | | When set to true Llama Stack will be used in library mode, not in server mode (default) |
| library_client_config_path | | Path to configuration file used when Llama Stack is run in library mode |
+| timeout | integer | Timeout in seconds for requests to Llama Stack service. Default is 180 seconds (3 minutes) to accommodate long-running RAG queries. |
## MCPClientAuthOptionsResponse