Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -196,4 +196,7 @@ dev-tools/mcp-mock-server/.certs/
requirements.*.backup

# Local run files
local-run.yaml
local-run.yaml

datadir/
INSTRUCTIONS.md
35 changes: 13 additions & 22 deletions lightspeed-stack.yaml
Original file line number Diff line number Diff line change
@@ -1,33 +1,24 @@
name: Lightspeed Core Service (LCS)
# To get llama stack logs `export LLAMA_STACK_LOGGING=all=debug`
name: OpenStack Lightspeed

authentication:
module: "noop"

service:
host: 0.0.0.0
port: 8080
base_url: http://localhost:8080
auth_enabled: false
workers: 1
color_log: true
access_log: true

llama_stack:
# Uses a remote llama-stack service
# The instance would have already been started with a llama-stack-run.yaml file
use_as_library_client: false
# Alternative for "as library use"
# use_as_library_client: true
# library_client_config_path: <path-to-llama-stack-run.yaml-file>
url: http://llama-stack:8321
api_key: xyzzy
user_data_collection:
feedback_enabled: true
feedback_storage: "/tmp/data/feedback"
transcripts_enabled: true
transcripts_storage: "/tmp/data/transcripts"

# Conversation cache for storing Q&A history
conversation_cache:
type: "sqlite"
sqlite:
db_path: "/tmp/data/conversation-cache.db" # Persistent across requests, can be deleted between test runs
url: http://localhost:8321

authentication:
module: "noop"
user_data_collection:
feedback_enabled: false
transcripts_enabled: false

customization:
system_prompt_path: ${env.PWD}/system-prompt.yaml
118 changes: 118 additions & 0 deletions llama-stack.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
version: 2

apis:
- agents
- datasetio
- eval
- files
- inference
- safety
- scoring
- vector_io
- tool_runtime

benchmarks: []
datasets: []
image_name: starter

providers:
inference:
- provider_id: llmprovider
provider_type: remote::anthropic
config:
api_key: ${env.LLM_KEY}

# OpenAI
#- provider_id: llmprovider
# provider_type: remote::openai
# config:
# api_key: ${env.LLM_KEY}

# Gemini
#- provider_id: llmprovider
# provider_type: remote::gemini
# config:
# api_key: ${env.LLM_KEY}

files:
- config:
metadata_store:
table_name: files_metadata
backend: sql_default
storage_dir: ${env.PWD}/datadir/files
provider_id: meta-reference-files
provider_type: inline::localfs

vector_io: []
agents:
- config:
persistence:
agent_state:
namespace: agents_state
backend: kv_default
responses:
table_name: agents_responses
backend: sql_default
provider_id: meta-reference
provider_type: inline::meta-reference
datasetio:
- config:
kvstore:
namespace: huggingface_datasetio
backend: kv_default
provider_id: huggingface
provider_type: remote::huggingface
- config:
kvstore:
namespace: localfs_datasetio
backend: kv_default
provider_id: localfs
provider_type: inline::localfs
eval:
- config:
kvstore:
namespace: eval_store
backend: kv_default
provider_id: meta-reference
provider_type: inline::meta-reference
registered_resources:
models:
- metadata: {}
model_id: "${env.LLM_MODEL}"
provider_id: llmprovider
provider_model_id: "${env.LLM_MODEL}"
model_type: llm
scoring_fns: []
server:
port: 8321
storage:
backends:
kv_default:
type: kv_sqlite
db_path: ${env.PWD}/datadir/kv_store.db}
sql_default:
type: sql_sqlite
db_path: ${env.PWD}/datadir/sql_store.db}
stores:
metadata:
namespace: registry
backend: kv_default
inference:
table_name: inference_store
backend: sql_default
max_write_queue_size: 10000
num_writers: 4
conversations:
table_name: openai_conversations
backend: sql_default
prompts:
namespace: prompts
backend: kv_default
shields: []
datasets: []
scoring_fns: []
benchmarks: []
tool_groups: []
vector_stores: {}
telemetry:
enabled: false
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ dependencies = [
"pyasn1>=0.6.3", # LCORE-1490
# Used for system prompt template variable rendering
"jinja2>=3.1.0",
"anthropic>=0.86.0",
]


Expand Down
15 changes: 15 additions & 0 deletions src/app/endpoints/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
ShieldModerationResult,
TurnSummary,
)
from utils.user_memory import build_instructions_with_preferences, user_memory
from utils.vector_search import build_rag_context

logger = get_logger(__name__)
Expand Down Expand Up @@ -183,6 +184,19 @@ async def query_endpoint_handler(
inline_rag_context=inline_rag_context.context_text,
)

# Extract user preferences from conversation history
user_preferences = await user_memory(
user_id,
client,
responses_params.model,
_skip_userid_check,
new_conversation=not query_request.conversation_id,
)
if user_preferences:
responses_params.instructions = build_instructions_with_preferences(
responses_params.instructions, user_preferences
)

# Handle Azure token refresh if needed
if (
responses_params.model.startswith("azure")
Expand Down Expand Up @@ -293,6 +307,7 @@ async def retrieve_response(
id=moderation_result.moderation_id, llm_response=moderation_result.message
)
try:
logger.info(responses_params.model_dump(exclude_none=True))
response = await client.responses.create(
**responses_params.model_dump(exclude_none=True)
)
Expand Down
16 changes: 15 additions & 1 deletion src/app/endpoints/streaming_query.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Streaming query handler using Responses API."""
"""Streaming query handler using Responses API.""" # pylint: disable=too-many-lines

import asyncio
import datetime
Expand Down Expand Up @@ -95,6 +95,7 @@
from utils.suid import get_suid, normalize_conversation_id
from utils.token_counter import TokenCounter
from utils.types import ReferencedDocument, ResponsesApiParams, TurnSummary
from utils.user_memory import build_instructions_with_preferences, user_memory
from utils.vector_search import build_rag_context

logger = get_logger(__name__)
Expand Down Expand Up @@ -219,6 +220,19 @@ async def streaming_query_endpoint_handler( # pylint: disable=too-many-locals
inline_rag_context=inline_rag_context.context_text,
)

# Extract user preferences from conversation history
user_preferences = await user_memory(
user_id,
client,
responses_params.model,
_skip_userid_check,
new_conversation=not query_request.conversation_id,
)
if user_preferences:
responses_params.instructions = build_instructions_with_preferences(
responses_params.instructions, user_preferences
)

# Handle Azure token refresh if needed
if (
responses_params.model.startswith("azure")
Expand Down
20 changes: 20 additions & 0 deletions src/models/database/conversations.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,26 @@ class UserConversation(Base): # pylint: disable=too-few-public-methods
topic_summary: Mapped[str] = mapped_column(default="")


class UserMemory(Base): # pylint: disable=too-few-public-methods
"""Model for storing cached user preference extractions."""

__tablename__ = "user_memory"

# One row per user
user_id: Mapped[str] = mapped_column(primary_key=True)

# The extracted preferences string (empty string if none found)
preferences: Mapped[str] = mapped_column(default="")

# Conversation count at the time of extraction (used for cache invalidation)
conversation_count: Mapped[int] = mapped_column(default=0)

updated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
server_default=func.now(), # pylint: disable=not-callable
)


class UserTurn(Base): # pylint: disable=too-few-public-methods
"""Model for storing turn-level metadata."""

Expand Down
Loading
Loading