Skip to content

Commit cac8f47

Browse files
committed
fix: startup blockers — migrations, async checkpointer, Pydantic state compat
- DB migrations: make fault-tolerant (each step catches errors independently instead of aborting entire migration chain). Fixes pgvector missing on host PG16, SAVEPOINT-in-autocommit errors, and legacy schema mismatches. - Checkpointer: switch from sync PostgresSaver to AsyncPostgresSaver with async connection pool. Graphs use ainvoke() which requires async checkpoint methods. Setup runs via separate autocommit connection (CREATE INDEX CONCURRENTLY requires it). - State models: add _DictCompatMixin with .get() and __getitem__ to ResearchState, TradingState, SupervisorState. Node code uses dict-style access but states are Pydantic BaseModel instances. - LLM config consolidation and .env.example updates.
1 parent 100ef09 commit cac8f47

File tree

18 files changed

+242
-218
lines changed

18 files changed

+242
-218
lines changed

.env.example

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -55,15 +55,13 @@ DATA_PROVIDER_PRIORITY=alpaca,polygon,financial_datasets,alpha_vantage
5555
# azure | groq | together_ai | fireworks_ai | mistral |
5656
# ollama | custom_openai | bedrock_groq
5757
#
58-
# bedrock_groq (recommended): Sonnet for heavy-tier agents (trade_debater,
59-
# fund_manager, quant_researcher, etc.), Groq Llama for all operational
60-
# agents (monitoring, scanning, execution, health checks). Cuts cost on
61-
# 14 of 21 agents while keeping reasoning quality where it matters.
62-
LLM_PROVIDER=bedrock
58+
# groq: Qwen3-32B for all operational tiers (medium/light/bulk).
59+
# bedrock_groq: OpenAI gpt-oss-120b for heavy reasoning, Groq Qwen3-32B for all others.
60+
# No AWS Bedrock used — Sonnet is replaced by gpt-oss-120b for heavy tasks.
61+
LLM_PROVIDER=groq
6362

6463
# Fallback chain — comma-separated, tried left to right if primary is unavailable.
65-
# Example: if Bedrock creds expire mid-day, agents will automatically fall back.
66-
LLM_FALLBACK_CHAIN=anthropic,openai
64+
LLM_FALLBACK_CHAIN=openai,groq
6765

6866
# =============================================================================
6967
# Provider credentials
@@ -100,12 +98,11 @@ OPENAI_MODEL=gpt-4o
10098
# AZURE_API_VERSION=2024-02-15-preview
10199
# AZURE_DEPLOYMENT_NAME=gpt-4o
102100

103-
# --- Groq (Tier 2 — extremely fast inference, free tier available) ---
101+
# --- Groq (primary provider) ---
104102
# Used standalone (LLM_PROVIDER=groq) or in hybrid mode (LLM_PROVIDER=bedrock_groq).
105-
# Hybrid mode: heavy tier stays on Bedrock Sonnet, medium/light/bulk use Groq Llama.
106-
# Models: llama-3.3-70b-versatile (medium), llama-3.1-8b-instant (light/bulk).
103+
# Hybrid mode: heavy tier uses OpenAI gpt-oss-120b, all others use Groq Qwen3-32B.
107104
GROQ_API_KEY=
108-
# GROQ_MODEL=llama-3.3-70b-versatile
105+
GROQ_MODEL=qwen/qwen3-32b
109106

110107
# --- Together AI (Tier 2) ---
111108
# TOGETHER_API_KEY=
@@ -143,7 +140,7 @@ GROQ_API_KEY=
143140

144141
# ICs — narrow focused work, cheaper/faster models are sufficient
145142
# LLM_MODEL_IC=bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0
146-
# LLM_MODEL_IC=groq/llama-3.3-70b-versatile # budget option (~$0.005/run)
143+
# LLM_MODEL_IC=groq/qwen/qwen3-32b # Groq Qwen3-32B
147144
# LLM_MODEL_IC=gemini/gemini-2.5-flash # balanced option
148145

149146
# Pod Managers — synthesis, benefits from stronger models

config/litellm_router.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# LiteLLM Router Configuration for QuantStack
2-
# Routes by task cost/quality: Claude for reasoning, Groq for bulk/cheap, Ollama for offline.
2+
# Routes by task cost/quality: OpenAI gpt-oss-120b for reasoning, Groq Qwen3-32B for bulk/fast.
33
#
44
# Loaded by: src/quantstack/llm_router.py via LITELLM_ROUTER_CONFIG env var.
55
# Usage:
@@ -9,17 +9,17 @@
99
model_list:
1010
- model_name: "reasoning" # complex decisions: trade debate, fund-manager, hypothesis gen
1111
litellm_params:
12-
model: claude-sonnet-4-6
13-
api_key: os.environ/ANTHROPIC_API_KEY
12+
model: openai/gpt-oss-120b
13+
api_key: os.environ/OPENAI_API_KEY
1414

1515
- model_name: "fast" # bulk/cheap tasks: market-intel, screener, data validation
1616
litellm_params:
17-
model: groq/llama-3.3-70b-versatile
17+
model: groq/qwen/qwen3-32b
1818
api_key: os.environ/GROQ_API_KEY
1919

2020
- model_name: "bulk" # alias for fast — high-volume deterministic tasks
2121
litellm_params:
22-
model: groq/llama-3.3-70b-versatile
22+
model: groq/qwen/qwen3-32b
2323
api_key: os.environ/GROQ_API_KEY
2424

2525
- model_name: "local" # offline fallback — requires Ollama running locally

litellm_config.yaml

Lines changed: 16 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,76 +1,52 @@
11
# LiteLLM Proxy Configuration
22
# Model groups match tier names used by get_chat_model() — no translation needed.
3+
# Provider strategy: Groq Qwen3-32B for all operational tiers,
4+
# OpenAI gpt-oss-120b for heavy reasoning. No AWS Bedrock.
35

46
model_list:
5-
# --- heavy: Sonnet-class, complex reasoning ---
7+
# --- heavy: complex reasoning (fund-manager, trade-debater, quant-researcher) ---
68
- model_name: heavy
79
litellm_params:
8-
model: bedrock/us.anthropic.claude-sonnet-4-6
9-
aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID
10-
aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY
11-
aws_region_name: os.environ/AWS_DEFAULT_REGION
10+
model: openai/gpt-oss-120b
11+
api_key: os.environ/OPENAI_API_KEY
1212
model_info:
1313
priority: 1
1414
- model_name: heavy
1515
litellm_params:
16-
model: anthropic/claude-sonnet-4-6
17-
api_key: os.environ/ANTHROPIC_API_KEY
18-
model_info:
19-
priority: 2
20-
- model_name: heavy
21-
litellm_params:
22-
model: groq/llama-3.3-70b-versatile
16+
model: groq/qwen/qwen3-32b
2317
api_key: os.environ/GROQ_API_KEY
2418
model_info:
25-
priority: 3
19+
priority: 2
2620

27-
# --- medium: Haiku-class, structured extraction ---
21+
# --- medium: structured extraction (earnings-analyst, position-monitor, daily-planner) ---
2822
- model_name: medium
2923
litellm_params:
30-
model: bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0
31-
aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID
32-
aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY
33-
aws_region_name: os.environ/AWS_DEFAULT_REGION
24+
model: groq/qwen/qwen3-32b
25+
api_key: os.environ/GROQ_API_KEY
3426
model_info:
3527
priority: 1
3628
- model_name: medium
3729
litellm_params:
38-
model: anthropic/claude-haiku-4-5
39-
api_key: os.environ/ANTHROPIC_API_KEY
30+
model: openai/gpt-oss-120b
31+
api_key: os.environ/OPENAI_API_KEY
4032
model_info:
4133
priority: 2
4234

43-
# --- light: cheapest Haiku, simple coordination ---
35+
# --- light: simple coordination (supervisor, health-monitor) ---
4436
- model_name: light
4537
litellm_params:
46-
model: bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0
47-
aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID
48-
aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY
49-
aws_region_name: os.environ/AWS_DEFAULT_REGION
38+
model: groq/qwen/qwen3-32b
39+
api_key: os.environ/GROQ_API_KEY
5040
model_info:
5141
priority: 1
52-
- model_name: light
53-
litellm_params:
54-
model: anthropic/claude-haiku-4-5
55-
api_key: os.environ/ANTHROPIC_API_KEY
56-
model_info:
57-
priority: 2
5842

5943
# --- bulk: OPRO/TextGrad loops, cost-sensitive ---
6044
- model_name: bulk
6145
litellm_params:
62-
model: groq/llama-3.3-70b-versatile
46+
model: groq/qwen/qwen3-32b
6347
api_key: os.environ/GROQ_API_KEY
6448
model_info:
6549
priority: 1
66-
- model_name: bulk
67-
litellm_params:
68-
model: bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0
69-
aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID
70-
aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY
71-
aws_region_name: os.environ/AWS_DEFAULT_REGION
72-
model_info:
73-
priority: 2
7450

7551
# --- embedding: local ---
7652
- model_name: embedding

src/quantstack/alpha_discovery/hypothesis_agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
- Returns [] on timeout, Groq failure, or JSON parse failure — never raises.
1414
- All returned candidates pass schema validation before being returned.
1515
Malformed rules are dropped silently, not propagated to CandidateFilter.
16-
- Uses groq/llama-3.3-70b-versatile via LiteLLM — same model as GroqPM/SentimentCollector.
16+
- Uses groq/qwen/qwen3-32b via LiteLLM for hypothesis generation.
1717
- Temperature=0 for deterministic structured output.
1818
- max_hypotheses=5 caps token cost per call regardless of prompt response.
1919

src/quantstack/checkpointing.py

Lines changed: 55 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
"""Durable checkpoint management for LangGraph StateGraphs.
22
3-
Provides a PostgresSaver-backed checkpointer factory that enables crash
3+
Provides an AsyncPostgresSaver-backed checkpointer factory that enables crash
44
recovery. Each graph runner gets a shared checkpointer backed by a dedicated
5-
psycopg3 connection pool (separate from the application pool in db.py).
5+
psycopg3 async connection pool (separate from the application pool in db.py).
66
77
Connection budget:
88
Main pool (db.py): max 20
@@ -18,33 +18,71 @@
1818
logger = logging.getLogger(__name__)
1919

2020

21-
def create_checkpointer():
22-
"""Create a PostgresSaver backed by a dedicated psycopg3 connection pool.
21+
def _get_pg_url() -> str:
22+
return os.getenv("TRADER_PG_URL", "postgresql://localhost/quantstack")
2323

24-
Pool is sized for checkpoint operations: min_size=2, max_size=6.
25-
This is intentionally smaller than the main application pool (max_size=20)
26-
because checkpoint writes are less frequent than application queries.
2724

28-
setup() is NOT called here. Table creation is a deployment step,
29-
not a per-startup step. See setup_checkpoint_tables().
25+
def _run_checkpoint_setup() -> None:
26+
"""Create checkpoint tables using a sync autocommit connection.
27+
28+
Uses the sync PostgresSaver.MIGRATIONS list directly since
29+
CREATE INDEX CONCURRENTLY requires autocommit mode.
3030
"""
31+
import psycopg
3132
from langgraph.checkpoint.postgres import PostgresSaver
32-
from psycopg_pool import ConnectionPool
33+
from psycopg.rows import dict_row
3334

34-
pg_url = os.getenv(
35-
"TRADER_PG_URL",
36-
f"postgresql://localhost/quantstack",
37-
)
35+
pg_url = _get_pg_url()
36+
migrations = PostgresSaver.MIGRATIONS
37+
38+
with psycopg.connect(pg_url, autocommit=True, row_factory=dict_row) as conn:
39+
with conn.cursor() as cur:
40+
cur.execute(migrations[0])
41+
results = cur.execute(
42+
"SELECT v FROM checkpoint_migrations ORDER BY v DESC LIMIT 1"
43+
)
44+
row = results.fetchone()
45+
version = -1 if row is None else row["v"]
46+
for v, migration in zip(
47+
range(version + 1, len(migrations)),
48+
migrations[version + 1:],
49+
strict=False,
50+
):
51+
cur.execute(migration)
52+
cur.execute(
53+
"INSERT INTO checkpoint_migrations (v) VALUES (%s)", (v,)
54+
)
55+
logger.info("PostgresSaver checkpoint tables ready")
56+
57+
58+
async def create_checkpointer():
59+
"""Create an AsyncPostgresSaver backed by a dedicated async connection pool.
60+
61+
Pool is sized for checkpoint operations: min_size=2, max_size=6.
62+
Tables are created synchronously before the async pool is opened.
63+
"""
64+
from langgraph.checkpoint.postgres.aio import AsyncPostgresSaver
65+
from psycopg_pool import AsyncConnectionPool
66+
67+
pg_url = _get_pg_url()
68+
69+
# Ensure tables exist (sync, autocommit — safe for CREATE INDEX CONCURRENTLY)
70+
try:
71+
_run_checkpoint_setup()
72+
except Exception as exc:
73+
logger.warning("Checkpoint table setup failed (may already exist): %s", exc)
3874

39-
pool = ConnectionPool(
75+
pool = AsyncConnectionPool(
4076
conninfo=pg_url,
4177
min_size=2,
4278
max_size=6,
4379
max_lifetime=3600,
4480
max_idle=600,
81+
open=False,
4582
)
83+
await pool.open()
4684

47-
return PostgresSaver(pool)
85+
return AsyncPostgresSaver(pool)
4886

4987

5088
def setup_checkpoint_tables() -> None:
@@ -53,9 +91,7 @@ def setup_checkpoint_tables() -> None:
5391
Run once as a deployment/migration step, not on every startup.
5492
Safe to call multiple times (idempotent CREATE IF NOT EXISTS).
5593
"""
56-
checkpointer = create_checkpointer()
57-
checkpointer.setup()
58-
logger.info("PostgresSaver checkpoint tables created/verified")
94+
_run_checkpoint_setup()
5995

6096

6197
def prune_old_checkpoints(retention_hours: int = 48) -> int:

0 commit comments

Comments
 (0)