From 9e7dd5506a3d34c25e521ddfcf597982383cc5c0 Mon Sep 17 00:00:00 2001 From: Lucas Jin Date: Mon, 9 Mar 2026 22:36:22 -0400 Subject: [PATCH 01/13] chore: plan for deployment --- plan/CLAUDE.md | 279 +++++++++++++++++++++++++++++++++++++++++++++++++ plan/PRD.md | 205 ++++++++++++++++++++++++++++++++++++ 2 files changed, 484 insertions(+) create mode 100644 plan/CLAUDE.md create mode 100644 plan/PRD.md diff --git a/plan/CLAUDE.md b/plan/CLAUDE.md new file mode 100644 index 0000000..5e676dd --- /dev/null +++ b/plan/CLAUDE.md @@ -0,0 +1,279 @@ +# Argus — Claude Context File + +Read this file before working on any ticket. It contains the full architectural context, conventions, and key file locations for the Argus project. + +## What Is Argus + +A 3D global event intelligence platform. World events are scraped daily from multiple sources, stored in PostgreSQL with vector embeddings, and visualized on an interactive globe. An AI agent (Graph-RAG pipeline) lets users query events with persona-aware analysis. + +## Tech Stack + +| Component | Technology | Notes | +|-----------|-----------|-------| +| Frontend | React 19 + TypeScript 5 + Vite 6 | SPA, no SSR | +| Globe | react-globe.gl (three.js wrapper) | Heavy bundle — lazy-load | +| Styling | Tailwind CSS 3 + CSS custom properties | Design tokens in `index.css` | +| Backend | FastAPI + Uvicorn (Python 3.11+) | Async throughout | +| Database | PostgreSQL 15+ with pgvector + pgcrypto | Extensions required | +| AI Model | Google Gemini 2.5-flash | Structured JSON output | +| Embeddings | OpenAI text-embedding-3-small (1536 dims) | Migration target: local sentence-transformers | +| Voice | ElevenLabs Scribe v1 | Optional, speech-to-text | +| Media | Cloudinary (primary), S3 fallback, placeholder SVGs | | + +## Project Structure + +``` +hackcanada/ +├── plan/ # PRD and this context file +├── frontend/ +│ ├── src/ +│ │ ├── main.tsx # Entry: nested context providers +│ │ ├── App.tsx # Bootstrap: fetch points/arcs, render overlays +│ │ ├── index.css # Design tokens, fonts +│ │ ├── api/client.ts # Typed fetch wrapper for all backend endpoints +│ │ ├── components/ +│ │ │ ├── Globe/GlobeView.tsx # 3D globe — points, arcs, tooltips, clusters +│ │ │ ├── Filters/FilterBar.tsx # Event-type filter chips +│ │ │ ├── Timeline/TimelineSlider.tsx # Date scrubber + play/pause +│ │ │ ├── Modal/EventModal.tsx # Right panel — event detail + AI analysis +│ │ │ ├── Modal/RealTimeAnalysisSection.tsx # Gemini + Google Search grounding +│ │ │ └── Agent/ # Left panel — AI query interface +│ │ │ ├── AgentPanel.tsx # Query input, voice, submit +│ │ │ ├── AgentAnswerView.tsx # Citation parsing, financial impact +│ │ │ ├── AgentNavigationOverlay.tsx # Globe camera animation +│ │ │ ├── PersonaSelector.tsx # Role + industry selection +│ │ │ └── FinancialImpactSection.tsx +│ │ ├── context/ +│ │ │ ├── AppContext.tsx # Events, arcs, filters, timeline, globe focus +│ │ │ ├── AgentContext.tsx # Agent state, highlights, navigation plan +│ │ │ └── UserPersonaContext.tsx # Role + industry (localStorage-persisted) +│ │ ├── types/ +│ │ │ ├── events.ts # Event, ContentPoint, ContentArc, EventDetail +│ │ │ └── agent.ts # AgentResponse, NavigationPlan, FinancialImpact +│ │ └── utils/mediaConfig.ts # Cloudinary/S3/placeholder URL resolver +│ └── package.json +│ +└── backend/ + ├── requirements.txt + ├── run_scrape.py # CLI: Polymarket + Kalshi scrape + ├── run_gdelt_scrape.py # CLI: GDELT scrape (--days, --limit flags) + ├── migrations/ + │ └── 001_init_schema.sql # Full PostgreSQL schema + └── app/ + ├── main.py # FastAPI app, CORS, router registration + ├── config.py # Env var loading, agent defaults + ├── models/ + │ ├── enums.py # EventType, RelationshipType (StrEnum) + │ ├── schemas.py # Core Pydantic models + │ └── agent_schemas.py # Agent-specific Pydantic models + ├── routers/ + │ ├── content.py # GET /content/points, /arcs, POST /{id}/confidence-score, /{id}/realtime-analysis + │ ├── agent.py # POST /agent/query + │ ├── ingestion.py # POST /ingestion/acled + │ ├── embeddings.py # POST /embeddings/backfill/content + │ └── market_signals.py # GET /market-signals (live fetch) + ├── services/ + │ ├── agent_service.py # Graph-RAG pipeline orchestration + │ ├── agent_tools.py # DB query tools (search, relate, detail, impact) + │ ├── gemini_client.py # Gemini API: synthesis, confidence, realtime analysis + │ ├── scraping_service.py # Polymarket + Kalshi + GDELT orchestrator + │ └── content_repository.py # (duplicate — also in ingestion/) + ├── repositories/ + │ └── content_repository.py # Market signal row persistence + ├── embeddings/ + │ ├── embedding_repository.py # Fetch/update embedding vectors + │ ├── embedding_backfill_service.py # Backfill missing embeddings + │ ├── openai_embedding_client.py # OpenAI API wrapper + │ └── run_embedding_backfill.py # CLI entry + ├── ingestion/ + │ ├── ingestion_service.py # ACLED pipeline: fetch -> normalize -> dedupe -> insert + │ ├── content_repository.py # ensure_sources, insert_content (DUPLICATE) + │ ├── db.py # asyncpg connection pool (only used by ingestion) + │ ├── dedupe_service.py # Duplicate detection + │ └── acled/ + │ ├── acled_client.py # ACLED API client + │ └── acled_normalizer.py + └── scrapers/ + ├── gdelt.py # GDELT (BigQuery primary, DOC API fallback) + ├── polymarket.py # Polymarket API + ├── kalshi.py # Kalshi API (async, rate-limited) + ├── row_format.py # Shared row normalization -> content_table shape + ├── eonet.py # UNUSED — delete + ├── eonet_db.py # UNUSED — delete + ├── social_scraper.py # UNUSED — delete + ├── reddit.py # UNUSED — delete + ├── reddit_classifier.py # UNUSED — delete + ├── reddit_db.py # UNUSED — delete + ├── Reddit Scraper/ # UNUSED — delete + ├── natural-disasters/ # UNUSED — delete + └── ryan_scrapers/ # UNUSED — delete +``` + +## Database Schema + +PostgreSQL with pgvector and pgcrypto extensions. + +### Core Tables + +**`content_table`** (primary data store — rename target: `articles`) +- `id` UUID PK (gen_random_uuid) +- `title`, `body`, `url` (UNIQUE) +- `latitude`, `longitude` (nullable floats) +- `image_url` (Cloudinary public_id), `s3_url` +- `embedding` vector(1536) — OpenAI text-embedding-3-small +- `sentiment_score` float, `market_signal` text +- `published_at` timestamptz, `event_type` text, `raw_metadata_json` JSONB +- `source_id` FK -> sources, `engagement_id` FK -> engagement +- `created_at` timestamptz + +**`engagement`** — Reddit, Polymarket, Twitter metrics per content item + +**`sources`** — name, type, base_url, trust_score + +**`entities`** — extracted entities (person, org, location, etc.) + +**`content_entities`** — join table (content_item_id, entity_id, relevance_score) + +**`events`** — clustered event groups with cluster_embedding, canada_impact_summary, confidence_score + +**`event_content`** — join between events and content_table + +**`event_relationships`** — event_a_id, event_b_id, relationship_type, score, reason_codes + +### Key Indexes +- HNSW cosine index on `content_table.embedding` +- UNIQUE on `content_table.url` + +## API Endpoints + +| Method | Path | Purpose | AI Cost | +|--------|------|---------|---------| +| GET | `/content/points` | All content with lat/lng (last 31 days) | None | +| GET | `/content/arcs?threshold=0.7` | Similarity arcs via pgvector cosine | None | +| GET | `/content/{id}` | Single content item detail | None | +| POST | `/content/{id}/confidence-score` | Gemini credibility scoring (0.31-1.0) | 1 Gemini call | +| POST | `/content/{id}/realtime-analysis` | Gemini + Google Search grounding | 1 Gemini call | +| POST | `/agent/query` | Graph-RAG agent pipeline | 1 Gemini call + 1 OpenAI embed | +| GET | `/market-signals` | Live Polymarket + Kalshi fetch | None | +| POST | `/ingestion/acled` | Trigger ACLED ingestion pipeline | None | +| POST | `/embeddings/backfill/content` | Backfill missing embeddings | N OpenAI calls | +| GET | `/health` | Health check | None | + +## Agent Pipeline (Graph-RAG) + +1. **Classify query** — pattern match keywords -> query_type (event_explanation, impact_analysis, connection_discovery, entity_relevance) +2. **Seed retrieval** — keyword ILIKE search + pgvector cosine similarity +3. **Graph expansion** — 2-hop: for each seed, find 6 nearest neighbors via pgvector +4. **Context assembly** — full article bodies + financial impact heuristics +5. **Gemini synthesis** — structured JSON output with citations `[cite:UUID]`, navigation plan, financial impact +6. **Post-processing** — filter to globe-navigable events, strip invalid citations + +Output schema includes: answer, confidence, caution, query_type, navigation_plan, relevant_event_ids, highlight_relationships, financial_impact, reasoning_steps, cited_event_map. + +## Event Types (StrEnum) + +``` +geopolitics, trade_supply_chain, energy_commodities, +financial_markets, climate_disasters, policy_regulation +``` + +Each maps to a color in the frontend `EVENT_TYPE_COLORS` constant. + +## Data Sources + +| Source | What It Provides | Scraper File | +|--------|-----------------|--------------| +| GDELT | Global events from news (BigQuery or DOC API) | `scrapers/gdelt.py` | +| ACLED | Armed conflict events | `ingestion/acled/acled_client.py` | +| Polymarket | Prediction market events + probabilities | `scrapers/polymarket.py` | +| Kalshi | Prediction market events + volumes | `scrapers/kalshi.py` | + +## Known Issues & Tech Debt + +1. **Duplicate `content_repository.py`** — exists in both `repositories/` and `ingestion/`. Must consolidate. +2. **No shared DB pool** — `ingestion/db.py` has its own pool; other services use inline `asyncpg.connect()`. Need a single shared pool. +3. **Dead scraper code** — `eonet.py`, `reddit*.py`, `social_scraper.py`, `Reddit Scraper/`, `natural-disasters/`, `ryan_scrapers/` are all unused. +4. **No scheduled scraping** — all ingestion is manual CLI or API trigger. +5. **Expensive embeddings** — OpenAI API called per-row. Should switch to local model. +6. **No caching** — every confidence score and realtime analysis call hits Gemini. Need Redis. +7. **CORS wildcard** — `*` origin allowed in production. Must lock down. +8. **No tests** — zero test files in the repo. +9. **Print debugging** — `print()` used instead of structured logging. +10. **No migration tool** — raw SQL files, no Alembic. + +## Conventions + +### Backend +- **Async everywhere** — use `async def` for all route handlers and service methods +- **asyncpg** for DB access (not SQLAlchemy ORM) +- **Pydantic v2** for request/response models +- **Raw SQL** for queries (no ORM) — parameterize all user inputs with `$1, $2` syntax +- **Environment variables** via `python-dotenv` and `os.getenv()` +- **Scraper output** normalized via `row_format.make_content_row()` before DB insert + +### Frontend +- **React 19** with function components and hooks only +- **Context API** for state (no Redux) — three providers: App, Agent, UserPersona +- **Tailwind CSS** for styling — design tokens as CSS variables in `index.css` +- **No routing library** yet — single-page, overlay-based navigation +- **Client-side filtering** — all events loaded on mount, visibility controlled by pointRadius/pointColor + +### Git +- Branch from `main` +- Conventional-ish commit messages (e.g., `feat:`, `fix:`, `chore:`) + +## Environment Variables + +### Backend (.env) +``` +DATABASE_URL=postgresql+asyncpg://user:pass@host:5432/dbname # Required +GEMINI_API_KEY=... # Required for agent +GEMINI_MODEL=gemini-2.5-flash # Optional, default shown +OPENAI_API_KEY=... # Required for embeddings (until local model migration) +ACLED_API_KEY=... # Required for ACLED ingestion +CLOUDINARY_CLOUD_NAME=... # Optional +CLOUDINARY_API_KEY=... # Optional +CLOUDINARY_API_SECRET=... # Optional +AWS_ACCESS_KEY_ID=... # Optional (S3 fallback) +AWS_SECRET_ACCESS_KEY=... # Optional +S3_BUCKET=... # Optional +AWS_REGION=... # Optional +ELEVENLABS_API_KEY=... # Optional +``` + +### Frontend (.env) +``` +VITE_API_URL=/api # or http://127.0.0.1:8000 +VITE_CLOUDINARY_CLOUD_NAME=... # Optional +VITE_ELEVENLABS_API_KEY=... # Optional +``` + +## Running Locally + +```bash +# Backend +cd backend && python -m venv .venv && source .venv/bin/activate +pip install -r requirements.txt +uvicorn app.main:app --reload --port 8000 + +# Frontend +cd frontend && npm install && npm run dev + +# Manual scraping +python run_scrape.py # Polymarket + Kalshi +python run_gdelt_scrape.py # GDELT (--days 14 --limit 500) +curl -X POST localhost:8000/ingestion/acled +curl -X POST localhost:8000/embeddings/backfill/content +``` + +## Working on Tickets + +When picking up a ticket from the PRD (`plan/PRD.md`): + +1. **Read the relevant source files first** — don't modify code you haven't read +2. **Check for duplicates** — the codebase has redundant implementations (see Known Issues) +3. **Keep async** — all new backend code should be async +4. **Parameterize SQL** — never string-interpolate user input into queries +5. **No new print()** — use `logging.getLogger(__name__)` +6. **Test what you build** — add tests alongside new code (once pytest is set up) +7. **Budget-conscious** — if a feature involves AI API calls, always consider caching and batching first diff --git a/plan/PRD.md b/plan/PRD.md new file mode 100644 index 0000000..1276be5 --- /dev/null +++ b/plan/PRD.md @@ -0,0 +1,205 @@ +# Argus — Product Requirements Document + +## Vision + +Argus is a 3D global event intelligence platform that aggregates world events daily, stores them in a semantically searchable database, and visualizes them on an interactive globe. Users can explore events spatially and temporally, query an AI agent for analysis, and discover connections between events through graph-based relationships. + +The platform is source-agnostic — while the hackathon prototype focused on Canada-impact framing, the production system should support any analytical lens or persona. + +## Current State (MVP) + +Working prototype with: +- 3D globe rendering events as points with similarity arcs +- AI agent (Graph-RAG) with persona-aware Gemini synthesis +- Scrapers for GDELT, ACLED, Polymarket, Kalshi +- PostgreSQL + pgvector for semantic search +- Client-side filtering and timeline scrubbing + +Key gaps: no deployment config, no scheduled scraping, no caching, no tests, high AI token costs, dead code from hackathon iteration. + +## Team & Constraints + +- 4 people, part-time +- Low budget — minimize per-request AI spend +- Target: production-deployable within ~7 weeks (4 phases) + +--- + +## Phase 1: Foundation (Week 1–2) + +**Goal:** Make the app deployable and the codebase clean enough to work on confidently. + +### 1.1 Containerization + +| # | Ticket | Priority | +|---|--------|----------| +| 1 | Create backend Dockerfile (FastAPI + uvicorn, multi-stage build) | P0 | +| 2 | Create frontend Dockerfile (Vite build -> nginx static serve) | P0 | +| 3 | Create `docker-compose.yml` with services: backend, frontend, postgres+pgvector, redis | P0 | +| 4 | Add `.dockerignore` files (exclude `.env`, `node_modules`, `.venv`, `__pycache__`) | P0 | + +### 1.2 CI/CD + +| # | Ticket | Priority | +|---|--------|----------| +| 5 | Set up GitHub Actions CI: ruff lint/format, TypeScript typecheck, docker build | P0 | +| 6 | Set up GitHub Actions CD: build images, push to registry, deploy to hosting | P1 | +| 7 | Choose hosting provider (Railway, Fly.io, or small VPS) and document deploy process | P1 | + +### 1.3 Dead Code Removal + +| # | Ticket | Priority | +|---|--------|----------| +| 8 | Delete unused scrapers: `eonet.py`, `eonet_db.py`, `social_scraper.py`, `reddit.py`, `reddit_classifier.py`, `reddit_db.py`, `Reddit Scraper/`, `natural-disasters/`, `ryan_scrapers/` | P0 | +| 9 | Remove duplicate `content_repository.py` (exists in both `repositories/` and `ingestion/`) — consolidate into one | P0 | +| 10 | Audit and remove any other dead imports, unused functions, or commented-out code | P1 | + +### 1.4 Security Baseline + +| # | Ticket | Priority | +|---|--------|----------| +| 11 | Lock down CORS origins — remove `*` wildcard, use env-configured allowed origins | P0 | +| 12 | Add rate limiting middleware on `/agent/query` and `/content/{id}/realtime-analysis` (these burn AI tokens) | P0 | +| 13 | Validate all secrets are loaded from env vars; fail fast on startup if required vars missing | P0 | + +--- + +## Phase 2: Automation & Cost Optimization (Week 2–3) + +**Goal:** Automate daily data ingestion and drastically cut AI token spend. + +### 2.1 Scheduled Scraping + +| # | Ticket | Priority | +|---|--------|----------| +| 14 | Create unified scrape entrypoint (`run_daily_pipeline.py`) that runs all scrapers + embedding backfill in sequence | P0 | +| 15 | Add cron scheduler (APScheduler in a separate container, or cron in docker-compose) | P0 | +| 16 | Define scraping schedule: GDELT 1x/day, ACLED 1x/day, Polymarket+Kalshi 2x/day | P0 | +| 17 | Add `scrape_runs` logging table (source, status, rows_inserted, errors, duration_ms, started_at) | P1 | +| 18 | Add idempotency guards — URL-based dedup check before insert, not after | P0 | +| 19 | Add failure alerting (Discord webhook or email on scrape errors) | P2 | + +### 2.2 Embedding Cost Reduction + +| # | Ticket | Priority | +|---|--------|----------| +| 20 | **Switch embeddings to local model** — replace OpenAI `text-embedding-3-small` with `sentence-transformers` (e.g. `all-MiniLM-L6-v2`, 384 dims). Run locally in backend container. Update vector column dimension. | P0 | +| 21 | Batch embedding generation — process 100+ items per batch instead of one-at-a-time | P0 | +| 22 | Deduplicate content before generating embeddings (currently embeddings are generated before dedup) | P1 | + +### 2.3 AI Response Caching (Redis) + +| # | Ticket | Priority | +|---|--------|----------| +| 23 | Add Redis client utility module with connection pooling | P0 | +| 24 | Cache `/content/points` response — invalidate on new scrape run completion | P0 | +| 25 | Cache `/content/arcs` response per threshold value (TTL = until next scrape) | P1 | +| 26 | Cache Gemini confidence scores per content_id (TTL = 24h) | P0 | +| 27 | Cache Gemini realtime analysis per `(content_id, user_role)` (TTL = 6h) | P0 | +| 28 | Cache agent query results keyed on `(normalized_query_hash, persona)` (TTL = 1h) | P1 | + +### 2.4 Additional Cost Controls + +| # | Ticket | Priority | +|---|--------|----------| +| 29 | Add token usage tracking — log Gemini and OpenAI (if still used) token counts per call to a `token_usage` table | P1 | +| 30 | Pre-compute confidence scores during scrape pipeline instead of on-demand per user click | P2 | +| 31 | Replace per-request Google Search grounding with daily cached news summaries scraped during pipeline | P2 | + +--- + +## Phase 3: Schema & Code Quality (Week 3–5) + +**Goal:** Clean up the data model, modularize backend code, add test coverage. + +### 3.1 Database Schema Improvements + +| # | Ticket | Priority | +|---|--------|----------| +| 32 | Set up Alembic for migration management (replace raw SQL files) | P0 | +| 33 | Rename `content_table` -> `articles` | P1 | +| 34 | Split AI-generated fields into `article_analysis` table (embedding, sentiment_score, market_signal, confidence_score) | P1 | +| 35 | Add `scrape_source` enum column to replace generic FK to `sources` table | P2 | +| 36 | Add composite index on `(event_type, published_at)` for filtered timeline queries | P1 | +| 37 | Add `last_scraped_at` column to sources for freshness tracking | P2 | +| 38 | Clean up unused columns and tables from hackathon iteration | P1 | + +### 3.2 Backend Modularization + +| # | Ticket | Priority | +|---|--------|----------| +| 39 | Create shared `db.py` module — single asyncpg pool used by all services (currently duplicated in `ingestion/db.py` and inline connects) | P0 | +| 40 | Extract `BaseScraper` ABC interface: `async fetch() -> list[NormalizedRow]`. Make GDELT, ACLED, Polymarket, Kalshi implement it. | P1 | +| 41 | Consolidate all Pydantic models into a single `schemas/` package | P1 | +| 42 | Replace all `print()` statements with structured `logging` (use correlation IDs per request) | P1 | +| 43 | Add input validation on all API endpoints (query length limits, coordinate bounds, UUID format) | P1 | + +### 3.3 Testing + +| # | Ticket | Priority | +|---|--------|----------| +| 44 | Set up pytest with async fixtures (asyncpg test DB, httpx AsyncClient) | P0 | +| 45 | Write tests for scraper row normalization (`row_format.py`, ACLED normalizer) | P0 | +| 46 | Write tests for deduplication logic | P1 | +| 47 | Write tests for agent query classification (`_classify_query`) | P1 | +| 48 | Write API integration tests for `/content/points`, `/agent/query` | P1 | +| 49 | Add frontend smoke tests (Vitest + React Testing Library) for Globe render, Agent query flow | P2 | +| 50 | Add pre-commit hooks: ruff lint+format, TypeScript typecheck | P0 | + +--- + +## Phase 4: Frontend Polish & Observability (Week 5–7) + +**Goal:** Make the UI production-grade and add operational visibility. + +### 4.1 Frontend UX + +| # | Ticket | Priority | +|---|--------|----------| +| 51 | Add loading skeleton/spinner on initial data fetch | P0 | +| 52 | Lazy-load Globe component (three.js is ~500KB) with React.lazy + Suspense | P1 | +| 53 | Add error boundaries around Globe, Agent, and Modal components | P0 | +| 54 | Memoize expensive computations (arc filtering, point color mapping) with useMemo/useCallback | P1 | +| 55 | Add responsive layout — mobile/tablet support or graceful "desktop-only" message | P1 | +| 56 | Add URL-based routing (React Router) for shareable/bookmarkable globe state | P2 | +| 57 | Add "no results" empty states for agent queries and empty filtered views | P1 | +| 58 | Move inline styles and magic numbers to shared constants / design tokens | P2 | + +### 4.2 Observability + +| # | Ticket | Priority | +|---|--------|----------| +| 59 | Add structured logging with request correlation IDs (middleware) | P1 | +| 60 | Add simple admin dashboard page: scrape history, DB row counts, token spend summary | P2 | +| 61 | Add uptime monitoring on `/health` endpoint (UptimeRobot free tier or similar) | P1 | + +### 4.3 Security Hardening + +| # | Ticket | Priority | +|---|--------|----------| +| 62 | Audit all raw SQL for injection risks — parameterize any string-interpolated queries | P0 | +| 63 | Add API key or lightweight session auth for agent/analysis endpoints | P1 | + +--- + +## Success Criteria + +| Metric | Target | +|--------|--------| +| Daily scraping runs automatically | Yes, with logged success/failure | +| Embedding cost per scrape cycle | < $0.01 (local model) | +| Gemini API calls per unique user action | Max 1 (cached thereafter) | +| Time to deploy from commit | < 10 minutes | +| Backend test coverage on critical paths | > 70% | +| Frontend initial load time | < 3 seconds (gzipped, lazy-loaded) | +| Uptime | > 99% (monitored) | + +--- + +## Non-Goals (for now) + +- Mobile app (web-only is fine) +- User accounts / auth (public dashboard for now, API key auth only) +- Real-time websocket streaming (polling/refresh is sufficient) +- Multi-language support +- Custom event submission by users From 62a30fc7bdad1408d5845ccf9c6d11a32d65bd03 Mon Sep 17 00:00:00 2001 From: Lucas Jin Date: Mon, 9 Mar 2026 22:56:29 -0400 Subject: [PATCH 02/13] chore: update planning docs --- plan/CLAUDE.md | 55 ++++++++++--------- plan/PRD.md | 146 +++++++++++++++++++++++++++++-------------------- 2 files changed, 118 insertions(+), 83 deletions(-) diff --git a/plan/CLAUDE.md b/plan/CLAUDE.md index 5e676dd..fba77a9 100644 --- a/plan/CLAUDE.md +++ b/plan/CLAUDE.md @@ -2,6 +2,8 @@ Read this file before working on any ticket. It contains the full architectural context, conventions, and key file locations for the Argus project. +> **Note:** This document reflects the team's best understanding at time of writing. If the user gives instructions that conflict with what's written here, **follow the user's instructions** — they take priority. If parts of this context have become outdated or irrelevant due to changes in the codebase, use your judgement and note the discrepancy rather than blindly following stale guidance. + ## What Is Argus A 3D global event intelligence platform. World events are scraped daily from multiple sources, stored in PostgreSQL with vector embeddings, and visualized on an interactive globe. An AI agent (Graph-RAG pipeline) lets users query events with persona-aware analysis. @@ -18,7 +20,7 @@ A 3D global event intelligence platform. World events are scraped daily from mul | AI Model | Google Gemini 2.5-flash | Structured JSON output | | Embeddings | OpenAI text-embedding-3-small (1536 dims) | Migration target: local sentence-transformers | | Voice | ElevenLabs Scribe v1 | Optional, speech-to-text | -| Media | Cloudinary (primary), S3 fallback, placeholder SVGs | | +| Media | Placeholder SVGs only | Cloudinary and S3 removed — no longer needed | ## Project Structure @@ -50,13 +52,13 @@ hackcanada/ │ │ ├── types/ │ │ │ ├── events.ts # Event, ContentPoint, ContentArc, EventDetail │ │ │ └── agent.ts # AgentResponse, NavigationPlan, FinancialImpact -│ │ └── utils/mediaConfig.ts # Cloudinary/S3/placeholder URL resolver +│ │ └── utils/mediaConfig.ts # DEPRECATED — Cloudinary/S3 removed, delete this file (ticket #10) │ └── package.json │ └── backend/ ├── requirements.txt - ├── run_scrape.py # CLI: Polymarket + Kalshi scrape - ├── run_gdelt_scrape.py # CLI: GDELT scrape (--days, --limit flags) + ├── run_scrape.py # LEGACY CLI — will be replaced by run_daily_pipeline.py + ├── run_gdelt_scrape.py # LEGACY CLI — will be replaced by run_daily_pipeline.py ├── migrations/ │ └── 001_init_schema.sql # Full PostgreSQL schema └── app/ @@ -94,16 +96,19 @@ hackcanada/ │ ├── acled_client.py # ACLED API client │ └── acled_normalizer.py └── scrapers/ - ├── gdelt.py # GDELT (BigQuery primary, DOC API fallback) - ├── polymarket.py # Polymarket API - ├── kalshi.py # Kalshi API (async, rate-limited) - ├── row_format.py # Shared row normalization -> content_table shape + ├── row_format.py # Shared row normalization -> content_table shape (KEEP — used by new scrapers) + ├── _reference/ # Hackathon scrapers kept as design inspiration only (NOT used in production) + │ ├── gdelt.py # Reference: dual-path fetch, CAMEO mapping, Goldstein normalization + │ ├── kalshi.py # Reference: async rate limiter, cursor pagination, asyncio.gather + │ ├── polymarket.py # Reference: simple REST API pattern, tag filtering + │ └── acled/ # Reference: client/normalizer separation, NormalizedRecord model ├── eonet.py # UNUSED — delete ├── eonet_db.py # UNUSED — delete ├── social_scraper.py # UNUSED — delete ├── reddit.py # UNUSED — delete ├── reddit_classifier.py # UNUSED — delete ├── reddit_db.py # UNUSED — delete + ├── reddit_schema.sql # UNUSED — delete ├── Reddit Scraper/ # UNUSED — delete ├── natural-disasters/ # UNUSED — delete └── ryan_scrapers/ # UNUSED — delete @@ -119,7 +124,7 @@ PostgreSQL with pgvector and pgcrypto extensions. - `id` UUID PK (gen_random_uuid) - `title`, `body`, `url` (UNIQUE) - `latitude`, `longitude` (nullable floats) -- `image_url` (Cloudinary public_id), `s3_url` +- `image_url`, `s3_url` — DEPRECATED, to be dropped (Cloudinary/S3 no longer used) - `embedding` vector(1536) — OpenAI text-embedding-3-small - `sentiment_score` float, `market_signal` text - `published_at` timestamptz, `event_type` text, `raw_metadata_json` JSONB @@ -181,18 +186,22 @@ Each maps to a color in the frontend `EVENT_TYPE_COLORS` constant. ## Data Sources -| Source | What It Provides | Scraper File | -|--------|-----------------|--------------| -| GDELT | Global events from news (BigQuery or DOC API) | `scrapers/gdelt.py` | -| ACLED | Armed conflict events | `ingestion/acled/acled_client.py` | -| Polymarket | Prediction market events + probabilities | `scrapers/polymarket.py` | -| Kalshi | Prediction market events + volumes | `scrapers/kalshi.py` | +The hackathon prototype used the sources below. **None of the existing scraper implementations will be used directly** — new production scrapers will be written implementing a `BaseScraper` ABC. The old code is kept in `scrapers/_reference/` for design inspiration. + +| Source | What It Provides | Reference File | Quality | +|--------|-----------------|----------------|---------| +| GDELT | Global events from news (BigQuery or DOC API) | `scrapers/_reference/gdelt.py` | Excellent — study for complex normalization | +| ACLED | Armed conflict events | `scrapers/_reference/acled/` | Good — study for client/normalizer separation | +| Polymarket | Prediction market events + probabilities | `scrapers/_reference/polymarket.py` | Decent — study for simple REST pattern | +| Kalshi | Prediction market events + volumes | `scrapers/_reference/kalshi.py` | Excellent — study for async rate limiting | + +Which data sources to keep, replace, or add is a product decision for Phase 2. The scraper architecture (BaseScraper ABC, row_format contract, dedup-before-embed) is what matters. ## Known Issues & Tech Debt 1. **Duplicate `content_repository.py`** — exists in both `repositories/` and `ingestion/`. Must consolidate. 2. **No shared DB pool** — `ingestion/db.py` has its own pool; other services use inline `asyncpg.connect()`. Need a single shared pool. -3. **Dead scraper code** — `eonet.py`, `reddit*.py`, `social_scraper.py`, `Reddit Scraper/`, `natural-disasters/`, `ryan_scrapers/` are all unused. +3. **Dead scraper code** — `eonet.py`, `reddit*.py`, `social_scraper.py`, `Reddit Scraper/`, `natural-disasters/`, `ryan_scrapers/` are all unused junk. The "real" scrapers (`gdelt.py`, `kalshi.py`, `polymarket.py`, `acled/`) are hackathon-quality reference code only — new production scrapers need to be written. 4. **No scheduled scraping** — all ingestion is manual CLI or API trigger. 5. **Expensive embeddings** — OpenAI API called per-row. Should switch to local model. 6. **No caching** — every confidence score and realtime analysis call hits Gemini. Need Redis. @@ -200,6 +209,7 @@ Each maps to a color in the frontend `EVENT_TYPE_COLORS` constant. 8. **No tests** — zero test files in the repo. 9. **Print debugging** — `print()` used instead of structured logging. 10. **No migration tool** — raw SQL files, no Alembic. +11. **Dead Cloudinary/S3 code** — Cloudinary and S3 are no longer used. `utils/mediaConfig.ts`, `@cloudinary/react`, `@cloudinary/url-gen`, `cloudinary`, `boto3` deps, `image_url`/`s3_url` DB columns, and all related env vars should be removed (ticket #10). ## Conventions @@ -210,6 +220,7 @@ Each maps to a color in the frontend `EVENT_TYPE_COLORS` constant. - **Raw SQL** for queries (no ORM) — parameterize all user inputs with `$1, $2` syntax - **Environment variables** via `python-dotenv` and `os.getenv()` - **Scraper output** normalized via `row_format.make_content_row()` before DB insert +- **New scrapers** must implement `BaseScraper` ABC — see `scrapers/_reference/` for patterns, especially `kalshi.py` (rate limiting) and `gdelt.py` (normalization) ### Frontend - **React 19** with function components and hooks only @@ -231,21 +242,15 @@ GEMINI_API_KEY=... # Required for agent GEMINI_MODEL=gemini-2.5-flash # Optional, default shown OPENAI_API_KEY=... # Required for embeddings (until local model migration) ACLED_API_KEY=... # Required for ACLED ingestion -CLOUDINARY_CLOUD_NAME=... # Optional -CLOUDINARY_API_KEY=... # Optional -CLOUDINARY_API_SECRET=... # Optional -AWS_ACCESS_KEY_ID=... # Optional (S3 fallback) -AWS_SECRET_ACCESS_KEY=... # Optional -S3_BUCKET=... # Optional -AWS_REGION=... # Optional ELEVENLABS_API_KEY=... # Optional +# NOTE: CLOUDINARY_* and AWS_*/S3_* vars are no longer needed — remove if present ``` ### Frontend (.env) ``` VITE_API_URL=/api # or http://127.0.0.1:8000 -VITE_CLOUDINARY_CLOUD_NAME=... # Optional VITE_ELEVENLABS_API_KEY=... # Optional +# NOTE: VITE_CLOUDINARY_CLOUD_NAME is no longer needed — remove if present ``` ## Running Locally @@ -259,7 +264,7 @@ uvicorn app.main:app --reload --port 8000 # Frontend cd frontend && npm install && npm run dev -# Manual scraping +# Manual scraping (LEGACY — will be replaced by run_daily_pipeline.py) python run_scrape.py # Polymarket + Kalshi python run_gdelt_scrape.py # GDELT (--days 14 --limit 500) curl -X POST localhost:8000/ingestion/acled diff --git a/plan/PRD.md b/plan/PRD.md index 1276be5..966ee7c 100644 --- a/plan/PRD.md +++ b/plan/PRD.md @@ -11,11 +11,11 @@ The platform is source-agnostic — while the hackathon prototype focused on Can Working prototype with: - 3D globe rendering events as points with similarity arcs - AI agent (Graph-RAG) with persona-aware Gemini synthesis -- Scrapers for GDELT, ACLED, Polymarket, Kalshi +- Hackathon-era scrapers for GDELT, ACLED, Polymarket, Kalshi (will not be used directly — new scrapers will be written, but these serve as reference) - PostgreSQL + pgvector for semantic search - Client-side filtering and timeline scrubbing -Key gaps: no deployment config, no scheduled scraping, no caching, no tests, high AI token costs, dead code from hackathon iteration. +Key gaps: no deployment config, no scheduled scraping, no caching, no tests, high AI token costs, dead code from hackathon iteration, legacy Cloudinary/S3 media code that is no longer needed. Existing scrapers need to be replaced with new, production-grade implementations — the current ones are MVP-quality reference code only. ## Team & Constraints @@ -50,17 +50,19 @@ Key gaps: no deployment config, no scheduled scraping, no caching, no tests, hig | # | Ticket | Priority | |---|--------|----------| -| 8 | Delete unused scrapers: `eonet.py`, `eonet_db.py`, `social_scraper.py`, `reddit.py`, `reddit_classifier.py`, `reddit_db.py`, `Reddit Scraper/`, `natural-disasters/`, `ryan_scrapers/` | P0 | -| 9 | Remove duplicate `content_repository.py` (exists in both `repositories/` and `ingestion/`) — consolidate into one | P0 | -| 10 | Audit and remove any other dead imports, unused functions, or commented-out code | P1 | +| 8 | Delete junk scrapers: `eonet.py`, `eonet_db.py`, `social_scraper.py`, `reddit.py`, `reddit_classifier.py`, `reddit_db.py`, `reddit_schema.sql`, `Reddit Scraper/`, `natural-disasters/`, `ryan_scrapers/` | P0 | +| 9 | Move remaining hackathon scrapers (`gdelt.py`, `kalshi.py`, `polymarket.py`, `acled/`) into a `scrapers/_reference/` directory — these won't be used directly but are kept as design inspiration (see "Scraper Reference Guide" below) | P0 | +| 10 | Remove duplicate `content_repository.py` (exists in both `repositories/` and `ingestion/`) — consolidate into one | P0 | +| 11 | Remove all Cloudinary and S3 media code — no longer needed. Delete `utils/mediaConfig.ts`, remove `@cloudinary/react` and `@cloudinary/url-gen` deps from frontend, remove `cloudinary` and `boto3` deps from backend, drop `image_url` and `s3_url` columns from `content_table`, remove all `CLOUDINARY_*` and `AWS_*`/`S3_*` env vars | P0 | +| 12 | Audit and remove any other dead imports, unused functions, or commented-out code | P1 | ### 1.4 Security Baseline | # | Ticket | Priority | |---|--------|----------| -| 11 | Lock down CORS origins — remove `*` wildcard, use env-configured allowed origins | P0 | -| 12 | Add rate limiting middleware on `/agent/query` and `/content/{id}/realtime-analysis` (these burn AI tokens) | P0 | -| 13 | Validate all secrets are loaded from env vars; fail fast on startup if required vars missing | P0 | +| 13 | Lock down CORS origins — remove `*` wildcard, use env-configured allowed origins | P0 | +| 14 | Add rate limiting middleware on `/agent/query` and `/content/{id}/realtime-analysis` (these burn AI tokens) | P0 | +| 15 | Validate all secrets are loaded from env vars; fail fast on startup if required vars missing | P0 | --- @@ -72,39 +74,41 @@ Key gaps: no deployment config, no scheduled scraping, no caching, no tests, hig | # | Ticket | Priority | |---|--------|----------| -| 14 | Create unified scrape entrypoint (`run_daily_pipeline.py`) that runs all scrapers + embedding backfill in sequence | P0 | -| 15 | Add cron scheduler (APScheduler in a separate container, or cron in docker-compose) | P0 | -| 16 | Define scraping schedule: GDELT 1x/day, ACLED 1x/day, Polymarket+Kalshi 2x/day | P0 | -| 17 | Add `scrape_runs` logging table (source, status, rows_inserted, errors, duration_ms, started_at) | P1 | -| 18 | Add idempotency guards — URL-based dedup check before insert, not after | P0 | -| 19 | Add failure alerting (Discord webhook or email on scrape errors) | P2 | +| 16 | Design `BaseScraper` ABC interface: `async fetch() -> list[NormalizedRow]` with built-in rate limiting, error handling, and dedup. Use `kalshi.py` rate limiter and `ingestion_service.py` error patterns as reference. | P0 | +| 17 | Write new production scrapers implementing `BaseScraper` for each data source (determine which sources to keep/add based on product needs) | P0 | +| 18 | Create unified scrape entrypoint (`run_daily_pipeline.py`) that runs all scrapers + embedding backfill in sequence | P0 | +| 19 | Add cron scheduler (APScheduler in a separate container, or cron in docker-compose) | P0 | +| 20 | Define scraping schedule (e.g. 1x/day, 2x/day per source) | P0 | +| 21 | Add `scrape_runs` logging table (source, status, rows_inserted, errors, duration_ms, started_at) | P1 | +| 22 | Add idempotency guards — URL-based dedup check before insert, not after | P0 | +| 23 | Add failure alerting (Discord webhook or email on scrape errors) | P2 | ### 2.2 Embedding Cost Reduction | # | Ticket | Priority | |---|--------|----------| -| 20 | **Switch embeddings to local model** — replace OpenAI `text-embedding-3-small` with `sentence-transformers` (e.g. `all-MiniLM-L6-v2`, 384 dims). Run locally in backend container. Update vector column dimension. | P0 | -| 21 | Batch embedding generation — process 100+ items per batch instead of one-at-a-time | P0 | -| 22 | Deduplicate content before generating embeddings (currently embeddings are generated before dedup) | P1 | +| 24 | **Switch embeddings to local model** — replace OpenAI `text-embedding-3-small` with `sentence-transformers` (e.g. `all-MiniLM-L6-v2`, 384 dims). Run locally in backend container. Update vector column dimension. | P0 | +| 25 | Batch embedding generation — process 100+ items per batch instead of one-at-a-time | P0 | +| 26 | Deduplicate content before generating embeddings (currently embeddings are generated before dedup) | P1 | ### 2.3 AI Response Caching (Redis) | # | Ticket | Priority | |---|--------|----------| -| 23 | Add Redis client utility module with connection pooling | P0 | -| 24 | Cache `/content/points` response — invalidate on new scrape run completion | P0 | -| 25 | Cache `/content/arcs` response per threshold value (TTL = until next scrape) | P1 | -| 26 | Cache Gemini confidence scores per content_id (TTL = 24h) | P0 | -| 27 | Cache Gemini realtime analysis per `(content_id, user_role)` (TTL = 6h) | P0 | -| 28 | Cache agent query results keyed on `(normalized_query_hash, persona)` (TTL = 1h) | P1 | +| 27 | Add Redis client utility module with connection pooling | P0 | +| 28 | Cache `/content/points` response — invalidate on new scrape run completion | P0 | +| 29 | Cache `/content/arcs` response per threshold value (TTL = until next scrape) | P1 | +| 30 | Cache Gemini confidence scores per content_id (TTL = 24h) | P0 | +| 31 | Cache Gemini realtime analysis per `(content_id, user_role)` (TTL = 6h) | P0 | +| 32 | Cache agent query results keyed on `(normalized_query_hash, persona)` (TTL = 1h) | P1 | ### 2.4 Additional Cost Controls | # | Ticket | Priority | |---|--------|----------| -| 29 | Add token usage tracking — log Gemini and OpenAI (if still used) token counts per call to a `token_usage` table | P1 | -| 30 | Pre-compute confidence scores during scrape pipeline instead of on-demand per user click | P2 | -| 31 | Replace per-request Google Search grounding with daily cached news summaries scraped during pipeline | P2 | +| 33 | Add token usage tracking — log Gemini and OpenAI (if still used) token counts per call to a `token_usage` table | P1 | +| 34 | Pre-compute confidence scores during scrape pipeline instead of on-demand per user click | P2 | +| 35 | Replace per-request Google Search grounding with daily cached news summaries scraped during pipeline | P2 | --- @@ -116,35 +120,34 @@ Key gaps: no deployment config, no scheduled scraping, no caching, no tests, hig | # | Ticket | Priority | |---|--------|----------| -| 32 | Set up Alembic for migration management (replace raw SQL files) | P0 | -| 33 | Rename `content_table` -> `articles` | P1 | -| 34 | Split AI-generated fields into `article_analysis` table (embedding, sentiment_score, market_signal, confidence_score) | P1 | -| 35 | Add `scrape_source` enum column to replace generic FK to `sources` table | P2 | -| 36 | Add composite index on `(event_type, published_at)` for filtered timeline queries | P1 | -| 37 | Add `last_scraped_at` column to sources for freshness tracking | P2 | -| 38 | Clean up unused columns and tables from hackathon iteration | P1 | +| 36 | Set up Alembic for migration management (replace raw SQL files) | P0 | +| 37 | Rename `content_table` -> `articles` | P1 | +| 38 | Split AI-generated fields into `article_analysis` table (embedding, sentiment_score, market_signal, confidence_score) | P1 | +| 39 | Add `scrape_source` enum column to replace generic FK to `sources` table | P2 | +| 40 | Add composite index on `(event_type, published_at)` for filtered timeline queries | P1 | +| 41 | Add `last_scraped_at` column to sources for freshness tracking | P2 | +| 42 | Clean up unused columns and tables from hackathon iteration (including dropped `image_url`, `s3_url`) | P1 | ### 3.2 Backend Modularization | # | Ticket | Priority | |---|--------|----------| -| 39 | Create shared `db.py` module — single asyncpg pool used by all services (currently duplicated in `ingestion/db.py` and inline connects) | P0 | -| 40 | Extract `BaseScraper` ABC interface: `async fetch() -> list[NormalizedRow]`. Make GDELT, ACLED, Polymarket, Kalshi implement it. | P1 | -| 41 | Consolidate all Pydantic models into a single `schemas/` package | P1 | -| 42 | Replace all `print()` statements with structured `logging` (use correlation IDs per request) | P1 | -| 43 | Add input validation on all API endpoints (query length limits, coordinate bounds, UUID format) | P1 | +| 43 | Create shared `db.py` module — single asyncpg pool used by all services (currently duplicated in `ingestion/db.py` and inline connects) | P0 | +| 44 | Consolidate all Pydantic models into a single `schemas/` package | P1 | +| 45 | Replace all `print()` statements with structured `logging` (use correlation IDs per request) | P1 | +| 46 | Add input validation on all API endpoints (query length limits, coordinate bounds, UUID format) | P1 | ### 3.3 Testing | # | Ticket | Priority | |---|--------|----------| -| 44 | Set up pytest with async fixtures (asyncpg test DB, httpx AsyncClient) | P0 | -| 45 | Write tests for scraper row normalization (`row_format.py`, ACLED normalizer) | P0 | -| 46 | Write tests for deduplication logic | P1 | -| 47 | Write tests for agent query classification (`_classify_query`) | P1 | -| 48 | Write API integration tests for `/content/points`, `/agent/query` | P1 | -| 49 | Add frontend smoke tests (Vitest + React Testing Library) for Globe render, Agent query flow | P2 | -| 50 | Add pre-commit hooks: ruff lint+format, TypeScript typecheck | P0 | +| 47 | Set up pytest with async fixtures (asyncpg test DB, httpx AsyncClient) | P0 | +| 48 | Write tests for new scraper implementations and row normalization | P0 | +| 49 | Write tests for deduplication logic | P1 | +| 50 | Write tests for agent query classification (`_classify_query`) | P1 | +| 51 | Write API integration tests for `/content/points`, `/agent/query` | P1 | +| 52 | Add frontend smoke tests (Vitest + React Testing Library) for Globe render, Agent query flow | P2 | +| 53 | Add pre-commit hooks: ruff lint+format, TypeScript typecheck | P0 | --- @@ -156,29 +159,29 @@ Key gaps: no deployment config, no scheduled scraping, no caching, no tests, hig | # | Ticket | Priority | |---|--------|----------| -| 51 | Add loading skeleton/spinner on initial data fetch | P0 | -| 52 | Lazy-load Globe component (three.js is ~500KB) with React.lazy + Suspense | P1 | -| 53 | Add error boundaries around Globe, Agent, and Modal components | P0 | -| 54 | Memoize expensive computations (arc filtering, point color mapping) with useMemo/useCallback | P1 | -| 55 | Add responsive layout — mobile/tablet support or graceful "desktop-only" message | P1 | -| 56 | Add URL-based routing (React Router) for shareable/bookmarkable globe state | P2 | -| 57 | Add "no results" empty states for agent queries and empty filtered views | P1 | -| 58 | Move inline styles and magic numbers to shared constants / design tokens | P2 | +| 54 | Add loading skeleton/spinner on initial data fetch | P0 | +| 55 | Lazy-load Globe component (three.js is ~500KB) with React.lazy + Suspense | P1 | +| 56 | Add error boundaries around Globe, Agent, and Modal components | P0 | +| 57 | Memoize expensive computations (arc filtering, point color mapping) with useMemo/useCallback | P1 | +| 58 | Add responsive layout — mobile/tablet support or graceful "desktop-only" message | P1 | +| 59 | Add URL-based routing (React Router) for shareable/bookmarkable globe state | P2 | +| 60 | Add "no results" empty states for agent queries and empty filtered views | P1 | +| 61 | Move inline styles and magic numbers to shared constants / design tokens | P2 | ### 4.2 Observability | # | Ticket | Priority | |---|--------|----------| -| 59 | Add structured logging with request correlation IDs (middleware) | P1 | -| 60 | Add simple admin dashboard page: scrape history, DB row counts, token spend summary | P2 | -| 61 | Add uptime monitoring on `/health` endpoint (UptimeRobot free tier or similar) | P1 | +| 62 | Add structured logging with request correlation IDs (middleware) | P1 | +| 63 | Add simple admin dashboard page: scrape history, DB row counts, token spend summary | P2 | +| 64 | Add uptime monitoring on `/health` endpoint (UptimeRobot free tier or similar) | P1 | ### 4.3 Security Hardening | # | Ticket | Priority | |---|--------|----------| -| 62 | Audit all raw SQL for injection risks — parameterize any string-interpolated queries | P0 | -| 63 | Add API key or lightweight session auth for agent/analysis endpoints | P1 | +| 65 | Audit all raw SQL for injection risks — parameterize any string-interpolated queries | P0 | +| 66 | Add API key or lightweight session auth for agent/analysis endpoints | P1 | --- @@ -196,6 +199,33 @@ Key gaps: no deployment config, no scheduled scraping, no caching, no tests, hig --- +## Scraper Reference Guide + +The existing hackathon scrapers will **not** be used directly in production. They are moved to `scrapers/_reference/` as design inspiration. New scrapers should be written from scratch implementing the `BaseScraper` ABC (ticket #16). + +### Which reference files to study, and why + +| File | Quality | What to learn from it | +|------|---------|----------------------| +| `kalshi.py` | Excellent | Async rate limiter class (`_RateLimiter` with lock-based queueing, 10 req/sec), cursor-based pagination, `asyncio.gather` with `return_exceptions=True` | +| `gdelt.py` | Excellent | Dual-path fetch (BigQuery primary, DOC API fallback), complex event-type mapping via CAMEO codes, Goldstein scale normalization, title synthesis from multiple fields | +| `row_format.py` | Excellent | Schema-aligned output contract — keyword-only args prevent mistakes. **Copy this pattern into all new scrapers.** | +| `acled_normalizer.py` | Good | Clean normalizer pattern: type-safe `NormalizedRecord` return, graceful fallback for every nullable field | +| `ingestion_service.py` | Good | Per-record error handling with `RunSummary` tracking (malformed, duplicates, db_failures), dedup integration | +| `polymarket.py` | Decent | Simplest example — good starting point for straightforward REST APIs, tag-based filtering | +| `scraping_service.py` | Decent | Orchestration pattern: per-scraper try-catch, error records appended (visibility over silent failures) | + +### Patterns to carry forward into new scrapers + +1. **Separate fetch from normalization** — client fetches raw data, normalizer produces `NormalizedRow` +2. **Use async rate limiters, not `time.sleep()`** — see `kalshi.py`'s `_RateLimiter` class +3. **All output goes through `row_format.make_content_row()`** — enforces schema contract +4. **Per-record error handling** — one bad record shouldn't abort the batch +5. **Track stats** — count inserted, skipped, failed per run for observability +6. **Dedup before embedding** — check URL uniqueness before generating expensive vectors + +--- + ## Non-Goals (for now) - Mobile app (web-only is fine) From 4884b0935a724aa975f03d6adad2d557afe874c1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 16 Mar 2026 22:37:07 +0000 Subject: [PATCH 03/13] Initial plan From 7afe01edd174fbbed5f53b124b0b5e553df0630e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 16 Mar 2026 22:45:34 +0000 Subject: [PATCH 04/13] Initial plan From eea6f6b893a49728a57f0d34c358e58a937bae60 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 16 Mar 2026 22:49:02 +0000 Subject: [PATCH 05/13] docs: merge PR29 planning docs and add deployment issue board Co-authored-by: Tpypan <155002057+Tpypan@users.noreply.github.com> --- plan/ISSUE_BOARD.md | 124 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 plan/ISSUE_BOARD.md diff --git a/plan/ISSUE_BOARD.md b/plan/ISSUE_BOARD.md new file mode 100644 index 0000000..d38be12 --- /dev/null +++ b/plan/ISSUE_BOARD.md @@ -0,0 +1,124 @@ +# Argus Deployment Issue Board + +This board is derived from the deployment planning work merged from PR #29 (`plan/PRD.md` and `plan/CLAUDE.md`). + +## Current Status + +- Total deployment-plan tasks tracked: **66** +- Completed: **0** +- Remaining: **66** + +## Deployment Blockers (P0) + +- [ ] #1 Create backend Dockerfile (FastAPI + uvicorn, multi-stage build) +- [ ] #2 Create frontend Dockerfile (Vite build -> nginx static serve) +- [ ] #3 Create `docker-compose.yml` with backend, frontend, postgres+pgvector, redis +- [ ] #4 Add `.dockerignore` files +- [ ] #5 Set up GitHub Actions CI (ruff lint/format, TypeScript typecheck, docker build) +- [ ] #8 Delete junk scrapers (`eonet.py`, `eonet_db.py`, `social_scraper.py`, `reddit.py`, `reddit_classifier.py`, `reddit_db.py`, `reddit_schema.sql`, `Reddit Scraper/`, `natural-disasters/`, `ryan_scrapers/`) +- [ ] #9 Move remaining hackathon scrapers into `scrapers/_reference/` +- [ ] #10 Remove duplicate `content_repository.py` and consolidate +- [ ] #11 Remove Cloudinary and S3 media code, deps, env vars, DB columns +- [ ] #13 Lock down CORS origins (remove wildcard `*`, use env-configured origins) +- [ ] #14 Add rate limiting for expensive AI endpoints +- [ ] #15 Enforce required secrets via env vars and fail fast on startup +- [ ] #16 Design `BaseScraper` ABC with rate limiting/error handling/dedup +- [ ] #17 Implement new production scrapers using `BaseScraper` +- [ ] #18 Create unified pipeline entrypoint (`run_daily_pipeline.py`) +- [ ] #19 Add scrape scheduler (APScheduler/cron) +- [ ] #20 Define scrape schedule per source +- [ ] #22 Add idempotency guards (URL dedup before insert) +- [ ] #24 Switch embeddings to local `sentence-transformers` model +- [ ] #25 Batch embedding generation +- [ ] #27 Add Redis client utility with pooling +- [ ] #28 Cache `/content/points` +- [ ] #30 Cache Gemini confidence scores +- [ ] #31 Cache realtime analysis +- [ ] #36 Set up Alembic migrations +- [ ] #43 Create shared DB pool module for all services +- [ ] #47 Set up pytest with async fixtures +- [ ] #48 Add scraper/normalization tests +- [ ] #53 Add pre-commit hooks (ruff + TS typecheck) +- [ ] #54 Add initial loading UI state +- [ ] #56 Add frontend error boundaries +- [ ] #65 Audit and parameterize raw SQL for injection safety + +## Full Board (All PR #29 Planning Tasks) + +### Phase 1 — Foundation (Week 1–2) + +- [ ] #1 Create backend Dockerfile (P0) +- [ ] #2 Create frontend Dockerfile (P0) +- [ ] #3 Create `docker-compose.yml` with backend, frontend, postgres+pgvector, redis (P0) +- [ ] #4 Add `.dockerignore` files (P0) +- [ ] #5 Set up GitHub Actions CI (P0) +- [ ] #6 Set up GitHub Actions CD (P1) +- [ ] #7 Choose hosting provider + document deploy process (P1) +- [ ] #8 Delete junk scrapers (P0) +- [ ] #9 Move reference scrapers to `scrapers/_reference/` (P0) +- [ ] #10 Consolidate duplicate `content_repository.py` (P0) +- [ ] #11 Remove Cloudinary/S3 media code and related schema/env/deps (P0) +- [ ] #12 Audit/remove remaining dead code (P1) +- [ ] #13 Lock down CORS origins (P0) +- [ ] #14 Add endpoint rate limiting (P0) +- [ ] #15 Validate required env secrets on startup (P0) + +### Phase 2 — Automation & Cost Optimization (Week 2–3) + +- [ ] #16 Design `BaseScraper` ABC (P0) +- [ ] #17 Write production scrapers implementing `BaseScraper` (P0) +- [ ] #18 Build `run_daily_pipeline.py` (P0) +- [ ] #19 Add scheduler (P0) +- [ ] #20 Define scraping schedule (P0) +- [ ] #21 Add `scrape_runs` logging table (P1) +- [ ] #22 Add pre-insert URL dedup/idempotency guard (P0) +- [ ] #23 Add scrape failure alerting (P2) +- [ ] #24 Replace OpenAI embeddings with local sentence-transformers (P0) +- [ ] #25 Batch embedding generation (P0) +- [ ] #26 Deduplicate before embedding generation (P1) +- [ ] #27 Add Redis client utility (P0) +- [ ] #28 Cache `/content/points` (P0) +- [ ] #29 Cache `/content/arcs` by threshold (P1) +- [ ] #30 Cache Gemini confidence scores (P0) +- [ ] #31 Cache realtime analysis by `(content_id, user_role)` (P0) +- [ ] #32 Cache agent query results by normalized query + persona (P1) +- [ ] #33 Track token usage in DB (P1) +- [ ] #34 Precompute confidence scores in pipeline (P2) +- [ ] #35 Replace per-request grounding with cached daily summaries (P2) + +### Phase 3 — Schema & Code Quality (Week 3–5) + +- [ ] #36 Set up Alembic migration management (P0) +- [ ] #37 Rename `content_table` -> `articles` (P1) +- [ ] #38 Move AI fields to `article_analysis` table (P1) +- [ ] #39 Add `scrape_source` enum column (P2) +- [ ] #40 Add index on `(event_type, published_at)` (P1) +- [ ] #41 Add `last_scraped_at` on sources (P2) +- [ ] #42 Remove remaining unused schema artifacts (P1) +- [ ] #43 Create shared `db.py` pool module (P0) +- [ ] #44 Consolidate Pydantic models under `schemas/` (P1) +- [ ] #45 Replace `print()` with structured logging + correlation IDs (P1) +- [ ] #46 Add stronger endpoint input validation (P1) +- [ ] #47 Set up pytest with async fixtures (P0) +- [ ] #48 Add tests for new scrapers + normalization (P0) +- [ ] #49 Add deduplication tests (P1) +- [ ] #50 Add `_classify_query` tests (P1) +- [ ] #51 Add API integration tests (`/content/points`, `/agent/query`) (P1) +- [ ] #52 Add frontend smoke tests with Vitest + RTL (P2) +- [ ] #53 Add pre-commit hooks (P0) + +### Phase 4 — Frontend Polish & Observability (Week 5–7) + +- [ ] #54 Add loading skeleton/spinner on initial fetch (P0) +- [ ] #55 Lazy-load Globe with `React.lazy` + `Suspense` (P1) +- [ ] #56 Add error boundaries around Globe, Agent, Modal (P0) +- [ ] #57 Memoize expensive computations (P1) +- [ ] #58 Add responsive/mobile support or desktop-only message (P1) +- [ ] #59 Add URL-based routing for shareable state (P2) +- [ ] #60 Add empty states for agent/filter no-results (P1) +- [ ] #61 Move inline styles/magic numbers to constants/tokens (P2) +- [ ] #62 Add structured logging middleware with correlation IDs (P1) +- [ ] #63 Add admin dashboard (scrape history/row counts/token spend) (P2) +- [ ] #64 Add uptime monitoring on `/health` (P1) +- [ ] #65 Audit SQL injection risks and parameterize all queries (P0) +- [ ] #66 Add API key/lightweight auth for agent/analysis endpoints (P1) From 61062aa009fac7aaece5915a03b5141ede215c26 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 16 Mar 2026 22:49:30 +0000 Subject: [PATCH 06/13] docs: clarify unique task count and blocker subset in issue board Co-authored-by: Tpypan <155002057+Tpypan@users.noreply.github.com> --- plan/ISSUE_BOARD.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/plan/ISSUE_BOARD.md b/plan/ISSUE_BOARD.md index d38be12..d4efb4d 100644 --- a/plan/ISSUE_BOARD.md +++ b/plan/ISSUE_BOARD.md @@ -4,12 +4,14 @@ This board is derived from the deployment planning work merged from PR #29 (`pla ## Current Status -- Total deployment-plan tasks tracked: **66** +- Total unique deployment-plan tasks tracked: **66** - Completed: **0** - Remaining: **66** ## Deployment Blockers (P0) +> These are a prioritized subset of the full board and are duplicated below in the phase checklists. + - [ ] #1 Create backend Dockerfile (FastAPI + uvicorn, multi-stage build) - [ ] #2 Create frontend Dockerfile (Vite build -> nginx static serve) - [ ] #3 Create `docker-compose.yml` with backend, frontend, postgres+pgvector, redis From a68cfd2b9b57bfc79ef4b817bfa143e4162d95b0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 16 Mar 2026 22:50:34 +0000 Subject: [PATCH 07/13] docs: clarify embedding migration and issue-board counter behavior Co-authored-by: Tpypan <155002057+Tpypan@users.noreply.github.com> --- plan/CLAUDE.md | 6 ++++-- plan/ISSUE_BOARD.md | 2 ++ plan/PRD.md | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/plan/CLAUDE.md b/plan/CLAUDE.md index fba77a9..dc1ac5e 100644 --- a/plan/CLAUDE.md +++ b/plan/CLAUDE.md @@ -10,6 +10,8 @@ A 3D global event intelligence platform. World events are scraped daily from mul ## Tech Stack +This table reflects the current codebase state at the time of writing; planned migrations are called out in the notes column. + | Component | Technology | Notes | |-----------|-----------|-------| | Frontend | React 19 + TypeScript 5 + Vite 6 | SPA, no SSR | @@ -18,7 +20,7 @@ A 3D global event intelligence platform. World events are scraped daily from mul | Backend | FastAPI + Uvicorn (Python 3.11+) | Async throughout | | Database | PostgreSQL 15+ with pgvector + pgcrypto | Extensions required | | AI Model | Google Gemini 2.5-flash | Structured JSON output | -| Embeddings | OpenAI text-embedding-3-small (1536 dims) | Migration target: local sentence-transformers | +| Embeddings | OpenAI text-embedding-3-small (1536 dims) | Current state; migration target: local sentence-transformers (see PRD ticket #24) | | Voice | ElevenLabs Scribe v1 | Optional, speech-to-text | | Media | Placeholder SVGs only | Cloudinary and S3 removed — no longer needed | @@ -125,7 +127,7 @@ PostgreSQL with pgvector and pgcrypto extensions. - `title`, `body`, `url` (UNIQUE) - `latitude`, `longitude` (nullable floats) - `image_url`, `s3_url` — DEPRECATED, to be dropped (Cloudinary/S3 no longer used) -- `embedding` vector(1536) — OpenAI text-embedding-3-small +- `embedding` vector(1536) — OpenAI text-embedding-3-small (current; planned migration to 384 dims in PRD ticket #24) - `sentiment_score` float, `market_signal` text - `published_at` timestamptz, `event_type` text, `raw_metadata_json` JSONB - `source_id` FK -> sources, `engagement_id` FK -> engagement diff --git a/plan/ISSUE_BOARD.md b/plan/ISSUE_BOARD.md index d4efb4d..304098c 100644 --- a/plan/ISSUE_BOARD.md +++ b/plan/ISSUE_BOARD.md @@ -8,6 +8,8 @@ This board is derived from the deployment planning work merged from PR #29 (`pla - Completed: **0** - Remaining: **66** +> Progress counters are manual and should be updated when boxes are checked. + ## Deployment Blockers (P0) > These are a prioritized subset of the full board and are duplicated below in the phase checklists. diff --git a/plan/PRD.md b/plan/PRD.md index 966ee7c..ddbb374 100644 --- a/plan/PRD.md +++ b/plan/PRD.md @@ -87,7 +87,7 @@ Key gaps: no deployment config, no scheduled scraping, no caching, no tests, hig | # | Ticket | Priority | |---|--------|----------| -| 24 | **Switch embeddings to local model** — replace OpenAI `text-embedding-3-small` with `sentence-transformers` (e.g. `all-MiniLM-L6-v2`, 384 dims). Run locally in backend container. Update vector column dimension. | P0 | +| 24 | **Switch embeddings to local model** — replace OpenAI `text-embedding-3-small` with `sentence-transformers` (e.g. `all-MiniLM-L6-v2`, 384 dims). Run locally in backend container. Update vector column dimension and include a one-time re-embedding migration/rollout plan (1536 -> 384) to avoid query downtime. | P0 | | 25 | Batch embedding generation — process 100+ items per batch instead of one-at-a-time | P0 | | 26 | Deduplicate content before generating embeddings (currently embeddings are generated before dedup) | P1 | From 2009d9737f2c007edc0be06d7682394988299986 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 16 Mar 2026 22:51:14 +0000 Subject: [PATCH 08/13] docs: add explicit embedding migration rollout notes Co-authored-by: Tpypan <155002057+Tpypan@users.noreply.github.com> --- plan/CLAUDE.md | 2 +- plan/PRD.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/plan/CLAUDE.md b/plan/CLAUDE.md index dc1ac5e..06e263a 100644 --- a/plan/CLAUDE.md +++ b/plan/CLAUDE.md @@ -20,7 +20,7 @@ This table reflects the current codebase state at the time of writing; planned m | Backend | FastAPI + Uvicorn (Python 3.11+) | Async throughout | | Database | PostgreSQL 15+ with pgvector + pgcrypto | Extensions required | | AI Model | Google Gemini 2.5-flash | Structured JSON output | -| Embeddings | OpenAI text-embedding-3-small (1536 dims) | Current state; migration target: local sentence-transformers (see PRD ticket #24) | +| Embeddings | OpenAI text-embedding-3-small (1536 dims) | Current state; migration target: local sentence-transformers (see PRD ticket #24 for dual-column/backfill/cutover migration) | | Voice | ElevenLabs Scribe v1 | Optional, speech-to-text | | Media | Placeholder SVGs only | Cloudinary and S3 removed — no longer needed | diff --git a/plan/PRD.md b/plan/PRD.md index ddbb374..8f0853e 100644 --- a/plan/PRD.md +++ b/plan/PRD.md @@ -87,7 +87,7 @@ Key gaps: no deployment config, no scheduled scraping, no caching, no tests, hig | # | Ticket | Priority | |---|--------|----------| -| 24 | **Switch embeddings to local model** — replace OpenAI `text-embedding-3-small` with `sentence-transformers` (e.g. `all-MiniLM-L6-v2`, 384 dims). Run locally in backend container. Update vector column dimension and include a one-time re-embedding migration/rollout plan (1536 -> 384) to avoid query downtime. | P0 | +| 24 | **Switch embeddings to local model** — replace OpenAI `text-embedding-3-small` with `sentence-transformers` (e.g. `all-MiniLM-L6-v2`, 384 dims). Run locally in backend container. Update vector column dimension and include a one-time re-embedding migration/rollout plan (1536 -> 384) to avoid query downtime (dual-column migration, batched backfill, cutover + rollback steps). | P0 | | 25 | Batch embedding generation — process 100+ items per batch instead of one-at-a-time | P0 | | 26 | Deduplicate content before generating embeddings (currently embeddings are generated before dedup) | P1 | From 4dd0ae699214afc821c3a00aa1984ad28b18ae13 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 16 Mar 2026 22:52:02 +0000 Subject: [PATCH 09/13] docs: add explicit embedding migration rollout checklist Co-authored-by: Tpypan <155002057+Tpypan@users.noreply.github.com> --- plan/CLAUDE.md | 4 ++-- plan/PRD.md | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/plan/CLAUDE.md b/plan/CLAUDE.md index 06e263a..e5d0dde 100644 --- a/plan/CLAUDE.md +++ b/plan/CLAUDE.md @@ -20,7 +20,7 @@ This table reflects the current codebase state at the time of writing; planned m | Backend | FastAPI + Uvicorn (Python 3.11+) | Async throughout | | Database | PostgreSQL 15+ with pgvector + pgcrypto | Extensions required | | AI Model | Google Gemini 2.5-flash | Structured JSON output | -| Embeddings | OpenAI text-embedding-3-small (1536 dims) | Current state; migration target: local sentence-transformers (see PRD ticket #24 for dual-column/backfill/cutover migration) | +| Embeddings | OpenAI text-embedding-3-small (1536 dims) | Current state; migration target: local sentence-transformers (see PRD embedding migration rollout notes) | | Voice | ElevenLabs Scribe v1 | Optional, speech-to-text | | Media | Placeholder SVGs only | Cloudinary and S3 removed — no longer needed | @@ -127,7 +127,7 @@ PostgreSQL with pgvector and pgcrypto extensions. - `title`, `body`, `url` (UNIQUE) - `latitude`, `longitude` (nullable floats) - `image_url`, `s3_url` — DEPRECATED, to be dropped (Cloudinary/S3 no longer used) -- `embedding` vector(1536) — OpenAI text-embedding-3-small (current; planned migration to 384 dims in PRD ticket #24) +- `embedding` vector(1536) — OpenAI text-embedding-3-small (current; planned migration to 384 dims in PRD embedding migration rollout notes) - `sentiment_score` float, `market_signal` text - `published_at` timestamptz, `event_type` text, `raw_metadata_json` JSONB - `source_id` FK -> sources, `engagement_id` FK -> engagement diff --git a/plan/PRD.md b/plan/PRD.md index 8f0853e..e51fa3b 100644 --- a/plan/PRD.md +++ b/plan/PRD.md @@ -91,6 +91,12 @@ Key gaps: no deployment config, no scheduled scraping, no caching, no tests, hig | 25 | Batch embedding generation — process 100+ items per batch instead of one-at-a-time | P0 | | 26 | Deduplicate content before generating embeddings (currently embeddings are generated before dedup) | P1 | +**Embedding migration rollout notes (for ticket #24):** +1. Add a new local-embedding column/index while keeping the current 1536-dim column online. +2. Backfill local embeddings in batches with progress logging and retry support. +3. Cut over read paths to the new column only after validation checks pass. +4. Keep a rollback window where reads can switch back to the old column if quality/performance regress. + ### 2.3 AI Response Caching (Redis) | # | Ticket | Priority | From ccd696ce05cb12556489daff1e919b0dacdb8b2c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 16 Mar 2026 22:52:49 +0000 Subject: [PATCH 10/13] docs: tighten cross-references for embedding migration notes Co-authored-by: Tpypan <155002057+Tpypan@users.noreply.github.com> --- plan/CLAUDE.md | 4 ++-- plan/PRD.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/plan/CLAUDE.md b/plan/CLAUDE.md index e5d0dde..5c5c603 100644 --- a/plan/CLAUDE.md +++ b/plan/CLAUDE.md @@ -20,7 +20,7 @@ This table reflects the current codebase state at the time of writing; planned m | Backend | FastAPI + Uvicorn (Python 3.11+) | Async throughout | | Database | PostgreSQL 15+ with pgvector + pgcrypto | Extensions required | | AI Model | Google Gemini 2.5-flash | Structured JSON output | -| Embeddings | OpenAI text-embedding-3-small (1536 dims) | Current state; migration target: local sentence-transformers (see PRD embedding migration rollout notes) | +| Embeddings | OpenAI text-embedding-3-small (1536 dims) | Current state; migration target: local sentence-transformers (see PRD.md section 2.2, ticket #24 + rollout notes) | | Voice | ElevenLabs Scribe v1 | Optional, speech-to-text | | Media | Placeholder SVGs only | Cloudinary and S3 removed — no longer needed | @@ -127,7 +127,7 @@ PostgreSQL with pgvector and pgcrypto extensions. - `title`, `body`, `url` (UNIQUE) - `latitude`, `longitude` (nullable floats) - `image_url`, `s3_url` — DEPRECATED, to be dropped (Cloudinary/S3 no longer used) -- `embedding` vector(1536) — OpenAI text-embedding-3-small (current; planned migration to 384 dims in PRD embedding migration rollout notes) +- `embedding` vector(1536) — OpenAI text-embedding-3-small (current; planned migration to 384 dims in PRD.md section 2.2, ticket #24 + rollout notes) - `sentiment_score` float, `market_signal` text - `published_at` timestamptz, `event_type` text, `raw_metadata_json` JSONB - `source_id` FK -> sources, `engagement_id` FK -> engagement diff --git a/plan/PRD.md b/plan/PRD.md index e51fa3b..12923e4 100644 --- a/plan/PRD.md +++ b/plan/PRD.md @@ -87,7 +87,7 @@ Key gaps: no deployment config, no scheduled scraping, no caching, no tests, hig | # | Ticket | Priority | |---|--------|----------| -| 24 | **Switch embeddings to local model** — replace OpenAI `text-embedding-3-small` with `sentence-transformers` (e.g. `all-MiniLM-L6-v2`, 384 dims). Run locally in backend container. Update vector column dimension and include a one-time re-embedding migration/rollout plan (1536 -> 384) to avoid query downtime (dual-column migration, batched backfill, cutover + rollback steps). | P0 | +| 24 | **Switch embeddings to local model** — replace OpenAI `text-embedding-3-small` with `sentence-transformers` (e.g. `all-MiniLM-L6-v2`, 384 dims). Run locally in backend container. Update vector column dimension and include a one-time re-embedding migration/rollout plan (1536 -> 384) to avoid query downtime (dual-column migration, batched backfill, cutover + rollback steps; see rollout notes below). | P0 | | 25 | Batch embedding generation — process 100+ items per batch instead of one-at-a-time | P0 | | 26 | Deduplicate content before generating embeddings (currently embeddings are generated before dedup) | P1 | From a75e61be0ae7c6bc145c3a6a1c5fddf23e56c748 Mon Sep 17 00:00:00 2001 From: Tony Pan <155002057+Tpypan@users.noreply.github.com> Date: Mon, 16 Mar 2026 18:56:26 -0400 Subject: [PATCH 11/13] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- plan/CLAUDE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plan/CLAUDE.md b/plan/CLAUDE.md index 5c5c603..c3d2798 100644 --- a/plan/CLAUDE.md +++ b/plan/CLAUDE.md @@ -239,7 +239,7 @@ Which data sources to keep, replace, or add is a product decision for Phase 2. T ### Backend (.env) ``` -DATABASE_URL=postgresql+asyncpg://user:pass@host:5432/dbname # Required +DATABASE_URL=postgresql://user:pass@host:5432/dbname # Required GEMINI_API_KEY=... # Required for agent GEMINI_MODEL=gemini-2.5-flash # Optional, default shown OPENAI_API_KEY=... # Required for embeddings (until local model migration) From 23c697502cb542fa2b69bc33f813d4b0076d4ac2 Mon Sep 17 00:00:00 2001 From: Tony Pan <155002057+Tpypan@users.noreply.github.com> Date: Mon, 16 Mar 2026 18:56:33 -0400 Subject: [PATCH 12/13] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- plan/CLAUDE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plan/CLAUDE.md b/plan/CLAUDE.md index c3d2798..01394de 100644 --- a/plan/CLAUDE.md +++ b/plan/CLAUDE.md @@ -54,7 +54,7 @@ hackcanada/ │ │ ├── types/ │ │ │ ├── events.ts # Event, ContentPoint, ContentArc, EventDetail │ │ │ └── agent.ts # AgentResponse, NavigationPlan, FinancialImpact -│ │ └── utils/mediaConfig.ts # DEPRECATED — Cloudinary/S3 removed, delete this file (ticket #10) +│ │ └── utils/mediaConfig.ts # DEPRECATED — Cloudinary/S3 removed, delete this file (ticket #11) │ └── package.json │ └── backend/ From 71621a4320fb9e1ba3a51388f88018735fe9764e Mon Sep 17 00:00:00 2001 From: Tony Pan <155002057+Tpypan@users.noreply.github.com> Date: Mon, 16 Mar 2026 19:11:00 -0400 Subject: [PATCH 13/13] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- plan/CLAUDE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plan/CLAUDE.md b/plan/CLAUDE.md index 01394de..b1910b0 100644 --- a/plan/CLAUDE.md +++ b/plan/CLAUDE.md @@ -211,7 +211,7 @@ Which data sources to keep, replace, or add is a product decision for Phase 2. T 8. **No tests** — zero test files in the repo. 9. **Print debugging** — `print()` used instead of structured logging. 10. **No migration tool** — raw SQL files, no Alembic. -11. **Dead Cloudinary/S3 code** — Cloudinary and S3 are no longer used. `utils/mediaConfig.ts`, `@cloudinary/react`, `@cloudinary/url-gen`, `cloudinary`, `boto3` deps, `image_url`/`s3_url` DB columns, and all related env vars should be removed (ticket #10). +11. **Dead Cloudinary/S3 code** — Cloudinary and S3 are no longer used. `utils/mediaConfig.ts`, `@cloudinary/react`, `@cloudinary/url-gen`, `cloudinary`, `boto3` deps, `image_url`/`s3_url` DB columns, and all related env vars should be removed (ticket #11). ## Conventions