From 9e7dd5506a3d34c25e521ddfcf597982383cc5c0 Mon Sep 17 00:00:00 2001 From: Lucas Jin Date: Mon, 9 Mar 2026 22:36:22 -0400 Subject: [PATCH 1/5] chore: plan for deployment --- plan/CLAUDE.md | 279 +++++++++++++++++++++++++++++++++++++++++++++++++ plan/PRD.md | 205 ++++++++++++++++++++++++++++++++++++ 2 files changed, 484 insertions(+) create mode 100644 plan/CLAUDE.md create mode 100644 plan/PRD.md diff --git a/plan/CLAUDE.md b/plan/CLAUDE.md new file mode 100644 index 0000000..5e676dd --- /dev/null +++ b/plan/CLAUDE.md @@ -0,0 +1,279 @@ +# Argus — Claude Context File + +Read this file before working on any ticket. It contains the full architectural context, conventions, and key file locations for the Argus project. + +## What Is Argus + +A 3D global event intelligence platform. World events are scraped daily from multiple sources, stored in PostgreSQL with vector embeddings, and visualized on an interactive globe. An AI agent (Graph-RAG pipeline) lets users query events with persona-aware analysis. + +## Tech Stack + +| Component | Technology | Notes | +|-----------|-----------|-------| +| Frontend | React 19 + TypeScript 5 + Vite 6 | SPA, no SSR | +| Globe | react-globe.gl (three.js wrapper) | Heavy bundle — lazy-load | +| Styling | Tailwind CSS 3 + CSS custom properties | Design tokens in `index.css` | +| Backend | FastAPI + Uvicorn (Python 3.11+) | Async throughout | +| Database | PostgreSQL 15+ with pgvector + pgcrypto | Extensions required | +| AI Model | Google Gemini 2.5-flash | Structured JSON output | +| Embeddings | OpenAI text-embedding-3-small (1536 dims) | Migration target: local sentence-transformers | +| Voice | ElevenLabs Scribe v1 | Optional, speech-to-text | +| Media | Cloudinary (primary), S3 fallback, placeholder SVGs | | + +## Project Structure + +``` +hackcanada/ +├── plan/ # PRD and this context file +├── frontend/ +│ ├── src/ +│ │ ├── main.tsx # Entry: nested context providers +│ │ ├── App.tsx # Bootstrap: fetch points/arcs, render overlays +│ │ ├── index.css # Design tokens, fonts +│ │ ├── api/client.ts # Typed fetch wrapper for all backend endpoints +│ │ ├── components/ +│ │ │ ├── Globe/GlobeView.tsx # 3D globe — points, arcs, tooltips, clusters +│ │ │ ├── Filters/FilterBar.tsx # Event-type filter chips +│ │ │ ├── Timeline/TimelineSlider.tsx # Date scrubber + play/pause +│ │ │ ├── Modal/EventModal.tsx # Right panel — event detail + AI analysis +│ │ │ ├── Modal/RealTimeAnalysisSection.tsx # Gemini + Google Search grounding +│ │ │ └── Agent/ # Left panel — AI query interface +│ │ │ ├── AgentPanel.tsx # Query input, voice, submit +│ │ │ ├── AgentAnswerView.tsx # Citation parsing, financial impact +│ │ │ ├── AgentNavigationOverlay.tsx # Globe camera animation +│ │ │ ├── PersonaSelector.tsx # Role + industry selection +│ │ │ └── FinancialImpactSection.tsx +│ │ ├── context/ +│ │ │ ├── AppContext.tsx # Events, arcs, filters, timeline, globe focus +│ │ │ ├── AgentContext.tsx # Agent state, highlights, navigation plan +│ │ │ └── UserPersonaContext.tsx # Role + industry (localStorage-persisted) +│ │ ├── types/ +│ │ │ ├── events.ts # Event, ContentPoint, ContentArc, EventDetail +│ │ │ └── agent.ts # AgentResponse, NavigationPlan, FinancialImpact +│ │ └── utils/mediaConfig.ts # Cloudinary/S3/placeholder URL resolver +│ └── package.json +│ +└── backend/ + ├── requirements.txt + ├── run_scrape.py # CLI: Polymarket + Kalshi scrape + ├── run_gdelt_scrape.py # CLI: GDELT scrape (--days, --limit flags) + ├── migrations/ + │ └── 001_init_schema.sql # Full PostgreSQL schema + └── app/ + ├── main.py # FastAPI app, CORS, router registration + ├── config.py # Env var loading, agent defaults + ├── models/ + │ ├── enums.py # EventType, RelationshipType (StrEnum) + │ ├── schemas.py # Core Pydantic models + │ └── agent_schemas.py # Agent-specific Pydantic models + ├── routers/ + │ ├── content.py # GET /content/points, /arcs, POST /{id}/confidence-score, /{id}/realtime-analysis + │ ├── agent.py # POST /agent/query + │ ├── ingestion.py # POST /ingestion/acled + │ ├── embeddings.py # POST /embeddings/backfill/content + │ └── market_signals.py # GET /market-signals (live fetch) + ├── services/ + │ ├── agent_service.py # Graph-RAG pipeline orchestration + │ ├── agent_tools.py # DB query tools (search, relate, detail, impact) + │ ├── gemini_client.py # Gemini API: synthesis, confidence, realtime analysis + │ ├── scraping_service.py # Polymarket + Kalshi + GDELT orchestrator + │ └── content_repository.py # (duplicate — also in ingestion/) + ├── repositories/ + │ └── content_repository.py # Market signal row persistence + ├── embeddings/ + │ ├── embedding_repository.py # Fetch/update embedding vectors + │ ├── embedding_backfill_service.py # Backfill missing embeddings + │ ├── openai_embedding_client.py # OpenAI API wrapper + │ └── run_embedding_backfill.py # CLI entry + ├── ingestion/ + │ ├── ingestion_service.py # ACLED pipeline: fetch -> normalize -> dedupe -> insert + │ ├── content_repository.py # ensure_sources, insert_content (DUPLICATE) + │ ├── db.py # asyncpg connection pool (only used by ingestion) + │ ├── dedupe_service.py # Duplicate detection + │ └── acled/ + │ ├── acled_client.py # ACLED API client + │ └── acled_normalizer.py + └── scrapers/ + ├── gdelt.py # GDELT (BigQuery primary, DOC API fallback) + ├── polymarket.py # Polymarket API + ├── kalshi.py # Kalshi API (async, rate-limited) + ├── row_format.py # Shared row normalization -> content_table shape + ├── eonet.py # UNUSED — delete + ├── eonet_db.py # UNUSED — delete + ├── social_scraper.py # UNUSED — delete + ├── reddit.py # UNUSED — delete + ├── reddit_classifier.py # UNUSED — delete + ├── reddit_db.py # UNUSED — delete + ├── Reddit Scraper/ # UNUSED — delete + ├── natural-disasters/ # UNUSED — delete + └── ryan_scrapers/ # UNUSED — delete +``` + +## Database Schema + +PostgreSQL with pgvector and pgcrypto extensions. + +### Core Tables + +**`content_table`** (primary data store — rename target: `articles`) +- `id` UUID PK (gen_random_uuid) +- `title`, `body`, `url` (UNIQUE) +- `latitude`, `longitude` (nullable floats) +- `image_url` (Cloudinary public_id), `s3_url` +- `embedding` vector(1536) — OpenAI text-embedding-3-small +- `sentiment_score` float, `market_signal` text +- `published_at` timestamptz, `event_type` text, `raw_metadata_json` JSONB +- `source_id` FK -> sources, `engagement_id` FK -> engagement +- `created_at` timestamptz + +**`engagement`** — Reddit, Polymarket, Twitter metrics per content item + +**`sources`** — name, type, base_url, trust_score + +**`entities`** — extracted entities (person, org, location, etc.) + +**`content_entities`** — join table (content_item_id, entity_id, relevance_score) + +**`events`** — clustered event groups with cluster_embedding, canada_impact_summary, confidence_score + +**`event_content`** — join between events and content_table + +**`event_relationships`** — event_a_id, event_b_id, relationship_type, score, reason_codes + +### Key Indexes +- HNSW cosine index on `content_table.embedding` +- UNIQUE on `content_table.url` + +## API Endpoints + +| Method | Path | Purpose | AI Cost | +|--------|------|---------|---------| +| GET | `/content/points` | All content with lat/lng (last 31 days) | None | +| GET | `/content/arcs?threshold=0.7` | Similarity arcs via pgvector cosine | None | +| GET | `/content/{id}` | Single content item detail | None | +| POST | `/content/{id}/confidence-score` | Gemini credibility scoring (0.31-1.0) | 1 Gemini call | +| POST | `/content/{id}/realtime-analysis` | Gemini + Google Search grounding | 1 Gemini call | +| POST | `/agent/query` | Graph-RAG agent pipeline | 1 Gemini call + 1 OpenAI embed | +| GET | `/market-signals` | Live Polymarket + Kalshi fetch | None | +| POST | `/ingestion/acled` | Trigger ACLED ingestion pipeline | None | +| POST | `/embeddings/backfill/content` | Backfill missing embeddings | N OpenAI calls | +| GET | `/health` | Health check | None | + +## Agent Pipeline (Graph-RAG) + +1. **Classify query** — pattern match keywords -> query_type (event_explanation, impact_analysis, connection_discovery, entity_relevance) +2. **Seed retrieval** — keyword ILIKE search + pgvector cosine similarity +3. **Graph expansion** — 2-hop: for each seed, find 6 nearest neighbors via pgvector +4. **Context assembly** — full article bodies + financial impact heuristics +5. **Gemini synthesis** — structured JSON output with citations `[cite:UUID]`, navigation plan, financial impact +6. **Post-processing** — filter to globe-navigable events, strip invalid citations + +Output schema includes: answer, confidence, caution, query_type, navigation_plan, relevant_event_ids, highlight_relationships, financial_impact, reasoning_steps, cited_event_map. + +## Event Types (StrEnum) + +``` +geopolitics, trade_supply_chain, energy_commodities, +financial_markets, climate_disasters, policy_regulation +``` + +Each maps to a color in the frontend `EVENT_TYPE_COLORS` constant. + +## Data Sources + +| Source | What It Provides | Scraper File | +|--------|-----------------|--------------| +| GDELT | Global events from news (BigQuery or DOC API) | `scrapers/gdelt.py` | +| ACLED | Armed conflict events | `ingestion/acled/acled_client.py` | +| Polymarket | Prediction market events + probabilities | `scrapers/polymarket.py` | +| Kalshi | Prediction market events + volumes | `scrapers/kalshi.py` | + +## Known Issues & Tech Debt + +1. **Duplicate `content_repository.py`** — exists in both `repositories/` and `ingestion/`. Must consolidate. +2. **No shared DB pool** — `ingestion/db.py` has its own pool; other services use inline `asyncpg.connect()`. Need a single shared pool. +3. **Dead scraper code** — `eonet.py`, `reddit*.py`, `social_scraper.py`, `Reddit Scraper/`, `natural-disasters/`, `ryan_scrapers/` are all unused. +4. **No scheduled scraping** — all ingestion is manual CLI or API trigger. +5. **Expensive embeddings** — OpenAI API called per-row. Should switch to local model. +6. **No caching** — every confidence score and realtime analysis call hits Gemini. Need Redis. +7. **CORS wildcard** — `*` origin allowed in production. Must lock down. +8. **No tests** — zero test files in the repo. +9. **Print debugging** — `print()` used instead of structured logging. +10. **No migration tool** — raw SQL files, no Alembic. + +## Conventions + +### Backend +- **Async everywhere** — use `async def` for all route handlers and service methods +- **asyncpg** for DB access (not SQLAlchemy ORM) +- **Pydantic v2** for request/response models +- **Raw SQL** for queries (no ORM) — parameterize all user inputs with `$1, $2` syntax +- **Environment variables** via `python-dotenv` and `os.getenv()` +- **Scraper output** normalized via `row_format.make_content_row()` before DB insert + +### Frontend +- **React 19** with function components and hooks only +- **Context API** for state (no Redux) — three providers: App, Agent, UserPersona +- **Tailwind CSS** for styling — design tokens as CSS variables in `index.css` +- **No routing library** yet — single-page, overlay-based navigation +- **Client-side filtering** — all events loaded on mount, visibility controlled by pointRadius/pointColor + +### Git +- Branch from `main` +- Conventional-ish commit messages (e.g., `feat:`, `fix:`, `chore:`) + +## Environment Variables + +### Backend (.env) +``` +DATABASE_URL=postgresql+asyncpg://user:pass@host:5432/dbname # Required +GEMINI_API_KEY=... # Required for agent +GEMINI_MODEL=gemini-2.5-flash # Optional, default shown +OPENAI_API_KEY=... # Required for embeddings (until local model migration) +ACLED_API_KEY=... # Required for ACLED ingestion +CLOUDINARY_CLOUD_NAME=... # Optional +CLOUDINARY_API_KEY=... # Optional +CLOUDINARY_API_SECRET=... # Optional +AWS_ACCESS_KEY_ID=... # Optional (S3 fallback) +AWS_SECRET_ACCESS_KEY=... # Optional +S3_BUCKET=... # Optional +AWS_REGION=... # Optional +ELEVENLABS_API_KEY=... # Optional +``` + +### Frontend (.env) +``` +VITE_API_URL=/api # or http://127.0.0.1:8000 +VITE_CLOUDINARY_CLOUD_NAME=... # Optional +VITE_ELEVENLABS_API_KEY=... # Optional +``` + +## Running Locally + +```bash +# Backend +cd backend && python -m venv .venv && source .venv/bin/activate +pip install -r requirements.txt +uvicorn app.main:app --reload --port 8000 + +# Frontend +cd frontend && npm install && npm run dev + +# Manual scraping +python run_scrape.py # Polymarket + Kalshi +python run_gdelt_scrape.py # GDELT (--days 14 --limit 500) +curl -X POST localhost:8000/ingestion/acled +curl -X POST localhost:8000/embeddings/backfill/content +``` + +## Working on Tickets + +When picking up a ticket from the PRD (`plan/PRD.md`): + +1. **Read the relevant source files first** — don't modify code you haven't read +2. **Check for duplicates** — the codebase has redundant implementations (see Known Issues) +3. **Keep async** — all new backend code should be async +4. **Parameterize SQL** — never string-interpolate user input into queries +5. **No new print()** — use `logging.getLogger(__name__)` +6. **Test what you build** — add tests alongside new code (once pytest is set up) +7. **Budget-conscious** — if a feature involves AI API calls, always consider caching and batching first diff --git a/plan/PRD.md b/plan/PRD.md new file mode 100644 index 0000000..1276be5 --- /dev/null +++ b/plan/PRD.md @@ -0,0 +1,205 @@ +# Argus — Product Requirements Document + +## Vision + +Argus is a 3D global event intelligence platform that aggregates world events daily, stores them in a semantically searchable database, and visualizes them on an interactive globe. Users can explore events spatially and temporally, query an AI agent for analysis, and discover connections between events through graph-based relationships. + +The platform is source-agnostic — while the hackathon prototype focused on Canada-impact framing, the production system should support any analytical lens or persona. + +## Current State (MVP) + +Working prototype with: +- 3D globe rendering events as points with similarity arcs +- AI agent (Graph-RAG) with persona-aware Gemini synthesis +- Scrapers for GDELT, ACLED, Polymarket, Kalshi +- PostgreSQL + pgvector for semantic search +- Client-side filtering and timeline scrubbing + +Key gaps: no deployment config, no scheduled scraping, no caching, no tests, high AI token costs, dead code from hackathon iteration. + +## Team & Constraints + +- 4 people, part-time +- Low budget — minimize per-request AI spend +- Target: production-deployable within ~7 weeks (4 phases) + +--- + +## Phase 1: Foundation (Week 1–2) + +**Goal:** Make the app deployable and the codebase clean enough to work on confidently. + +### 1.1 Containerization + +| # | Ticket | Priority | +|---|--------|----------| +| 1 | Create backend Dockerfile (FastAPI + uvicorn, multi-stage build) | P0 | +| 2 | Create frontend Dockerfile (Vite build -> nginx static serve) | P0 | +| 3 | Create `docker-compose.yml` with services: backend, frontend, postgres+pgvector, redis | P0 | +| 4 | Add `.dockerignore` files (exclude `.env`, `node_modules`, `.venv`, `__pycache__`) | P0 | + +### 1.2 CI/CD + +| # | Ticket | Priority | +|---|--------|----------| +| 5 | Set up GitHub Actions CI: ruff lint/format, TypeScript typecheck, docker build | P0 | +| 6 | Set up GitHub Actions CD: build images, push to registry, deploy to hosting | P1 | +| 7 | Choose hosting provider (Railway, Fly.io, or small VPS) and document deploy process | P1 | + +### 1.3 Dead Code Removal + +| # | Ticket | Priority | +|---|--------|----------| +| 8 | Delete unused scrapers: `eonet.py`, `eonet_db.py`, `social_scraper.py`, `reddit.py`, `reddit_classifier.py`, `reddit_db.py`, `Reddit Scraper/`, `natural-disasters/`, `ryan_scrapers/` | P0 | +| 9 | Remove duplicate `content_repository.py` (exists in both `repositories/` and `ingestion/`) — consolidate into one | P0 | +| 10 | Audit and remove any other dead imports, unused functions, or commented-out code | P1 | + +### 1.4 Security Baseline + +| # | Ticket | Priority | +|---|--------|----------| +| 11 | Lock down CORS origins — remove `*` wildcard, use env-configured allowed origins | P0 | +| 12 | Add rate limiting middleware on `/agent/query` and `/content/{id}/realtime-analysis` (these burn AI tokens) | P0 | +| 13 | Validate all secrets are loaded from env vars; fail fast on startup if required vars missing | P0 | + +--- + +## Phase 2: Automation & Cost Optimization (Week 2–3) + +**Goal:** Automate daily data ingestion and drastically cut AI token spend. + +### 2.1 Scheduled Scraping + +| # | Ticket | Priority | +|---|--------|----------| +| 14 | Create unified scrape entrypoint (`run_daily_pipeline.py`) that runs all scrapers + embedding backfill in sequence | P0 | +| 15 | Add cron scheduler (APScheduler in a separate container, or cron in docker-compose) | P0 | +| 16 | Define scraping schedule: GDELT 1x/day, ACLED 1x/day, Polymarket+Kalshi 2x/day | P0 | +| 17 | Add `scrape_runs` logging table (source, status, rows_inserted, errors, duration_ms, started_at) | P1 | +| 18 | Add idempotency guards — URL-based dedup check before insert, not after | P0 | +| 19 | Add failure alerting (Discord webhook or email on scrape errors) | P2 | + +### 2.2 Embedding Cost Reduction + +| # | Ticket | Priority | +|---|--------|----------| +| 20 | **Switch embeddings to local model** — replace OpenAI `text-embedding-3-small` with `sentence-transformers` (e.g. `all-MiniLM-L6-v2`, 384 dims). Run locally in backend container. Update vector column dimension. | P0 | +| 21 | Batch embedding generation — process 100+ items per batch instead of one-at-a-time | P0 | +| 22 | Deduplicate content before generating embeddings (currently embeddings are generated before dedup) | P1 | + +### 2.3 AI Response Caching (Redis) + +| # | Ticket | Priority | +|---|--------|----------| +| 23 | Add Redis client utility module with connection pooling | P0 | +| 24 | Cache `/content/points` response — invalidate on new scrape run completion | P0 | +| 25 | Cache `/content/arcs` response per threshold value (TTL = until next scrape) | P1 | +| 26 | Cache Gemini confidence scores per content_id (TTL = 24h) | P0 | +| 27 | Cache Gemini realtime analysis per `(content_id, user_role)` (TTL = 6h) | P0 | +| 28 | Cache agent query results keyed on `(normalized_query_hash, persona)` (TTL = 1h) | P1 | + +### 2.4 Additional Cost Controls + +| # | Ticket | Priority | +|---|--------|----------| +| 29 | Add token usage tracking — log Gemini and OpenAI (if still used) token counts per call to a `token_usage` table | P1 | +| 30 | Pre-compute confidence scores during scrape pipeline instead of on-demand per user click | P2 | +| 31 | Replace per-request Google Search grounding with daily cached news summaries scraped during pipeline | P2 | + +--- + +## Phase 3: Schema & Code Quality (Week 3–5) + +**Goal:** Clean up the data model, modularize backend code, add test coverage. + +### 3.1 Database Schema Improvements + +| # | Ticket | Priority | +|---|--------|----------| +| 32 | Set up Alembic for migration management (replace raw SQL files) | P0 | +| 33 | Rename `content_table` -> `articles` | P1 | +| 34 | Split AI-generated fields into `article_analysis` table (embedding, sentiment_score, market_signal, confidence_score) | P1 | +| 35 | Add `scrape_source` enum column to replace generic FK to `sources` table | P2 | +| 36 | Add composite index on `(event_type, published_at)` for filtered timeline queries | P1 | +| 37 | Add `last_scraped_at` column to sources for freshness tracking | P2 | +| 38 | Clean up unused columns and tables from hackathon iteration | P1 | + +### 3.2 Backend Modularization + +| # | Ticket | Priority | +|---|--------|----------| +| 39 | Create shared `db.py` module — single asyncpg pool used by all services (currently duplicated in `ingestion/db.py` and inline connects) | P0 | +| 40 | Extract `BaseScraper` ABC interface: `async fetch() -> list[NormalizedRow]`. Make GDELT, ACLED, Polymarket, Kalshi implement it. | P1 | +| 41 | Consolidate all Pydantic models into a single `schemas/` package | P1 | +| 42 | Replace all `print()` statements with structured `logging` (use correlation IDs per request) | P1 | +| 43 | Add input validation on all API endpoints (query length limits, coordinate bounds, UUID format) | P1 | + +### 3.3 Testing + +| # | Ticket | Priority | +|---|--------|----------| +| 44 | Set up pytest with async fixtures (asyncpg test DB, httpx AsyncClient) | P0 | +| 45 | Write tests for scraper row normalization (`row_format.py`, ACLED normalizer) | P0 | +| 46 | Write tests for deduplication logic | P1 | +| 47 | Write tests for agent query classification (`_classify_query`) | P1 | +| 48 | Write API integration tests for `/content/points`, `/agent/query` | P1 | +| 49 | Add frontend smoke tests (Vitest + React Testing Library) for Globe render, Agent query flow | P2 | +| 50 | Add pre-commit hooks: ruff lint+format, TypeScript typecheck | P0 | + +--- + +## Phase 4: Frontend Polish & Observability (Week 5–7) + +**Goal:** Make the UI production-grade and add operational visibility. + +### 4.1 Frontend UX + +| # | Ticket | Priority | +|---|--------|----------| +| 51 | Add loading skeleton/spinner on initial data fetch | P0 | +| 52 | Lazy-load Globe component (three.js is ~500KB) with React.lazy + Suspense | P1 | +| 53 | Add error boundaries around Globe, Agent, and Modal components | P0 | +| 54 | Memoize expensive computations (arc filtering, point color mapping) with useMemo/useCallback | P1 | +| 55 | Add responsive layout — mobile/tablet support or graceful "desktop-only" message | P1 | +| 56 | Add URL-based routing (React Router) for shareable/bookmarkable globe state | P2 | +| 57 | Add "no results" empty states for agent queries and empty filtered views | P1 | +| 58 | Move inline styles and magic numbers to shared constants / design tokens | P2 | + +### 4.2 Observability + +| # | Ticket | Priority | +|---|--------|----------| +| 59 | Add structured logging with request correlation IDs (middleware) | P1 | +| 60 | Add simple admin dashboard page: scrape history, DB row counts, token spend summary | P2 | +| 61 | Add uptime monitoring on `/health` endpoint (UptimeRobot free tier or similar) | P1 | + +### 4.3 Security Hardening + +| # | Ticket | Priority | +|---|--------|----------| +| 62 | Audit all raw SQL for injection risks — parameterize any string-interpolated queries | P0 | +| 63 | Add API key or lightweight session auth for agent/analysis endpoints | P1 | + +--- + +## Success Criteria + +| Metric | Target | +|--------|--------| +| Daily scraping runs automatically | Yes, with logged success/failure | +| Embedding cost per scrape cycle | < $0.01 (local model) | +| Gemini API calls per unique user action | Max 1 (cached thereafter) | +| Time to deploy from commit | < 10 minutes | +| Backend test coverage on critical paths | > 70% | +| Frontend initial load time | < 3 seconds (gzipped, lazy-loaded) | +| Uptime | > 99% (monitored) | + +--- + +## Non-Goals (for now) + +- Mobile app (web-only is fine) +- User accounts / auth (public dashboard for now, API key auth only) +- Real-time websocket streaming (polling/refresh is sufficient) +- Multi-language support +- Custom event submission by users From 62a30fc7bdad1408d5845ccf9c6d11a32d65bd03 Mon Sep 17 00:00:00 2001 From: Lucas Jin Date: Mon, 9 Mar 2026 22:56:29 -0400 Subject: [PATCH 2/5] chore: update planning docs --- plan/CLAUDE.md | 55 ++++++++++--------- plan/PRD.md | 146 +++++++++++++++++++++++++++++-------------------- 2 files changed, 118 insertions(+), 83 deletions(-) diff --git a/plan/CLAUDE.md b/plan/CLAUDE.md index 5e676dd..fba77a9 100644 --- a/plan/CLAUDE.md +++ b/plan/CLAUDE.md @@ -2,6 +2,8 @@ Read this file before working on any ticket. It contains the full architectural context, conventions, and key file locations for the Argus project. +> **Note:** This document reflects the team's best understanding at time of writing. If the user gives instructions that conflict with what's written here, **follow the user's instructions** — they take priority. If parts of this context have become outdated or irrelevant due to changes in the codebase, use your judgement and note the discrepancy rather than blindly following stale guidance. + ## What Is Argus A 3D global event intelligence platform. World events are scraped daily from multiple sources, stored in PostgreSQL with vector embeddings, and visualized on an interactive globe. An AI agent (Graph-RAG pipeline) lets users query events with persona-aware analysis. @@ -18,7 +20,7 @@ A 3D global event intelligence platform. World events are scraped daily from mul | AI Model | Google Gemini 2.5-flash | Structured JSON output | | Embeddings | OpenAI text-embedding-3-small (1536 dims) | Migration target: local sentence-transformers | | Voice | ElevenLabs Scribe v1 | Optional, speech-to-text | -| Media | Cloudinary (primary), S3 fallback, placeholder SVGs | | +| Media | Placeholder SVGs only | Cloudinary and S3 removed — no longer needed | ## Project Structure @@ -50,13 +52,13 @@ hackcanada/ │ │ ├── types/ │ │ │ ├── events.ts # Event, ContentPoint, ContentArc, EventDetail │ │ │ └── agent.ts # AgentResponse, NavigationPlan, FinancialImpact -│ │ └── utils/mediaConfig.ts # Cloudinary/S3/placeholder URL resolver +│ │ └── utils/mediaConfig.ts # DEPRECATED — Cloudinary/S3 removed, delete this file (ticket #10) │ └── package.json │ └── backend/ ├── requirements.txt - ├── run_scrape.py # CLI: Polymarket + Kalshi scrape - ├── run_gdelt_scrape.py # CLI: GDELT scrape (--days, --limit flags) + ├── run_scrape.py # LEGACY CLI — will be replaced by run_daily_pipeline.py + ├── run_gdelt_scrape.py # LEGACY CLI — will be replaced by run_daily_pipeline.py ├── migrations/ │ └── 001_init_schema.sql # Full PostgreSQL schema └── app/ @@ -94,16 +96,19 @@ hackcanada/ │ ├── acled_client.py # ACLED API client │ └── acled_normalizer.py └── scrapers/ - ├── gdelt.py # GDELT (BigQuery primary, DOC API fallback) - ├── polymarket.py # Polymarket API - ├── kalshi.py # Kalshi API (async, rate-limited) - ├── row_format.py # Shared row normalization -> content_table shape + ├── row_format.py # Shared row normalization -> content_table shape (KEEP — used by new scrapers) + ├── _reference/ # Hackathon scrapers kept as design inspiration only (NOT used in production) + │ ├── gdelt.py # Reference: dual-path fetch, CAMEO mapping, Goldstein normalization + │ ├── kalshi.py # Reference: async rate limiter, cursor pagination, asyncio.gather + │ ├── polymarket.py # Reference: simple REST API pattern, tag filtering + │ └── acled/ # Reference: client/normalizer separation, NormalizedRecord model ├── eonet.py # UNUSED — delete ├── eonet_db.py # UNUSED — delete ├── social_scraper.py # UNUSED — delete ├── reddit.py # UNUSED — delete ├── reddit_classifier.py # UNUSED — delete ├── reddit_db.py # UNUSED — delete + ├── reddit_schema.sql # UNUSED — delete ├── Reddit Scraper/ # UNUSED — delete ├── natural-disasters/ # UNUSED — delete └── ryan_scrapers/ # UNUSED — delete @@ -119,7 +124,7 @@ PostgreSQL with pgvector and pgcrypto extensions. - `id` UUID PK (gen_random_uuid) - `title`, `body`, `url` (UNIQUE) - `latitude`, `longitude` (nullable floats) -- `image_url` (Cloudinary public_id), `s3_url` +- `image_url`, `s3_url` — DEPRECATED, to be dropped (Cloudinary/S3 no longer used) - `embedding` vector(1536) — OpenAI text-embedding-3-small - `sentiment_score` float, `market_signal` text - `published_at` timestamptz, `event_type` text, `raw_metadata_json` JSONB @@ -181,18 +186,22 @@ Each maps to a color in the frontend `EVENT_TYPE_COLORS` constant. ## Data Sources -| Source | What It Provides | Scraper File | -|--------|-----------------|--------------| -| GDELT | Global events from news (BigQuery or DOC API) | `scrapers/gdelt.py` | -| ACLED | Armed conflict events | `ingestion/acled/acled_client.py` | -| Polymarket | Prediction market events + probabilities | `scrapers/polymarket.py` | -| Kalshi | Prediction market events + volumes | `scrapers/kalshi.py` | +The hackathon prototype used the sources below. **None of the existing scraper implementations will be used directly** — new production scrapers will be written implementing a `BaseScraper` ABC. The old code is kept in `scrapers/_reference/` for design inspiration. + +| Source | What It Provides | Reference File | Quality | +|--------|-----------------|----------------|---------| +| GDELT | Global events from news (BigQuery or DOC API) | `scrapers/_reference/gdelt.py` | Excellent — study for complex normalization | +| ACLED | Armed conflict events | `scrapers/_reference/acled/` | Good — study for client/normalizer separation | +| Polymarket | Prediction market events + probabilities | `scrapers/_reference/polymarket.py` | Decent — study for simple REST pattern | +| Kalshi | Prediction market events + volumes | `scrapers/_reference/kalshi.py` | Excellent — study for async rate limiting | + +Which data sources to keep, replace, or add is a product decision for Phase 2. The scraper architecture (BaseScraper ABC, row_format contract, dedup-before-embed) is what matters. ## Known Issues & Tech Debt 1. **Duplicate `content_repository.py`** — exists in both `repositories/` and `ingestion/`. Must consolidate. 2. **No shared DB pool** — `ingestion/db.py` has its own pool; other services use inline `asyncpg.connect()`. Need a single shared pool. -3. **Dead scraper code** — `eonet.py`, `reddit*.py`, `social_scraper.py`, `Reddit Scraper/`, `natural-disasters/`, `ryan_scrapers/` are all unused. +3. **Dead scraper code** — `eonet.py`, `reddit*.py`, `social_scraper.py`, `Reddit Scraper/`, `natural-disasters/`, `ryan_scrapers/` are all unused junk. The "real" scrapers (`gdelt.py`, `kalshi.py`, `polymarket.py`, `acled/`) are hackathon-quality reference code only — new production scrapers need to be written. 4. **No scheduled scraping** — all ingestion is manual CLI or API trigger. 5. **Expensive embeddings** — OpenAI API called per-row. Should switch to local model. 6. **No caching** — every confidence score and realtime analysis call hits Gemini. Need Redis. @@ -200,6 +209,7 @@ Each maps to a color in the frontend `EVENT_TYPE_COLORS` constant. 8. **No tests** — zero test files in the repo. 9. **Print debugging** — `print()` used instead of structured logging. 10. **No migration tool** — raw SQL files, no Alembic. +11. **Dead Cloudinary/S3 code** — Cloudinary and S3 are no longer used. `utils/mediaConfig.ts`, `@cloudinary/react`, `@cloudinary/url-gen`, `cloudinary`, `boto3` deps, `image_url`/`s3_url` DB columns, and all related env vars should be removed (ticket #10). ## Conventions @@ -210,6 +220,7 @@ Each maps to a color in the frontend `EVENT_TYPE_COLORS` constant. - **Raw SQL** for queries (no ORM) — parameterize all user inputs with `$1, $2` syntax - **Environment variables** via `python-dotenv` and `os.getenv()` - **Scraper output** normalized via `row_format.make_content_row()` before DB insert +- **New scrapers** must implement `BaseScraper` ABC — see `scrapers/_reference/` for patterns, especially `kalshi.py` (rate limiting) and `gdelt.py` (normalization) ### Frontend - **React 19** with function components and hooks only @@ -231,21 +242,15 @@ GEMINI_API_KEY=... # Required for agent GEMINI_MODEL=gemini-2.5-flash # Optional, default shown OPENAI_API_KEY=... # Required for embeddings (until local model migration) ACLED_API_KEY=... # Required for ACLED ingestion -CLOUDINARY_CLOUD_NAME=... # Optional -CLOUDINARY_API_KEY=... # Optional -CLOUDINARY_API_SECRET=... # Optional -AWS_ACCESS_KEY_ID=... # Optional (S3 fallback) -AWS_SECRET_ACCESS_KEY=... # Optional -S3_BUCKET=... # Optional -AWS_REGION=... # Optional ELEVENLABS_API_KEY=... # Optional +# NOTE: CLOUDINARY_* and AWS_*/S3_* vars are no longer needed — remove if present ``` ### Frontend (.env) ``` VITE_API_URL=/api # or http://127.0.0.1:8000 -VITE_CLOUDINARY_CLOUD_NAME=... # Optional VITE_ELEVENLABS_API_KEY=... # Optional +# NOTE: VITE_CLOUDINARY_CLOUD_NAME is no longer needed — remove if present ``` ## Running Locally @@ -259,7 +264,7 @@ uvicorn app.main:app --reload --port 8000 # Frontend cd frontend && npm install && npm run dev -# Manual scraping +# Manual scraping (LEGACY — will be replaced by run_daily_pipeline.py) python run_scrape.py # Polymarket + Kalshi python run_gdelt_scrape.py # GDELT (--days 14 --limit 500) curl -X POST localhost:8000/ingestion/acled diff --git a/plan/PRD.md b/plan/PRD.md index 1276be5..966ee7c 100644 --- a/plan/PRD.md +++ b/plan/PRD.md @@ -11,11 +11,11 @@ The platform is source-agnostic — while the hackathon prototype focused on Can Working prototype with: - 3D globe rendering events as points with similarity arcs - AI agent (Graph-RAG) with persona-aware Gemini synthesis -- Scrapers for GDELT, ACLED, Polymarket, Kalshi +- Hackathon-era scrapers for GDELT, ACLED, Polymarket, Kalshi (will not be used directly — new scrapers will be written, but these serve as reference) - PostgreSQL + pgvector for semantic search - Client-side filtering and timeline scrubbing -Key gaps: no deployment config, no scheduled scraping, no caching, no tests, high AI token costs, dead code from hackathon iteration. +Key gaps: no deployment config, no scheduled scraping, no caching, no tests, high AI token costs, dead code from hackathon iteration, legacy Cloudinary/S3 media code that is no longer needed. Existing scrapers need to be replaced with new, production-grade implementations — the current ones are MVP-quality reference code only. ## Team & Constraints @@ -50,17 +50,19 @@ Key gaps: no deployment config, no scheduled scraping, no caching, no tests, hig | # | Ticket | Priority | |---|--------|----------| -| 8 | Delete unused scrapers: `eonet.py`, `eonet_db.py`, `social_scraper.py`, `reddit.py`, `reddit_classifier.py`, `reddit_db.py`, `Reddit Scraper/`, `natural-disasters/`, `ryan_scrapers/` | P0 | -| 9 | Remove duplicate `content_repository.py` (exists in both `repositories/` and `ingestion/`) — consolidate into one | P0 | -| 10 | Audit and remove any other dead imports, unused functions, or commented-out code | P1 | +| 8 | Delete junk scrapers: `eonet.py`, `eonet_db.py`, `social_scraper.py`, `reddit.py`, `reddit_classifier.py`, `reddit_db.py`, `reddit_schema.sql`, `Reddit Scraper/`, `natural-disasters/`, `ryan_scrapers/` | P0 | +| 9 | Move remaining hackathon scrapers (`gdelt.py`, `kalshi.py`, `polymarket.py`, `acled/`) into a `scrapers/_reference/` directory — these won't be used directly but are kept as design inspiration (see "Scraper Reference Guide" below) | P0 | +| 10 | Remove duplicate `content_repository.py` (exists in both `repositories/` and `ingestion/`) — consolidate into one | P0 | +| 11 | Remove all Cloudinary and S3 media code — no longer needed. Delete `utils/mediaConfig.ts`, remove `@cloudinary/react` and `@cloudinary/url-gen` deps from frontend, remove `cloudinary` and `boto3` deps from backend, drop `image_url` and `s3_url` columns from `content_table`, remove all `CLOUDINARY_*` and `AWS_*`/`S3_*` env vars | P0 | +| 12 | Audit and remove any other dead imports, unused functions, or commented-out code | P1 | ### 1.4 Security Baseline | # | Ticket | Priority | |---|--------|----------| -| 11 | Lock down CORS origins — remove `*` wildcard, use env-configured allowed origins | P0 | -| 12 | Add rate limiting middleware on `/agent/query` and `/content/{id}/realtime-analysis` (these burn AI tokens) | P0 | -| 13 | Validate all secrets are loaded from env vars; fail fast on startup if required vars missing | P0 | +| 13 | Lock down CORS origins — remove `*` wildcard, use env-configured allowed origins | P0 | +| 14 | Add rate limiting middleware on `/agent/query` and `/content/{id}/realtime-analysis` (these burn AI tokens) | P0 | +| 15 | Validate all secrets are loaded from env vars; fail fast on startup if required vars missing | P0 | --- @@ -72,39 +74,41 @@ Key gaps: no deployment config, no scheduled scraping, no caching, no tests, hig | # | Ticket | Priority | |---|--------|----------| -| 14 | Create unified scrape entrypoint (`run_daily_pipeline.py`) that runs all scrapers + embedding backfill in sequence | P0 | -| 15 | Add cron scheduler (APScheduler in a separate container, or cron in docker-compose) | P0 | -| 16 | Define scraping schedule: GDELT 1x/day, ACLED 1x/day, Polymarket+Kalshi 2x/day | P0 | -| 17 | Add `scrape_runs` logging table (source, status, rows_inserted, errors, duration_ms, started_at) | P1 | -| 18 | Add idempotency guards — URL-based dedup check before insert, not after | P0 | -| 19 | Add failure alerting (Discord webhook or email on scrape errors) | P2 | +| 16 | Design `BaseScraper` ABC interface: `async fetch() -> list[NormalizedRow]` with built-in rate limiting, error handling, and dedup. Use `kalshi.py` rate limiter and `ingestion_service.py` error patterns as reference. | P0 | +| 17 | Write new production scrapers implementing `BaseScraper` for each data source (determine which sources to keep/add based on product needs) | P0 | +| 18 | Create unified scrape entrypoint (`run_daily_pipeline.py`) that runs all scrapers + embedding backfill in sequence | P0 | +| 19 | Add cron scheduler (APScheduler in a separate container, or cron in docker-compose) | P0 | +| 20 | Define scraping schedule (e.g. 1x/day, 2x/day per source) | P0 | +| 21 | Add `scrape_runs` logging table (source, status, rows_inserted, errors, duration_ms, started_at) | P1 | +| 22 | Add idempotency guards — URL-based dedup check before insert, not after | P0 | +| 23 | Add failure alerting (Discord webhook or email on scrape errors) | P2 | ### 2.2 Embedding Cost Reduction | # | Ticket | Priority | |---|--------|----------| -| 20 | **Switch embeddings to local model** — replace OpenAI `text-embedding-3-small` with `sentence-transformers` (e.g. `all-MiniLM-L6-v2`, 384 dims). Run locally in backend container. Update vector column dimension. | P0 | -| 21 | Batch embedding generation — process 100+ items per batch instead of one-at-a-time | P0 | -| 22 | Deduplicate content before generating embeddings (currently embeddings are generated before dedup) | P1 | +| 24 | **Switch embeddings to local model** — replace OpenAI `text-embedding-3-small` with `sentence-transformers` (e.g. `all-MiniLM-L6-v2`, 384 dims). Run locally in backend container. Update vector column dimension. | P0 | +| 25 | Batch embedding generation — process 100+ items per batch instead of one-at-a-time | P0 | +| 26 | Deduplicate content before generating embeddings (currently embeddings are generated before dedup) | P1 | ### 2.3 AI Response Caching (Redis) | # | Ticket | Priority | |---|--------|----------| -| 23 | Add Redis client utility module with connection pooling | P0 | -| 24 | Cache `/content/points` response — invalidate on new scrape run completion | P0 | -| 25 | Cache `/content/arcs` response per threshold value (TTL = until next scrape) | P1 | -| 26 | Cache Gemini confidence scores per content_id (TTL = 24h) | P0 | -| 27 | Cache Gemini realtime analysis per `(content_id, user_role)` (TTL = 6h) | P0 | -| 28 | Cache agent query results keyed on `(normalized_query_hash, persona)` (TTL = 1h) | P1 | +| 27 | Add Redis client utility module with connection pooling | P0 | +| 28 | Cache `/content/points` response — invalidate on new scrape run completion | P0 | +| 29 | Cache `/content/arcs` response per threshold value (TTL = until next scrape) | P1 | +| 30 | Cache Gemini confidence scores per content_id (TTL = 24h) | P0 | +| 31 | Cache Gemini realtime analysis per `(content_id, user_role)` (TTL = 6h) | P0 | +| 32 | Cache agent query results keyed on `(normalized_query_hash, persona)` (TTL = 1h) | P1 | ### 2.4 Additional Cost Controls | # | Ticket | Priority | |---|--------|----------| -| 29 | Add token usage tracking — log Gemini and OpenAI (if still used) token counts per call to a `token_usage` table | P1 | -| 30 | Pre-compute confidence scores during scrape pipeline instead of on-demand per user click | P2 | -| 31 | Replace per-request Google Search grounding with daily cached news summaries scraped during pipeline | P2 | +| 33 | Add token usage tracking — log Gemini and OpenAI (if still used) token counts per call to a `token_usage` table | P1 | +| 34 | Pre-compute confidence scores during scrape pipeline instead of on-demand per user click | P2 | +| 35 | Replace per-request Google Search grounding with daily cached news summaries scraped during pipeline | P2 | --- @@ -116,35 +120,34 @@ Key gaps: no deployment config, no scheduled scraping, no caching, no tests, hig | # | Ticket | Priority | |---|--------|----------| -| 32 | Set up Alembic for migration management (replace raw SQL files) | P0 | -| 33 | Rename `content_table` -> `articles` | P1 | -| 34 | Split AI-generated fields into `article_analysis` table (embedding, sentiment_score, market_signal, confidence_score) | P1 | -| 35 | Add `scrape_source` enum column to replace generic FK to `sources` table | P2 | -| 36 | Add composite index on `(event_type, published_at)` for filtered timeline queries | P1 | -| 37 | Add `last_scraped_at` column to sources for freshness tracking | P2 | -| 38 | Clean up unused columns and tables from hackathon iteration | P1 | +| 36 | Set up Alembic for migration management (replace raw SQL files) | P0 | +| 37 | Rename `content_table` -> `articles` | P1 | +| 38 | Split AI-generated fields into `article_analysis` table (embedding, sentiment_score, market_signal, confidence_score) | P1 | +| 39 | Add `scrape_source` enum column to replace generic FK to `sources` table | P2 | +| 40 | Add composite index on `(event_type, published_at)` for filtered timeline queries | P1 | +| 41 | Add `last_scraped_at` column to sources for freshness tracking | P2 | +| 42 | Clean up unused columns and tables from hackathon iteration (including dropped `image_url`, `s3_url`) | P1 | ### 3.2 Backend Modularization | # | Ticket | Priority | |---|--------|----------| -| 39 | Create shared `db.py` module — single asyncpg pool used by all services (currently duplicated in `ingestion/db.py` and inline connects) | P0 | -| 40 | Extract `BaseScraper` ABC interface: `async fetch() -> list[NormalizedRow]`. Make GDELT, ACLED, Polymarket, Kalshi implement it. | P1 | -| 41 | Consolidate all Pydantic models into a single `schemas/` package | P1 | -| 42 | Replace all `print()` statements with structured `logging` (use correlation IDs per request) | P1 | -| 43 | Add input validation on all API endpoints (query length limits, coordinate bounds, UUID format) | P1 | +| 43 | Create shared `db.py` module — single asyncpg pool used by all services (currently duplicated in `ingestion/db.py` and inline connects) | P0 | +| 44 | Consolidate all Pydantic models into a single `schemas/` package | P1 | +| 45 | Replace all `print()` statements with structured `logging` (use correlation IDs per request) | P1 | +| 46 | Add input validation on all API endpoints (query length limits, coordinate bounds, UUID format) | P1 | ### 3.3 Testing | # | Ticket | Priority | |---|--------|----------| -| 44 | Set up pytest with async fixtures (asyncpg test DB, httpx AsyncClient) | P0 | -| 45 | Write tests for scraper row normalization (`row_format.py`, ACLED normalizer) | P0 | -| 46 | Write tests for deduplication logic | P1 | -| 47 | Write tests for agent query classification (`_classify_query`) | P1 | -| 48 | Write API integration tests for `/content/points`, `/agent/query` | P1 | -| 49 | Add frontend smoke tests (Vitest + React Testing Library) for Globe render, Agent query flow | P2 | -| 50 | Add pre-commit hooks: ruff lint+format, TypeScript typecheck | P0 | +| 47 | Set up pytest with async fixtures (asyncpg test DB, httpx AsyncClient) | P0 | +| 48 | Write tests for new scraper implementations and row normalization | P0 | +| 49 | Write tests for deduplication logic | P1 | +| 50 | Write tests for agent query classification (`_classify_query`) | P1 | +| 51 | Write API integration tests for `/content/points`, `/agent/query` | P1 | +| 52 | Add frontend smoke tests (Vitest + React Testing Library) for Globe render, Agent query flow | P2 | +| 53 | Add pre-commit hooks: ruff lint+format, TypeScript typecheck | P0 | --- @@ -156,29 +159,29 @@ Key gaps: no deployment config, no scheduled scraping, no caching, no tests, hig | # | Ticket | Priority | |---|--------|----------| -| 51 | Add loading skeleton/spinner on initial data fetch | P0 | -| 52 | Lazy-load Globe component (three.js is ~500KB) with React.lazy + Suspense | P1 | -| 53 | Add error boundaries around Globe, Agent, and Modal components | P0 | -| 54 | Memoize expensive computations (arc filtering, point color mapping) with useMemo/useCallback | P1 | -| 55 | Add responsive layout — mobile/tablet support or graceful "desktop-only" message | P1 | -| 56 | Add URL-based routing (React Router) for shareable/bookmarkable globe state | P2 | -| 57 | Add "no results" empty states for agent queries and empty filtered views | P1 | -| 58 | Move inline styles and magic numbers to shared constants / design tokens | P2 | +| 54 | Add loading skeleton/spinner on initial data fetch | P0 | +| 55 | Lazy-load Globe component (three.js is ~500KB) with React.lazy + Suspense | P1 | +| 56 | Add error boundaries around Globe, Agent, and Modal components | P0 | +| 57 | Memoize expensive computations (arc filtering, point color mapping) with useMemo/useCallback | P1 | +| 58 | Add responsive layout — mobile/tablet support or graceful "desktop-only" message | P1 | +| 59 | Add URL-based routing (React Router) for shareable/bookmarkable globe state | P2 | +| 60 | Add "no results" empty states for agent queries and empty filtered views | P1 | +| 61 | Move inline styles and magic numbers to shared constants / design tokens | P2 | ### 4.2 Observability | # | Ticket | Priority | |---|--------|----------| -| 59 | Add structured logging with request correlation IDs (middleware) | P1 | -| 60 | Add simple admin dashboard page: scrape history, DB row counts, token spend summary | P2 | -| 61 | Add uptime monitoring on `/health` endpoint (UptimeRobot free tier or similar) | P1 | +| 62 | Add structured logging with request correlation IDs (middleware) | P1 | +| 63 | Add simple admin dashboard page: scrape history, DB row counts, token spend summary | P2 | +| 64 | Add uptime monitoring on `/health` endpoint (UptimeRobot free tier or similar) | P1 | ### 4.3 Security Hardening | # | Ticket | Priority | |---|--------|----------| -| 62 | Audit all raw SQL for injection risks — parameterize any string-interpolated queries | P0 | -| 63 | Add API key or lightweight session auth for agent/analysis endpoints | P1 | +| 65 | Audit all raw SQL for injection risks — parameterize any string-interpolated queries | P0 | +| 66 | Add API key or lightweight session auth for agent/analysis endpoints | P1 | --- @@ -196,6 +199,33 @@ Key gaps: no deployment config, no scheduled scraping, no caching, no tests, hig --- +## Scraper Reference Guide + +The existing hackathon scrapers will **not** be used directly in production. They are moved to `scrapers/_reference/` as design inspiration. New scrapers should be written from scratch implementing the `BaseScraper` ABC (ticket #16). + +### Which reference files to study, and why + +| File | Quality | What to learn from it | +|------|---------|----------------------| +| `kalshi.py` | Excellent | Async rate limiter class (`_RateLimiter` with lock-based queueing, 10 req/sec), cursor-based pagination, `asyncio.gather` with `return_exceptions=True` | +| `gdelt.py` | Excellent | Dual-path fetch (BigQuery primary, DOC API fallback), complex event-type mapping via CAMEO codes, Goldstein scale normalization, title synthesis from multiple fields | +| `row_format.py` | Excellent | Schema-aligned output contract — keyword-only args prevent mistakes. **Copy this pattern into all new scrapers.** | +| `acled_normalizer.py` | Good | Clean normalizer pattern: type-safe `NormalizedRecord` return, graceful fallback for every nullable field | +| `ingestion_service.py` | Good | Per-record error handling with `RunSummary` tracking (malformed, duplicates, db_failures), dedup integration | +| `polymarket.py` | Decent | Simplest example — good starting point for straightforward REST APIs, tag-based filtering | +| `scraping_service.py` | Decent | Orchestration pattern: per-scraper try-catch, error records appended (visibility over silent failures) | + +### Patterns to carry forward into new scrapers + +1. **Separate fetch from normalization** — client fetches raw data, normalizer produces `NormalizedRow` +2. **Use async rate limiters, not `time.sleep()`** — see `kalshi.py`'s `_RateLimiter` class +3. **All output goes through `row_format.make_content_row()`** — enforces schema contract +4. **Per-record error handling** — one bad record shouldn't abort the batch +5. **Track stats** — count inserted, skipped, failed per run for observability +6. **Dedup before embedding** — check URL uniqueness before generating expensive vectors + +--- + ## Non-Goals (for now) - Mobile app (web-only is fine) From 4884b0935a724aa975f03d6adad2d557afe874c1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 16 Mar 2026 22:37:07 +0000 Subject: [PATCH 3/5] Initial plan From 5439e83c3807fe9d79e89f90e7b95e7fb0a31374 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 16 Mar 2026 23:17:19 +0000 Subject: [PATCH 4/5] Initial plan From 571e49c29ce9e072bdcdf5f66ad7881040a1d94a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 16 Mar 2026 23:21:53 +0000 Subject: [PATCH 5/5] chore: add comprehensive deployment issue board with all tasks Co-authored-by: Tpypan <155002057+Tpypan@users.noreply.github.com> --- plan/ISSUE_BOARD.md | 185 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 plan/ISSUE_BOARD.md diff --git a/plan/ISSUE_BOARD.md b/plan/ISSUE_BOARD.md new file mode 100644 index 0000000..d9ab4b8 --- /dev/null +++ b/plan/ISSUE_BOARD.md @@ -0,0 +1,185 @@ +# Argus — Deployment Issue Board + +All tasks required before this project can be deployed to production, extracted from the PRD and PR review discussions. Organized by phase and priority. + +**Priority key:** P0 = must-have for deployment · P1 = high value · P2 = nice to have + +--- + +## Phase 1 — Foundation (Week 1–2) + +### 1.1 Containerization + +- [ ] **#1** `[P0]` Create backend Dockerfile (FastAPI + uvicorn, multi-stage build) +- [ ] **#2** `[P0]` Create frontend Dockerfile (Vite build → nginx static serve) +- [ ] **#3** `[P0]` Create `docker-compose.yml` with services: backend, frontend, postgres+pgvector, redis +- [ ] **#4** `[P0]` Add `.dockerignore` files (exclude `.env`, `node_modules`, `.venv`, `__pycache__`) + +### 1.2 CI/CD + +- [ ] **#5** `[P0]` Set up GitHub Actions CI: ruff lint/format, TypeScript typecheck, docker build +- [ ] **#6** `[P1]` Set up GitHub Actions CD: build images, push to registry, deploy to hosting +- [ ] **#7** `[P1]` Choose hosting provider (Railway, Fly.io, or small VPS) and document deploy process + +### 1.3 Dead Code Removal + +- [ ] **#8** `[P0]` Delete junk scrapers: `eonet.py`, `eonet_db.py`, `social_scraper.py`, `reddit.py`, `reddit_classifier.py`, `reddit_db.py`, `reddit_schema.sql`, `Reddit Scraper/`, `natural-disasters/`, `ryan_scrapers/` +- [ ] **#9** `[P0]` Move remaining hackathon scrapers (`gdelt.py`, `kalshi.py`, `polymarket.py`, `acled/`) into a `scrapers/_reference/` directory — kept as design inspiration only +- [ ] **#10** `[P0]` Remove duplicate `content_repository.py` (exists in both `repositories/` and `ingestion/`) — consolidate into one +- [ ] **#11** `[P0]` Remove all Cloudinary and legacy S3 media code — delete `utils/mediaConfig.ts`, remove `@cloudinary/react` and `@cloudinary/url-gen` from frontend, remove `cloudinary` and `boto3` from backend, drop `image_url` and `s3_url` columns from `content_table`, remove all `CLOUDINARY_*` and `AWS_*`/`S3_*` env vars. **Note (from review):** team confirmed S3 *will* be used for media storage, so keep `image_url` (repurposed as the S3 URL column) but drop `s3_url` as a duplicate. +- [ ] **#12** `[P1]` Audit and remove any other dead imports, unused functions, or commented-out code + +### 1.4 Security Baseline + +- [ ] **#13** `[P0]` Lock down CORS origins — remove `*` wildcard, use env-configured allowed origins +- [ ] **#14** `[P0]` Add rate limiting middleware on `/agent/query` and `/content/{id}/realtime-analysis` (these burn AI tokens) +- [ ] **#15** `[P0]` Validate all secrets are loaded from env vars; fail fast on startup if required vars are missing + +--- + +## Phase 2 — Automation & Cost Optimization (Week 2–3) + +### 2.1 Scheduled Scraping + +- [ ] **#16** `[P0]` Design `BaseScraper` ABC interface: `async fetch() -> list[NormalizedRow]` with built-in rate limiting, error handling, and dedup. Use `kalshi.py` rate limiter and `ingestion_service.py` error patterns as reference. +- [ ] **#17** `[P0]` Write new production scrapers implementing `BaseScraper` for each data source (determine which sources to keep/add based on product needs) +- [ ] **#18** `[P0]` Create unified scrape entrypoint (`run_daily_pipeline.py`) that runs all scrapers + embedding backfill in sequence +- [ ] **#19** `[P0]` Add cron scheduler (APScheduler in a separate container, or cron in docker-compose) +- [ ] **#20** `[P0]` Define scraping schedule (e.g. 1×/day, 2×/day per source) +- [ ] **#21** `[P1]` Add `scrape_runs` logging table (source, status, rows_inserted, errors, duration_ms, started_at) +- [ ] **#22** `[P0]` Add idempotency guards — URL-based dedup check before insert, not after +- [ ] **#23** `[P2]` Add failure alerting (Discord webhook or email on scrape errors) +- [ ] **#24 (extra)** `[P0]` Add a locking mechanism to `run_daily_pipeline.py` so that if one cron run hangs, the next scheduled run does not spawn a zombie process. Options include a PostgreSQL advisory lock (`pg_try_advisory_lock`) for simplicity or a Redis-based distributed lock. See discussion on PR #29. + +### 2.2 Embedding Cost Reduction + +- [ ] **#25** `[P0]` Switch embeddings to local model — replace OpenAI `text-embedding-3-small` with `sentence-transformers` (e.g. `all-MiniLM-L6-v2`, 384 dims). Run locally in backend container. Update vector column dimension. +- [ ] **#26** `[P0]` Batch embedding generation — process 100+ items per batch instead of one-at-a-time +- [ ] **#27** `[P1]` Deduplicate content before generating embeddings (currently embeddings are generated before dedup) + +### 2.3 AI Response Caching (Redis) + +- [ ] **#28** `[P0]` Add Redis client utility module with connection pooling +- [ ] **#29** `[P0]` Cache `/content/points` response — invalidate on new scrape run completion +- [ ] **#30** `[P1]` Cache `/content/arcs` response per threshold value (TTL = until next scrape) +- [ ] **#31** `[P0]` Cache Gemini confidence scores per content_id (TTL = 24h) +- [ ] **#32** `[P0]` Cache Gemini realtime analysis per `(content_id, user_role)` (TTL = 6h) +- [ ] **#33** `[P1]` Cache agent query results keyed on `(normalized_query_hash, persona)` (TTL = 1h) + +### 2.4 Additional Cost Controls + +- [ ] **#34** `[P1]` Add token usage tracking — log Gemini and OpenAI (if still used) token counts per call to a `token_usage` table +- [ ] **#35** `[P2]` Pre-compute confidence scores during scrape pipeline instead of on-demand per user click +- [ ] **#36** `[P2]` Replace per-request Google Search grounding with daily cached news summaries scraped during pipeline + +### 2.5 Globe Performance (from PR review) + +- [ ] **#37 (extra)** `[P1]` Implement server-side viewport filtering for `/content/points` (e.g. `?bbox=west,south,east,north&zoom=level`) so only visible points are returned, preventing huge JSON payloads that lag low-end machines. Combine nearby points into clusters when zoom is low; show individual points when zoomed in. + +--- + +## Phase 3 — Schema & Code Quality (Week 3–5) + +### 3.1 Database Schema Improvements + +- [ ] **#38** `[P0]` Set up Alembic for migration management (replace raw SQL files) +- [ ] **#39** `[P1]` Rename `content_table` → `articles` +- [ ] **#40** `[P1]` Split AI-generated fields into `article_analysis` table (embedding, sentiment_score, market_signal, confidence_score) +- [ ] **#41** `[P2]` Add `scrape_source` enum column to replace generic FK to `sources` table +- [ ] **#42** `[P1]` Add composite index on `(event_type, published_at)` for filtered timeline queries +- [ ] **#43** `[P2]` Add `last_scraped_at` column to sources for freshness tracking +- [ ] **#44** `[P1]` Clean up unused columns and tables from hackathon iteration (including dropped `s3_url`) + +### 3.2 Backend Modularization + +- [ ] **#45** `[P0]` Create shared `db.py` module — single asyncpg pool used by all services (currently duplicated in `ingestion/db.py` and inline connects). **Important (from review):** explicitly set `max_size` on the connection pool (default asyncpg max is 10) to avoid "too many connections" errors under load. Document the chosen value and the reasoning in the module. +- [ ] **#46** `[P1]` Consolidate all Pydantic models into a single `schemas/` package +- [ ] **#47** `[P1]` Replace all `print()` statements with structured `logging` (use correlation IDs per request) +- [ ] **#48** `[P1]` Add input validation on all API endpoints (query length limits, coordinate bounds, UUID format) +- [ ] **#49** `[P0]` Migrate all sync psycopg2 route handlers to async (`content.py` currently uses `def get_content_points()` / `def get_content_arcs()` with `psycopg2.connect()`). All new code must use asyncpg. + +### 3.3 Testing + +- [ ] **#50** `[P0]` Set up pytest with async fixtures (asyncpg test DB, httpx AsyncClient) +- [ ] **#51** `[P0]` Write tests for new scraper implementations and row normalization +- [ ] **#52** `[P1]` Write tests for deduplication logic +- [ ] **#53** `[P1]` Write tests for agent query classification (`_classify_query`) +- [ ] **#54** `[P1]` Write API integration tests for `/content/points`, `/agent/query` +- [ ] **#55** `[P2]` Add frontend smoke tests (Vitest + React Testing Library) for Globe render, Agent query flow +- [ ] **#56** `[P0]` Add pre-commit hooks: ruff lint+format, TypeScript typecheck + +--- + +## Phase 4 — Frontend Polish & Observability (Week 5–7) + +### 4.1 Frontend UX + +- [ ] **#57** `[P0]` Add loading skeleton/spinner on initial data fetch +- [ ] **#58** `[P1]` Lazy-load Globe component (three.js is ~500 KB) with `React.lazy` + `Suspense` +- [ ] **#59** `[P0]` Add error boundaries around Globe, Agent, and Modal components +- [ ] **#60** `[P1]` Memoize expensive computations (arc filtering, point color mapping) with `useMemo`/`useCallback` +- [ ] **#61** `[P1]` Add responsive layout — mobile/tablet support or graceful "desktop-only" message +- [ ] **#62** `[P2]` Add URL-based routing (React Router) for shareable/bookmarkable globe state +- [ ] **#63** `[P1]` Add "no results" empty states for agent queries and empty filtered views +- [ ] **#64** `[P2]` Move inline styles and magic numbers to shared constants / design tokens + +### 4.2 Observability + +- [ ] **#65** `[P1]` Add structured logging with request correlation IDs (middleware) +- [ ] **#66** `[P2]` Add simple admin dashboard page: scrape history, DB row counts, token spend summary +- [ ] **#67** `[P1]` Add uptime monitoring on `/health` endpoint (UptimeRobot free tier or similar) + +### 4.3 Security Hardening + +- [ ] **#68** `[P0]` Audit all raw SQL for injection risks — parameterize any string-interpolated queries +- [ ] **#69** `[P1]` Add API key or lightweight session auth for agent/analysis endpoints + +--- + +## Environment & Configuration + +- [ ] **#70** `[P0]` Set `DATABASE_URL` to `postgresql://user:pass@host:5432/dbname` (no `+asyncpg` suffix) — both psycopg2 and asyncpg consume it as a standard libpq DSN. Update all documentation and `.env.example` to reflect this. +- [ ] **#71** `[P0]` Add all required environment variables to `.env.example` with comments: `DATABASE_URL`, `GEMINI_API_KEY`, `GEMINI_MODEL`, `OPENAI_API_KEY` (until local embeddings), `ACLED_API_KEY`, `ELEVENLABS_API_KEY` (optional). Remove any `CLOUDINARY_*`, `AWS_*`, `S3_*` examples. + +--- + +## P0 Summary — Must Complete Before First Deploy + +The following P0 items are the minimum required to go live: + +| # | Task | +|---|------| +| 1 | Backend Dockerfile | +| 2 | Frontend Dockerfile | +| 3 | docker-compose.yml | +| 4 | .dockerignore files | +| 5 | GitHub Actions CI | +| 8 | Delete junk scrapers | +| 10 | Consolidate duplicate content_repository.py | +| 13 | Lock down CORS origins | +| 14 | Rate limiting on AI endpoints | +| 15 | Fail-fast on missing env vars | +| 16 | BaseScraper ABC interface | +| 17 | New production scrapers | +| 18 | run_daily_pipeline.py entrypoint | +| 19 | Cron scheduler | +| 20 | Define scraping schedule | +| 22 | URL-based dedup before insert | +| 24 | Pipeline locking mechanism | +| 25 | Switch to local embeddings (sentence-transformers) | +| 26 | Batch embedding generation | +| 28 | Redis client utility module | +| 29 | Cache /content/points | +| 31 | Cache Gemini confidence scores | +| 32 | Cache Gemini realtime analysis | +| 38 | Set up Alembic migrations | +| 45 | Shared asyncpg DB pool with explicit max_size | +| 49 | Migrate sync psycopg2 routes to async | +| 50 | pytest async fixtures | +| 51 | Scraper + normalization tests | +| 56 | Pre-commit hooks (ruff + TS typecheck) | +| 57 | Loading skeleton on initial fetch | +| 59 | Error boundaries (Globe, Agent, Modal) | +| 68 | Audit raw SQL for injection risks | +| 70 | Fix DATABASE_URL format in docs | +| 71 | Update .env.example |