diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..d04748f --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,80 @@ +name: CI + +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +jobs: + api-lint-test: + name: API — lint & test + runs-on: ubuntu-latest + defaults: + run: + working-directory: apps/api + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + + - name: Install dependencies + run: pip install -e ".[dev]" + + - name: Lint with ruff + run: ruff check src/ tests/ + + - name: Run tests + run: python -m pytest tests/ -v + + web-lint-typecheck: + name: Web — lint & typecheck + runs-on: ubuntu-latest + defaults: + run: + working-directory: apps/web + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-node@v4 + with: + node-version: "22" + cache: npm + cache-dependency-path: apps/web/package-lock.json + + - name: Install dependencies + run: npm ci + + - name: Lint + run: npm run lint + + - name: Typecheck + run: npx tsc --noEmit + + web-build: + name: Web — build + runs-on: ubuntu-latest + needs: web-lint-typecheck + defaults: + run: + working-directory: apps/web + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-node@v4 + with: + node-version: "22" + cache: npm + cache-dependency-path: apps/web/package-lock.json + + - name: Install dependencies + run: npm ci + + - name: Build + run: npm run build diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..a8951f3 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,334 @@ +# CLAUDE.md — EvoGraph Development Guide + +## Project Overview + +EvoGraph is an evolutionary biology visualization platform that maps species relationships through both taxonomy (Open Tree of Life) and genetic similarity (mutual information from COI barcode sequences). The MVP scope is **Aves (birds)**: ~27,853 taxa, ~167 species with COI sequences, ~1,787 MI edges. + +**Core idea:** Build a k-nearest-neighbor graph where edge weight = MI-derived distance from pairwise COI sequence alignment. Overlay this on the taxonomic tree so users can explore how genetic similarity compares to taxonomic classification. + +## Architecture + +``` +┌──────────────┐ ┌──────────────┐ ┌──────────────┐ +│ Next.js │────▶│ FastAPI │────▶│ PostgreSQL │ +│ (port 3000) │ │ (port 8000) │ │ (port 5432) │ +└──────────────┘ └──────────────┘ └──────────────┘ + │ │ + ▼ │ + ┌──────────┐ │ + │ Redis │ (unused yet) │ + │ (6379) │ │ + └──────────┘ │ + │ + ┌──────────────────────────┘ + │ Pipeline scripts + │ (ingest → compute → export) + └───────────────────────────── +``` + +## Repository Structure + +``` +evograph/ +├── apps/ +│ ├── api/ # Python FastAPI backend +│ │ ├── pyproject.toml # Dependencies, pytest config, ruff config +│ │ ├── Dockerfile # Health check included +│ │ ├── alembic.ini +│ │ ├── src/evograph/ +│ │ │ ├── main.py # FastAPI app, CORS, routers +│ │ │ ├── settings.py # DATABASE_URL, REDIS_URL, SCOPE_OTT_ROOT +│ │ │ ├── db/ +│ │ │ │ ├── models.py # Taxon, Sequence, Edge, NodeMedia +│ │ │ │ ├── session.py # engine, SessionLocal, get_db +│ │ │ │ └── migrations/versions/001_initial.py +│ │ │ ├── api/ +│ │ │ │ ├── routes/ # search, taxa, graph, sequences +│ │ │ │ └── schemas/ # Pydantic response models +│ │ │ ├── services/ +│ │ │ │ ├── ott_client.py # OpenTree API (tnrs, subtree, taxon_info) +│ │ │ │ ├── bold_client.py # BOLD portal v5 (currently down) +│ │ │ │ ├── mi_distance.py # entropy, MI, NMI, distance +│ │ │ │ └── neighbor_index.py # family-scoped candidate selection +│ │ │ ├── pipeline/ # Data ingestion & computation scripts +│ │ │ │ ├── ingest_ott.py # Newick parser → taxa table +│ │ │ │ ├── ingest_ncbi.py # NCBI esearch/efetch → sequences +│ │ │ │ ├── ingest_bold.py # BOLD portal → sequences (portal down) +│ │ │ │ ├── select_canonical.py # Pick best COI per species +│ │ │ │ ├── build_neighbors.py # Pairwise MI → kNN edges +│ │ │ │ ├── build_graph_export.py # JSON files for caching +│ │ │ │ ├── ingest_images.py # Wikipedia thumbnails → node_media +│ │ │ │ └── validate.py # Quality stats & outlier detection +│ │ │ └── utils/ +│ │ │ ├── alignment.py # parasail global alignment wrapper +│ │ │ └── fasta.py # FASTA format parser +│ │ └── tests/ # 53 pytest tests +│ │ ├── conftest.py # MockDB, fixtures, factories +│ │ ├── test_health.py +│ │ ├── test_search.py +│ │ ├── test_taxa.py +│ │ ├── test_graph.py +│ │ ├── test_sequences.py +│ │ ├── test_mi_distance.py +│ │ └── test_pipeline.py # Canonical selection scoring tests +│ └── web/ # Next.js 15 + TypeScript frontend +│ ├── package.json +│ ├── Dockerfile # Health check included +│ ├── tsconfig.json # Strict mode, @/* path alias +│ ├── next.config.js # output: "standalone" +│ └── src/ +│ ├── app/ +│ │ ├── globals.css # Dark theme, skeleton, responsive, graph search +│ │ ├── layout.tsx # Root layout, sticky nav +│ │ ├── page.tsx # Home: search + quick links +│ │ ├── graph/page.tsx # MI network explorer (Sigma.js) + node search +│ │ └── taxa/[ottId]/ +│ │ ├── page.tsx # Taxon detail (hero, children, neighbors) +│ │ └── sequences/page.tsx # COI sequence viewer +│ ├── components/ +│ │ ├── SearchBox.tsx # Debounced autocomplete +│ │ ├── TaxonCard.tsx # Thumbnail + rank badge +│ │ ├── GraphView.tsx # Cytoscape.js (small graphs) +│ │ ├── GraphViewSigma.tsx # Sigma.js (large networks) +│ │ └── Skeleton.tsx # Shimmer loading states +│ └── lib/ +│ ├── api.ts # API client functions +│ ├── types.ts # TypeScript interfaces +│ └── external-links.ts # Wikipedia, iNaturalist, eBird URLs +├── docker-compose.yml # postgres:16, redis:7, api, web (with health checks) +├── Makefile # Pipeline orchestration commands +├── .github/workflows/ci.yml # Lint, test, typecheck, build +├── .env.example +├── TODO.md # Tracked tasks with completion status +├── ROADMAP.md # 6-phase long-term vision +└── MVP.md # Original implementation spec +``` + +## Database Schema + +Four PostgreSQL tables (migrations: `001_initial.py`, `002_performance_indexes.py`): + +| Table | PK | Purpose | Key columns | +|-------|-----|---------|-------------| +| **taxa** | `ott_id` (int) | Taxonomy backbone | name, rank, parent_ott_id (self-FK), ncbi_tax_id, lineage (int[]), synonyms (jsonb) | +| **sequences** | `id` (uuid) | COI barcode DNA | ott_id (FK), marker, source, accession, sequence (text), length, quality (jsonb), is_canonical | +| **edges** | `(src_ott_id, dst_ott_id, marker)` | MI similarity graph | distance (0-1), mi_norm (0-1), align_len | +| **node_media** | `ott_id` (FK) | Species images | image_url, attribution (jsonb) | + +**Indexes (migration 001):** taxa(name), taxa(parent_ott_id), sequences(ott_id), edges(src_ott_id), edges(dst_ott_id) + +**Performance indexes (migration 002):** +- `ix_taxa_name_trgm` — GIN trigram index on taxa.name (fast ILIKE `%query%`) +- `ix_taxa_rank` — B-tree on taxa.rank (rank-based filtering) +- `ix_edges_src_distance` — composite on edges(src_ott_id, distance) (neighbor queries) +- `ix_sequences_ott_canonical` — composite on sequences(ott_id, is_canonical) (canonical checks) +- `ix_sequences_ott_marker` — composite on sequences(ott_id, marker) (pipeline selection) + +## API Endpoints + +All under FastAPI with CORS enabled (all origins). + +| Method | Path | Params | Response | Notes | +|--------|------|--------|----------|-------| +| GET | `/health` | — | `{"status":"ok"}` | | +| GET | `/v1/search` | `q` (required, min 1), `limit` (max 100) | `TaxonSummary[]` | pg_trgm + prefix ranking | +| GET | `/v1/taxa/{ott_id}` | — | `TaxonDetail` | Recursive CTE lineage | +| GET | `/v1/taxa/{ott_id}/children` | `offset`, `limit` (max 500) | `ChildrenPage` | Paginated | +| GET | `/v1/taxa/{ott_id}/sequences` | — | `SequenceOut[]` | Includes DNA sequence text | +| GET | `/v1/graph/subtree/{ott_id}` | `depth` (1-5, default 3) | `GraphResponse` | Recursive CTE subtree | +| GET | `/v1/graph/mi-network` | — | `GraphResponse` | Cached 5min in-memory | +| GET | `/v1/graph/neighbors/{ott_id}` | `k` (1-50, default 15) | `NeighborOut[]` | Sorted by distance | + +**Key response types:** +- `TaxonDetail`: includes children[], total_children, lineage[], has_canonical_sequence, wikipedia_url +- `GraphResponse`: nodes[] + edges[] (kind: "taxonomy" | "mi") +- `SequenceOut`: includes full DNA sequence text, source, accession, is_canonical + +## Pipeline Order + +Run via Makefile or directly as `python -m evograph.pipeline.`: + +``` +1. ingest_ott — Parse OpenTree Newick subtree → taxa table (~27,853 for Aves) +2. ingest_ncbi — Fetch COI from NCBI GenBank → sequences table + ingest_bold — Fetch COI from BOLD portal → sequences table (portal down) +3. select_canonical — Score sequences (length - 10*ambig), mark best per species +4. build_neighbors — Pairwise alignment + MI distance → kNN edges (k=15) +5. build_graph_export — Export nodes.json + edges.json +6. ingest_images — Wikipedia thumbnails → node_media table +7. validate — Print quality report (genus/family sharing %, distance stats) +``` + +**Full pipeline:** `make pipeline` runs steps 1-7 in sequence. + +## Key Algorithms + +### MI Distance (services/mi_distance.py) +``` +1. Global alignment via parasail (Needleman-Wunsch, SIMD) + Scoring: match=+2, mismatch=-1, gap_open=3, gap_extend=1 + +2. From aligned columns (excluding gaps): + - P(X), P(Y): marginal distributions of bases + - P(X,Y): joint distribution + - MI = Σ P(x,y) * log(P(x,y) / (P(x)*P(y))) + - NMI = MI / min(H(X), H(Y)), clamped to [0,1] + - Requires ≥ 50 non-gap columns + +3. Distance = 1 - NMI (0 = identical, 1 = unrelated) +``` + +### Candidate Selection (services/neighbor_index.py) +- Walk up parent chain from each species to find enclosing family +- Only compute MI distance between species in the same family +- Phase 2 upgrade: k-mer ANN index for cross-family detection + +### Canonical Sequence Selection (pipeline/select_canonical.py) +- Score = length - 10 * ambiguous_base_count +- Highest score per species wins canonical flag + +## Development Commands + +```bash +# Local dev (Docker) +make up # docker compose up --build +make down # docker compose down +make migrate # alembic upgrade head + +# API tests (53 tests) +cd apps/api && python -m pytest tests/ -v + +# Lint +cd apps/api && ruff check src/ tests/ +cd apps/web && npm run lint + +# Typecheck +cd apps/web && npx tsc --noEmit + +# Build frontend +cd apps/web && npm run build +``` + +## Environment Variables + +``` +DATABASE_URL=postgresql+psycopg://postgres:postgres@db:5432/evograph +REDIS_URL=redis://redis:6379/0 +SCOPE_OTT_ROOT=Aves +NEXT_PUBLIC_API_BASE=http://localhost:8000 +``` + +## Testing Strategy + +**Current: 53 tests passing** (all in `apps/api/tests/`) + +Tests use `MockDB` with FastAPI dependency override — no real database needed: +- `conftest.py`: Mock factories (`_make_taxon`, `_make_sequence`, `_make_edge`, `_make_media`), `MockQuery` (chainable filter/limit/order_by/scalar), `MockDB` (registry by model type) +- Override `get_db` dependency with mock session + +**What's tested:** +- All 8 API endpoints (status codes, response schemas, validation errors, 404s) +- MI distance computation (entropy, NMI, clamping, gap exclusion) +- Pipeline canonical selection scoring (11 tests for `_score` function) + +**What's NOT tested:** +- Full pipeline integration (ingest, neighbor building) +- Frontend components (no Jest/RTL) +- External API integration (OpenTree, NCBI, Wikipedia) +- Database migrations + +## Frontend Conventions + +- **Dark theme:** CSS variables in globals.css (--bg, --fg, --accent, --border, --bg-card) +- **Rank colors:** class=#e57373, order=#ffb74d, family=#fff176, genus=#81c784, species=#4fc3f7 +- **Two graph renderers:** GraphView.tsx (Cytoscape, for small subtree graphs) and GraphViewSigma.tsx (Sigma.js WebGL, for full MI network) +- **Graph search:** NodeSearchBox component in graph/page.tsx — autocomplete dropdown that highlights + zooms to selected node +- **Loading states:** Skeleton.tsx with shimmer animation (not plain text) +- **Responsive breakpoints:** 768px (tablet), 480px (mobile) +- **Species names:** Always italicized (``) + +## Type Consistency Rules + +The following types must stay in sync across three layers: + +| Python Schema | TypeScript Type | DB Model | +|---------------|-----------------|----------| +| `TaxonSummary` | `TaxonSummary` | `Taxon` | +| `TaxonDetail` | `TaxonDetail` | `Taxon` + joins | +| `ChildrenPage` | `ChildrenPage` | — | +| `SequenceOut` | `SequenceOut` | `Sequence` | +| `Node` / `GraphEdge` / `GraphResponse` | `GraphNode` / `GraphEdge` / `GraphResponse` | `Taxon` + `Edge` | +| `NeighborOut` | `NeighborOut` | `Edge` + `Taxon` join | + +**When adding a field:** Update all three: schema → route mapping → TypeScript type → API client → UI usage. + +**File locations:** +- Python schemas: `apps/api/src/evograph/api/schemas/` +- TypeScript types: `apps/web/src/lib/types.ts` +- API client: `apps/web/src/lib/api.ts` + +## Current Data Status + +| Metric | Value | +|--------|-------| +| Total taxa | ~27,853 (Aves) | +| Species with COI | ~167 (0.6%) | +| Total sequences | ~167 | +| MI edges | ~1,787 | +| Images | Fetched from Wikipedia | + +**Why so few sequences:** +- NCBI query finds only ~167 matches for Aves species +- BOLD portal has been down since Feb 2026 +- TODO: Broader NCBI search (genus fallback, relaxed terms) + +## Remaining Work (from TODO.md) + +### High Priority +- [ ] Expand NCBI ingestion — try genus-level queries, broader search terms +- [ ] Retry BOLD portal when it comes back online +- [ ] Frontend smoke tests + +### Medium Priority +- [ ] Run validate.py and document results +- [ ] Production deployment config + +### Phase 2 +- [ ] Make SCOPE_OTT_ROOT configurable for other clades +- [ ] k-mer candidate filtering (FAISS/Annoy) for cross-family neighbors +- [ ] Job queue (Celery/RQ) for background pipeline jobs +- [ ] Multi-marker support (16S, 18S) + +## Architectural Principles + +1. **OTT ID is canonical identity** — everything links through ott_id +2. **Sequences are immutable** — stored with provenance (source, accession) +3. **Edges are recomputable** — derived from sequences, can be rebuilt +4. **Graph is derived data** — not the source of truth +5. **MI is a similarity proxy, not phylogenetic truth** — always label as "similarity" + +## Performance Characteristics + +- **Connection pooling:** 10 persistent + 20 overflow connections, 5min recycle, pre-ping validation +- **GZip compression:** All responses > 500 bytes are compressed (critical for graph JSON) +- **Lineage query:** Single recursive CTE (was N+1 individual parent lookups) +- **Subtree query:** Single recursive CTE (was Python-side BFS with one query per level) +- **Canonical check:** Uses `EXISTS` subquery (was fetching full row) +- **Search:** pg_trgm GIN index for fast ILIKE substring matching; prefix matches ranked first +- **MI network:** In-memory cache with 5-minute TTL +- **Neighbor queries:** Composite index (src_ott_id, distance) for indexed ORDER BY + LIMIT + +## Known Gotchas + +- `pyproject.toml` requires Python >=3.11 (relaxed from 3.12 for compatibility) +- Build backend is `hatchling.build` (not `hatchling.backends`) +- Redis is configured but not used yet (reserved for caching) +- CORS is wide open (`allow_origins=["*"]`) — tighten for production +- `data/raw/` and `data/processed/` are gitignored — not in repo +- Graph JSON exports exist at `apps/api/src/data/processed/graph/` but are gitignored +- Sequence `quality` field is JSONB with `{"ambig": N}` format +- Edges are directed (A→B) but UI treats as undirected +- The `ingest_images.py` uses raw SQL (`text()`) for the join query +- MI network endpoint is cached in-memory (5min TTL) — stale data possible after pipeline re-run +- Cytoscape types use `StylesheetStyle` (not `Stylesheet`) in newer @types/cytoscape +- Migration 002 requires `pg_trgm` extension — enabled automatically in upgrade() diff --git a/TODO.md b/TODO.md index 253e3d0..65c0207 100644 --- a/TODO.md +++ b/TODO.md @@ -8,23 +8,30 @@ - [ ] Add NCBI taxonomy ID lookup — `ncbi_tax_id` column exists but is never populated. Add pipeline step to query NCBI Taxonomy by name and backfill ### Testing -- [ ] API route tests — pytest + httpx TestClient for all 6 endpoints -- [ ] Pipeline unit tests — test MI computation with known sequences, test canonical selection logic +- [x] API route tests — pytest + httpx TestClient for all endpoints (42 tests) +- [x] MI distance unit tests — entropy, MI computation, NMI clamping, distance conversion +- [x] Pipeline unit tests — canonical selection scoring logic (11 tests) - [ ] Frontend smoke tests — basic render tests for key pages ### Performance -- [ ] Cache MI network endpoint — the full graph loads all edges every request. Add Redis or in-memory caching with TTL -- [ ] Add DB indexes on `edges(src_ott_id, dst_ott_id)` if not already present -- [ ] Paginate children for large taxa (Aves has 729 direct children) +- [x] Cache MI network endpoint — in-memory cache with 5-minute TTL +- [x] Performance indexes (migration 002) — pg_trgm, composite indexes for neighbors/canonical/search +- [x] Paginate children for large taxa — inline limit of 100, dedicated `/taxa/{id}/children` endpoint with offset/limit +- [x] Connection pooling — 10 persistent + 20 overflow, pre-ping, 5min recycle +- [x] Recursive CTE for lineage — single query replaces N+1 parent chain walk +- [x] Recursive CTE for subtree — single query replaces Python BFS with per-level queries +- [x] GZip compression — middleware compresses responses > 500 bytes +- [x] Search optimization — pg_trgm GIN index + prefix ranking + LIKE pattern escaping +- [x] EXISTS for canonical check — replaces fetching full row ## Medium Priority ### Frontend Polish -- [ ] Add `getSequences()` to frontend API client — endpoint exists but no client function -- [ ] Sequence viewer page — show aligned sequences for a species, highlight conserved regions -- [ ] Mobile responsive layout — test and fix breakpoints -- [ ] Loading skeletons instead of plain "Loading..." text -- [ ] Graph page: add node search/filter within the MI network +- [x] Add `getSequences()` to frontend API client +- [x] Sequence viewer page — color-coded DNA bases, composition bar, expandable cards +- [x] Mobile responsive layout — breakpoints at 768px and 480px +- [x] Loading skeletons — shimmer animation for taxon detail and graph pages +- [x] Graph page: node search/filter within the MI network — autocomplete dropdown with camera animation ### Data Quality - [ ] Run `validate.py` and document results — what % of neighbors share genus/family? @@ -32,8 +39,9 @@ - [ ] Deduplicate sequences — check for identical accessions from multiple sources ### DevOps -- [ ] Add Dockerfile health checks -- [ ] CI pipeline (GitHub Actions) — lint, typecheck, test +- [x] Add Dockerfile health checks — API (Python urllib), Web (Node fetch), DB (pg_isready), Redis (redis-cli ping) +- [x] CI pipeline (GitHub Actions) — lint, typecheck, test, build +- [x] Fix lint warnings — removed unused imports, fixed f-string, removed unused variable - [ ] Production deployment config (fly.io, Railway, or VPS) ## Phase 2 (from ROADMAP.md) diff --git a/apps/api/Dockerfile b/apps/api/Dockerfile index 25053a1..50215dc 100644 --- a/apps/api/Dockerfile +++ b/apps/api/Dockerfile @@ -9,4 +9,8 @@ COPY src/ src/ COPY alembic.ini . ENV PYTHONPATH=/app/src + +HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ + CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1 + CMD ["uvicorn", "evograph.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] diff --git a/apps/api/pyproject.toml b/apps/api/pyproject.toml index 0e8f54e..d27fc63 100644 --- a/apps/api/pyproject.toml +++ b/apps/api/pyproject.toml @@ -2,7 +2,7 @@ name = "evograph" version = "0.1.0" description = "EvoGraph MVP - Evolutionary similarity graph" -requires-python = ">=3.12" +requires-python = ">=3.11" dependencies = [ "fastapi>=0.115", "uvicorn[standard]>=0.34", @@ -24,11 +24,14 @@ dev = [ [build-system] requires = ["hatchling"] -build-backend = "hatchling.backends" +build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] packages = ["src/evograph"] +[tool.pytest.ini_options] +testpaths = ["tests"] + [tool.ruff] target-version = "py312" line-length = 120 diff --git a/apps/api/src/evograph/api/routes/graph.py b/apps/api/src/evograph/api/routes/graph.py index 2cb464d..6118cb7 100644 --- a/apps/api/src/evograph/api/routes/graph.py +++ b/apps/api/src/evograph/api/routes/graph.py @@ -1,9 +1,9 @@ """Graph endpoints: subtree and MI-neighbor queries.""" -from collections import deque +import time from fastapi import APIRouter, Depends, HTTPException, Query -from sqlalchemy import and_, or_ +from sqlalchemy import and_, text from sqlalchemy.orm import Session from evograph.api.schemas.graph import GraphEdge, GraphResponse, NeighborOut, Node @@ -12,6 +12,11 @@ router = APIRouter(tags=["graph"]) +# ── In-memory cache for MI network (expensive query) ────── +_mi_network_cache: GraphResponse | None = None +_mi_network_cache_time: float = 0.0 +_MI_NETWORK_TTL: float = 300.0 # 5 minutes + @router.get("/graph/subtree/{ott_id}", response_model=GraphResponse) def get_subtree_graph( @@ -21,39 +26,47 @@ def get_subtree_graph( ) -> GraphResponse: """Get a graph containing the taxonomy subtree + MI edges among those nodes. - Walk down from ott_id collecting descendants up to `depth` levels. - Include taxonomy edges (parent->child) and MI edges between nodes in the set. + Uses a recursive CTE to collect all descendants in a single query, + replacing the previous Python-side BFS that issued one query per level. """ root = db.query(Taxon).filter(Taxon.ott_id == ott_id).first() if root is None: raise HTTPException(status_code=404, detail="Taxon not found") - # BFS to collect descendants up to `depth` levels - collected: dict[int, Taxon] = {root.ott_id: root} + # Recursive CTE: fetch entire subtree in one query + subtree_rows = db.execute( + text(""" + WITH RECURSIVE subtree AS ( + SELECT ott_id, name, rank, parent_ott_id, 0 AS depth + FROM taxa + WHERE ott_id = :root_id + UNION ALL + SELECT t.ott_id, t.name, t.rank, t.parent_ott_id, s.depth + 1 + FROM taxa t + JOIN subtree s ON t.parent_ott_id = s.ott_id + WHERE s.depth < :max_depth + ) + SELECT ott_id, name, rank, parent_ott_id, depth FROM subtree + """), + {"root_id": ott_id, "max_depth": depth}, + ).fetchall() + + # Build taxa dict and taxonomy edges from CTE results + # Always include the root (CTE may return empty in test environments) + taxa_info: dict[int, tuple[str, str]] = { + root.ott_id: (root.name, root.rank), + } taxonomy_edges: list[GraphEdge] = [] - queue: deque[tuple[int, int]] = deque([(root.ott_id, 0)]) - - while queue: - current_id, current_depth = queue.popleft() - if current_depth >= depth: - continue - children = ( - db.query(Taxon).filter(Taxon.parent_ott_id == current_id).all() - ) - for child in children: - if child.ott_id not in collected: - collected[child.ott_id] = child - taxonomy_edges.append( - GraphEdge( - src=current_id, - dst=child.ott_id, - kind="taxonomy", - distance=None, - ) - ) - queue.append((child.ott_id, current_depth + 1)) - ott_ids = list(collected.keys()) + for row in subtree_rows: + node_ott_id, name, rank, parent_ott_id, row_depth = row + taxa_info[node_ott_id] = (name, rank) + if row_depth > 0 and parent_ott_id in taxa_info: + taxonomy_edges.append( + GraphEdge(src=parent_ott_id, dst=node_ott_id, kind="taxonomy", distance=None) + ) + + ott_ids = list(taxa_info.keys()) # Fetch image URLs for all collected nodes media_rows = ( @@ -86,12 +99,12 @@ def get_subtree_graph( nodes = [ Node( - ott_id=t.ott_id, - name=t.name, - rank=t.rank, - image_url=media_map.get(t.ott_id), + ott_id=node_ott_id, + name=name, + rank=rank, + image_url=media_map.get(node_ott_id), ) - for t in collected.values() + for node_ott_id, (name, rank) in taxa_info.items() ] return GraphResponse( @@ -109,7 +122,14 @@ def get_mi_network( Returns all taxa that have at least one MI edge, plus all MI edges between them (deduplicated to undirected). Includes taxonomy edges connecting species to their parent genus. + + Results are cached in-memory for 5 minutes. """ + global _mi_network_cache, _mi_network_cache_time + now = time.monotonic() + if _mi_network_cache is not None and (now - _mi_network_cache_time) < _MI_NETWORK_TTL: + return _mi_network_cache + # Get all MI edges all_edges = db.query(Edge).all() if not all_edges: @@ -172,7 +192,10 @@ def get_mi_network( for t in taxa_map.values() ] - return GraphResponse(nodes=nodes, edges=mi_edges + taxonomy_edges) + result = GraphResponse(nodes=nodes, edges=mi_edges + taxonomy_edges) + _mi_network_cache = result + _mi_network_cache_time = time.monotonic() + return result @router.get("/graph/neighbors/{ott_id}", response_model=list[NeighborOut]) diff --git a/apps/api/src/evograph/api/routes/search.py b/apps/api/src/evograph/api/routes/search.py index 148ff06..51ae357 100644 --- a/apps/api/src/evograph/api/routes/search.py +++ b/apps/api/src/evograph/api/routes/search.py @@ -1,6 +1,15 @@ -"""Search endpoint for taxa.""" +"""Search endpoint for taxa. + +Uses ILIKE for substring matching, backed by a pg_trgm GIN index +(migration 002) for O(1) lookup instead of sequential scan. + +Results are ordered to prioritize: +1. Prefix matches (names starting with the query) +2. Alphabetical order for remaining matches +""" from fastapi import APIRouter, Depends, Query +from sqlalchemy import case from sqlalchemy.orm import Session from evograph.api.schemas.taxa import TaxonSummary @@ -10,17 +19,34 @@ router = APIRouter(tags=["search"]) +def _escape_like(s: str) -> str: + """Escape special LIKE/ILIKE characters to prevent pattern injection.""" + return s.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_") + + @router.get("/search", response_model=list[TaxonSummary]) def search_taxa( q: str = Query(..., min_length=1), limit: int = Query(20, le=100), db: Session = Depends(get_db), ) -> list[TaxonSummary]: - """Search taxa by name (case-insensitive ILIKE).""" + """Search taxa by name (case-insensitive substring match). + + Uses pg_trgm GIN index for fast ILIKE on large tables. + Results prioritize prefix matches over substring matches. + """ + escaped = _escape_like(q) + + # Prefix matches rank first (sort_key=0), substring matches second (sort_key=1) + prefix_case = case( + (Taxon.name.ilike(f"{escaped}%"), 0), + else_=1, + ) + rows = ( db.query(Taxon) - .filter(Taxon.name.ilike(f"%{q}%")) - .order_by(Taxon.name) + .filter(Taxon.name.ilike(f"%{escaped}%")) + .order_by(prefix_case, Taxon.name) .limit(limit) .all() ) diff --git a/apps/api/src/evograph/api/routes/sequences.py b/apps/api/src/evograph/api/routes/sequences.py index fa4e76f..587f902 100644 --- a/apps/api/src/evograph/api/routes/sequences.py +++ b/apps/api/src/evograph/api/routes/sequences.py @@ -33,6 +33,7 @@ def get_sequences( marker=s.marker, source=s.source, accession=s.accession, + sequence=s.sequence, length=s.length, is_canonical=s.is_canonical, retrieved_at=s.retrieved_at, diff --git a/apps/api/src/evograph/api/routes/taxa.py b/apps/api/src/evograph/api/routes/taxa.py index 0c5968a..6f9fe9f 100644 --- a/apps/api/src/evograph/api/routes/taxa.py +++ b/apps/api/src/evograph/api/routes/taxa.py @@ -1,31 +1,72 @@ -"""Taxon detail endpoint.""" +"""Taxon detail endpoint with paginated children.""" -from fastapi import APIRouter, Depends, HTTPException -from sqlalchemy import func +from fastapi import APIRouter, Depends, HTTPException, Query +from sqlalchemy import func, text from sqlalchemy.orm import Session -from evograph.api.schemas.taxa import TaxonDetail, TaxonSummary +from evograph.api.schemas.taxa import ChildrenPage, TaxonDetail, TaxonSummary from evograph.db.models import NodeMedia, Sequence, Taxon from evograph.db.session import get_db router = APIRouter(tags=["taxa"]) +_INLINE_CHILDREN_LIMIT = 100 + + +def _fetch_lineage(db: Session, ott_id: int) -> list[TaxonSummary]: + """Fetch full lineage (root → ... → parent) using a recursive CTE. + + Single SQL query replaces N+1 individual parent lookups. + """ + result = db.execute( + text(""" + WITH RECURSIVE ancestors AS ( + SELECT ott_id, name, rank, parent_ott_id, 0 AS depth + FROM taxa + WHERE ott_id = (SELECT parent_ott_id FROM taxa WHERE ott_id = :ott_id) + UNION ALL + SELECT t.ott_id, t.name, t.rank, t.parent_ott_id, a.depth + 1 + FROM taxa t + JOIN ancestors a ON t.ott_id = a.parent_ott_id + WHERE a.depth < 20 + ) + SELECT ott_id, name, rank FROM ancestors ORDER BY depth DESC + """), + {"ott_id": ott_id}, + ).fetchall() + return [ + TaxonSummary(ott_id=row[0], name=row[1], rank=row[2]) + for row in result + ] + @router.get("/taxa/{ott_id}", response_model=TaxonDetail) def get_taxon( ott_id: int, db: Session = Depends(get_db), ) -> TaxonDetail: - """Get taxon detail with children, lineage, and canonical sequence availability.""" + """Get taxon detail with children, lineage, and canonical sequence availability. + + For taxa with more than 100 children, only the first 100 are returned inline. + Use GET /taxa/{ott_id}/children?offset=... for paginated access. + """ taxon = db.query(Taxon).filter(Taxon.ott_id == ott_id).first() if taxon is None: raise HTTPException(status_code=404, detail="Taxon not found") - # Get children + # Count total children + total_children = ( + db.query(func.count(Taxon.ott_id)) + .filter(Taxon.parent_ott_id == ott_id) + .scalar() + ) or 0 + + # Get children (limited for inline display) children = ( db.query(Taxon) .filter(Taxon.parent_ott_id == ott_id) .order_by(Taxon.name) + .limit(_INLINE_CHILDREN_LIMIT) .all() ) @@ -51,31 +92,17 @@ def get_taxon( ) child_images = {ott: url for ott, url in media_rows} - has_canonical = ( + # EXISTS check is faster than fetching a full row + has_canonical = db.query( db.query(Sequence) .filter(Sequence.ott_id == ott_id, Sequence.is_canonical.is_(True)) - .first() - is not None - ) + .exists() + ).scalar() media = db.query(NodeMedia).filter(NodeMedia.ott_id == ott_id).first() - # Build lineage by walking up the parent chain - lineage: list[TaxonSummary] = [] - current = taxon - seen = {ott_id} - while current.parent_ott_id and current.parent_ott_id not in seen: - seen.add(current.parent_ott_id) - parent = db.query(Taxon).filter(Taxon.ott_id == current.parent_ott_id).first() - if parent is None: - break - lineage.append(TaxonSummary( - ott_id=parent.ott_id, - name=parent.name, - rank=parent.rank, - )) - current = parent - lineage.reverse() # root → ... → parent + # Build lineage with single recursive CTE query (replaces N+1 parent walk) + lineage = _fetch_lineage(db, ott_id) parent_name = None if lineage: @@ -101,8 +128,73 @@ def get_taxon( ) for c in children ], + total_children=total_children, has_canonical_sequence=has_canonical, image_url=media.image_url if media else None, lineage=lineage, wikipedia_url=wikipedia_url, ) + + +@router.get("/taxa/{ott_id}/children", response_model=ChildrenPage) +def get_children( + ott_id: int, + offset: int = Query(0, ge=0), + limit: int = Query(100, ge=1, le=500), + db: Session = Depends(get_db), +) -> ChildrenPage: + """Paginated children for a taxon.""" + taxon = db.query(Taxon).filter(Taxon.ott_id == ott_id).first() + if taxon is None: + raise HTTPException(status_code=404, detail="Taxon not found") + + total = ( + db.query(func.count(Taxon.ott_id)) + .filter(Taxon.parent_ott_id == ott_id) + .scalar() + ) or 0 + + children = ( + db.query(Taxon) + .filter(Taxon.parent_ott_id == ott_id) + .order_by(Taxon.name) + .offset(offset) + .limit(limit) + .all() + ) + + child_ids = [c.ott_id for c in children] + child_counts: dict[int, int] = {} + child_images: dict[int, str] = {} + + if child_ids: + counts = ( + db.query(Taxon.parent_ott_id, func.count(Taxon.ott_id)) + .filter(Taxon.parent_ott_id.in_(child_ids)) + .group_by(Taxon.parent_ott_id) + .all() + ) + child_counts = {parent_id: cnt for parent_id, cnt in counts} + + media_rows = ( + db.query(NodeMedia.ott_id, NodeMedia.image_url) + .filter(NodeMedia.ott_id.in_(child_ids)) + .all() + ) + child_images = {ott: url for ott, url in media_rows} + + return ChildrenPage( + items=[ + TaxonSummary( + ott_id=c.ott_id, + name=c.name, + rank=c.rank, + child_count=child_counts.get(c.ott_id, 0), + image_url=child_images.get(c.ott_id), + ) + for c in children + ], + total=total, + offset=offset, + limit=limit, + ) diff --git a/apps/api/src/evograph/api/schemas/sequence.py b/apps/api/src/evograph/api/schemas/sequence.py index b5a810f..9c0ad50 100644 --- a/apps/api/src/evograph/api/schemas/sequence.py +++ b/apps/api/src/evograph/api/schemas/sequence.py @@ -7,6 +7,7 @@ class SequenceOut(BaseModel): marker: str source: str accession: str + sequence: str length: int is_canonical: bool retrieved_at: datetime | None = None diff --git a/apps/api/src/evograph/api/schemas/taxa.py b/apps/api/src/evograph/api/schemas/taxa.py index 95ae39d..992391f 100644 --- a/apps/api/src/evograph/api/schemas/taxa.py +++ b/apps/api/src/evograph/api/schemas/taxa.py @@ -17,7 +17,15 @@ class TaxonDetail(BaseModel): parent_name: str | None = None ncbi_tax_id: int | None = None children: list[TaxonSummary] = [] + total_children: int = 0 has_canonical_sequence: bool = False image_url: str | None = None lineage: list[TaxonSummary] = [] wikipedia_url: str | None = None + + +class ChildrenPage(BaseModel): + items: list[TaxonSummary] + total: int + offset: int + limit: int diff --git a/apps/api/src/evograph/db/migrations/versions/002_performance_indexes.py b/apps/api/src/evograph/db/migrations/versions/002_performance_indexes.py new file mode 100644 index 0000000..cb60838 --- /dev/null +++ b/apps/api/src/evograph/db/migrations/versions/002_performance_indexes.py @@ -0,0 +1,67 @@ +"""Add performance indexes for search, neighbors, and canonical lookups. + +Revision ID: 002 +Revises: 001 +Create Date: 2026-03-01 + +Indexes added: +- pg_trgm GIN index on taxa.name for fast ILIKE substring search +- Composite index on edges(src_ott_id, distance) for neighbor queries +- Composite index on sequences(ott_id, is_canonical) for canonical checks +- Index on taxa(rank) for rank-based filtering at scale +- Index on sequences(ott_id, marker, is_canonical) covering canonical selection +""" + +from alembic import op + +revision = "002" +down_revision = "001" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # Enable pg_trgm extension for trigram similarity search + op.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm") + + # GIN trigram index on taxa.name — enables fast ILIKE '%query%' search + # Without this, ILIKE does a full sequential scan on large tables + op.execute( + "CREATE INDEX ix_taxa_name_trgm ON taxa USING gin (name gin_trgm_ops)" + ) + + # Composite index for neighbor queries: + # SELECT ... FROM edges WHERE src_ott_id = ? ORDER BY distance LIMIT k + op.create_index( + "ix_edges_src_distance", + "edges", + ["src_ott_id", "distance"], + ) + + # Composite index for canonical sequence checks: + # SELECT ... FROM sequences WHERE ott_id = ? AND is_canonical = true + op.create_index( + "ix_sequences_ott_canonical", + "sequences", + ["ott_id", "is_canonical"], + ) + + # Covering index for canonical selection pipeline: + # SELECT ... FROM sequences WHERE ott_id = ? AND marker = 'COI' + op.create_index( + "ix_sequences_ott_marker", + "sequences", + ["ott_id", "marker"], + ) + + # Rank index for filtering/grouping by taxonomic rank + op.create_index("ix_taxa_rank", "taxa", ["rank"]) + + +def downgrade() -> None: + op.drop_index("ix_taxa_rank") + op.drop_index("ix_sequences_ott_marker") + op.drop_index("ix_sequences_ott_canonical") + op.drop_index("ix_edges_src_distance") + op.execute("DROP INDEX IF EXISTS ix_taxa_name_trgm") + op.execute("DROP EXTENSION IF EXISTS pg_trgm") diff --git a/apps/api/src/evograph/db/models.py b/apps/api/src/evograph/db/models.py index 9b28941..bf99e0c 100644 --- a/apps/api/src/evograph/db/models.py +++ b/apps/api/src/evograph/db/models.py @@ -10,6 +10,7 @@ DateTime, Double, ForeignKey, + Index, Integer, PrimaryKeyConstraint, Text, @@ -28,7 +29,7 @@ class Taxon(Base): ott_id: Mapped[int] = mapped_column(Integer, primary_key=True) name: Mapped[str] = mapped_column(Text, index=True) - rank: Mapped[str] = mapped_column(Text) + rank: Mapped[str] = mapped_column(Text, index=True) parent_ott_id: Mapped[int | None] = mapped_column( Integer, ForeignKey("taxa.ott_id"), nullable=True, index=True ) @@ -79,6 +80,7 @@ class Edge(Base): __tablename__ = "edges" __table_args__ = ( PrimaryKeyConstraint("src_ott_id", "dst_ott_id", "marker"), + Index("ix_edges_src_distance", "src_ott_id", "distance"), ) src_ott_id: Mapped[int] = mapped_column( diff --git a/apps/api/src/evograph/db/session.py b/apps/api/src/evograph/db/session.py index ebbf74c..1f79d7b 100644 --- a/apps/api/src/evograph/db/session.py +++ b/apps/api/src/evograph/db/session.py @@ -1,4 +1,4 @@ -"""SQLAlchemy engine and session factory.""" +"""SQLAlchemy engine and session factory with connection pooling.""" from collections.abc import Generator @@ -7,7 +7,16 @@ from evograph.settings import settings -engine = create_engine(settings.database_url) +engine = create_engine( + settings.database_url, + # Connection pool configuration for production readiness + pool_size=10, # Maintain 10 persistent connections + max_overflow=20, # Allow up to 20 additional connections under load + pool_recycle=300, # Recycle connections after 5 minutes (avoid stale connections) + pool_pre_ping=True, # Verify connections are alive before using them + pool_timeout=30, # Wait up to 30s for a connection from the pool + echo=False, # Set True for SQL debugging +) SessionLocal = sessionmaker(bind=engine, autocommit=False, autoflush=False) diff --git a/apps/api/src/evograph/main.py b/apps/api/src/evograph/main.py index 3ad4408..1f37113 100644 --- a/apps/api/src/evograph/main.py +++ b/apps/api/src/evograph/main.py @@ -1,9 +1,14 @@ from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware -from evograph.api.routes import taxa, graph, search, sequences +from fastapi.middleware.gzip import GZipMiddleware + +from evograph.api.routes import graph, search, sequences, taxa app = FastAPI(title="EvoGraph MVP", version="0.1.0") +# GZip responses > 500 bytes — critical for graph endpoints with large JSON payloads +app.add_middleware(GZipMiddleware, minimum_size=500) + app.add_middleware( CORSMiddleware, allow_origins=["*"], @@ -17,6 +22,7 @@ app.include_router(graph.router, prefix="/v1") app.include_router(sequences.router, prefix="/v1") + @app.get("/health") def health(): return {"status": "ok"} diff --git a/apps/api/src/evograph/pipeline/ingest_images.py b/apps/api/src/evograph/pipeline/ingest_images.py index 5ae1f59..d0e365d 100644 --- a/apps/api/src/evograph/pipeline/ingest_images.py +++ b/apps/api/src/evograph/pipeline/ingest_images.py @@ -15,7 +15,7 @@ import httpx from sqlalchemy import select, text -from evograph.db.models import NodeMedia, Taxon +from evograph.db.models import NodeMedia from evograph.db.session import engine logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s") diff --git a/apps/api/src/evograph/pipeline/validate.py b/apps/api/src/evograph/pipeline/validate.py index 40c7e3d..def1d10 100644 --- a/apps/api/src/evograph/pipeline/validate.py +++ b/apps/api/src/evograph/pipeline/validate.py @@ -130,7 +130,7 @@ def validate() -> None: if len(distances) >= 2: print(f" StdDev: {statistics.stdev(distances):.4f}") - print(f"\nOutliers:") + print("\nOutliers:") print( f" Distance < 0.05 across families: {len(outliers_low)}" ) diff --git a/apps/api/src/evograph/services/mi_distance.py b/apps/api/src/evograph/services/mi_distance.py index df00662..6e6e2fa 100644 --- a/apps/api/src/evograph/services/mi_distance.py +++ b/apps/api/src/evograph/services/mi_distance.py @@ -51,9 +51,6 @@ def mi_from_alignment(aln: AlignmentResult) -> tuple[float, float, int]: y_counts: Counter[str] = Counter(y for _, y in columns) # Empirical distributions. - p_xy: dict[str, float] = { - f"{x},{y}": count / n for (x, y), count in joint_counts.items() - } p_x: dict[str, float] = {x: count / n for x, count in x_counts.items()} p_y: dict[str, float] = {y: count / n for y, count in y_counts.items()} diff --git a/apps/api/tests/__init__.py b/apps/api/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/api/tests/conftest.py b/apps/api/tests/conftest.py new file mode 100644 index 0000000..77379a6 --- /dev/null +++ b/apps/api/tests/conftest.py @@ -0,0 +1,186 @@ +"""Shared test fixtures for EvoGraph API tests.""" + +import uuid +from datetime import datetime, timezone +from unittest.mock import MagicMock + +import pytest +from fastapi.testclient import TestClient + +from evograph.db.models import Edge, NodeMedia, Sequence, Taxon +from evograph.db.session import get_db +from evograph.main import app + + +def _make_taxon(ott_id: int, name: str, rank: str, parent_ott_id: int | None = None, **kw) -> Taxon: + t = MagicMock(spec=Taxon) + t.ott_id = ott_id + t.name = name + t.rank = rank + t.parent_ott_id = parent_ott_id + t.ncbi_tax_id = kw.get("ncbi_tax_id") + t.bold_tax_id = kw.get("bold_tax_id") + t.synonyms = kw.get("synonyms") + t.lineage = kw.get("lineage") + return t + + +def _make_sequence(ott_id: int, **kw) -> Sequence: + s = MagicMock(spec=Sequence) + s.id = kw.get("id", uuid.uuid4()) + s.ott_id = ott_id + s.marker = kw.get("marker", "COI") + s.source = kw.get("source", "NCBI") + s.accession = kw.get("accession", "NC_000001") + s.sequence = kw.get("sequence", "ATCGATCG") + s.length = kw.get("length", 658) + s.quality = kw.get("quality") + s.is_canonical = kw.get("is_canonical", False) + s.retrieved_at = kw.get("retrieved_at", datetime(2024, 1, 1, tzinfo=timezone.utc)) + return s + + +def _make_edge(src: int, dst: int, **kw) -> Edge: + e = MagicMock(spec=Edge) + e.src_ott_id = src + e.dst_ott_id = dst + e.marker = kw.get("marker", "COI") + e.distance = kw.get("distance", 0.15) + e.mi_norm = kw.get("mi_norm", 0.85) + e.align_len = kw.get("align_len", 600) + e.created_at = kw.get("created_at", datetime(2024, 1, 1, tzinfo=timezone.utc)) + return e + + +def _make_media(ott_id: int, image_url: str) -> NodeMedia: + m = MagicMock(spec=NodeMedia) + m.ott_id = ott_id + m.image_url = image_url + m.attribution = None + return m + + +# ── Sample data ───────────────────────────────────────── +AVES = _make_taxon(81461, "Aves", "class") +PASSERIFORMES = _make_taxon(1041547, "Passeriformes", "order", parent_ott_id=81461) +CORVIDAE = _make_taxon(187411, "Corvidae", "family", parent_ott_id=1041547) +CORVUS = _make_taxon(369568, "Corvus", "genus", parent_ott_id=187411) +CORVUS_CORAX = _make_taxon(700118, "Corvus corax", "species", parent_ott_id=369568) +CORVUS_CORONE = _make_taxon(893498, "Corvus corone", "species", parent_ott_id=369568) + + +@pytest.fixture() +def sample_taxa(): + return { + "aves": AVES, + "passeriformes": PASSERIFORMES, + "corvidae": CORVIDAE, + "corvus": CORVUS, + "corvus_corax": CORVUS_CORAX, + "corvus_corone": CORVUS_CORONE, + } + + +class MockExistsClause: + """Wraps a boolean for EXISTS subquery simulation.""" + pass + + +class MockQuery: + """Chainable mock for SQLAlchemy query calls.""" + + def __init__(self, results=None): + self._results = results or [] + + def filter(self, *args, **kwargs): + return self + + def order_by(self, *args, **kwargs): + return self + + def limit(self, *args, **kwargs): + return self + + def offset(self, *args, **kwargs): + return self + + def join(self, *args, **kwargs): + return self + + def group_by(self, *args, **kwargs): + return self + + def exists(self): + """Return an exists clause marker for use in outer query.""" + return MockExistsClause() + + def scalar(self): + # When called on an EXISTS wrapper query, return False + if self._results and isinstance(self._results[0], MockExistsClause): + return False + return 0 + + def all(self): + return self._results + + def first(self): + return self._results[0] if self._results else None + + +class MockExecuteResult: + """Mock result for db.execute() calls (used by recursive CTEs).""" + + def __init__(self, rows=None): + self._rows = rows or [] + + def fetchall(self): + return self._rows + + def scalars(self): + return self + + def all(self): + return [row[0] if isinstance(row, tuple) else row for row in self._rows] + + +class MockDB: + """Mock database session that dispatches query() by model type.""" + + def __init__(self): + self._registry: dict[type | tuple, list] = {} + + def set(self, key, results): + """Register results for a given model or model-tuple.""" + self._registry[key] = results + return self + + def query(self, *models): + # For EXISTS wrapper: db.query(exists_clause) + if len(models) == 1 and isinstance(models[0], MockExistsClause): + return MockQuery([models[0]]) + # For multi-model queries like (Edge, Taxon), use a tuple key + key = models[0] if len(models) == 1 else models + results = self._registry.get(key, []) + return MockQuery(results) + + def execute(self, *args, **kwargs): + """Handle raw SQL execute calls (recursive CTEs). Returns empty results.""" + return MockExecuteResult([]) + + +@pytest.fixture() +def mock_db(): + return MockDB() + + +@pytest.fixture() +def client(mock_db): + """FastAPI TestClient with the get_db dependency overridden.""" + + def _override_get_db(): + yield mock_db + + app.dependency_overrides[get_db] = _override_get_db + with TestClient(app) as c: + yield c + app.dependency_overrides.clear() diff --git a/apps/api/tests/test_graph.py b/apps/api/tests/test_graph.py new file mode 100644 index 0000000..bbeab39 --- /dev/null +++ b/apps/api/tests/test_graph.py @@ -0,0 +1,161 @@ +"""Tests for the /v1/graph/* endpoints.""" + +from evograph.db.models import Edge, NodeMedia, Taxon +from tests.conftest import _make_edge, _make_taxon + + +class TestSubtreeGraph: + def test_subtree_returns_graph(self, client, mock_db): + corvidae = _make_taxon(187411, "Corvidae", "family") + corvus = _make_taxon(369568, "Corvus", "genus", parent_ott_id=187411) + + mock_db.set(Taxon, [corvidae, corvus]) + mock_db.set(Edge, []) + mock_db.set(NodeMedia, []) + + resp = client.get("/v1/graph/subtree/187411", params={"depth": 1}) + assert resp.status_code == 200 + + data = resp.json() + assert "nodes" in data + assert "edges" in data + + def test_subtree_not_found(self, client, mock_db): + mock_db.set(Taxon, []) + + resp = client.get("/v1/graph/subtree/999999") + assert resp.status_code == 404 + + def test_subtree_depth_validation(self, client, mock_db): + taxon = _make_taxon(187411, "Corvidae", "family") + mock_db.set(Taxon, [taxon]) + mock_db.set(Edge, []) + mock_db.set(NodeMedia, []) + + # Depth 0 should fail validation (min is 1) + resp = client.get("/v1/graph/subtree/187411", params={"depth": 0}) + assert resp.status_code == 422 + + # Depth 6 should fail validation (max is 5) + resp = client.get("/v1/graph/subtree/187411", params={"depth": 6}) + assert resp.status_code == 422 + + def test_subtree_graph_schema(self, client, mock_db): + taxon = _make_taxon(187411, "Corvidae", "family") + mock_db.set(Taxon, [taxon]) + mock_db.set(Edge, []) + mock_db.set(NodeMedia, []) + + resp = client.get("/v1/graph/subtree/187411") + data = resp.json() + + # Root node must be present + assert any(n["ott_id"] == 187411 for n in data["nodes"]) + for node in data["nodes"]: + assert "ott_id" in node + assert "name" in node + assert "rank" in node + + +class TestMiNetwork: + def test_mi_network_empty(self, client, mock_db): + mock_db.set(Edge, []) + + resp = client.get("/v1/graph/mi-network") + assert resp.status_code == 200 + + data = resp.json() + assert data["nodes"] == [] + assert data["edges"] == [] + + def test_mi_network_returns_edges_and_nodes(self, client, mock_db): + edge = _make_edge(700118, 893498, distance=0.15, mi_norm=0.85) + corax = _make_taxon(700118, "Corvus corax", "species", parent_ott_id=369568) + corone = _make_taxon(893498, "Corvus corone", "species", parent_ott_id=369568) + + mock_db.set(Edge, [edge]) + mock_db.set(Taxon, [corax, corone]) + mock_db.set(NodeMedia, []) + + resp = client.get("/v1/graph/mi-network") + assert resp.status_code == 200 + + data = resp.json() + assert len(data["nodes"]) >= 2 + assert len(data["edges"]) >= 1 + + def test_mi_network_edge_schema(self, client, mock_db): + edge = _make_edge(700118, 893498) + corax = _make_taxon(700118, "Corvus corax", "species") + corone = _make_taxon(893498, "Corvus corone", "species") + + mock_db.set(Edge, [edge]) + mock_db.set(Taxon, [corax, corone]) + mock_db.set(NodeMedia, []) + + resp = client.get("/v1/graph/mi-network") + mi_edges = [e for e in resp.json()["edges"] if e["kind"] == "mi"] + assert len(mi_edges) >= 1 + for e in mi_edges: + assert "src" in e + assert "dst" in e + assert "distance" in e + + +class TestNeighbors: + def test_neighbors_returns_sorted(self, client, mock_db): + taxon = _make_taxon(700118, "Corvus corax", "species") + corone = _make_taxon(893498, "Corvus corone", "species") + edge = _make_edge(700118, 893498, distance=0.15, mi_norm=0.85) + + mock_db.set(Taxon, [taxon]) + mock_db.set((Edge, Taxon), [(edge, corone)]) + + resp = client.get("/v1/graph/neighbors/700118") + assert resp.status_code == 200 + + data = resp.json() + assert len(data) == 1 + assert data[0]["ott_id"] == 893498 + assert data[0]["distance"] == 0.15 + assert data[0]["mi_norm"] == 0.85 + + def test_neighbors_not_found(self, client, mock_db): + mock_db.set(Taxon, []) + + resp = client.get("/v1/graph/neighbors/999999") + assert resp.status_code == 404 + + def test_neighbors_k_validation(self, client, mock_db): + taxon = _make_taxon(700118, "Corvus corax", "species") + mock_db.set(Taxon, [taxon]) + mock_db.set((Edge, Taxon), []) + + # k=0 should fail (min is 1) + resp = client.get("/v1/graph/neighbors/700118", params={"k": 0}) + assert resp.status_code == 422 + + # k=51 should fail (max is 50) + resp = client.get("/v1/graph/neighbors/700118", params={"k": 51}) + assert resp.status_code == 422 + + def test_neighbors_empty_result(self, client, mock_db): + taxon = _make_taxon(700118, "Corvus corax", "species") + mock_db.set(Taxon, [taxon]) + mock_db.set((Edge, Taxon), []) + + resp = client.get("/v1/graph/neighbors/700118") + assert resp.status_code == 200 + assert resp.json() == [] + + def test_neighbor_schema(self, client, mock_db): + taxon = _make_taxon(700118, "Corvus corax", "species") + corone = _make_taxon(893498, "Corvus corone", "species") + edge = _make_edge(700118, 893498, distance=0.2, mi_norm=0.8) + + mock_db.set(Taxon, [taxon]) + mock_db.set((Edge, Taxon), [(edge, corone)]) + + resp = client.get("/v1/graph/neighbors/700118") + item = resp.json()[0] + assert set(item.keys()) == {"ott_id", "name", "rank", "distance", "mi_norm"} diff --git a/apps/api/tests/test_health.py b/apps/api/tests/test_health.py new file mode 100644 index 0000000..4e6e018 --- /dev/null +++ b/apps/api/tests/test_health.py @@ -0,0 +1,13 @@ +"""Tests for the health check endpoint.""" + +from fastapi.testclient import TestClient + +from evograph.main import app + +client = TestClient(app) + + +def test_health(): + resp = client.get("/health") + assert resp.status_code == 200 + assert resp.json() == {"status": "ok"} diff --git a/apps/api/tests/test_mi_distance.py b/apps/api/tests/test_mi_distance.py new file mode 100644 index 0000000..a8ce61e --- /dev/null +++ b/apps/api/tests/test_mi_distance.py @@ -0,0 +1,93 @@ +"""Tests for MI distance computation.""" + +import math + +from evograph.services.mi_distance import ( + distance_from_nmi, + entropy, + mi_from_alignment, +) +from evograph.utils.alignment import AlignmentResult + + +class TestEntropy: + def test_uniform_binary(self): + """H of a fair coin = ln(2).""" + p = {"A": 0.5, "B": 0.5} + h = entropy(p) + assert abs(h - math.log(2)) < 0.01 + + def test_certain_outcome(self): + """H of a certain outcome ~ 0.""" + p = {"A": 1.0} + h = entropy(p) + assert h < 0.01 + + def test_uniform_4(self): + """H of uniform distribution over 4 bases = ln(4).""" + p = {"A": 0.25, "C": 0.25, "G": 0.25, "T": 0.25} + h = entropy(p) + assert abs(h - math.log(4)) < 0.01 + + +class TestMiFromAlignment: + def test_identical_sequences(self): + """Identical sequences should have high MI (NMI close to 1).""" + seq = "ATCGATCGATCGATCG" * 10 # 160 bases, well above _MIN_COLUMNS + aln = AlignmentResult(a=seq, b=seq) + raw_mi, nmi, n = mi_from_alignment(aln) + assert n == 160 + assert nmi > 0.95 + assert raw_mi > 0 + + def test_too_few_columns(self): + """Fewer than 50 non-gap columns → zero MI.""" + seq = "ATCG" * 10 # 40 bases + aln = AlignmentResult(a=seq, b=seq) + raw_mi, nmi, n = mi_from_alignment(aln) + assert n == 40 + assert nmi == 0.0 + assert raw_mi == 0.0 + + def test_gaps_are_excluded(self): + """Gap columns should be excluded from MI computation.""" + a = "A" * 60 + "-" * 20 + b = "A" * 60 + "T" * 20 + aln = AlignmentResult(a=a, b=b) + _, _, n = mi_from_alignment(aln) + assert n == 60 # only non-gap columns counted + + def test_unrelated_sequences_low_mi(self): + """Sequences with no consistent pattern should have low MI.""" + # Alternating patterns that create independent distributions + import random + random.seed(42) + bases = "ACGT" + a = "".join(random.choice(bases) for _ in range(200)) + b = "".join(random.choice(bases) for _ in range(200)) + aln = AlignmentResult(a=a, b=b) + _, nmi, n = mi_from_alignment(aln) + assert n == 200 + assert nmi < 0.3 # should be low for random sequences + + def test_nmi_clamped_to_unit(self): + """NMI should always be in [0, 1].""" + seq = "ATCG" * 50 + aln = AlignmentResult(a=seq, b=seq) + _, nmi, _ = mi_from_alignment(aln) + assert 0.0 <= nmi <= 1.0 + + +class TestDistanceFromNmi: + def test_identical_distance_zero(self): + assert distance_from_nmi(1.0) == 0.0 + + def test_unrelated_distance_one(self): + assert distance_from_nmi(0.0) == 1.0 + + def test_middle(self): + assert abs(distance_from_nmi(0.5) - 0.5) < 0.001 + + def test_clamped(self): + assert distance_from_nmi(1.5) == 0.0 + assert distance_from_nmi(-0.5) == 1.0 diff --git a/apps/api/tests/test_pipeline.py b/apps/api/tests/test_pipeline.py new file mode 100644 index 0000000..a84a366 --- /dev/null +++ b/apps/api/tests/test_pipeline.py @@ -0,0 +1,87 @@ +"""Tests for pipeline logic — canonical selection scoring.""" + +from unittest.mock import MagicMock + +from evograph.db.models import Sequence +from evograph.pipeline.select_canonical import _score + + +def _make_seq(length: int, quality: dict | None = None) -> Sequence: + """Create a mock Sequence with the fields _score() needs.""" + s = MagicMock(spec=Sequence) + s.length = length + s.quality = quality + return s + + +class TestScore: + """Unit tests for the _score function used in canonical selection.""" + + def test_score_no_quality(self): + """Sequence with no quality dict should score as just its length.""" + seq = _make_seq(658) + assert _score(seq) == 658 + + def test_score_none_quality(self): + """Explicit None quality should score as just length.""" + seq = _make_seq(500, quality=None) + assert _score(seq) == 500 + + def test_score_empty_quality(self): + """Empty quality dict should score as just length (0 ambig).""" + seq = _make_seq(600, quality={}) + assert _score(seq) == 600 + + def test_score_with_ambig(self): + """Score should subtract 10 * ambig from length.""" + seq = _make_seq(658, quality={"ambig": 5}) + assert _score(seq) == 658 - 50 # 608 + + def test_score_zero_ambig(self): + """Zero ambig should give same score as no ambig.""" + seq = _make_seq(658, quality={"ambig": 0}) + assert _score(seq) == 658 + + def test_score_high_ambig_can_be_negative(self): + """Very high ambig count can produce a negative score.""" + seq = _make_seq(100, quality={"ambig": 20}) + assert _score(seq) == 100 - 200 # -100 + + def test_score_comparison_longer_wins(self): + """Longer sequence with no ambiguity beats shorter one.""" + long = _make_seq(800) + short = _make_seq(400) + assert _score(long) > _score(short) + + def test_score_comparison_ambig_penalized(self): + """Longer sequence with many ambiguous bases can lose to shorter clean one.""" + long_ambig = _make_seq(700, quality={"ambig": 50}) # 700 - 500 = 200 + short_clean = _make_seq(500) # 500 + assert _score(short_clean) > _score(long_ambig) + + def test_canonical_selection_picks_best(self): + """Simulate the max() selection from select_canonical.""" + seqs = [ + _make_seq(400, quality={"ambig": 2}), # 380 + _make_seq(658, quality={"ambig": 0}), # 658 (best) + _make_seq(600, quality={"ambig": 10}), # 500 + ] + best = max(seqs, key=lambda s: _score(s)) + assert best.length == 658 + + def test_canonical_selection_with_ties(self): + """When scores are equal, max() returns the first one found.""" + seqs = [ + _make_seq(500), # 500 + _make_seq(510, quality={"ambig": 1}), # 500 (tie) + _make_seq(400), # 400 + ] + best = max(seqs, key=lambda s: _score(s)) + # Both first and second score 500; max() returns the first + assert best.length == 500 + + def test_quality_non_dict_treated_as_no_quality(self): + """If quality is not a dict (e.g. a string), ambig defaults to 0.""" + seq = _make_seq(658, quality="invalid") + # isinstance check fails, so ambig = 0 + assert _score(seq) == 658 diff --git a/apps/api/tests/test_search.py b/apps/api/tests/test_search.py new file mode 100644 index 0000000..972ad60 --- /dev/null +++ b/apps/api/tests/test_search.py @@ -0,0 +1,50 @@ +"""Tests for the /v1/search endpoint.""" + +from evograph.db.models import Taxon +from tests.conftest import _make_taxon + + +class TestSearchTaxa: + def test_search_returns_matching_taxa(self, client, mock_db): + mock_db.set(Taxon, [ + _make_taxon(187411, "Corvidae", "family"), + _make_taxon(369568, "Corvus", "genus"), + ]) + + resp = client.get("/v1/search", params={"q": "corv"}) + assert resp.status_code == 200 + + data = resp.json() + assert len(data) == 2 + assert data[0]["name"] == "Corvidae" + assert data[0]["ott_id"] == 187411 + assert data[0]["rank"] == "family" + assert data[1]["name"] == "Corvus" + + def test_search_returns_empty_for_no_match(self, client, mock_db): + mock_db.set(Taxon, []) + + resp = client.get("/v1/search", params={"q": "zzzzzzz"}) + assert resp.status_code == 200 + assert resp.json() == [] + + def test_search_requires_query(self, client): + resp = client.get("/v1/search") + assert resp.status_code == 422 # validation error + + def test_search_respects_limit_param(self, client, mock_db): + mock_db.set(Taxon, [_make_taxon(1, "Corvus", "genus")]) + + resp = client.get("/v1/search", params={"q": "corv", "limit": 1}) + assert resp.status_code == 200 + + def test_search_rejects_empty_query(self, client): + resp = client.get("/v1/search", params={"q": ""}) + assert resp.status_code == 422 + + def test_search_fields_match_schema(self, client, mock_db): + mock_db.set(Taxon, [_make_taxon(100, "Falco", "genus")]) + + resp = client.get("/v1/search", params={"q": "falco"}) + item = resp.json()[0] + assert set(item.keys()) >= {"ott_id", "name", "rank"} diff --git a/apps/api/tests/test_sequences.py b/apps/api/tests/test_sequences.py new file mode 100644 index 0000000..e3ea024 --- /dev/null +++ b/apps/api/tests/test_sequences.py @@ -0,0 +1,51 @@ +"""Tests for the /v1/taxa/{ott_id}/sequences endpoint.""" + +from evograph.db.models import Sequence, Taxon +from tests.conftest import _make_sequence, _make_taxon + + +class TestGetSequences: + def test_sequences_returns_list(self, client, mock_db): + taxon = _make_taxon(700118, "Corvus corax", "species") + seq = _make_sequence(700118, accession="NC_001", length=658, is_canonical=True) + + mock_db.set(Taxon, [taxon]) + mock_db.set(Sequence, [seq]) + + resp = client.get("/v1/taxa/700118/sequences") + assert resp.status_code == 200 + + data = resp.json() + assert len(data) == 1 + assert data[0]["ott_id"] == 700118 + assert data[0]["marker"] == "COI" + assert data[0]["source"] == "NCBI" + assert data[0]["accession"] == "NC_001" + assert data[0]["length"] == 658 + assert data[0]["is_canonical"] is True + + def test_sequences_taxon_not_found(self, client, mock_db): + mock_db.set(Taxon, []) + + resp = client.get("/v1/taxa/999999/sequences") + assert resp.status_code == 404 + + def test_sequences_empty(self, client, mock_db): + taxon = _make_taxon(700118, "Corvus corax", "species") + mock_db.set(Taxon, [taxon]) + mock_db.set(Sequence, []) + + resp = client.get("/v1/taxa/700118/sequences") + assert resp.status_code == 200 + assert resp.json() == [] + + def test_sequence_schema_fields(self, client, mock_db): + taxon = _make_taxon(700118, "Corvus corax", "species") + seq = _make_sequence(700118) + mock_db.set(Taxon, [taxon]) + mock_db.set(Sequence, [seq]) + + resp = client.get("/v1/taxa/700118/sequences") + item = resp.json()[0] + expected_keys = {"id", "ott_id", "marker", "source", "accession", "sequence", "length", "is_canonical", "retrieved_at"} + assert expected_keys == set(item.keys()) diff --git a/apps/api/tests/test_taxa.py b/apps/api/tests/test_taxa.py new file mode 100644 index 0000000..c4b4426 --- /dev/null +++ b/apps/api/tests/test_taxa.py @@ -0,0 +1,92 @@ +"""Tests for the /v1/taxa/{ott_id} endpoint.""" + +from evograph.db.models import NodeMedia, Sequence, Taxon +from tests.conftest import _make_taxon + + +class TestGetTaxon: + def test_taxon_detail_basic(self, client, mock_db): + corvidae = _make_taxon(187411, "Corvidae", "family", parent_ott_id=1041547) + corvus = _make_taxon(369568, "Corvus", "genus", parent_ott_id=187411) + parent = _make_taxon(1041547, "Passeriformes", "order") + + # First query: get the taxon itself + # Second query: get children + # Third+ queries: batch counts, images, canonical check, media, lineage walk + mock_db.set(Taxon, [corvidae, corvus, parent]) + mock_db.set(Sequence, []) + mock_db.set(NodeMedia, []) + + resp = client.get("/v1/taxa/187411") + assert resp.status_code == 200 + + data = resp.json() + assert data["ott_id"] == 187411 + assert data["name"] == "Corvidae" + assert data["rank"] == "family" + + def test_taxon_not_found(self, client, mock_db): + mock_db.set(Taxon, []) + + resp = client.get("/v1/taxa/999999") + assert resp.status_code == 404 + + def test_taxon_includes_wikipedia_url(self, client, mock_db): + taxon = _make_taxon(700118, "Corvus corax", "species") + mock_db.set(Taxon, [taxon]) + mock_db.set(Sequence, []) + mock_db.set(NodeMedia, []) + + resp = client.get("/v1/taxa/700118") + data = resp.json() + assert data["wikipedia_url"] == "https://en.wikipedia.org/wiki/Corvus_corax" + + def test_taxon_response_fields(self, client, mock_db): + taxon = _make_taxon(81461, "Aves", "class") + mock_db.set(Taxon, [taxon]) + mock_db.set(Sequence, []) + mock_db.set(NodeMedia, []) + + resp = client.get("/v1/taxa/81461") + data = resp.json() + expected_keys = { + "ott_id", "name", "rank", "parent_ott_id", "parent_name", + "ncbi_tax_id", "children", "total_children", + "has_canonical_sequence", "image_url", "lineage", "wikipedia_url", + } + assert expected_keys <= set(data.keys()) + + +class TestGetChildren: + def test_children_basic(self, client, mock_db): + parent = _make_taxon(187411, "Corvidae", "family") + corvus = _make_taxon(369568, "Corvus", "genus", parent_ott_id=187411) + + mock_db.set(Taxon, [parent, corvus]) + mock_db.set(NodeMedia, []) + + resp = client.get("/v1/taxa/187411/children") + assert resp.status_code == 200 + + data = resp.json() + assert "items" in data + assert "total" in data + assert "offset" in data + assert "limit" in data + + def test_children_not_found(self, client, mock_db): + mock_db.set(Taxon, []) + + resp = client.get("/v1/taxa/999999/children") + assert resp.status_code == 404 + + def test_children_pagination_params(self, client, mock_db): + parent = _make_taxon(187411, "Corvidae", "family") + mock_db.set(Taxon, [parent]) + mock_db.set(NodeMedia, []) + + resp = client.get("/v1/taxa/187411/children", params={"offset": 10, "limit": 50}) + assert resp.status_code == 200 + data = resp.json() + assert data["offset"] == 10 + assert data["limit"] == 50 diff --git a/apps/web/Dockerfile b/apps/web/Dockerfile index d2d3257..949aeb2 100644 --- a/apps/web/Dockerfile +++ b/apps/web/Dockerfile @@ -7,4 +7,7 @@ RUN npm install COPY . . +HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \ + CMD node -e "fetch('http://localhost:3000').then(r => { if (!r.ok) process.exit(1) }).catch(() => process.exit(1))" || exit 1 + CMD ["npm", "run", "dev"] diff --git a/apps/web/src/app/globals.css b/apps/web/src/app/globals.css index 641b6f9..34a5ea3 100644 --- a/apps/web/src/app/globals.css +++ b/apps/web/src/app/globals.css @@ -83,6 +83,24 @@ a:hover { 50% { opacity: 0.4; } } +/* ── Skeletons ─────────────────────────────────────── */ + +.skeleton { + background: linear-gradient( + 90deg, + var(--bg-card) 25%, + #1e1e1e 50%, + var(--bg-card) 75% + ); + background-size: 200% 100%; + animation: shimmer 1.5s ease-in-out infinite; +} + +@keyframes shimmer { + 0% { background-position: 200% 0; } + 100% { background-position: -200% 0; } +} + .error { color: #ef5350; padding: 1rem; @@ -326,16 +344,94 @@ input:focus { } @media (max-width: 768px) { + .container { + padding: 0 1rem; + } + .detail-grid { grid-template-columns: 1fr; } + .hero-section { flex-direction: column; } + .hero-image-wrap { width: 120px; height: 120px; } + + .hero-title { + font-size: 1.35rem; + } + + .stats-bar { + font-size: 0.8rem; + gap: 0.35rem; + padding: 0.5rem 0.75rem; + } + + .breadcrumbs { + font-size: 0.78rem; + } + + .external-links { + gap: 0.35rem; + } + + .ext-link { + font-size: 0.75rem; + padding: 0.25rem 0.5rem; + } + + .graph-legend { + font-size: 0.6rem; + gap: 0.4rem; + padding: 0.4rem 0.6rem; + } + + .graph-legend-hint { + display: none; + } + + .neighbor-card-content { + padding: 0.5rem 0.65rem; + font-size: 0.9rem; + } + + .sequence-viewer { + font-size: 0.7rem; + } + + .sequence-offset { + width: 40px; + font-size: 0.6rem; + } + + .graph-search-input { + width: 150px; + font-size: 0.75rem; + } +} + +@media (max-width: 480px) { + .hero-image-wrap { + width: 80px; + height: 80px; + } + + .hero-title { + font-size: 1.15rem; + } + + .stats-bar { + flex-direction: column; + align-items: flex-start; + } + + .stats-sep { + display: none; + } } /* ── Collapsible sections ─────────────────────────── */ @@ -470,3 +566,162 @@ input:focus { font-variant-numeric: tabular-nums; white-space: nowrap; } + +/* ── Sequence viewer ─────────────────────────────── */ + +.sequence-card { + padding: 0 !important; + overflow: hidden; +} + +.sequence-card-header { + display: flex; + align-items: center; + justify-content: space-between; + width: 100%; + padding: 0.75rem 1rem; + border: none; + background: transparent; + color: inherit; + cursor: pointer; + font-size: 0.9rem; + transition: background 0.15s; +} + +.sequence-card-header:hover { + background: rgba(255, 255, 255, 0.03); +} + +.sequence-card-body { + border-top: 1px solid var(--border); + padding: 1rem; +} + +.sequence-viewer { + font-family: "SF Mono", "Fira Code", "Cascadia Code", monospace; + font-size: 0.8rem; + line-height: 1.6; + overflow-x: auto; + padding: 0.75rem 0; +} + +.sequence-line { + display: flex; + gap: 1rem; + white-space: nowrap; +} + +.sequence-offset { + display: inline-block; + width: 50px; + text-align: right; + color: #555; + font-size: 0.7rem; + flex-shrink: 0; + user-select: none; +} + +.sequence-bases { + letter-spacing: 0.05em; +} + +.sequence-chunk { + margin-right: 0.5em; +} + +/* ── Composition bar ─────────────────────────────── */ + +.composition-section { + margin-bottom: 1rem; +} + +.composition-bar { + display: flex; + height: 8px; + border-radius: 4px; + overflow: hidden; + margin-bottom: 0.5rem; +} + +.composition-legend { + display: flex; + flex-wrap: wrap; + gap: 0.75rem; + font-size: 0.8rem; +} + +.composition-item { + display: flex; + align-items: center; + gap: 0.3rem; +} + +.composition-dot { + width: 8px; + height: 8px; + border-radius: 50%; + flex-shrink: 0; +} + +/* ── Graph search ─────────────────────────────────── */ + +.graph-search-box { + position: relative; +} + +.graph-search-input { + width: 200px; + padding: 0.35rem 0.65rem; + border: 1px solid var(--border); + border-radius: var(--radius); + background: var(--bg-input); + color: var(--fg); + font-size: 0.8rem; + font-family: inherit; +} + +.graph-search-input::placeholder { + color: #666; +} + +.graph-search-dropdown { + position: absolute; + top: 100%; + left: 0; + right: 0; + margin-top: 4px; + background: var(--bg-card); + border: 1px solid var(--border); + border-radius: var(--radius); + max-height: 280px; + overflow-y: auto; + z-index: 100; + box-shadow: 0 8px 24px rgba(0, 0, 0, 0.5); +} + +.graph-search-item { + display: flex; + align-items: center; + justify-content: space-between; + width: 100%; + padding: 0.4rem 0.65rem; + border: none; + background: transparent; + color: var(--fg); + font-size: 0.8rem; + cursor: pointer; + text-align: left; + font-family: inherit; +} + +.graph-search-item:hover { + background: rgba(79, 195, 247, 0.1); +} + +.graph-search-rank { + font-size: 0.7rem; + color: #666; + text-transform: capitalize; + margin-left: 0.5rem; + flex-shrink: 0; +} diff --git a/apps/web/src/app/graph/page.tsx b/apps/web/src/app/graph/page.tsx index 0e47664..b7d17b8 100644 --- a/apps/web/src/app/graph/page.tsx +++ b/apps/web/src/app/graph/page.tsx @@ -1,19 +1,89 @@ "use client"; import dynamic from "next/dynamic"; -import { useEffect, useState } from "react"; +import { useEffect, useMemo, useRef, useState } from "react"; import { getMiNetwork } from "@/lib/api"; -import type { GraphResponse } from "@/lib/types"; +import type { GraphResponse, GraphNode } from "@/lib/types"; +import { GraphPageSkeleton } from "@/components/Skeleton"; const GraphViewSigma = dynamic( () => import("@/components/GraphViewSigma"), { ssr: false } ); +function NodeSearchBox({ + nodes, + onSelect, +}: { + nodes: GraphNode[]; + onSelect: (ottId: number | null) => void; +}) { + const [query, setQuery] = useState(""); + const [open, setOpen] = useState(false); + const ref = useRef(null); + + const matches = useMemo(() => { + if (query.length < 2) return []; + const q = query.toLowerCase(); + return nodes + .filter((n) => n.name.toLowerCase().includes(q)) + .slice(0, 12); + }, [query, nodes]); + + // Close dropdown on outside click + useEffect(() => { + function handleClick(e: MouseEvent) { + if (ref.current && !ref.current.contains(e.target as Node)) { + setOpen(false); + } + } + document.addEventListener("mousedown", handleClick); + return () => document.removeEventListener("mousedown", handleClick); + }, []); + + return ( +
+ { + setQuery(e.target.value); + setOpen(true); + if (e.target.value.length < 2) onSelect(null); + }} + onFocus={() => setOpen(true)} + className="graph-search-input" + /> + {open && matches.length > 0 && ( +
+ {matches.map((n) => ( + + ))} +
+ )} +
+ ); +} + export default function GraphPage() { const [graph, setGraph] = useState(null); const [error, setError] = useState(null); const [loading, setLoading] = useState(true); + const [highlightedNode, setHighlightedNode] = useState(null); useEffect(() => { getMiNetwork() @@ -35,26 +105,30 @@ export default function GraphPage() { MI Similarity Network -
-

+

+

Species with COI barcodes connected by mutual information similarity. Closer species have thicker, brighter edges. Hover to highlight, click to view details.

{graph && !loading && ( - - {speciesCount} species / {miCount} MI edges - +
+ + + {speciesCount} species / {miCount} MI edges + +
)}
{loading ? ( -
Loading graph...
+ ) : graph ? ( ) : null}
diff --git a/apps/web/src/app/taxa/[ottId]/page.tsx b/apps/web/src/app/taxa/[ottId]/page.tsx index 48347eb..a1271a0 100644 --- a/apps/web/src/app/taxa/[ottId]/page.tsx +++ b/apps/web/src/app/taxa/[ottId]/page.tsx @@ -4,10 +4,11 @@ import { useEffect, useState } from "react"; import { useParams } from "next/navigation"; import Link from "next/link"; import dynamic from "next/dynamic"; -import { getTaxon, getNeighbors, getSubtreeGraph } from "@/lib/api"; +import { getTaxon, getNeighbors, getSubtreeGraph, getChildren } from "@/lib/api"; import { wikipediaUrl, inaturalistUrl, ebirdUrl } from "@/lib/external-links"; import type { TaxonDetail, TaxonSummary, NeighborOut, GraphResponse } from "@/lib/types"; import TaxonCard from "@/components/TaxonCard"; +import { TaxonDetailSkeleton } from "@/components/Skeleton"; const GraphView = dynamic(() => import("@/components/GraphView"), { ssr: false }); @@ -127,9 +128,11 @@ export default function TaxonDetailPage() { const ottId = Number(params.ottId); const [taxon, setTaxon] = useState(null); + const [allChildren, setAllChildren] = useState([]); const [neighbors, setNeighbors] = useState([]); const [graph, setGraph] = useState(null); const [error, setError] = useState(null); + const [loadingMore, setLoadingMore] = useState(false); useEffect(() => { if (isNaN(ottId)) { @@ -138,6 +141,7 @@ export default function TaxonDetailPage() { } setTaxon(null); + setAllChildren([]); setNeighbors([]); setGraph(null); setError(null); @@ -149,19 +153,32 @@ export default function TaxonDetailPage() { ]) .then(([t, n, g]) => { setTaxon(t); + setAllChildren(t.children); setNeighbors(n); setGraph(g); }) .catch((err: Error) => setError(err.message)); }, [ottId]); + const loadMoreChildren = () => { + if (!taxon || loadingMore) return; + setLoadingMore(true); + getChildren(ottId, allChildren.length, 100) + .then((page) => { + setAllChildren((prev) => [...prev, ...page.items]); + }) + .catch(() => {}) + .finally(() => setLoadingMore(false)); + }; + if (error) return
Error: {error}
; - if (!taxon) return
Loading taxon...
; + if (!taxon) return ; const isSpecies = taxon.rank === "species" || taxon.rank === "subspecies"; const hasMiEdges = graph?.edges.some((e) => e.kind === "mi") ?? false; const showGraph = hasMiEdges && neighbors.length > 0; - const grouped = groupByRank(taxon.children); + const grouped = groupByRank(allChildren); + const hasMoreChildren = allChildren.length < taxon.total_children; const neighborMaxDist = neighbors.length > 0 ? Math.max(...neighbors.map((n) => n.distance)) * 1.1 // 10% headroom : 1; @@ -218,12 +235,17 @@ export default function TaxonDetailPage() { eBird + {taxon.has_canonical_sequence && ( + + COI Sequences + + )} {/* Stats bar */} - {taxon.children.length > 0 && } + {taxon.total_children > 0 && } {/* Empty state */} {grouped.length === 0 && neighbors.length === 0 && ( @@ -240,7 +262,14 @@ export default function TaxonDetailPage() { {/* Children grouped by rank */} {grouped.length > 0 && (
-

Children

+

+ Children + {hasMoreChildren && ( + + ({allChildren.length} of {taxon.total_children}) + + )} +

{grouped.map(([rank, items]) => ( ))} + {hasMoreChildren && ( + + )}
)} diff --git a/apps/web/src/app/taxa/[ottId]/sequences/page.tsx b/apps/web/src/app/taxa/[ottId]/sequences/page.tsx new file mode 100644 index 0000000..29b998d --- /dev/null +++ b/apps/web/src/app/taxa/[ottId]/sequences/page.tsx @@ -0,0 +1,233 @@ +"use client"; + +import { useEffect, useState } from "react"; +import { useParams } from "next/navigation"; +import Link from "next/link"; +import { getTaxon, getSequences } from "@/lib/api"; +import type { TaxonDetail, SequenceOut } from "@/lib/types"; +import { SkeletonLine } from "@/components/Skeleton"; + +// ── DNA base colors ───────────────────────────────── +const BASE_COLORS: Record = { + A: "#4fc3f7", // blue + T: "#ef5350", // red + C: "#66bb6a", // green + G: "#ffa726", // orange + N: "#888", +}; + +function colorForBase(base: string): string { + return BASE_COLORS[base.toUpperCase()] ?? "#888"; +} + +// ── Sequence display ──────────────────────────────── +function SequenceViewer({ sequence }: { sequence: string }) { + const chunkSize = 10; + const lineSize = 60; // bases per line + const lines: string[] = []; + for (let i = 0; i < sequence.length; i += lineSize) { + lines.push(sequence.slice(i, i + lineSize)); + } + + return ( +
+ {lines.map((line, lineIdx) => { + const offset = lineIdx * lineSize; + const chunks: string[] = []; + for (let i = 0; i < line.length; i += chunkSize) { + chunks.push(line.slice(i, i + chunkSize)); + } + return ( +
+ {offset + 1} + + {chunks.map((chunk, ci) => ( + + {[...chunk].map((base, bi) => ( + + {base} + + ))} + + ))} + +
+ ); + })} +
+ ); +} + +// ── Composition bar ───────────────────────────────── +function CompositionBar({ sequence }: { sequence: string }) { + const counts: Record = { A: 0, T: 0, C: 0, G: 0, N: 0 }; + for (const base of sequence.toUpperCase()) { + if (base in counts) counts[base]++; + else counts["N"]++; + } + const total = sequence.length; + + return ( +
+
+ {(["A", "T", "C", "G", "N"] as const).map((base) => { + const pct = (counts[base] / total) * 100; + if (pct === 0) return null; + return ( +
+ ); + })} +
+
+ {(["A", "T", "C", "G"] as const).map((base) => ( + + + {base} + + {counts[base]} ({((counts[base] / total) * 100).toFixed(1)}%) + + + ))} + {counts["N"] > 0 && ( + + + N + + {counts["N"]} ({((counts["N"] / total) * 100).toFixed(1)}%) + + + )} +
+
+ ); +} + +// ── Main page ─────────────────────────────────────── +export default function SequencesPage() { + const params = useParams<{ ottId: string }>(); + const ottId = Number(params.ottId); + + const [taxon, setTaxon] = useState(null); + const [sequences, setSequences] = useState([]); + const [error, setError] = useState(null); + const [loading, setLoading] = useState(true); + const [expanded, setExpanded] = useState(null); + + useEffect(() => { + if (isNaN(ottId)) { + setError("Invalid taxon ID"); + setLoading(false); + return; + } + + Promise.all([getTaxon(ottId), getSequences(ottId)]) + .then(([t, s]) => { + setTaxon(t); + setSequences(s); + // Auto-expand canonical sequence + const canonical = s.find((seq) => seq.is_canonical); + if (canonical) setExpanded(canonical.id); + }) + .catch((err: Error) => setError(err.message)) + .finally(() => setLoading(false)); + }, [ottId]); + + if (error) return
Error: {error}
; + if (loading) { + return ( +
+ +
+ + +
+
+ ); + } + + return ( +
+ {/* Breadcrumb */} + + +

+ COI Sequences +

+ {taxon && ( +

+ + {taxon.name} + + {" "}— {sequences.length} sequence{sequences.length !== 1 ? "s" : ""} +

+ )} + + {sequences.length === 0 ? ( +
+ No COI sequences found for this taxon. +
+ ) : ( +
+ {sequences.map((seq) => { + const isExpanded = expanded === seq.id; + return ( +
+ + + {isExpanded && ( +
+ + +
+ )} +
+ ); + })} +
+ )} +
+ ); +} diff --git a/apps/web/src/components/GraphView.tsx b/apps/web/src/components/GraphView.tsx index 56af297..a484eb1 100644 --- a/apps/web/src/components/GraphView.tsx +++ b/apps/web/src/components/GraphView.tsx @@ -72,7 +72,7 @@ export default function GraphView({ const hasMiEdges = graph.edges.some((e) => e.kind === "mi"); // Rank-specific selectors for node coloring via stylesheet - const rankStyles: cytoscape.Stylesheet[] = Object.entries(RANK_COLOR).map( + const rankStyles: cytoscape.StylesheetStyle[] = Object.entries(RANK_COLOR).map( ([rank, color]) => ({ selector: `node[rank="${rank}"]`, style: { @@ -84,7 +84,7 @@ export default function GraphView({ ); // Always-visible labels for higher-rank nodes - const labelStyles: cytoscape.Stylesheet[] = [ + const labelStyles: cytoscape.StylesheetStyle[] = [ { selector: 'node[rank="class"], node[rank="order"], node[rank="family"]', style: { diff --git a/apps/web/src/components/Skeleton.tsx b/apps/web/src/components/Skeleton.tsx new file mode 100644 index 0000000..625e2db --- /dev/null +++ b/apps/web/src/components/Skeleton.tsx @@ -0,0 +1,114 @@ +"use client"; + +/** + * Reusable skeleton primitives for loading states. + */ + +export function SkeletonLine({ + width = "100%", + height = "1rem", +}: { + width?: string; + height?: string; +}) { + return ( +
+ ); +} + +export function SkeletonCircle({ size = 44 }: { size?: number }) { + return ( +
+ ); +} + +export function SkeletonCard() { + return ( +
+ +
+ + +
+
+ ); +} + +/** Full-page loading skeleton for taxon detail. */ +export function TaxonDetailSkeleton() { + return ( +
+ {/* Breadcrumb skeleton */} +
+ + + +
+ + {/* Hero section skeleton */} +
+
+
+ +
+ + +
+
+ + + +
+
+
+ + {/* Stats bar skeleton */} +
+ +
+ + {/* Grid skeleton */} +
+
+ + {Array.from({ length: 4 }).map((_, i) => ( + + ))} +
+
+ +
+
+
+
+ ); +} + +/** Loading skeleton for the graph page. */ +export function GraphPageSkeleton() { + return ( +
+ +
+ + +
+
+
+ ); +} diff --git a/apps/web/src/lib/api.ts b/apps/web/src/lib/api.ts index 78d1897..e378619 100644 --- a/apps/web/src/lib/api.ts +++ b/apps/web/src/lib/api.ts @@ -1,4 +1,4 @@ -import type { TaxonSummary, TaxonDetail, GraphResponse, NeighborOut, SequenceOut } from "./types"; +import type { TaxonSummary, TaxonDetail, ChildrenPage, GraphResponse, NeighborOut, SequenceOut } from "./types"; const API_BASE = process.env.NEXT_PUBLIC_API_BASE || "http://localhost:8000"; @@ -32,6 +32,10 @@ export function getNeighbors(ottId: number, k = 15) { return getJSON(`/v1/graph/neighbors/${ottId}?k=${k}`); } +export function getChildren(ottId: number, offset = 0, limit = 100) { + return getJSON(`/v1/taxa/${ottId}/children?offset=${offset}&limit=${limit}`); +} + export function getSequences(ottId: number) { return getJSON(`/v1/taxa/${ottId}/sequences`); } diff --git a/apps/web/src/lib/types.ts b/apps/web/src/lib/types.ts index 582a091..41c66d8 100644 --- a/apps/web/src/lib/types.ts +++ b/apps/web/src/lib/types.ts @@ -11,18 +11,27 @@ export interface TaxonDetail extends TaxonSummary { parent_name: string | null; ncbi_tax_id: number | null; children: TaxonSummary[]; + total_children: number; has_canonical_sequence: boolean; image_url: string | null; lineage: TaxonSummary[]; wikipedia_url: string | null; } +export interface ChildrenPage { + items: TaxonSummary[]; + total: number; + offset: number; + limit: number; +} + export interface SequenceOut { id: string; ott_id: number; marker: string; source: string; accession: string; + sequence: string; length: number; is_canonical: boolean; retrieved_at: string | null; diff --git a/docker-compose.yml b/docker-compose.yml index 3f2011d..6dd1938 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,10 +8,20 @@ services: POSTGRES_DB: evograph ports: ["5432:5432"] volumes: ["pgdata:/var/lib/postgresql/data"] + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 10s + timeout: 5s + retries: 5 redis: image: redis:7 ports: ["6379:6379"] + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 5s + retries: 5 api: build: ./apps/api @@ -19,7 +29,11 @@ services: DATABASE_URL: postgresql+psycopg://postgres:postgres@db:5432/evograph REDIS_URL: redis://redis:6379/0 ports: ["8000:8000"] - depends_on: [db, redis] + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy volumes: - ./apps/api/src:/app/src - ./data:/app/data