marcoloco23 · marcoloco23 · Mar 1, 2026 · Mar 1, 2026 · Mar 1, 2026 · Mar 1, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,80 @@
+name: CI
+
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+
+jobs:
+  api-lint-test:
+    name: API — lint & test
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: apps/api
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: pip
+
+      - name: Install dependencies
+        run: pip install -e ".[dev]"
+
+      - name: Lint with ruff
+        run: ruff check src/ tests/
+
+      - name: Run tests
+        run: python -m pytest tests/ -v
+
+  web-lint-typecheck:
+    name: Web — lint & typecheck
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: apps/web
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "22"
+          cache: npm
+          cache-dependency-path: apps/web/package-lock.json
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Lint
+        run: npm run lint
+
+      - name: Typecheck
+        run: npx tsc --noEmit
+
+  web-build:
+    name: Web — build
+    runs-on: ubuntu-latest
+    needs: web-lint-typecheck
+    defaults:
+      run:
+        working-directory: apps/web
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "22"
+          cache: npm
+          cache-dependency-path: apps/web/package-lock.json
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Build
+        run: npm run build
diff --git a/CLAUDE.md b/CLAUDE.md
diff --git a/TODO.md b/TODO.md
@@ -8,32 +8,40 @@
 - [ ] Add NCBI taxonomy ID lookup — `ncbi_tax_id` column exists but is never populated. Add pipeline step to query NCBI Taxonomy by name and backfill
 
 ### Testing
-- [ ] API route tests — pytest + httpx TestClient for all 6 endpoints
-- [ ] Pipeline unit tests — test MI computation with known sequences, test canonical selection logic
+- [x] API route tests — pytest + httpx TestClient for all endpoints (42 tests)
+- [x] MI distance unit tests — entropy, MI computation, NMI clamping, distance conversion
+- [x] Pipeline unit tests — canonical selection scoring logic (11 tests)
 - [ ] Frontend smoke tests — basic render tests for key pages
 
 ### Performance
-- [ ] Cache MI network endpoint — the full graph loads all edges every request. Add Redis or in-memory caching with TTL
-- [ ] Add DB indexes on `edges(src_ott_id, dst_ott_id)` if not already present
-- [ ] Paginate children for large taxa (Aves has 729 direct children)
+- [x] Cache MI network endpoint — in-memory cache with 5-minute TTL
+- [x] Performance indexes (migration 002) — pg_trgm, composite indexes for neighbors/canonical/search
+- [x] Paginate children for large taxa — inline limit of 100, dedicated `/taxa/{id}/children` endpoint with offset/limit
+- [x] Connection pooling — 10 persistent + 20 overflow, pre-ping, 5min recycle
+- [x] Recursive CTE for lineage — single query replaces N+1 parent chain walk
+- [x] Recursive CTE for subtree — single query replaces Python BFS with per-level queries
+- [x] GZip compression — middleware compresses responses > 500 bytes
+- [x] Search optimization — pg_trgm GIN index + prefix ranking + LIKE pattern escaping
+- [x] EXISTS for canonical check — replaces fetching full row
 
 ## Medium Priority
 
 ### Frontend Polish
-- [ ] Add `getSequences()` to frontend API client — endpoint exists but no client function
-- [ ] Sequence viewer page — show aligned sequences for a species, highlight conserved regions
-- [ ] Mobile responsive layout — test and fix breakpoints
-- [ ] Loading skeletons instead of plain "Loading..." text
-- [ ] Graph page: add node search/filter within the MI network
+- [x] Add `getSequences()` to frontend API client
+- [x] Sequence viewer page — color-coded DNA bases, composition bar, expandable cards
+- [x] Mobile responsive layout — breakpoints at 768px and 480px
+- [x] Loading skeletons — shimmer animation for taxon detail and graph pages
+- [x] Graph page: node search/filter within the MI network — autocomplete dropdown with camera animation
 
 ### Data Quality
 - [ ] Run `validate.py` and document results — what % of neighbors share genus/family?
 - [ ] Flag taxonomic outliers — species whose MI neighbors are in different families
 - [ ] Deduplicate sequences — check for identical accessions from multiple sources
 
 ### DevOps
-- [ ] Add Dockerfile health checks
-- [ ] CI pipeline (GitHub Actions) — lint, typecheck, test
+- [x] Add Dockerfile health checks — API (Python urllib), Web (Node fetch), DB (pg_isready), Redis (redis-cli ping)
+- [x] CI pipeline (GitHub Actions) — lint, typecheck, test, build
+- [x] Fix lint warnings — removed unused imports, fixed f-string, removed unused variable
 - [ ] Production deployment config (fly.io, Railway, or VPS)
 
 ## Phase 2 (from ROADMAP.md)

diff --git a/apps/api/Dockerfile b/apps/api/Dockerfile
@@ -9,4 +9,8 @@ COPY src/ src/
 COPY alembic.ini .
 
 ENV PYTHONPATH=/app/src
+
+HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
+  CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
+
 CMD ["uvicorn", "evograph.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
diff --git a/apps/api/pyproject.toml b/apps/api/pyproject.toml
@@ -2,7 +2,7 @@
 name = "evograph"
 version = "0.1.0"
 description = "EvoGraph MVP - Evolutionary similarity graph"
-requires-python = ">=3.12"
+requires-python = ">=3.11"
 dependencies = [
     "fastapi>=0.115",
     "uvicorn[standard]>=0.34",
@@ -24,11 +24,14 @@ dev = [
 
 [build-system]
 requires = ["hatchling"]
-build-backend = "hatchling.backends"
+build-backend = "hatchling.build"
 
 [tool.hatch.build.targets.wheel]
 packages = ["src/evograph"]
 
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+
 [tool.ruff]
 target-version = "py312"
 line-length = 120
diff --git a/apps/api/src/evograph/api/routes/graph.py b/apps/api/src/evograph/api/routes/graph.py
@@ -1,9 +1,9 @@
 """Graph endpoints: subtree and MI-neighbor queries."""
 
-from collections import deque
+import time
 
 from fastapi import APIRouter, Depends, HTTPException, Query
-from sqlalchemy import and_, or_
+from sqlalchemy import and_, text
 from sqlalchemy.orm import Session
 
 from evograph.api.schemas.graph import GraphEdge, GraphResponse, NeighborOut, Node
@@ -12,6 +12,11 @@
 
 router = APIRouter(tags=["graph"])
 
+# ── In-memory cache for MI network (expensive query) ──────
+_mi_network_cache: GraphResponse | None = None
+_mi_network_cache_time: float = 0.0
+_MI_NETWORK_TTL: float = 300.0  # 5 minutes
+
 
 @router.get("/graph/subtree/{ott_id}", response_model=GraphResponse)
 def get_subtree_graph(
@@ -21,39 +26,47 @@ def get_subtree_graph(
 ) -> GraphResponse:
     """Get a graph containing the taxonomy subtree + MI edges among those nodes.
 
-    Walk down from ott_id collecting descendants up to `depth` levels.
-    Include taxonomy edges (parent->child) and MI edges between nodes in the set.
+    Uses a recursive CTE to collect all descendants in a single query,
+    replacing the previous Python-side BFS that issued one query per level.
     """
     root = db.query(Taxon).filter(Taxon.ott_id == ott_id).first()
     if root is None:
         raise HTTPException(status_code=404, detail="Taxon not found")
 
-    # BFS to collect descendants up to `depth` levels
-    collected: dict[int, Taxon] = {root.ott_id: root}
+    # Recursive CTE: fetch entire subtree in one query
+    subtree_rows = db.execute(
+        text("""
+            WITH RECURSIVE subtree AS (
+                SELECT ott_id, name, rank, parent_ott_id, 0 AS depth
+                FROM taxa
+                WHERE ott_id = :root_id
+                UNION ALL
+                SELECT t.ott_id, t.name, t.rank, t.parent_ott_id, s.depth + 1
+                FROM taxa t
+                JOIN subtree s ON t.parent_ott_id = s.ott_id
+                WHERE s.depth < :max_depth
+            )
+            SELECT ott_id, name, rank, parent_ott_id, depth FROM subtree
+        """),
+        {"root_id": ott_id, "max_depth": depth},
+    ).fetchall()
+
+    # Build taxa dict and taxonomy edges from CTE results
+    # Always include the root (CTE may return empty in test environments)
+    taxa_info: dict[int, tuple[str, str]] = {
+        root.ott_id: (root.name, root.rank),
+    }
     taxonomy_edges: list[GraphEdge] = []
-    queue: deque[tuple[int, int]] = deque([(root.ott_id, 0)])
-
-    while queue:
-        current_id, current_depth = queue.popleft()
-        if current_depth >= depth:
-            continue
-        children = (
-            db.query(Taxon).filter(Taxon.parent_ott_id == current_id).all()
-        )
-        for child in children:
-            if child.ott_id not in collected:
-                collected[child.ott_id] = child
-                taxonomy_edges.append(
-                    GraphEdge(
-                        src=current_id,
-                        dst=child.ott_id,
-                        kind="taxonomy",
-                        distance=None,
-                    )
-                )
-                queue.append((child.ott_id, current_depth + 1))
 
-    ott_ids = list(collected.keys())
+    for row in subtree_rows:
+        node_ott_id, name, rank, parent_ott_id, row_depth = row
+        taxa_info[node_ott_id] = (name, rank)
+        if row_depth > 0 and parent_ott_id in taxa_info:
+            taxonomy_edges.append(
+                GraphEdge(src=parent_ott_id, dst=node_ott_id, kind="taxonomy", distance=None)
+            )
+
+    ott_ids = list(taxa_info.keys())
 
     # Fetch image URLs for all collected nodes
     media_rows = (
@@ -86,12 +99,12 @@ def get_subtree_graph(
 
     nodes = [
         Node(
-            ott_id=t.ott_id,
-            name=t.name,
-            rank=t.rank,
-            image_url=media_map.get(t.ott_id),
+            ott_id=node_ott_id,
+            name=name,
+            rank=rank,
+            image_url=media_map.get(node_ott_id),
         )
-        for t in collected.values()
+        for node_ott_id, (name, rank) in taxa_info.items()
     ]
 
     return GraphResponse(
@@ -109,7 +122,14 @@ def get_mi_network(
     Returns all taxa that have at least one MI edge, plus all MI edges
     between them (deduplicated to undirected). Includes taxonomy edges
     connecting species to their parent genus.
+
+    Results are cached in-memory for 5 minutes.
     """
+    global _mi_network_cache, _mi_network_cache_time
+    now = time.monotonic()
+    if _mi_network_cache is not None and (now - _mi_network_cache_time) < _MI_NETWORK_TTL:
+        return _mi_network_cache
+
     # Get all MI edges
     all_edges = db.query(Edge).all()
     if not all_edges:
@@ -172,7 +192,10 @@ def get_mi_network(
         for t in taxa_map.values()
     ]
 
-    return GraphResponse(nodes=nodes, edges=mi_edges + taxonomy_edges)
+    result = GraphResponse(nodes=nodes, edges=mi_edges + taxonomy_edges)
+    _mi_network_cache = result
+    _mi_network_cache_time = time.monotonic()
+    return result
 
 
 @router.get("/graph/neighbors/{ott_id}", response_model=list[NeighborOut])

diff --git a/apps/api/src/evograph/api/routes/search.py b/apps/api/src/evograph/api/routes/search.py
@@ -1,6 +1,15 @@
-"""Search endpoint for taxa."""
+"""Search endpoint for taxa.
+
+Uses ILIKE for substring matching, backed by a pg_trgm GIN index
+(migration 002) for O(1) lookup instead of sequential scan.
+
+Results are ordered to prioritize:
+1. Prefix matches (names starting with the query)
+2. Alphabetical order for remaining matches
+"""
 
 from fastapi import APIRouter, Depends, Query
+from sqlalchemy import case
 from sqlalchemy.orm import Session
 
 from evograph.api.schemas.taxa import TaxonSummary
@@ -10,17 +19,34 @@
 router = APIRouter(tags=["search"])
 
 
+def _escape_like(s: str) -> str:
+    """Escape special LIKE/ILIKE characters to prevent pattern injection."""
+    return s.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
+
+
 @router.get("/search", response_model=list[TaxonSummary])
 def search_taxa(
     q: str = Query(..., min_length=1),
     limit: int = Query(20, le=100),
     db: Session = Depends(get_db),
 ) -> list[TaxonSummary]:
-    """Search taxa by name (case-insensitive ILIKE)."""
+    """Search taxa by name (case-insensitive substring match).
+
+    Uses pg_trgm GIN index for fast ILIKE on large tables.
+    Results prioritize prefix matches over substring matches.
+    """
+    escaped = _escape_like(q)
+
+    # Prefix matches rank first (sort_key=0), substring matches second (sort_key=1)
+    prefix_case = case(
+        (Taxon.name.ilike(f"{escaped}%"), 0),
+        else_=1,
+    )
+
     rows = (
         db.query(Taxon)
-        .filter(Taxon.name.ilike(f"%{q}%"))
-        .order_by(Taxon.name)
+        .filter(Taxon.name.ilike(f"%{escaped}%"))
+        .order_by(prefix_case, Taxon.name)
         .limit(limit)
         .all()
     )

diff --git a/apps/api/src/evograph/api/routes/sequences.py b/apps/api/src/evograph/api/routes/sequences.py
@@ -33,6 +33,7 @@ def get_sequences(
             marker=s.marker,
             source=s.source,
             accession=s.accession,
+            sequence=s.sequence,
             length=s.length,
             is_canonical=s.is_canonical,
             retrieved_at=s.retrieved_at,