marcoloco23 · marcoloco23 · Mar 1, 2026 · Mar 1, 2026 · Mar 1, 2026 · Mar 1, 2026
diff --git a/.env.example b/.env.example
@@ -2,3 +2,9 @@ DATABASE_URL=postgresql+psycopg://postgres:postgres@db:5432/evograph
 REDIS_URL=redis://redis:6379/0
 SCOPE_OTT_ROOT=Aves
 NEXT_PUBLIC_API_BASE=http://localhost:8000
+
+# Production settings
+# POSTGRES_PASSWORD=changeme          # Required for docker-compose.prod.yml
+# CORS_ORIGINS=["https://evograph.example.com"]
+# LOG_LEVEL=info                      # debug, info, warning, error, critical
+# LOG_FORMAT=json                     # text (dev) or json (production)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -56,6 +56,9 @@ jobs:
       - name: Typecheck
         run: npx tsc --noEmit
 
+      - name: Test
+        run: npm test
+
   web-build:
     name: Web — build
     runs-on: ubuntu-latest

diff --git a/CLAUDE.md b/CLAUDE.md
diff --git a/Makefile b/Makefile
@@ -1,11 +1,17 @@
-.PHONY: up down migrate ingest_ott ingest_ncbi ingest_bold canonical neighbors export images validate pipeline
+.PHONY: up down up-prod down-prod migrate ingest_ott ingest_ncbi ingest_bold canonical neighbors export images validate pipeline
 
 up:
 	docker compose up --build
 
 down:
 	docker compose down
 
+up-prod:
+	docker compose -f docker-compose.yml -f docker-compose.prod.yml up --build -d
+
+down-prod:
+	docker compose -f docker-compose.yml -f docker-compose.prod.yml down
+
 migrate:
 	docker compose exec api alembic upgrade head
 

diff --git a/TODO.md b/TODO.md
@@ -3,15 +3,15 @@
 ## High Priority
 
 ### Sequence Coverage
-- [ ] Expand NCBI ingestion — current query finds only 167/18,805 species. Try broader search terms, search by genus when species fails, increase `--per-species` limit
+- [x] Expand NCBI ingestion — broader COI gene search terms (COI/COX1/COXI/CO1 + title variants), genus-level fallback, `--skip-existing` flag
 - [ ] Retry BOLD portal — `portal.boldsystems.org` has been down since Feb 2026. Check periodically; when it returns, `ingest_bold.py` is ready
-- [ ] Add NCBI taxonomy ID lookup — `ncbi_tax_id` column exists but is never populated. Add pipeline step to query NCBI Taxonomy by name and backfill
+- [x] Add NCBI taxonomy ID lookup — `backfill_ncbi_tax_id.py` queries NCBI Taxonomy API by scientific name and updates ncbi_tax_id column
 
 ### Testing
 - [x] API route tests — pytest + httpx TestClient for all endpoints (42 tests)
 - [x] MI distance unit tests — entropy, MI computation, NMI clamping, distance conversion
 - [x] Pipeline unit tests — canonical selection scoring logic (11 tests)
-- [ ] Frontend smoke tests — basic render tests for key pages
+- [x] Frontend smoke tests — Jest + React Testing Library, 58 tests across 8 suites (pages, components, API client, utilities)
 
 ### Performance
 - [x] Cache MI network endpoint — in-memory cache with 5-minute TTL
@@ -35,8 +35,8 @@
 
 ### Data Quality
 - [ ] Run `validate.py` and document results — what % of neighbors share genus/family?
-- [ ] Flag taxonomic outliers — species whose MI neighbors are in different families
-- [ ] Deduplicate sequences — check for identical accessions from multiple sources
+- [x] Flag taxonomic outliers — `validate.py` now returns structured `ValidationReport` with `OutlierRecord` objects (cross-family close, within-genus distant), JSON export via `--output`
+- [x] Deduplicate sequences — `dedup_sequences.py` removes duplicate accessions, keeping longest per (ott_id, accession, marker)
 
 ### DevOps
 - [x] Add Dockerfile health checks — API (Python urllib), Web (Node fetch), DB (pg_isready), Redis (redis-cli ping)
@@ -47,7 +47,7 @@
 ## Phase 2 (from ROADMAP.md)
 
 ### Scale Across Animalia
-- [ ] Make `SCOPE_OTT_ROOT` configurable — support Mammalia, Chordata, etc.
+- [x] Make `SCOPE_OTT_ROOT` configurable — env var in docker-compose, `--scope` CLI arg, exposed in `/health` endpoint
 - [ ] k-mer candidate filtering — replace family-scoped search with ANN index (FAISS/Annoy) for cross-family neighbor detection
 - [ ] Job queue — replace one-shot scripts with Celery/RQ for background pipeline jobs
 - [ ] Precompute subtree graph exports for common entry points

diff --git a/apps/api/Dockerfile b/apps/api/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.12-slim
+FROM python:3.12-slim AS base
 
 WORKDIR /app
 
@@ -13,4 +13,15 @@ ENV PYTHONPATH=/app/src
 HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
   CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
 
+# ── Development (default for docker-compose) ──────────
+FROM base AS dev
 CMD ["uvicorn", "evograph.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
+
+# ── Production ─────────────────────────────────────────
+FROM base AS prod
+
+RUN groupadd --gid 1000 appuser && \
+    useradd --uid 1000 --gid appuser --no-create-home appuser
+USER appuser
+
+CMD ["uvicorn", "evograph.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4", "--log-level", "warning"]
diff --git a/apps/api/src/evograph/api/routes/graph.py b/apps/api/src/evograph/api/routes/graph.py
@@ -2,7 +2,7 @@
 
 import time
 
-from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi import APIRouter, Depends, HTTPException, Query, Response
 from sqlalchemy import and_, text
 from sqlalchemy.orm import Session
 
@@ -115,6 +115,7 @@ def get_subtree_graph(
 
 @router.get("/graph/mi-network", response_model=GraphResponse)
 def get_mi_network(
+    response: Response,
     db: Session = Depends(get_db),
 ) -> GraphResponse:
     """Get the full MI similarity network: all species with MI edges.
@@ -128,6 +129,7 @@ def get_mi_network(
     global _mi_network_cache, _mi_network_cache_time
     now = time.monotonic()
     if _mi_network_cache is not None and (now - _mi_network_cache_time) < _MI_NETWORK_TTL:
+        response.headers["Cache-Control"] = "public, max-age=300"
         return _mi_network_cache
 
     # Get all MI edges
@@ -195,6 +197,8 @@ def get_mi_network(
     result = GraphResponse(nodes=nodes, edges=mi_edges + taxonomy_edges)
     _mi_network_cache = result
     _mi_network_cache_time = time.monotonic()
+
+    response.headers["Cache-Control"] = "public, max-age=300"
     return result
 
 

diff --git a/apps/api/src/evograph/api/routes/search.py b/apps/api/src/evograph/api/routes/search.py
@@ -9,10 +9,10 @@
 """
 
 from fastapi import APIRouter, Depends, Query
-from sqlalchemy import case
+from sqlalchemy import case, func
 from sqlalchemy.orm import Session
 
-from evograph.api.schemas.taxa import TaxonSummary
+from evograph.api.schemas.taxa import SearchPage, TaxonSummary
 from evograph.db.models import Taxon
 from evograph.db.session import get_db
 
@@ -24,19 +24,25 @@ def _escape_like(s: str) -> str:
     return s.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
 
 
-@router.get("/search", response_model=list[TaxonSummary])
+@router.get("/search", response_model=SearchPage)
 def search_taxa(
     q: str = Query(..., min_length=1),
     limit: int = Query(20, le=100),
     db: Session = Depends(get_db),
-) -> list[TaxonSummary]:
+) -> SearchPage:
     """Search taxa by name (case-insensitive substring match).
 
     Uses pg_trgm GIN index for fast ILIKE on large tables.
     Results prioritize prefix matches over substring matches.
+    Returns paginated response with total count.
     """
     escaped = _escape_like(q)
 
+    base_filter = Taxon.name.ilike(f"%{escaped}%")
+
+    # Total matching count (for client to know if there are more results)
+    total = db.query(func.count(Taxon.ott_id)).filter(base_filter).scalar() or 0
+
     # Prefix matches rank first (sort_key=0), substring matches second (sort_key=1)
     prefix_case = case(
         (Taxon.name.ilike(f"{escaped}%"), 0),
@@ -45,11 +51,15 @@ def search_taxa(
 
     rows = (
         db.query(Taxon)
-        .filter(Taxon.name.ilike(f"%{escaped}%"))
+        .filter(base_filter)
         .order_by(prefix_case, Taxon.name)
         .limit(limit)
         .all()
     )
-    return [
-        TaxonSummary(ott_id=t.ott_id, name=t.name, rank=t.rank) for t in rows
-    ]
+    return SearchPage(
+        items=[
+            TaxonSummary(ott_id=t.ott_id, name=t.name, rank=t.rank) for t in rows
+        ],
+        total=total,
+        limit=limit,
+    )
diff --git a/apps/api/src/evograph/api/routes/sequences.py b/apps/api/src/evograph/api/routes/sequences.py
@@ -1,42 +1,58 @@
 """Sequence endpoints for a taxon."""
 
-from fastapi import APIRouter, Depends, HTTPException
+from fastapi import APIRouter, Depends, HTTPException, Query
+from sqlalchemy import func
 from sqlalchemy.orm import Session
 
-from evograph.api.schemas.sequence import SequenceOut
+from evograph.api.schemas.sequence import SequenceOut, SequencePage
 from evograph.db.models import Sequence, Taxon
 from evograph.db.session import get_db
 
 router = APIRouter(tags=["sequences"])
 
 
-@router.get("/taxa/{ott_id}/sequences", response_model=list[SequenceOut])
+@router.get("/taxa/{ott_id}/sequences", response_model=SequencePage)
 def get_sequences(
     ott_id: int,
+    offset: int = Query(0, ge=0),
+    limit: int = Query(50, ge=1, le=200),
     db: Session = Depends(get_db),
-) -> list[SequenceOut]:
-    """Get all sequences for a taxon."""
+) -> SequencePage:
+    """Get paginated sequences for a taxon."""
     taxon = db.query(Taxon).filter(Taxon.ott_id == ott_id).first()
     if taxon is None:
         raise HTTPException(status_code=404, detail="Taxon not found")
 
+    total = (
+        db.query(func.count(Sequence.id))
+        .filter(Sequence.ott_id == ott_id)
+        .scalar()
+    ) or 0
+
     rows = (
         db.query(Sequence)
         .filter(Sequence.ott_id == ott_id)
         .order_by(Sequence.is_canonical.desc(), Sequence.retrieved_at.desc())
+        .offset(offset)
+        .limit(limit)
         .all()
     )
-    return [
-        SequenceOut(
-            id=str(s.id),
-            ott_id=s.ott_id,
-            marker=s.marker,
-            source=s.source,
-            accession=s.accession,
-            sequence=s.sequence,
-            length=s.length,
-            is_canonical=s.is_canonical,
-            retrieved_at=s.retrieved_at,
-        )
-        for s in rows
-    ]
+    return SequencePage(
+        items=[
+            SequenceOut(
+                id=str(s.id),
+                ott_id=s.ott_id,
+                marker=s.marker,
+                source=s.source,
+                accession=s.accession,
+                sequence=s.sequence,
+                length=s.length,
+                is_canonical=s.is_canonical,
+                retrieved_at=s.retrieved_at,
+            )
+            for s in rows
+        ],
+        total=total,
+        offset=offset,
+        limit=limit,
+    )
diff --git a/apps/api/src/evograph/api/routes/stats.py b/apps/api/src/evograph/api/routes/stats.py
@@ -0,0 +1,87 @@
+"""Stats endpoint for observability — database counts and data quality overview."""
+
+from fastapi import APIRouter, Depends
+from fastapi.responses import JSONResponse
+from sqlalchemy import func
+from sqlalchemy.orm import Session
+
+from evograph.db.models import Edge, Sequence, Taxon
+from evograph.db.session import get_db
+
+router = APIRouter(tags=["stats"])
+
+
+@router.get("/stats")
+def get_stats(db: Session = Depends(get_db)):
+    """Return summary statistics about the database contents.
+
+    Includes:
+    - Total taxa count and breakdown by rank
+    - Total sequences and breakdown by source/marker
+    - Total MI edges with distance summary
+    - Species with/without sequences
+    """
+    # Taxa by rank
+    rank_counts = (
+        db.query(Taxon.rank, func.count())
+        .group_by(Taxon.rank)
+        .order_by(func.count().desc())
+        .all()
+    )
+
+    total_taxa = sum(c for _, c in rank_counts)
+
+    # Sequences by source
+    seq_by_source = (
+        db.query(Sequence.source, func.count())
+        .group_by(Sequence.source)
+        .all()
+    )
+    total_sequences = sum(c for _, c in seq_by_source)
+
+    # Species with at least one sequence
+    species_with_seqs = (
+        db.query(func.count(func.distinct(Sequence.ott_id)))
+        .scalar()
+    ) or 0
+
+    total_species = next(
+        (c for r, c in rank_counts if r == "species"), 0
+    )
+
+    # Edge stats
+    total_edges = db.query(func.count()).select_from(Edge).scalar() or 0
+    distance_stats = None
+    if total_edges > 0:
+        row = db.query(
+            func.min(Edge.distance),
+            func.max(Edge.distance),
+            func.avg(Edge.distance),
+        ).one()
+        distance_stats = {
+            "min": round(float(row[0]), 4) if row[0] is not None else None,
+            "max": round(float(row[1]), 4) if row[1] is not None else None,
+            "avg": round(float(row[2]), 4) if row[2] is not None else None,
+        }
+
+    data = {
+        "taxa": {
+            "total": total_taxa,
+            "by_rank": {rank: count for rank, count in rank_counts},
+        },
+        "sequences": {
+            "total": total_sequences,
+            "by_source": {source: count for source, count in seq_by_source},
+            "species_with_sequences": species_with_seqs,
+            "species_total": total_species,
+            "coverage_pct": (
+                round(100.0 * species_with_seqs / total_species, 1)
+                if total_species > 0 else 0.0
+            ),
+        },
+        "edges": {
+            "total": total_edges,
+            "distance": distance_stats,
+        },
+    }
+    return JSONResponse(content=data, headers={"Cache-Control": "public, max-age=60"})
diff --git a/apps/api/src/evograph/api/schemas/sequence.py b/apps/api/src/evograph/api/schemas/sequence.py
@@ -1,6 +1,8 @@
-from pydantic import BaseModel
 from datetime import datetime
 
+from pydantic import BaseModel
+
+
 class SequenceOut(BaseModel):
     id: str
     ott_id: int
@@ -11,3 +13,10 @@ class SequenceOut(BaseModel):
     length: int
     is_canonical: bool
     retrieved_at: datetime | None = None
+
+
+class SequencePage(BaseModel):
+    items: list[SequenceOut]
+    total: int
+    offset: int
+    limit: int
diff --git a/apps/api/src/evograph/api/schemas/taxa.py b/apps/api/src/evograph/api/schemas/taxa.py
@@ -29,3 +29,9 @@ class ChildrenPage(BaseModel):
     total: int
     offset: int
     limit: int
+
+
+class SearchPage(BaseModel):
+    items: list[TaxonSummary]
+    total: int
+    limit: int