harvatechs · harvatechs · Mar 6, 2026
diff --git a/README.md b/README.md
@@ -15,6 +15,16 @@ A production-ready, frugal, sovereign AI system that orchestrates India's open-s
 
 **Now supporting all 22 official Indian languages with GUI and TUI interfaces!**
 
+## 🎯 Project Vision
+
+Ariv is built as a world-class open-source AI model orchestra focused on reliable, high-accuracy reasoning. Our goal is to continuously improve ARC-AGI-2 style performance through better orchestration, evaluation discipline, and efficient inference on accessible hardware.
+
+Core principles:
+- **Open by default**: Build with open models, open benchmarks, and reproducible workflows.
+- **Accuracy first**: Optimize for correctness and grounded reasoning quality, not just speed.
+- **Practical excellence**: Deliver research-grade methods in production-ready tooling.
+- **Inclusive intelligence**: Keep multilingual Indian language support as a first-class objective.
+
 ## 🌟 What's New in Version 2.0
 
 ### ✨ Major Enhancements
@@ -656,3 +666,4 @@ Apache License Version 2.0 - See [LICENSE](LICENSE)
 - **[TUI Guide](docs/tui/README.md)** - Detailed TUI interface guide
 - **[Contributing Guide](docs/CONTRIBUTING.md)** - How to contribute to Ariv
 - **[Deployment Guide](docs/DEPLOYMENT.md)** - Production deployment instructions
+- **[Production Benchmark Report](benchmarks/results/PRODUCTION_BENCHMARK_REPORT.md)** - Latest benchmark summary for release sign-off
diff --git a/benchmarks/generate_report.py b/benchmarks/generate_report.py
@@ -0,0 +1,111 @@
+"""Generate a deployment-facing benchmark report from benchmark CSV outputs."""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import platform
+from datetime import datetime, timezone
+from pathlib import Path
+from statistics import mean
+from typing import Dict, List
+
+
+def _read_rows(csv_path: Path) -> List[Dict[str, str]]:
+    with csv_path.open("r", encoding="utf-8") as handle:
+        reader = csv.DictReader(handle)
+        return list(reader)
+
+
+def _as_float(row: Dict[str, str], key: str) -> float:
+    value = row.get(key, "0").strip()
+    try:
+        return float(value)
+    except ValueError:
+        return 0.0
+
+
+def _build_report(rows: List[Dict[str, str]], source_csv: Path) -> str:
+    if not rows:
+        raise ValueError("No benchmark rows found in CSV")
+
+    avg_bleu = mean(_as_float(r, "bleu") for r in rows)
+    avg_chrf = mean(_as_float(r, "chrf") for r in rows)
+    avg_tps = mean(_as_float(r, "throughput_tps") for r in rows)
+    avg_p50 = mean(_as_float(r, "latency_p50") for r in rows)
+    avg_p95 = mean(_as_float(r, "latency_p95") for r in rows)
+
+    lines = [
+        "# Production Benchmark Report",
+        "",
+        f"- Generated (UTC): {datetime.now(timezone.utc).isoformat(timespec='seconds')}",
+        f"- Source data: `{source_csv.as_posix()}`",
+        f"- Host platform: `{platform.platform()}`",
+        "",
+        "## Executive Summary",
+        "",
+        "This report summarizes benchmark output generated by `benchmarks/run_bench.py` and presents it in an operations-friendly format for release sign-off.",
+        "",
+        "## Aggregated Metrics",
+        "",
+        f"- Mean BLEU: **{avg_bleu:.4f}**",
+        f"- Mean chrF: **{avg_chrf:.4f}**",
+        f"- Mean throughput: **{avg_tps:.2f} tok/s**",
+        f"- Mean p50 latency: **{avg_p50:.4f}s**",
+        f"- Mean p95 latency: **{avg_p95:.4f}s**",
+        "",
+        "## Detailed Results",
+        "",
+        "| Model | Lang | Subset | BLEU | chrF | Throughput (tok/s) | p50 (s) | p95 (s) |",
+        "| --- | --- | --- | ---: | ---: | ---: | ---: | ---: |",
+    ]
+
+    for row in rows:
+        lines.append(
+            "| {model} | {lang} | {subset} | {bleu} | {chrf} | {throughput_tps} | {latency_p50} | {latency_p95} |".format(
+                **row
+            )
+        )
+
+    lines.extend(
+        [
+            "",
+            "## Production Readiness Gates",
+            "",
+            "- [x] Benchmark artifact captured in version control.",
+            "- [x] Throughput and latency metrics captured (p50/p95).",
+            "- [ ] Re-run on target production hardware profile (recommended before release).",
+            "- [ ] Validate with full model set (tiny fixture is smoke-only).",
+            "",
+            "## Reproducibility",
+            "",
+            "```bash",
+            f"python benchmarks/generate_report.py --csv {source_csv.as_posix()} --output benchmarks/results/PRODUCTION_BENCHMARK_REPORT.md",
+            "```",
+        ]
+    )
+
+    return "\n".join(lines) + "\n"
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate benchmark markdown report")
+    parser.add_argument("--csv", required=True, type=Path, help="Path to benchmark CSV")
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("benchmarks/results/PRODUCTION_BENCHMARK_REPORT.md"),
+        help="Output markdown report path",
+    )
+    args = parser.parse_args()
+
+    rows = _read_rows(args.csv)
+    report = _build_report(rows, args.csv)
+
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    args.output.write_text(report, encoding="utf-8")
+    print(f"Wrote report to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/results/PRODUCTION_BENCHMARK_REPORT.md b/benchmarks/results/PRODUCTION_BENCHMARK_REPORT.md
@@ -0,0 +1,36 @@
+# Production Benchmark Report
+
+- Generated (UTC): 2026-03-06T11:25:25+00:00
+- Source data: `benchmarks/results/tiny.gguf-hi-dev.csv`
+- Host platform: `Linux-6.12.47-x86_64-with-glibc2.39`
+
+## Executive Summary
+
+This report summarizes benchmark output generated by `benchmarks/run_bench.py` and presents it in an operations-friendly format for release sign-off.
+
+## Aggregated Metrics
+
+- Mean BLEU: **1.0000**
+- Mean chrF: **1.0000**
+- Mean throughput: **9525.29 tok/s**
+- Mean p50 latency: **0.0003s**
+- Mean p95 latency: **0.0005s**
+
+## Detailed Results
+
+| Model | Lang | Subset | BLEU | chrF | Throughput (tok/s) | p50 (s) | p95 (s) |
+| --- | --- | --- | ---: | ---: | ---: | ---: | ---: |
+| tests/fixtures/tiny.gguf | hi | dev | 1.0 | 1.0 | 9525.29 | 0.0003 | 0.0005 |
+
+## Production Readiness Gates
+
+- [x] Benchmark artifact captured in version control.
+- [x] Throughput and latency metrics captured (p50/p95).
+- [ ] Re-run on target production hardware profile (recommended before release).
+- [ ] Validate with full model set (tiny fixture is smoke-only).
+
+## Reproducibility
+
+```bash
+python benchmarks/generate_report.py --csv benchmarks/results/tiny.gguf-hi-dev.csv --output benchmarks/results/PRODUCTION_BENCHMARK_REPORT.md
+```
diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md
@@ -0,0 +1,101 @@
+# Deployment Guide (Production)
+
+This guide defines a production-grade deployment baseline for Ariv, including environment hardening, runtime configuration, observability, and benchmark sign-off.
+
+## 1) Deployment architecture
+
+Recommended minimal production stack:
+
+- **Ariv API** (`deploy/api_wrapper.py`) as ASGI app.
+- **Reverse proxy** (Nginx/Caddy) for TLS termination and request limits.
+- **Process supervisor** (systemd/supervisord) with automatic restarts.
+- **Metrics + logs** pipeline (Prometheus/Grafana + centralized logs).
+- **Model cache volume** mounted on fast local SSD.
+
+## 2) Environment baseline
+
+- Python 3.10+ recommended.
+- Dedicated service user (no root execution).
+- `ulimit` tuned for concurrent clients.
+- GPU drivers pinned and validated (if GPU inference is enabled).
+
+### Example bootstrap
+
+```bash
+python -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+
+## 3) Security hardening checklist
+
+- [ ] Run behind HTTPS only.
+- [ ] Restrict CORS to trusted origins.
+- [ ] Add API authentication/authorization.
+- [ ] Set request body size and rate limits at reverse proxy.
+- [ ] Disable debug mode in production.
+- [ ] Rotate secrets via environment variables or secret manager.
+- [ ] Enable audit logging for admin operations.
+
+## 4) Runtime configuration
+
+Set explicit runtime values instead of relying on defaults:
+
+- model path(s)
+- max tokens
+- temperature
+- GPU layer offload
+- timeout budgets
+- concurrency limits
+
+Use environment-specific `.env` files and avoid committing secrets.
+
+## 5) Operations and observability
+
+Track, at minimum:
+
+- Request count / error rate
+- p50 / p95 / p99 latency
+- Throughput (tokens per second)
+- GPU utilization, VRAM usage, CPU, memory
+- Queue depth / active sessions
+
+Define SLOs before launch (example):
+
+- Availability: 99.5% monthly
+- p95 latency: < 2.5s on standard prompt class
+- Error rate: < 1%
+
+## 6) Release gates
+
+A release is production-ready only if all are true:
+
+1. Security checklist complete.
+2. Smoke tests pass.
+3. Benchmark report generated and reviewed.
+4. Rollback plan documented.
+5. On-call and alert routing configured.
+
+## 7) Benchmark sign-off workflow
+
+Run benchmark:
+
+```bash
+python benchmarks/run_bench.py --models tests/fixtures/tiny.gguf --lang hi --subset dev
+```
+
+Generate a production-friendly report from CSV:
+
+```bash
+python benchmarks/generate_report.py --csv benchmarks/results/tiny.gguf-hi-dev.csv --output benchmarks/results/PRODUCTION_BENCHMARK_REPORT.md
+```
+
+Commit benchmark artifacts for traceability.
+
+## 8) Suggested deployment command
+
+```bash
+python deploy/api_wrapper.py
+```
+
+For internet-facing services, prefer running with a production ASGI server and proxy (e.g., gunicorn/uvicorn workers + Nginx).