diff --git a/README.md b/README.md index 64210bd..41baf04 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,16 @@ A production-ready, frugal, sovereign AI system that orchestrates India's open-s **Now supporting all 22 official Indian languages with GUI and TUI interfaces!** +## 🎯 Project Vision + +Ariv is built as a world-class open-source AI model orchestra focused on reliable, high-accuracy reasoning. Our goal is to continuously improve ARC-AGI-2 style performance through better orchestration, evaluation discipline, and efficient inference on accessible hardware. + +Core principles: +- **Open by default**: Build with open models, open benchmarks, and reproducible workflows. +- **Accuracy first**: Optimize for correctness and grounded reasoning quality, not just speed. +- **Practical excellence**: Deliver research-grade methods in production-ready tooling. +- **Inclusive intelligence**: Keep multilingual Indian language support as a first-class objective. + ## 🌟 What's New in Version 2.0 ### ✨ Major Enhancements @@ -656,3 +666,4 @@ Apache License Version 2.0 - See [LICENSE](LICENSE) - **[TUI Guide](docs/tui/README.md)** - Detailed TUI interface guide - **[Contributing Guide](docs/CONTRIBUTING.md)** - How to contribute to Ariv - **[Deployment Guide](docs/DEPLOYMENT.md)** - Production deployment instructions +- **[Production Benchmark Report](benchmarks/results/PRODUCTION_BENCHMARK_REPORT.md)** - Latest benchmark summary for release sign-off diff --git a/benchmarks/generate_report.py b/benchmarks/generate_report.py new file mode 100644 index 0000000..011f5e4 --- /dev/null +++ b/benchmarks/generate_report.py @@ -0,0 +1,111 @@ +"""Generate a deployment-facing benchmark report from benchmark CSV outputs.""" + +from __future__ import annotations + +import argparse +import csv +import platform +from datetime import datetime, timezone +from pathlib import Path +from statistics import mean +from typing import Dict, List + + +def _read_rows(csv_path: Path) -> List[Dict[str, str]]: + with csv_path.open("r", encoding="utf-8") as handle: + reader = csv.DictReader(handle) + return list(reader) + + +def _as_float(row: Dict[str, str], key: str) -> float: + value = row.get(key, "0").strip() + try: + return float(value) + except ValueError: + return 0.0 + + +def _build_report(rows: List[Dict[str, str]], source_csv: Path) -> str: + if not rows: + raise ValueError("No benchmark rows found in CSV") + + avg_bleu = mean(_as_float(r, "bleu") for r in rows) + avg_chrf = mean(_as_float(r, "chrf") for r in rows) + avg_tps = mean(_as_float(r, "throughput_tps") for r in rows) + avg_p50 = mean(_as_float(r, "latency_p50") for r in rows) + avg_p95 = mean(_as_float(r, "latency_p95") for r in rows) + + lines = [ + "# Production Benchmark Report", + "", + f"- Generated (UTC): {datetime.now(timezone.utc).isoformat(timespec='seconds')}", + f"- Source data: `{source_csv.as_posix()}`", + f"- Host platform: `{platform.platform()}`", + "", + "## Executive Summary", + "", + "This report summarizes benchmark output generated by `benchmarks/run_bench.py` and presents it in an operations-friendly format for release sign-off.", + "", + "## Aggregated Metrics", + "", + f"- Mean BLEU: **{avg_bleu:.4f}**", + f"- Mean chrF: **{avg_chrf:.4f}**", + f"- Mean throughput: **{avg_tps:.2f} tok/s**", + f"- Mean p50 latency: **{avg_p50:.4f}s**", + f"- Mean p95 latency: **{avg_p95:.4f}s**", + "", + "## Detailed Results", + "", + "| Model | Lang | Subset | BLEU | chrF | Throughput (tok/s) | p50 (s) | p95 (s) |", + "| --- | --- | --- | ---: | ---: | ---: | ---: | ---: |", + ] + + for row in rows: + lines.append( + "| {model} | {lang} | {subset} | {bleu} | {chrf} | {throughput_tps} | {latency_p50} | {latency_p95} |".format( + **row + ) + ) + + lines.extend( + [ + "", + "## Production Readiness Gates", + "", + "- [x] Benchmark artifact captured in version control.", + "- [x] Throughput and latency metrics captured (p50/p95).", + "- [ ] Re-run on target production hardware profile (recommended before release).", + "- [ ] Validate with full model set (tiny fixture is smoke-only).", + "", + "## Reproducibility", + "", + "```bash", + f"python benchmarks/generate_report.py --csv {source_csv.as_posix()} --output benchmarks/results/PRODUCTION_BENCHMARK_REPORT.md", + "```", + ] + ) + + return "\n".join(lines) + "\n" + + +def main() -> None: + parser = argparse.ArgumentParser(description="Generate benchmark markdown report") + parser.add_argument("--csv", required=True, type=Path, help="Path to benchmark CSV") + parser.add_argument( + "--output", + type=Path, + default=Path("benchmarks/results/PRODUCTION_BENCHMARK_REPORT.md"), + help="Output markdown report path", + ) + args = parser.parse_args() + + rows = _read_rows(args.csv) + report = _build_report(rows, args.csv) + + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(report, encoding="utf-8") + print(f"Wrote report to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/results/PRODUCTION_BENCHMARK_REPORT.md b/benchmarks/results/PRODUCTION_BENCHMARK_REPORT.md new file mode 100644 index 0000000..f9d7d6f --- /dev/null +++ b/benchmarks/results/PRODUCTION_BENCHMARK_REPORT.md @@ -0,0 +1,36 @@ +# Production Benchmark Report + +- Generated (UTC): 2026-03-06T11:25:25+00:00 +- Source data: `benchmarks/results/tiny.gguf-hi-dev.csv` +- Host platform: `Linux-6.12.47-x86_64-with-glibc2.39` + +## Executive Summary + +This report summarizes benchmark output generated by `benchmarks/run_bench.py` and presents it in an operations-friendly format for release sign-off. + +## Aggregated Metrics + +- Mean BLEU: **1.0000** +- Mean chrF: **1.0000** +- Mean throughput: **9525.29 tok/s** +- Mean p50 latency: **0.0003s** +- Mean p95 latency: **0.0005s** + +## Detailed Results + +| Model | Lang | Subset | BLEU | chrF | Throughput (tok/s) | p50 (s) | p95 (s) | +| --- | --- | --- | ---: | ---: | ---: | ---: | ---: | +| tests/fixtures/tiny.gguf | hi | dev | 1.0 | 1.0 | 9525.29 | 0.0003 | 0.0005 | + +## Production Readiness Gates + +- [x] Benchmark artifact captured in version control. +- [x] Throughput and latency metrics captured (p50/p95). +- [ ] Re-run on target production hardware profile (recommended before release). +- [ ] Validate with full model set (tiny fixture is smoke-only). + +## Reproducibility + +```bash +python benchmarks/generate_report.py --csv benchmarks/results/tiny.gguf-hi-dev.csv --output benchmarks/results/PRODUCTION_BENCHMARK_REPORT.md +``` diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md new file mode 100644 index 0000000..4889e5e --- /dev/null +++ b/docs/DEPLOYMENT.md @@ -0,0 +1,101 @@ +# Deployment Guide (Production) + +This guide defines a production-grade deployment baseline for Ariv, including environment hardening, runtime configuration, observability, and benchmark sign-off. + +## 1) Deployment architecture + +Recommended minimal production stack: + +- **Ariv API** (`deploy/api_wrapper.py`) as ASGI app. +- **Reverse proxy** (Nginx/Caddy) for TLS termination and request limits. +- **Process supervisor** (systemd/supervisord) with automatic restarts. +- **Metrics + logs** pipeline (Prometheus/Grafana + centralized logs). +- **Model cache volume** mounted on fast local SSD. + +## 2) Environment baseline + +- Python 3.10+ recommended. +- Dedicated service user (no root execution). +- `ulimit` tuned for concurrent clients. +- GPU drivers pinned and validated (if GPU inference is enabled). + +### Example bootstrap + +```bash +python -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +``` + +## 3) Security hardening checklist + +- [ ] Run behind HTTPS only. +- [ ] Restrict CORS to trusted origins. +- [ ] Add API authentication/authorization. +- [ ] Set request body size and rate limits at reverse proxy. +- [ ] Disable debug mode in production. +- [ ] Rotate secrets via environment variables or secret manager. +- [ ] Enable audit logging for admin operations. + +## 4) Runtime configuration + +Set explicit runtime values instead of relying on defaults: + +- model path(s) +- max tokens +- temperature +- GPU layer offload +- timeout budgets +- concurrency limits + +Use environment-specific `.env` files and avoid committing secrets. + +## 5) Operations and observability + +Track, at minimum: + +- Request count / error rate +- p50 / p95 / p99 latency +- Throughput (tokens per second) +- GPU utilization, VRAM usage, CPU, memory +- Queue depth / active sessions + +Define SLOs before launch (example): + +- Availability: 99.5% monthly +- p95 latency: < 2.5s on standard prompt class +- Error rate: < 1% + +## 6) Release gates + +A release is production-ready only if all are true: + +1. Security checklist complete. +2. Smoke tests pass. +3. Benchmark report generated and reviewed. +4. Rollback plan documented. +5. On-call and alert routing configured. + +## 7) Benchmark sign-off workflow + +Run benchmark: + +```bash +python benchmarks/run_bench.py --models tests/fixtures/tiny.gguf --lang hi --subset dev +``` + +Generate a production-friendly report from CSV: + +```bash +python benchmarks/generate_report.py --csv benchmarks/results/tiny.gguf-hi-dev.csv --output benchmarks/results/PRODUCTION_BENCHMARK_REPORT.md +``` + +Commit benchmark artifacts for traceability. + +## 8) Suggested deployment command + +```bash +python deploy/api_wrapper.py +``` + +For internet-facing services, prefer running with a production ASGI server and proxy (e.g., gunicorn/uvicorn workers + Nginx).