-
Notifications
You must be signed in to change notification settings - Fork 0
Production Deployment
Complete guide for deploying Inferno in production environments with enterprise-grade reliability, security, and performance.
- Hardware: Meets System Requirements for your workload
- Security: Authentication and TLS configured
- Monitoring: Metrics and logging setup complete
- Backup: Data backup and recovery procedures tested
- Load Testing: Performance validated under expected load
- Documentation: Runbooks and incident response procedures ready
Create docker-compose.prod.yml:
version: '3.8'
services:
inferno:
image: inferno:latest
container_name: inferno-prod
restart: unless-stopped
deploy:
resources:
limits:
memory: 16G
cpus: '8'
reservations:
memory: 8G
cpus: '4'
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
ports:
- "8080:8080"
- "9090:9090" # Metrics port
volumes:
- inferno_models:/data/models:ro
- inferno_cache:/data/cache
- inferno_logs:/data/logs
- ./config/production.toml:/etc/inferno/inferno.toml:ro
- ./ssl:/etc/ssl/inferno:ro
environment:
- RUST_LOG=info
- INFERNO_CONFIG=/etc/inferno/inferno.toml
- INFERNO_SSL_CERT=/etc/ssl/inferno/cert.pem
- INFERNO_SSL_KEY=/etc/ssl/inferno/key.pem
networks:
- inferno-network
healthcheck:
test: ["CMD", "curl", "-f", "https://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
# Load balancer with SSL termination
nginx:
image: nginx:alpine
container_name: inferno-nginx
restart: unless-stopped
ports:
- "443:443"
- "80:80"
volumes:
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
- ./ssl:/etc/ssl/nginx:ro
- nginx_logs:/var/log/nginx
depends_on:
- inferno
networks:
- inferno-network
# Monitoring stack
prometheus:
image: prom/prometheus:latest
container_name: inferno-prometheus
restart: unless-stopped
ports:
- "9091:9090"
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
networks:
- inferno-network
grafana:
image: grafana/grafana:latest
container_name: inferno-grafana
restart: unless-stopped
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
- GF_SECURITY_DISABLE_GRAVATAR=true
- GF_ANALYTICS_REPORTING_ENABLED=false
- GF_ANALYTICS_CHECK_FOR_UPDATES=false
volumes:
- grafana_data:/var/lib/grafana
- ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning
networks:
- inferno-network
# Redis for caching and session management
redis:
image: redis:alpine
container_name: inferno-redis
restart: unless-stopped
command: redis-server --requirepass ${REDIS_PASSWORD}
volumes:
- redis_data:/data
networks:
- inferno-network
networks:
inferno-network:
driver: bridge
volumes:
inferno_models:
driver: local
driver_opts:
type: none
o: bind
device: /data/inferno/models
inferno_cache:
driver: local
inferno_logs:
driver: local
nginx_logs:
driver: local
prometheus_data:
driver: local
grafana_data:
driver: local
redis_data:
driver: localCreate config/production.toml:
# Production configuration for Inferno
models_dir = "/data/models"
cache_dir = "/data/cache"
log_level = "info"
log_format = "json"
[server]
bind_address = "0.0.0.0"
port = 8080
max_concurrent_requests = 1000
request_timeout_seconds = 300
enable_cors = true
cors_origins = ["https://yourdomain.com"]
[security]
enable_auth = true
jwt_secret = "${JWT_SECRET}"
api_key_header = "X-API-Key"
rate_limit_requests_per_minute = 100
enable_ip_filtering = true
allowed_ips = ["10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"]
[backend_config]
gpu_enabled = true
context_size = 4096
batch_size = 32
memory_map = true
gpu_memory_fraction = 0.8
[cache]
enabled = true
compression = "zstd"
max_size_gb = 50
ttl_hours = 24
[observability]
metrics_enabled = true
metrics_port = 9090
tracing_enabled = true
tracing_endpoint = "http://jaeger:14268/api/traces"
[audit]
enabled = true
log_file = "/data/logs/audit.log"
encryption_enabled = true
encryption_key = "${AUDIT_ENCRYPTION_KEY}"
compression = "gzip"
[distributed]
enabled = false # Enable for multi-node deployment
node_id = "prod-node-1"
cluster_nodes = ["prod-node-1:8080", "prod-node-2:8080"]Create nginx/nginx.conf:
events {
worker_connections 1024;
}
http {
upstream inferno_backend {
least_conn;
server inferno:8080 max_fails=3 fail_timeout=30s;
# Add more servers for load balancing
# server inferno-2:8080 max_fails=3 fail_timeout=30s;
}
# Rate limiting
limit_req_zone $binary_remote_addr zone=api:10m rate=10r/s;
limit_req_zone $binary_remote_addr zone=auth:10m rate=1r/s;
# SSL configuration
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers HIGH:!aNULL:!MD5;
ssl_prefer_server_ciphers on;
# Gzip compression
gzip on;
gzip_vary on;
gzip_min_length 1024;
gzip_types text/plain application/json;
server {
listen 80;
server_name your-domain.com;
return 301 https://$server_name$request_uri;
}
server {
listen 443 ssl http2;
server_name your-domain.com;
ssl_certificate /etc/ssl/nginx/cert.pem;
ssl_certificate_key /etc/ssl/nginx/key.pem;
# Security headers
add_header Strict-Transport-Security "max-age=31536000; includeSubDomains";
add_header X-Frame-Options DENY;
add_header X-Content-Type-Options nosniff;
add_header X-XSS-Protection "1; mode=block";
# Health check endpoint (no rate limiting)
location /health {
proxy_pass http://inferno_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
# Authentication endpoints (strict rate limiting)
location /auth/ {
limit_req zone=auth burst=5 nodelay;
proxy_pass http://inferno_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
# API endpoints (moderate rate limiting)
location /v1/ {
limit_req zone=api burst=20 nodelay;
proxy_pass http://inferno_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_read_timeout 300s;
proxy_send_timeout 300s;
}
# WebSocket support
location /ws/ {
proxy_pass http://inferno_backend;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
# Metrics endpoint (internal only)
location /metrics {
allow 10.0.0.0/8;
allow 172.16.0.0/12;
allow 192.168.0.0/16;
deny all;
proxy_pass http://inferno_backend:9090;
}
}
}apiVersion: v1
kind: Namespace
metadata:
name: inferno-production
labels:
environment: production
app: inferno
---
apiVersion: v1
kind: ResourceQuota
metadata:
name: inferno-quota
namespace: inferno-production
spec:
hard:
requests.cpu: "20"
requests.memory: 64Gi
limits.cpu: "40"
limits.memory: 128Gi
nvidia.com/gpu: "4"apiVersion: v1
kind: ConfigMap
metadata:
name: inferno-config
namespace: inferno-production
data:
inferno.toml: |
models_dir = "/data/models"
cache_dir = "/data/cache"
log_level = "info"
log_format = "json"
[server]
bind_address = "0.0.0.0"
port = 8080
max_concurrent_requests = 1000
[security]
enable_auth = true
rate_limit_requests_per_minute = 100
[backend_config]
gpu_enabled = true
context_size = 4096
batch_size = 32
[cache]
enabled = true
compression = "zstd"
max_size_gb = 50
[observability]
metrics_enabled = true
metrics_port = 9090apiVersion: v1
kind: Secret
metadata:
name: inferno-secrets
namespace: inferno-production
type: Opaque
data:
jwt-secret: <base64-encoded-jwt-secret>
audit-encryption-key: <base64-encoded-encryption-key>
api-key: <base64-encoded-api-key>
---
apiVersion: v1
kind: Secret
metadata:
name: inferno-tls
namespace: inferno-production
type: kubernetes.io/tls
data:
tls.crt: <base64-encoded-certificate>
tls.key: <base64-encoded-private-key>apiVersion: apps/v1
kind: Deployment
metadata:
name: inferno-deployment
namespace: inferno-production
labels:
app: inferno
version: v1.0.0
spec:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: inferno
template:
metadata:
labels:
app: inferno
version: v1.0.0
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9090"
prometheus.io/path: "/metrics"
spec:
serviceAccountName: inferno-service-account
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
containers:
- name: inferno
image: inferno:v1.0.0
imagePullPolicy: IfNotPresent
ports:
- containerPort: 8080
name: http
protocol: TCP
- containerPort: 9090
name: metrics
protocol: TCP
env:
- name: RUST_LOG
value: "info"
- name: INFERNO_CONFIG
value: "/etc/inferno/inferno.toml"
- name: JWT_SECRET
valueFrom:
secretKeyRef:
name: inferno-secrets
key: jwt-secret
- name: AUDIT_ENCRYPTION_KEY
valueFrom:
secretKeyRef:
name: inferno-secrets
key: audit-encryption-key
resources:
requests:
memory: "8Gi"
cpu: "2000m"
nvidia.com/gpu: 1
limits:
memory: "16Gi"
cpu: "4000m"
nvidia.com/gpu: 1
volumeMounts:
- name: config
mountPath: /etc/inferno
readOnly: true
- name: models
mountPath: /data/models
readOnly: true
- name: cache
mountPath: /data/cache
- name: logs
mountPath: /data/logs
livenessProbe:
httpGet:
path: /health
port: 8080
scheme: HTTP
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 10
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 8080
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
startupProbe:
httpGet:
path: /health
port: 8080
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 12
volumes:
- name: config
configMap:
name: inferno-config
- name: models
persistentVolumeClaim:
claimName: inferno-models-pvc
- name: cache
persistentVolumeClaim:
claimName: inferno-cache-pvc
- name: logs
persistentVolumeClaim:
claimName: inferno-logs-pvc
nodeSelector:
workload-type: gpu-inference
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app
operator: In
values:
- inferno
topologyKey: kubernetes.io/hostnameapiVersion: v1
kind: Service
metadata:
name: inferno-service
namespace: inferno-production
labels:
app: inferno
spec:
type: ClusterIP
ports:
- port: 8080
targetPort: 8080
protocol: TCP
name: http
- port: 9090
targetPort: 9090
protocol: TCP
name: metrics
selector:
app: inferno
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: inferno-ingress
namespace: inferno-production
annotations:
kubernetes.io/ingress.class: nginx
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/rate-limit: "100"
nginx.ingress.kubernetes.io/rate-limit-window: "1m"
cert-manager.io/cluster-issuer: "letsencrypt-prod"
spec:
tls:
- hosts:
- inferno.yourdomain.com
secretName: inferno-tls
rules:
- host: inferno.yourdomain.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: inferno-service
port:
number: 8080apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: inferno-hpa
namespace: inferno-production
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: inferno-deployment
minReplicas: 3
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
- type: Pods
pods:
metric:
name: inferno_requests_per_second
target:
type: AverageValue
averageValue: "50"# Generate production SSL certificates (use Let's Encrypt in production)
openssl req -x509 -newkey rsa:4096 -keyout key.pem -out cert.pem -days 365 -nodes \
-subj "/C=US/ST=State/L=City/O=Organization/CN=inferno.yourdomain.com"
# Set proper permissions
chmod 600 key.pem
chmod 644 cert.pem# Initialize authentication system
inferno auth init --admin-user admin --admin-email admin@company.com
# Create API keys for services
inferno auth create-api-key --name "monitoring-service" --permissions "metrics:read"
inferno auth create-api-key --name "application" --permissions "inference:read,inference:write"
# Set up JWT configuration
inferno auth config --jwt-expiry 24h --refresh-token-expiry 7d# Linux iptables rules
iptables -A INPUT -p tcp --dport 22 -j ACCEPT # SSH
iptables -A INPUT -p tcp --dport 443 -j ACCEPT # HTTPS
iptables -A INPUT -p tcp --dport 80 -j ACCEPT # HTTP (redirect to HTTPS)
iptables -A INPUT -p tcp --dport 8080 -s 10.0.0.0/8 -j ACCEPT # Internal API
iptables -A INPUT -p tcp --dport 9090 -s 10.0.0.0/8 -j ACCEPT # Internal metrics
iptables -A INPUT -j DROP # Drop all other traffic
# Save rules
iptables-save > /etc/iptables/rules.v4Create monitoring/prometheus.yml:
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "inferno_rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: 'inferno'
static_configs:
- targets: ['inferno:9090']
scrape_interval: 10s
metrics_path: /metrics
- job_name: 'nginx'
static_configs:
- targets: ['nginx:9113']
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']Create monitoring/inferno_rules.yml:
groups:
- name: inferno
rules:
- alert: InfernoDown
expr: up{job="inferno"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Inferno instance is down"
description: "Inferno instance {{ $labels.instance }} has been down for more than 1 minute."
- alert: HighErrorRate
expr: rate(inferno_requests_total{status=~"5.."}[5m]) > 0.1
for: 2m
labels:
severity: warning
annotations:
summary: "High error rate on Inferno"
description: "Error rate is {{ $value }} errors per second on {{ $labels.instance }}."
- alert: HighLatency
expr: histogram_quantile(0.95, rate(inferno_request_duration_seconds_bucket[5m])) > 30
for: 5m
labels:
severity: warning
annotations:
summary: "High latency on Inferno"
description: "95th percentile latency is {{ $value }}s on {{ $labels.instance }}."
- alert: GPUMemoryHigh
expr: inferno_gpu_memory_used_bytes / inferno_gpu_memory_total_bytes > 0.9
for: 2m
labels:
severity: warning
annotations:
summary: "GPU memory usage high"
description: "GPU memory usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}."
- alert: ModelLoadFailure
expr: increase(inferno_model_load_failures_total[5m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "Model load failures detected"
description: "{{ $value }} model load failures in the last 5 minutes on {{ $labels.instance }}."Create monitoring/grafana/dashboards/inferno-overview.json:
{
"dashboard": {
"id": null,
"title": "Inferno Overview",
"tags": ["inferno", "ai", "ml"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(inferno_requests_total[5m])",
"legendFormat": "{{instance}} - {{method}}"
}
],
"yAxes": [
{
"label": "Requests/sec"
}
]
},
{
"id": 2,
"title": "Response Time",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(inferno_request_duration_seconds_bucket[5m]))",
"legendFormat": "95th percentile"
},
{
"expr": "histogram_quantile(0.50, rate(inferno_request_duration_seconds_bucket[5m]))",
"legendFormat": "50th percentile"
}
],
"yAxes": [
{
"label": "Seconds"
}
]
},
{
"id": 3,
"title": "GPU Utilization",
"type": "graph",
"targets": [
{
"expr": "inferno_gpu_utilization_percent",
"legendFormat": "GPU {{gpu_id}}"
}
],
"yAxes": [
{
"label": "Percentage",
"max": 100
}
]
},
{
"id": 4,
"title": "Active Models",
"type": "stat",
"targets": [
{
"expr": "inferno_loaded_models",
"legendFormat": "Loaded Models"
}
]
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "5s"
}
}Create scripts/backup.sh:
#!/bin/bash
# Production backup script for Inferno
set -euo pipefail
BACKUP_DIR="/backup/inferno"
RETENTION_DAYS=30
DATE=$(date +%Y%m%d_%H%M%S)
# Create backup directory
mkdir -p "$BACKUP_DIR/$DATE"
echo "Starting Inferno backup at $(date)"
# Backup models
echo "Backing up models..."
tar -czf "$BACKUP_DIR/$DATE/models.tar.gz" -C /data/inferno models/
# Backup configuration
echo "Backing up configuration..."
cp -r /etc/inferno "$BACKUP_DIR/$DATE/config"
# Backup cache metadata (not the full cache)
echo "Backing up cache metadata..."
find /data/inferno/cache -name "*.meta" -exec cp {} "$BACKUP_DIR/$DATE/" \;
# Backup audit logs
echo "Backing up audit logs..."
tar -czf "$BACKUP_DIR/$DATE/audit_logs.tar.gz" -C /data/inferno logs/
# Backup database (if using external database)
if [[ -n "${DATABASE_URL:-}" ]]; then
echo "Backing up database..."
pg_dump "$DATABASE_URL" | gzip > "$BACKUP_DIR/$DATE/database.sql.gz"
fi
# Create manifest
echo "Creating backup manifest..."
cat > "$BACKUP_DIR/$DATE/manifest.json" << EOF
{
"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
"version": "$(inferno --version)",
"backup_type": "full",
"files": [
"models.tar.gz",
"config/",
"audit_logs.tar.gz"
$([ -f "$BACKUP_DIR/$DATE/database.sql.gz" ] && echo ', "database.sql.gz"' || echo '')
]
}
EOF
# Clean up old backups
echo "Cleaning up old backups..."
find "$BACKUP_DIR" -type d -name "[0-9]*_[0-9]*" -mtime +$RETENTION_DAYS -exec rm -rf {} \;
# Upload to cloud storage (optional)
if [[ -n "${S3_BACKUP_BUCKET:-}" ]]; then
echo "Uploading to S3..."
aws s3 sync "$BACKUP_DIR/$DATE" "s3://$S3_BACKUP_BUCKET/inferno/$DATE/"
fi
echo "Backup completed successfully at $(date)"
echo "Backup location: $BACKUP_DIR/$DATE"Create scripts/restore.sh:
#!/bin/bash
# Production restore script for Inferno
set -euo pipefail
BACKUP_DIR="/backup/inferno"
RESTORE_DATE="${1:-latest}"
if [[ "$RESTORE_DATE" == "latest" ]]; then
RESTORE_PATH=$(ls -td "$BACKUP_DIR"/*/ | head -1)
else
RESTORE_PATH="$BACKUP_DIR/$RESTORE_DATE"
fi
if [[ ! -d "$RESTORE_PATH" ]]; then
echo "Error: Backup directory $RESTORE_PATH not found"
exit 1
fi
echo "Starting restore from $RESTORE_PATH at $(date)"
# Stop Inferno service
echo "Stopping Inferno service..."
systemctl stop inferno || docker-compose down
# Restore models
echo "Restoring models..."
tar -xzf "$RESTORE_PATH/models.tar.gz" -C /data/inferno/
# Restore configuration
echo "Restoring configuration..."
cp -r "$RESTORE_PATH/config"/* /etc/inferno/
# Restore audit logs
echo "Restoring audit logs..."
tar -xzf "$RESTORE_PATH/audit_logs.tar.gz" -C /data/inferno/
# Restore database (if exists)
if [[ -f "$RESTORE_PATH/database.sql.gz" ]]; then
echo "Restoring database..."
gunzip -c "$RESTORE_PATH/database.sql.gz" | psql "$DATABASE_URL"
fi
# Verify restore
echo "Verifying restore..."
inferno validate-config /etc/inferno/inferno.toml
# Start service
echo "Starting Inferno service..."
systemctl start inferno || docker-compose up -d
echo "Restore completed successfully at $(date)"Create scripts/production-tune.sh:
#!/bin/bash
# Production performance tuning script
set -euo pipefail
echo "Applying production performance optimizations..."
# Linux kernel parameters
echo "Tuning kernel parameters..."
cat >> /etc/sysctl.conf << EOF
# Network optimization
net.core.rmem_max = 134217728
net.core.wmem_max = 134217728
net.ipv4.tcp_rmem = 4096 87380 134217728
net.ipv4.tcp_wmem = 4096 65536 134217728
net.core.netdev_max_backlog = 5000
# File system optimization
fs.file-max = 65536
vm.swappiness = 1
vm.dirty_ratio = 15
vm.dirty_background_ratio = 5
# GPU memory optimization
vm.overcommit_memory = 1
EOF
sysctl -p
# Docker optimizations
echo "Configuring Docker for production..."
cat > /etc/docker/daemon.json << EOF
{
"log-driver": "json-file",
"log-opts": {
"max-size": "100m",
"max-file": "5"
},
"storage-driver": "overlay2",
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
}
}
EOF
systemctl restart docker
# Inferno-specific optimizations
echo "Optimizing Inferno configuration..."
inferno optimize --profile production --hardware auto-detect
echo "Performance tuning completed!"Create .github/workflows/deploy-production.yml:
name: Deploy to Production
on:
push:
tags:
- 'v*'
env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
jobs:
build-and-deploy:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Container Registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
type=ref,event=branch
type=ref,event=pr
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
- name: Build and push Docker image
uses: docker/build-push-action@v5
with:
context: .
platforms: linux/amd64,linux/arm64
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
- name: Deploy to production
uses: appleboy/ssh-action@v1.0.0
with:
host: ${{ secrets.PROD_HOST }}
username: ${{ secrets.PROD_USER }}
key: ${{ secrets.PROD_SSH_KEY }}
script: |
cd /opt/inferno
docker-compose pull
docker-compose up -d --no-deps inferno
docker image prune -f
- name: Health check
run: |
sleep 30
curl -f ${{ secrets.PROD_URL }}/health || exit 1
- name: Notify deployment
uses: 8398a7/action-slack@v3
if: always()
with:
status: ${{ job.status }}
channel: '#deployments'
webhook_url: ${{ secrets.SLACK_WEBHOOK }}Create terraform/main.tf:
provider "aws" {
region = var.aws_region
}
# VPC and networking
resource "aws_vpc" "inferno_vpc" {
cidr_block = "10.0.0.0/16"
enable_dns_hostnames = true
enable_dns_support = true
tags = {
Name = "inferno-production"
}
}
resource "aws_subnet" "inferno_subnet" {
count = 2
vpc_id = aws_vpc.inferno_vpc.id
cidr_block = "10.0.${count.index + 1}.0/24"
availability_zone = data.aws_availability_zones.available.names[count.index]
tags = {
Name = "inferno-subnet-${count.index + 1}"
}
}
# Security groups
resource "aws_security_group" "inferno_sg" {
name_prefix = "inferno-production"
vpc_id = aws_vpc.inferno_vpc.id
ingress {
from_port = 443
to_port = 443
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
}
ingress {
from_port = 22
to_port = 22
protocol = "tcp"
cidr_blocks = [var.admin_cidr]
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}
# Launch template for GPU instances
resource "aws_launch_template" "inferno_template" {
name_prefix = "inferno-production"
image_id = data.aws_ami.gpu_optimized.id
instance_type = var.instance_type
vpc_security_group_ids = [aws_security_group.inferno_sg.id]
user_data = base64encode(templatefile("${path.module}/user_data.sh", {
inferno_version = var.inferno_version
}))
tag_specifications {
resource_type = "instance"
tags = {
Name = "inferno-production"
}
}
}
# Auto Scaling Group
resource "aws_autoscaling_group" "inferno_asg" {
name = "inferno-production"
vpc_zone_identifier = aws_subnet.inferno_subnet[*].id
target_group_arns = [aws_lb_target_group.inferno_tg.arn]
health_check_type = "ELB"
health_check_grace_period = 300
min_size = var.min_instances
max_size = var.max_instances
desired_capacity = var.desired_instances
launch_template {
id = aws_launch_template.inferno_template.id
version = "$Latest"
}
tag {
key = "Name"
value = "inferno-production"
propagate_at_launch = true
}
}
# Application Load Balancer
resource "aws_lb" "inferno_alb" {
name = "inferno-production"
internal = false
load_balancer_type = "application"
security_groups = [aws_security_group.inferno_sg.id]
subnets = aws_subnet.inferno_subnet[*].id
}
resource "aws_lb_target_group" "inferno_tg" {
name = "inferno-production"
port = 8080
protocol = "HTTP"
vpc_id = aws_vpc.inferno_vpc.id
health_check {
enabled = true
healthy_threshold = 2
unhealthy_threshold = 2
timeout = 10
interval = 30
path = "/health"
matcher = "200"
}
}
resource "aws_lb_listener" "inferno_listener" {
load_balancer_arn = aws_lb.inferno_alb.arn
port = "443"
protocol = "HTTPS"
ssl_policy = "ELBSecurityPolicy-TLS-1-2-2017-01"
certificate_arn = aws_acm_certificate.inferno_cert.arn
default_action {
type = "forward"
target_group_arn = aws_lb_target_group.inferno_tg.arn
}
}Create load-tests/production-load-test.yml:
config:
target: 'https://inferno.yourdomain.com'
phases:
- duration: 60
arrivalRate: 1
name: "Warm up"
- duration: 120
arrivalRate: 5
rampTo: 50
name: "Ramp up load"
- duration: 300
arrivalRate: 50
name: "Sustained load"
payload:
path: './test-prompts.csv'
fields:
- 'prompt'
processor: './functions.js'
scenarios:
- name: "Chat completion"
weight: 70
flow:
- post:
url: '/v1/chat/completions'
headers:
Authorization: 'Bearer {{ $processEnvironment.API_KEY }}'
Content-Type: 'application/json'
json:
model: 'llama-2-7b-chat'
messages:
- role: 'user'
content: '{{ prompt }}'
max_tokens: 500
temperature: 0.7
capture:
- json: '$.choices[0].message.content'
as: 'response'
- name: "Streaming chat"
weight: 20
flow:
- post:
url: '/v1/chat/completions'
headers:
Authorization: 'Bearer {{ $processEnvironment.API_KEY }}'
Content-Type: 'application/json'
json:
model: 'llama-2-7b-chat'
messages:
- role: 'user'
content: '{{ prompt }}'
stream: true
max_tokens: 200
- name: "Health check"
weight: 10
flow:
- get:
url: '/health'Create load-tests/k6-production-test.js:
import http from 'k6/http';
import { check, sleep } from 'k6';
import { Rate } from 'k6/metrics';
const errorRate = new Rate('errors');
export let options = {
stages: [
{ duration: '2m', target: 10 }, // Ramp up
{ duration: '5m', target: 50 }, // Stay at 50 users
{ duration: '2m', target: 100 }, // Ramp to 100 users
{ duration: '5m', target: 100 }, // Stay at 100 users
{ duration: '2m', target: 0 }, // Ramp down
],
thresholds: {
http_req_duration: ['p(95)<30000'], // 95% of requests under 30s
http_req_failed: ['rate<0.01'], // Error rate under 1%
errors: ['rate<0.01'],
},
};
const API_KEY = __ENV.API_KEY || 'test-key';
const BASE_URL = __ENV.BASE_URL || 'https://inferno.yourdomain.com';
const prompts = [
"Explain quantum computing in simple terms",
"Write a Python function to sort a list",
"What are the benefits of renewable energy?",
"Describe the water cycle",
"How does machine learning work?"
];
export default function () {
const prompt = prompts[Math.floor(Math.random() * prompts.length)];
const payload = JSON.stringify({
model: 'llama-2-7b-chat',
messages: [
{ role: 'user', content: prompt }
],
max_tokens: 500,
temperature: 0.7
});
const params = {
headers: {
'Authorization': `Bearer ${API_KEY}`,
'Content-Type': 'application/json',
},
timeout: '60s',
};
const response = http.post(`${BASE_URL}/v1/chat/completions`, payload, params);
const success = check(response, {
'status is 200': (r) => r.status === 200,
'response has content': (r) => {
try {
const body = JSON.parse(r.body);
return body.choices && body.choices.length > 0;
} catch (e) {
return false;
}
},
'response time OK': (r) => r.timings.duration < 30000,
});
errorRate.add(!success);
sleep(1);
}- Load testing completed with expected traffic patterns
- Security audit performed and issues resolved
- Backup and recovery procedures tested
- Monitoring and alerting configured
- SSL certificates installed and tested
- Authentication and authorization working
- Resource limits and quotas configured
- Documentation updated with production procedures
- Blue-green deployment strategy implemented
- Health checks configured and working
- Rolling updates configured with zero downtime
- Database migrations (if applicable) tested
- Feature flags configured for gradual rollout
- Rollback procedures tested and documented
- Smoke tests passing
- Metrics and logs flowing correctly
- Performance baselines established
- On-call rotation configured
- Incident response procedures documented
- Disaster recovery plan tested
Production Deployment Guide updated for Inferno v1.0.0. Need deployment assistance? Contact maintainer for specialized enterprise installation assistance (information and pricing available).