diff --git a/docker-compose.loadtest.yml b/docker-compose.loadtest.yml new file mode 100644 index 0000000..92e9841 --- /dev/null +++ b/docker-compose.loadtest.yml @@ -0,0 +1,131 @@ +version: '3.8' + +services: + # HAProxy load balancer with consistent hashing + haproxy: + image: haproxy:2.8-alpine + ports: + - "8080:8080" + - "8404:8404" # Stats page + volumes: + - ./loadtest/haproxy.cfg:/usr/local/etc/haproxy/haproxy.cfg:ro + depends_on: + - goblet-1 + - goblet-2 + - goblet-3 + networks: + - goblet-net + + # Goblet instance 1 - shard A + goblet-1: + build: + context: . + dockerfile: Dockerfile + environment: + - GOBLET_PORT=8080 + - GOBLET_CACHE_ROOT=/cache + - GOBLET_INSTANCE_ID=1 + volumes: + - cache-1:/cache + networks: + - goblet-net + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8080/healthz"] + interval: 10s + timeout: 5s + retries: 3 + + # Goblet instance 2 - shard B + goblet-2: + build: + context: . + dockerfile: Dockerfile + environment: + - GOBLET_PORT=8080 + - GOBLET_CACHE_ROOT=/cache + - GOBLET_INSTANCE_ID=2 + volumes: + - cache-2:/cache + networks: + - goblet-net + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8080/healthz"] + interval: 10s + timeout: 5s + retries: 3 + + # Goblet instance 3 - shard C + goblet-3: + build: + context: . + dockerfile: Dockerfile + environment: + - GOBLET_PORT=8080 + - GOBLET_CACHE_ROOT=/cache + - GOBLET_INSTANCE_ID=3 + volumes: + - cache-3:/cache + networks: + - goblet-net + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8080/healthz"] + interval: 10s + timeout: 5s + retries: 3 + + # Prometheus for metrics collection + prometheus: + image: prom/prometheus:latest + ports: + - "9090:9090" + volumes: + - ./loadtest/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + networks: + - goblet-net + + # Grafana for metrics visualization + grafana: + image: grafana/grafana:latest + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + volumes: + - grafana-data:/var/lib/grafana + - ./loadtest/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml:ro + depends_on: + - prometheus + networks: + - goblet-net + + # Load test generator using k6 + k6: + image: grafana/k6:latest + profiles: + - loadtest + volumes: + - ./loadtest/k6-script.js:/scripts/test.js:ro + command: run /scripts/test.js + environment: + - K6_PROMETHEUS_RW_SERVER_URL=http://prometheus:9090/api/v1/write + - TARGET_URL=http://haproxy:8080 + depends_on: + - haproxy + networks: + - goblet-net + +networks: + goblet-net: + driver: bridge + +volumes: + cache-1: + cache-2: + cache-3: + prometheus-data: + grafana-data: diff --git a/docs/operations/load-testing.md b/docs/operations/load-testing.md new file mode 100644 index 0000000..cf32b2f --- /dev/null +++ b/docs/operations/load-testing.md @@ -0,0 +1,535 @@ +# Load Testing + +This guide explains how to load test Goblet to validate performance and capacity before production deployment. + +## Overview + +Load testing helps you: +- Validate deployment capacity +- Identify performance bottlenecks +- Tune cache sizes and resource limits +- Establish baseline metrics +- Test failure scenarios + +## Quick Start + +### Using Docker Compose + +The fastest way to run load tests: + +```bash +cd loadtest + +# Start test environment (3 Goblet instances + monitoring) +make start + +# Run Python-based load test +make loadtest-python + +# View results +open http://localhost:8404 # HAProxy stats +open http://localhost:9090 # Prometheus +open http://localhost:3000 # Grafana + +# Cleanup +make stop +``` + +## + + Test Environment Architecture + +``` +┌──────────────┐ +│ HAProxy │ Load balancer with consistent hashing +│ (port 8080)│ +└───────┬──────┘ + │ + ┌───┴────┬────────┐ + │ │ │ +┌───▼───┐ ┌──▼───┐ ┌──▼───┐ +│Goblet │ │Goblet│ │Goblet│ +│ -1 │ │ -2 │ │ -3 │ +└───┬───┘ └──┬───┘ └──┬───┘ + │ │ │ +┌───▼────────▼────────▼───┐ +│ Prometheus │ +│ (port 9090) │ +└────────┬─────────────────┘ + │ +┌────────▼─────────────────┐ +│ Grafana │ +│ (port 3000) │ +└──────────────────────────┘ +``` + +## Test Tools + +### Python Load Test + +Flexible, easy to customize: + +```bash +python3 loadtest/loadtest.py \ + --url http://localhost:8080 \ + --workers 20 \ + --requests 100 \ + --repos github.com/kubernetes/kubernetes \ + github.com/golang/go +``` + +**Options:** +- `--url`: Target URL +- `--workers`: Concurrent workers +- `--requests`: Requests per worker +- `--think-time`: Delay between requests (ms) +- `--repos`: Repository list to test +- `--output`: JSON output file + +**Output:** +``` +=== Load Test Summary === + +Total Requests: 2000 +Successful: 1995 +Failed: 5 +Success Rate: 99.75% +Total Duration: 45.23s +Requests/sec: 44.21 + +Response Times (ms): + Min: 12.34 + Max: 456.78 + Mean: 89.45 + Median: 67.89 + P95: 234.56 + P99: 389.12 +``` + +### k6 Load Test + +Advanced load testing with gradual ramp-up: + +```bash +# Run k6 test +docker-compose --profile loadtest up k6 +``` + +**Test stages:** +- Ramp to 10 VUs (2 min) +- Stay at 10 VUs (5 min) +- Ramp to 50 VUs (2 min) +- Stay at 50 VUs (5 min) +- Ramp to 100 VUs (2 min) +- Stay at 100 VUs (5 min) +- Ramp down (2 min) + +## Test Scenarios + +### Scenario 1: Cache Warm-up + +Test cache efficiency after warm-up period: + +```bash +# Phase 1: Populate cache +python3 loadtest.py --workers 5 --requests 50 + +# Phase 2: Test cache hits +python3 loadtest.py --workers 20 --requests 200 + +# Expected: >80% cache hit rate in Phase 2 +``` + +### Scenario 2: Cold Start + +Test behavior with empty cache: + +```bash +# Clear caches +docker-compose down -v +docker-compose up -d + +# Run test +python3 loadtest.py --workers 10 --requests 100 + +# Expected: Higher latency, all cache misses initially +``` + +### Scenario 3: High Concurrency + +Test maximum concurrent requests: + +```bash +python3 loadtest.py \ + --workers 100 \ + --requests 50 \ + --think-time 0 + +# Monitor: CPU, memory, connection count +``` + +### Scenario 4: Repository Diversity + +Test with many different repositories: + +```bash +python3 loadtest.py \ + --workers 20 \ + --requests 100 \ + --repos $(cat popular-repos.txt) + +# Tests cache distribution and eviction +``` + +### Scenario 5: Sustained Load + +Test stability over time: + +```bash +# Run for 1 hour +python3 loadtest.py \ + --workers 10 \ + --requests 3600 \ + --think-time 1000 + +# Monitor: memory leaks, cache growth, error rates +``` + +## Interpreting Results + +### Key Metrics + +**Success Rate:** +- Target: > 99% +- Warning: < 99% +- Critical: < 95% + +**Response Time (P95):** +- Excellent: < 100ms +- Good: 100-500ms +- Acceptable: 500-1000ms +- Poor: > 1000ms + +**Cache Hit Rate:** +- Excellent: > 90% +- Good: 80-90% +- Acceptable: 70-80% +- Poor: < 70% + +**Throughput:** +- Single instance: 500-1000 req/sec +- Per sidecar: 50-100 req/sec (sufficient for most workloads) + +### Performance Baselines + +**Cached requests (hit):** +``` +Min: 5-10ms (memory access) +P50: 10-20ms (disk read) +P95: 50-100ms (cold disk cache) +P99: 100-200ms (contention) +Max: 500ms+ (GC pauses) +``` + +**Cache miss (fetch from upstream):** +``` +Min: 100ms (small repo, fast network) +P50: 500ms (typical) +P95: 2000ms (large repo) +P99: 5000ms (very large repo) +Max: 30000ms (timeout) +``` + +## Capacity Planning + +### Single Instance Capacity + +Based on typical workloads: + +| Metric | Value | +|--------|-------| +| Max requests/sec | 500-1000 | +| Concurrent connections | 1000 | +| Cache size | 100GB-1TB | +| CPU (sustained) | 2-4 cores | +| Memory | 4-8GB | + +### Sidecar Capacity + +Per-pod capacity: + +| Metric | Value | +|--------|-------| +| Requests/hour | 100-1000 | +| Peak requests/sec | 10-50 | +| Cache size | 1-10GB | +| CPU | 250m-1 core | +| Memory | 512MB-2GB | + +### Scaling Formula + +``` +Required pods = (Peak requests/sec) / (Requests per pod/sec) + +Example: +- Peak traffic: 1000 req/sec +- Per pod capacity: 10 req/sec +- Required pods: 100 + +With 50% buffer: 150 pods +``` + +## Monitoring During Tests + +### HAProxy Stats + +```bash +open http://localhost:8404 + +# Key metrics: +# - Request distribution across instances +# - Health check status +# - Error rates per backend +``` + +### Prometheus Queries + +```promql +# Cache hit rate +rate(cache_hits_total[5m]) / rate(requests_total[5m]) + +# Request latency (P95) +histogram_quantile(0.95, rate(request_duration_seconds_bucket[5m])) + +# Error rate +rate(errors_total[5m]) / rate(requests_total[5m]) + +# Requests per second +rate(requests_total[5m]) +``` + +### System Metrics + +```bash +# CPU usage +docker stats goblet-1 goblet-2 goblet-3 + +# Disk I/O +docker exec goblet-1 iostat -x 1 + +# Network +docker exec goblet-1 iftop -i eth0 +``` + +## Troubleshooting + +### High Latency + +**Symptoms:** P95 > 1000ms + +**Diagnosis:** +```bash +# Check cache hit rate +curl http://localhost:8080/metrics | grep cache_hit_rate + +# Check disk I/O +docker exec goblet-1 iostat -x + +# Check network latency to upstream +docker exec goblet-1 ping -c 10 github.com +``` + +**Solutions:** +- Increase cache size +- Use faster storage (SSD) +- Add more instances +- Pre-warm cache + +### High Error Rate + +**Symptoms:** Errors > 5% + +**Diagnosis:** +```bash +# Check logs +docker-compose logs goblet-1 | grep ERROR + +# Check upstream connectivity +docker exec goblet-1 curl -I https://github.com +``` + +**Solutions:** +- Verify upstream connectivity +- Check authentication +- Increase timeout values +- Review rate limiting + +### Uneven Load Distribution + +**Symptoms:** One instance much busier than others + +**Diagnosis:** +```bash +# Check HAProxy distribution +curl http://localhost:8404 | grep -A 20 goblet_shards +``` + +**Solutions:** +- Verify consistent hashing configured +- Check if specific repos dominate traffic +- Review routing algorithm + +### Memory Growth + +**Symptoms:** Memory usage increases over time + +**Diagnosis:** +```bash +# Monitor memory over time +watch -n 5 'docker stats --no-stream goblet-1' + +# Check cache size +docker exec goblet-1 du -sh /cache +``` + +**Solutions:** +- Set cache size limits +- Enable LRU eviction +- Increase memory limits +- Review for memory leaks + +## Best Practices + +### Before Testing + +1. **Define objectives:** + - What are you testing? + - What metrics matter? + - What's the success criteria? + +2. **Prepare environment:** + - Clean state (clear caches if needed) + - Monitoring configured + - Baseline metrics captured + +3. **Plan test scenarios:** + - Realistic traffic patterns + - Representative repository mix + - Appropriate duration + +### During Testing + +1. **Monitor actively:** + - Watch dashboards + - Check logs for errors + - Note any anomalies + +2. **Document observations:** + - Screenshot metrics + - Record configuration + - Note any changes made + +3. **Adjust gradually:** + - Change one variable at a time + - Allow time to stabilize + - Compare with baseline + +### After Testing + +1. **Analyze results:** + - Compare against targets + - Identify bottlenecks + - Document findings + +2. **Save data:** + - Export metrics + - Save logs + - Archive configurations + +3. **Create action items:** + - Performance improvements needed + - Configuration changes + - Scaling requirements + +## Example Test Plan + +### Objective +Validate Goblet can handle 1M requests/month with sidecar pattern. + +### Setup +- 10 pods with sidecars +- 1GB cache per pod +- Representative repo mix + +### Test Phases + +**Phase 1: Baseline (30 min)** +```bash +# Light load to warm up cache +python3 loadtest.py --workers 5 --requests 100 +``` +*Expected: Establish baseline latency and hit rate* + +**Phase 2: Normal Load (1 hour)** +```bash +# Simulate average daily traffic +python3 loadtest.py --workers 10 --requests 1000 +``` +*Expected: P95 < 500ms, hit rate > 80%* + +**Phase 3: Peak Load (30 min)** +```bash +# Simulate 10x peak +python3 loadtest.py --workers 100 --requests 100 +``` +*Expected: P95 < 1000ms, no errors* + +**Phase 4: Sustained Peak (2 hours)** +```bash +# Validate stability at peak +python3 loadtest.py --workers 50 --requests 2000 +``` +*Expected: Stable performance, no memory leaks* + +### Success Criteria +- ✅ Success rate > 99% +- ✅ P95 latency < 500ms (normal), < 1000ms (peak) +- ✅ Cache hit rate > 80% +- ✅ No memory leaks +- ✅ No errors under sustained load + +## Summary + +**Quick Reference:** + +```bash +# Start environment +cd loadtest && make start + +# Run test +make loadtest-python + +# View stats +open http://localhost:8404 + +# Cleanup +make stop +``` + +**Key Takeaways:** + +1. Start with warm-up phase +2. Test realistic scenarios +3. Monitor actively +4. Document everything +5. Plan for peak + buffer + +**Next Steps:** + +- Run baseline tests in dev +- Validate capacity planning +- Test failure scenarios +- Move to staging +- Production rollout with monitoring + +For detailed test scripts, see [`loadtest/`](../../loadtest/) directory. diff --git a/loadtest/Makefile b/loadtest/Makefile new file mode 100644 index 0000000..0da9e79 --- /dev/null +++ b/loadtest/Makefile @@ -0,0 +1,107 @@ +.PHONY: help start stop restart status logs clean loadtest-python loadtest-k6 stats + +# Default target +help: + @echo "Goblet Load Test Harness" + @echo "" + @echo "Available targets:" + @echo " start - Start load test environment (HAProxy + 3 Goblet instances + monitoring)" + @echo " stop - Stop all containers" + @echo " restart - Restart all containers" + @echo " status - Show container status" + @echo " logs - Tail logs from all containers" + @echo " stats - Show HAProxy stats" + @echo " metrics - Show Prometheus metrics from Goblet instances" + @echo " loadtest-python - Run Python-based load test" + @echo " loadtest-k6 - Run k6-based load test" + @echo " clean - Stop and remove all containers and volumes" + @echo "" + @echo "Monitoring URLs:" + @echo " HAProxy Stats: http://localhost:8404" + @echo " Prometheus: http://localhost:9090" + @echo " Grafana: http://localhost:3000 (admin/admin)" + @echo " Goblet API: http://localhost:8080" + +# Start the load test environment +start: + @echo "Starting load test environment..." + docker-compose -f ../docker-compose.loadtest.yml up -d + @echo "" + @echo "Waiting for services to be ready..." + @sleep 5 + @echo "" + @echo "Services started!" + @echo " HAProxy: http://localhost:8080" + @echo " Stats: http://localhost:8404" + @echo " Prometheus: http://localhost:9090" + @echo " Grafana: http://localhost:3000" + +# Stop all containers +stop: + @echo "Stopping load test environment..." + docker-compose -f ../docker-compose.loadtest.yml down + +# Restart all containers +restart: + @echo "Restarting load test environment..." + docker-compose -f ../docker-compose.loadtest.yml restart + +# Show container status +status: + @echo "Container Status:" + @echo "" + docker-compose -f ../docker-compose.loadtest.yml ps + +# Tail logs from all containers +logs: + docker-compose -f ../docker-compose.loadtest.yml logs -f + +# Show HAProxy stats +stats: + @echo "HAProxy Statistics:" + @echo "===================" + @echo "" + @curl -s http://localhost:8404 | grep -A 20 "goblet_shards" || echo "HAProxy not responding. Is it running? Try: make start" + @echo "" + @echo "Full stats available at: http://localhost:8404" + +# Show Prometheus metrics from Goblet instances +metrics: + @echo "Goblet Instance Metrics:" + @echo "========================" + @echo "" + @echo "Instance 1:" + @curl -s http://localhost:8080/metrics 2>/dev/null | grep -E "^goblet_" | head -n 10 || echo "Not responding" + @echo "" + @echo "Open Prometheus for detailed metrics: http://localhost:9090" + +# Run Python-based load test +loadtest-python: + @echo "Running Python load test..." + @echo "" + python3 loadtest.py \ + --url http://localhost:8080 \ + --workers 20 \ + --requests 50 \ + --think-time 100 \ + --repos github.com/kubernetes/kubernetes github.com/golang/go \ + --output results-$$(date +%Y%m%d-%H%M%S).json + +# Run k6-based load test +loadtest-k6: + @echo "Running k6 load test..." + @echo "" + docker-compose -f ../docker-compose.loadtest.yml --profile loadtest up k6 + +# Clean everything +clean: + @echo "Cleaning up load test environment..." + @echo "This will remove all containers and volumes!" + @read -p "Are you sure? [y/N] " -n 1 -r; \ + echo; \ + if [[ $$REPLY =~ ^[Yy]$$ ]]; then \ + docker-compose -f ../docker-compose.loadtest.yml down -v; \ + echo "Cleanup complete!"; \ + else \ + echo "Cleanup cancelled."; \ + fi diff --git a/loadtest/README.md b/loadtest/README.md new file mode 100644 index 0000000..0dcfc33 --- /dev/null +++ b/loadtest/README.md @@ -0,0 +1,511 @@ +# Goblet Load Testing & Deployment Patterns + +This directory contains load testing infrastructure and deployment patterns for scaling Goblet in production environments. + +## ⚠️ CRITICAL SECURITY NOTICE + +**Before deploying Goblet with private repositories, read the [Security Isolation Guide](../docs/security/isolation-strategies.md)** + +Goblet's default configuration is **UNSAFE for multi-tenant deployments with private repositories**. Users can access each other's cached private repos. See [Security](#security-considerations) section below. + +## Table of Contents + +1. [Security Considerations](#security-considerations) +2. [Architecture Overview](#architecture-overview) +3. [Load Testing Setup](#load-testing-setup) +4. [Deployment Patterns](#deployment-patterns) +5. [Scaling Considerations](#scaling-considerations) +6. [Sidecar Pattern for Terraform](#sidecar-pattern-for-terraform) + +--- + +## Security Considerations + +### The Problem + +**Default cache key:** `/cache/{host}/{repo-path}` - NO user/tenant identifier +**Risk:** User A's private repos accessible to User B + +### Solutions (Pick One) + +| Pattern | Security | Storage | Complexity | Use Case | +|---------|----------|---------|------------|----------| +| **Sidecar** (Recommended) | ✅ Perfect | Medium | Low | Terraform, CI/CD | +| **User-Scoped** | ✅ Perfect | High | Medium | Risk scanning | +| **Tenant-Scoped** | ✅ Good | Medium | Medium | Terraform Cloud | +| **Network Isolation** | ✅ Perfect | Low | High | Compliance | +| ❌ **Default (None)** | ❌ UNSAFE | Low | Low | Public repos only | + +**Quick Fix:** Use sidecar pattern (one instance per pod). See [`kubernetes-sidecar-deployment.yaml`](./kubernetes-sidecar-deployment.yaml) + +**Detailed Guide:** See [Security Isolation Strategies](../docs/security/isolation-strategies.md) + +**Architecture:** See [Design Decisions](../docs/architecture/design-decisions.md) + +--- + +## Architecture Overview + +### Stateful vs Stateless + +**Goblet is a STATEFUL caching proxy** with the following characteristics: + +- **File-based cache**: Bare Git repositories stored on local disk +- **In-process state**: `sync.Map` for repository management with per-repo mutexes +- **Single-writer assumption**: Git operations expect exclusive access to repositories +- **No distributed coordination**: No distributed locks or leader election + +### Scaling Implications + +❌ **NOT SAFE**: Multiple instances sharing the same cache directory +- Git operations will race and corrupt repositories +- In-memory locks are process-local + +✅ **SAFE**: +- Single instance per cache directory +- Multiple instances with repository sharding +- Sidecar pattern (one cache per application pod) + +--- + +## Load Testing Setup + +### Prerequisites + +- Docker and Docker Compose +- Python 3.8+ (for Python-based load test) +- OR k6 (for JavaScript-based load test) + +### Quick Start + +1. **Start the load test environment:** + + ```bash + docker-compose -f docker-compose.loadtest.yml up -d + ``` + + This starts: + - 3 Goblet instances (goblet-1, goblet-2, goblet-3) + - HAProxy load balancer with consistent hashing (port 8080) + - Prometheus metrics collector (port 9090) + - Grafana dashboard (port 3000) + +2. **View HAProxy stats:** + + ```bash + open http://localhost:8404 + ``` + +3. **Run Python load test:** + + ```bash + python3 loadtest/loadtest.py \ + --url http://localhost:8080 \ + --workers 20 \ + --requests 100 \ + --repos github.com/kubernetes/kubernetes github.com/golang/go + ``` + +4. **Run k6 load test:** + + ```bash + docker-compose -f docker-compose.loadtest.yml --profile loadtest up k6 + ``` + +5. **View Grafana dashboards:** + + ```bash + open http://localhost:3000 + # Login: admin/admin + ``` + +### Load Test Scripts + +#### Python Script (`loadtest.py`) + +Flexible, easy-to-customize load test script: + +```bash +python3 loadtest/loadtest.py \ + --url http://localhost:8080 \ + --workers 50 \ + --requests 200 \ + --think-time 50 \ + --repos github.com/user/repo1 github.com/user/repo2 \ + --output results.json +``` + +**Options:** +- `--url`: Target URL (default: http://localhost:8080) +- `--workers`: Number of concurrent workers (default: 10) +- `--requests`: Requests per worker (default: 100) +- `--think-time`: Delay between requests in ms (default: 100) +- `--repos`: List of repository paths to test +- `--output`: JSON output file for results + +#### k6 Script (`k6-script.js`) + +Advanced load testing with gradual ramp-up: + +```javascript +// Stages defined in k6-script.js: +// - Ramp up to 10 VUs over 2 minutes +// - Stay at 10 VUs for 5 minutes +// - Ramp up to 50 VUs over 2 minutes +// - Stay at 50 VUs for 5 minutes +// - Ramp up to 100 VUs over 2 minutes +// - Stay at 100 VUs for 5 minutes +// - Ramp down over 2 minutes +``` + +Customize repositories in `k6-script.js` line 22. + +--- + +## Deployment Patterns + +### Pattern 1: Repository Sharding with HAProxy + +**Use case**: Centralized cache with horizontal scaling + +**Architecture:** +``` + HAProxy (consistent hashing on URL) + | + +---------------+---------------+ + | | | + Goblet-1 Goblet-2 Goblet-3 + (repos A-H) (repos I-P) (repos Q-Z) + | | | + Cache Dir 1 Cache Dir 2 Cache Dir 3 +``` + +**Implementation:** + +```yaml +# See docker-compose.loadtest.yml +# HAProxy uses: balance uri whole +``` + +**Pros:** +- True horizontal scaling +- Linear throughput increase +- Each instance caches a subset of repos + +**Cons:** +- Cache efficiency reduced (each instance has partial cache) +- Need sticky routing per repository +- Adds load balancer complexity + +### Pattern 2: Sidecar Pattern (Recommended for Terraform) + +**Use case**: Large-scale deployments with millions of requests per month + +**Architecture:** +``` +Kubernetes Pod + | + +-- Terraform Agent Container + | (git -> http://localhost:8080) + | + +-- Goblet Sidecar Container + (port 8080, cache: /cache) + | + +-- EmptyDir Volume (10Gi) +``` + +**Implementation:** + +See `kubernetes-sidecar-deployment.yaml` + +**Benefits:** +- ✅ Zero network latency (localhost) +- ✅ Pod-scoped cache lifecycle +- ✅ Natural workload partitioning +- ✅ No coordination needed +- ✅ Scales linearly with pod count +- ✅ Perfect for Terraform Cloud Agents + +**Configuration:** + +```yaml +# In Terraform agent container: +env: + - name: HTTP_PROXY + value: "http://localhost:8080" + # OR + - name: GIT_CONFIG_KEY_0 + value: "http.proxy" + - name: GIT_CONFIG_VALUE_0 + value: "http://localhost:8080" +``` + +### Pattern 3: Regional Instances + +**Use case**: Multi-region deployments with geo-distributed teams + +**Architecture:** +``` +US-EAST Region EU-WEST Region APAC Region + | | | +Goblet Instance Goblet Instance Goblet Instance +(10GB cache) (10GB cache) (10GB cache) +``` + +**Pros:** +- Low latency for regional users +- Independent failure domains +- Simple deployment model + +**Cons:** +- Cache duplication across regions +- Higher storage costs + +--- + +## Scaling Considerations + +### When to Scale + +**Vertical Scaling (increase instance size):** +- CPU bound: Many concurrent requests, protocol parsing +- Memory bound: Large number of cached repositories +- Disk I/O bound: Frequent cache misses, large repos + +**Horizontal Scaling (add instances):** +- Request rate exceeds single instance capacity (~1000 req/s) +- Need high availability / redundancy +- Regional distribution required +- Workload naturally partitioned (e.g., per-tenant) + +### Metrics to Monitor + +1. **Request Rate**: requests/sec per instance +2. **Cache Hit Rate**: % of requests served from cache +3. **Response Latency**: p50, p95, p99 latencies +4. **Disk Usage**: cache directory size +5. **Git Fetch Duration**: time to fetch from upstream +6. **Error Rate**: failed requests / total requests + +### Capacity Planning + +**Single Instance Capacity (estimated):** +- **Request Rate**: 500-1000 req/s (depends on cache hit rate) +- **Concurrent Connections**: 1000+ +- **Cached Repositories**: 100-1000 (depends on size) +- **Disk I/O**: ~100 MB/s sustained + +**For millions of requests/month:** +``` +1,000,000 requests/month = ~0.4 requests/sec average +With peak factor 10x = ~4 requests/sec peak +Single instance: SUFFICIENT for average load +Sidecar pattern: BETTER for peak handling + resilience +``` + +### Recommended Architecture for Terraform Cloud Scale + +**Deployment:** +- 100 Terraform Agent pods +- Each pod with Goblet sidecar +- 10GB cache per pod +- HPA (Horizontal Pod Autoscaler): 100-500 pods + +**Expected Performance:** +- 1M requests/month = ~10K requests/pod/month +- Avg: 0.004 req/sec per pod (trivial) +- Peak (10x): 0.04 req/sec per pod (trivial) +- **Cache hit rate**: 80-95% (after warm-up) + +**Benefits:** +- No shared state = no coordination overhead +- Linear scaling with pod count +- Cache warm-up happens naturally per pod +- Failed pods don't affect others +- Rolling updates are safe + +--- + +## Sidecar Pattern for Terraform + +### Why Sidecar for Terraform Agents? + +1. **Workload Isolation**: Each Terraform run is independent +2. **Cache Locality**: Terraform runs often use same repos +3. **No Network Overhead**: Localhost communication +4. **Natural Partitioning**: No need for distributed coordination +5. **Pod Lifecycle**: Cache created/destroyed with pod + +### Deployment Steps + +1. **Build Goblet container image:** + + ```bash + docker build -t goblet:latest . + docker tag goblet:latest your-registry/goblet:v1.0.0 + docker push your-registry/goblet:v1.0.0 + ``` + +2. **Deploy to Kubernetes:** + + ```bash + kubectl create namespace terraform-agents + kubectl apply -f loadtest/kubernetes-sidecar-deployment.yaml + ``` + +3. **Verify deployment:** + + ```bash + kubectl get pods -n terraform-agents + kubectl logs -n terraform-agents -c goblet-cache + ``` + +4. **Monitor with Prometheus:** + + ```bash + kubectl port-forward -n terraform-agents svc/terraform-agent-metrics 8080:8080 + curl http://localhost:8080/metrics + ``` + +### Configuration Tips + +**Cache Size:** +```yaml +volumes: + - name: git-cache + emptyDir: + sizeLimit: 10Gi # Adjust based on repo sizes +``` + +**Resource Allocation:** +```yaml +resources: + requests: + cpu: "500m" # Increase for cache-heavy workloads + memory: "1Gi" # Increase for many repos + limits: + cpu: "1" + memory: "2Gi" +``` + +**Autoscaling:** +```yaml +minReplicas: 10 # Baseline capacity +maxReplicas: 100 # Peak capacity +``` + +### Testing Sidecar Deployment + +```bash +# Port forward to a pod +kubectl port-forward -n terraform-agents 8080:8080 + +# Test from your local machine +python3 loadtest/loadtest.py \ + --url http://localhost:8080 \ + --workers 5 \ + --requests 50 +``` + +--- + +## Troubleshooting + +### Load Balancer Issues + +**Problem**: Requests not evenly distributed + +**Check HAProxy stats:** +```bash +curl http://localhost:8404 +``` + +**Solution**: Verify consistent hashing is working: +```bash +# Same repo should always go to same backend +for i in {1..10}; do + curl -v http://localhost:8080/github.com/kubernetes/kubernetes/info/refs \ + 2>&1 | grep "X-Served-By" +done +``` + +### Cache Corruption + +**Problem**: Git errors, repository corruption + +**Likely cause**: Multiple instances sharing same cache directory + +**Solution**: +1. Stop all instances +2. Clear cache: `rm -rf /cache/*` +3. Ensure proper sharding/sidecar deployment +4. Restart with isolated caches + +### High Memory Usage + +**Problem**: Goblet using excessive memory + +**Likely cause**: Many large repositories cached + +**Solution**: +1. Reduce cache size with LRU eviction (future enhancement) +2. Increase sizeLimit for emptyDir volume +3. Partition repositories across more instances + +### Slow Response Times + +**Problem**: High p95/p99 latencies + +**Diagnosis**: +```bash +# Check metrics +curl http://localhost:8080/metrics | grep git_fetch + +# Check upstream latency +curl http://localhost:8080/metrics | grep upstream_duration +``` + +**Solutions**: +- Increase worker pool size +- Add more instances (sharding) +- Optimize upstream connectivity +- Add backup storage for cold starts + +--- + +## Future Enhancements + +### Distributed Coordination + +To enable true shared-cache multi-instance deployment: + +1. **Distributed locks** (Redis, etcd) +2. **Leader election** per repository +3. **Cache coherency protocol** +4. **Shared metadata store** + +### Cache Management + +1. **LRU eviction** for size-bounded cache +2. **Metrics-based warming** (pre-fetch popular repos) +3. **Tiered storage** (hot/cold separation) +4. **Cache replication** for HA + +--- + +## Related Documentation + +- [Goblet README](../README.md) +- [Offline Mode Documentation](../testing/TEST_COVERAGE.md) +- [Docker Compose Configuration](../docker-compose.loadtest.yml) +- [Kubernetes Deployment](./kubernetes-sidecar-deployment.yaml) + +--- + +## Questions & Support + +For issues or questions about load testing: +1. Check HAProxy stats: http://localhost:8404 +2. Check Prometheus metrics: http://localhost:9090 +3. Check Grafana dashboards: http://localhost:3000 +4. Review container logs: `docker-compose logs -f goblet-1` diff --git a/loadtest/grafana-datasources.yml b/loadtest/grafana-datasources.yml new file mode 100644 index 0000000..bb009bb --- /dev/null +++ b/loadtest/grafana-datasources.yml @@ -0,0 +1,9 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false diff --git a/loadtest/haproxy.cfg b/loadtest/haproxy.cfg new file mode 100644 index 0000000..3fae24c --- /dev/null +++ b/loadtest/haproxy.cfg @@ -0,0 +1,59 @@ +global + log stdout format raw local0 + maxconn 4096 + stats socket /var/run/haproxy.sock mode 600 level admin + stats timeout 2m + +defaults + log global + mode http + option httplog + option dontlognull + timeout connect 5s + timeout client 60s + timeout server 60s + timeout http-request 10s + timeout http-keep-alive 2s + +# Stats page +frontend stats + bind *:8404 + stats enable + stats uri / + stats refresh 5s + stats show-legends + stats show-node + +# Main frontend - accepts Git protocol requests +frontend git_proxy + bind *:8080 + default_backend goblet_shards + + # Capture the request path for routing + http-request set-header X-Request-Path %[path] + + # Log backend selection + http-request capture path len 256 + http-response set-header X-Served-By %[srv_name] + +# Backend with consistent hashing by URL path +# This ensures the same repository always goes to the same instance +backend goblet_shards + balance uri whole + hash-type consistent + + # Health checks + option httpchk GET /healthz + http-check expect status 200 + + # Servers (Goblet instances) + server goblet-1 goblet-1:8080 check inter 5s fall 3 rise 2 + server goblet-2 goblet-2:8080 check inter 5s fall 3 rise 2 + server goblet-3 goblet-3:8080 check inter 5s fall 3 rise 2 + + # Connection tuning for Git operations + timeout server 300s + timeout connect 10s + + # Retry policy - don't retry on same server to avoid corruption + retries 0 diff --git a/loadtest/k6-script.js b/loadtest/k6-script.js new file mode 100644 index 0000000..06e7a32 --- /dev/null +++ b/loadtest/k6-script.js @@ -0,0 +1,171 @@ +import http from 'k6/http'; +import { check, sleep } from 'k6'; +import { Rate, Trend, Counter } from 'k6/metrics'; + +// Custom metrics +const errorRate = new Rate('errors'); +const cacheHitRate = new Rate('cache_hits'); +const requestDuration = new Trend('request_duration'); +const requestCounter = new Counter('requests_total'); + +// Load test configuration +export const options = { + stages: [ + { duration: '2m', target: 10 }, // Ramp up to 10 VUs + { duration: '5m', target: 10 }, // Stay at 10 VUs + { duration: '2m', target: 50 }, // Ramp up to 50 VUs + { duration: '5m', target: 50 }, // Stay at 50 VUs + { duration: '2m', target: 100 }, // Ramp up to 100 VUs + { duration: '5m', target: 100 }, // Stay at 100 VUs + { duration: '2m', target: 0 }, // Ramp down + ], + thresholds: { + 'http_req_duration': ['p(95)<5000'], // 95% of requests should be below 5s + 'errors': ['rate<0.1'], // Error rate should be below 10% + 'http_req_failed': ['rate<0.05'], // Failed requests below 5% + }, +}; + +// Simulated repository list (adjust to match your test repos) +const repositories = [ + 'github.com/kubernetes/kubernetes', + 'github.com/golang/go', + 'github.com/torvalds/linux', + 'github.com/facebook/react', + 'github.com/microsoft/vscode', + 'github.com/hashicorp/terraform', + 'github.com/nodejs/node', + 'github.com/rust-lang/rust', + 'github.com/apache/spark', + 'github.com/tensorflow/tensorflow', +]; + +// Git protocol v2 ls-refs command +function createLsRefsRequest() { + return '0014command=ls-refs\n' + + '0001' + + '0009peel\n' + + '000csymrefs\n' + + '000bunborn\n' + + '0014ref-prefix refs/\n' + + '0000'; +} + +// Git protocol v2 fetch command (minimal) +function createFetchRequest(wantRef) { + return '0011command=fetch\n' + + '0001' + + '000cthin-pack\n' + + '000cofs-delta\n' + + `00${(32 + wantRef.length).toString(16).padStart(2, '0')}want ${wantRef}\n` + + '00000009done\n' + + '0000'; +} + +// Select random repository +function getRandomRepo() { + return repositories[Math.floor(Math.random() * repositories.length)]; +} + +export default function () { + const targetUrl = __ENV.TARGET_URL || 'http://localhost:8080'; + const repo = getRandomRepo(); + const repoUrl = `${targetUrl}/${repo}/git-upload-pack`; + + // Scenario 1: ls-refs request (80% of requests) + if (Math.random() < 0.8) { + const lsRefsPayload = createLsRefsRequest(); + + const params = { + headers: { + 'Content-Type': 'application/x-git-upload-pack-request', + 'Git-Protocol': 'version=2', + 'Accept': 'application/x-git-upload-pack-result', + }, + timeout: '60s', + }; + + const start = Date.now(); + const response = http.post(repoUrl, lsRefsPayload, params); + const duration = Date.now() - start; + + requestCounter.add(1); + requestDuration.add(duration); + + const success = check(response, { + 'ls-refs status is 200': (r) => r.status === 200, + 'ls-refs has body': (r) => r.body.length > 0, + 'ls-refs is valid': (r) => r.body.includes('refs/'), + }); + + errorRate.add(!success); + + // Check if served from cache (custom header from HAProxy) + if (response.headers['X-Served-By']) { + console.log(`Repo ${repo} served by ${response.headers['X-Served-By']}`); + } + } + // Scenario 2: fetch request (20% of requests) + else { + // First, get refs with ls-refs + const lsRefsPayload = createLsRefsRequest(); + const params = { + headers: { + 'Content-Type': 'application/x-git-upload-pack-request', + 'Git-Protocol': 'version=2', + 'Accept': 'application/x-git-upload-pack-result', + }, + timeout: '60s', + }; + + const lsRefsResponse = http.post(repoUrl, lsRefsPayload, params); + + if (lsRefsResponse.status === 200) { + // Parse a ref from response (simplified - assumes valid format) + const refMatch = lsRefsResponse.body.match(/([0-9a-f]{40})\s+refs\/heads\/\w+/); + + if (refMatch && refMatch[1]) { + const wantRef = refMatch[1]; + const fetchPayload = createFetchRequest(wantRef); + + const start = Date.now(); + const fetchResponse = http.post(repoUrl, fetchPayload, params); + const duration = Date.now() - start; + + requestCounter.add(1); + requestDuration.add(duration); + + const success = check(fetchResponse, { + 'fetch status is 200': (r) => r.status === 200, + 'fetch has pack data': (r) => r.body.length > 0, + }); + + errorRate.add(!success); + } + } + } + + // Think time between requests (simulates real user behavior) + sleep(Math.random() * 3 + 1); // 1-4 seconds +} + +export function handleSummary(data) { + return { + 'stdout': textSummary(data, { indent: ' ', enableColors: true }), + '/tmp/k6-summary.json': JSON.stringify(data), + }; +} + +function textSummary(data, options) { + const indent = options.indent || ''; + const enableColors = options.enableColors || false; + + let summary = '\n' + indent + '=== Load Test Summary ===\n\n'; + + summary += indent + `Requests: ${data.metrics.requests_total.values.count}\n`; + summary += indent + `Errors: ${(data.metrics.errors.values.rate * 100).toFixed(2)}%\n`; + summary += indent + `Request Duration (p95): ${data.metrics.request_duration.values['p(95)']}ms\n`; + summary += indent + `HTTP Req Duration (p95): ${data.metrics.http_req_duration.values['p(95)']}ms\n`; + + return summary; +} diff --git a/loadtest/kubernetes-sidecar-deployment.yaml b/loadtest/kubernetes-sidecar-deployment.yaml new file mode 100644 index 0000000..25308ab --- /dev/null +++ b/loadtest/kubernetes-sidecar-deployment.yaml @@ -0,0 +1,204 @@ +--- +# ConfigMap for Goblet sidecar configuration +apiVersion: v1 +kind: ConfigMap +metadata: + name: goblet-sidecar-config + namespace: terraform-agents +data: + # Basic configuration - customize based on your auth needs + goblet.env: | + GOBLET_PORT=8080 + GOBLET_CACHE_ROOT=/cache + GOBLET_LOG_LEVEL=info + +--- +# Terraform Agent with Goblet Sidecar +apiVersion: apps/v1 +kind: Deployment +metadata: + name: terraform-agent + namespace: terraform-agents + labels: + app: terraform-agent +spec: + replicas: 10 # Scale to handle load + selector: + matchLabels: + app: terraform-agent + template: + metadata: + labels: + app: terraform-agent + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + spec: + # Shared cache volume between sidecar and main container + volumes: + - name: git-cache + emptyDir: + sizeLimit: 10Gi # Adjust based on your repo sizes + - name: goblet-config + configMap: + name: goblet-sidecar-config + + containers: + # Main Terraform Agent Container + - name: terraform-agent + image: your-terraform-agent:latest + env: + # Configure git to use local proxy + - name: HTTP_PROXY + value: "http://localhost:8080" + - name: HTTPS_PROXY + value: "http://localhost:8080" + # Alternatively, configure git directly + - name: GIT_CONFIG_COUNT + value: "1" + - name: GIT_CONFIG_KEY_0 + value: "http.proxy" + - name: GIT_CONFIG_VALUE_0 + value: "http://localhost:8080" + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "2" + memory: "4Gi" + + # Goblet Sidecar Container + - name: goblet-cache + image: goblet:latest # Build from your Dockerfile + ports: + - containerPort: 8080 + name: http + protocol: TCP + - containerPort: 8080 + name: metrics + protocol: TCP + envFrom: + - configMapRef: + name: goblet-sidecar-config + volumeMounts: + - name: git-cache + mountPath: /cache + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 30 + readinessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1" + memory: "2Gi" + # Security context + securityContext: + runAsNonRoot: true + runAsUser: 1000 + allowPrivilegeEscalation: false + readOnlyRootFilesystem: false # Git needs to write to cache + + # Lifecycle: Goblet should start before main container needs it + initContainers: + - name: wait-for-goblet + image: busybox:1.36 + command: + - sh + - -c + - | + until wget -q --spider http://localhost:8080/healthz; do + echo "Waiting for Goblet to be ready..." + sleep 2 + done + echo "Goblet is ready!" + +--- +# Service for metrics scraping (optional) +apiVersion: v1 +kind: Service +metadata: + name: terraform-agent-metrics + namespace: terraform-agents + labels: + app: terraform-agent +spec: + clusterIP: None # Headless service for pod-level metrics + selector: + app: terraform-agent + ports: + - name: metrics + port: 8080 + targetPort: 8080 + protocol: TCP + +--- +# ServiceMonitor for Prometheus Operator (optional) +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: goblet-sidecar + namespace: terraform-agents + labels: + app: terraform-agent +spec: + selector: + matchLabels: + app: terraform-agent + endpoints: + - port: metrics + path: /metrics + interval: 30s + +--- +# PodDisruptionBudget to ensure availability during rolling updates +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: terraform-agent-pdb + namespace: terraform-agents +spec: + minAvailable: 50% + selector: + matchLabels: + app: terraform-agent + +--- +# HorizontalPodAutoscaler for dynamic scaling +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: terraform-agent-hpa + namespace: terraform-agents +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: terraform-agent + minReplicas: 10 + maxReplicas: 100 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 diff --git a/loadtest/kubernetes-sidecar-secure.yaml b/loadtest/kubernetes-sidecar-secure.yaml new file mode 100644 index 0000000..171dfae --- /dev/null +++ b/loadtest/kubernetes-sidecar-secure.yaml @@ -0,0 +1,303 @@ +--- +# SECURE Terraform Agent Deployment with Tenant Isolation +# This configuration ensures proper isolation for multi-tenant scenarios + +apiVersion: v1 +kind: ConfigMap +metadata: + name: goblet-secure-config + namespace: terraform-agents +data: + isolation.json: | + { + "mode": "tenant", + "tenant_header_key": "X-TFC-Workspace-ID", + "hash_identifiers": false + } + +--- +# ServiceAccount with minimal permissions +apiVersion: v1 +kind: ServiceAccount +metadata: + name: terraform-agent + namespace: terraform-agents + +--- +# NetworkPolicy to restrict traffic +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: terraform-agent-netpol + namespace: terraform-agents +spec: + podSelector: + matchLabels: + app: terraform-agent + policyTypes: + - Ingress + - Egress + ingress: + # Only allow metrics scraping from Prometheus + - from: + - namespaceSelector: + matchLabels: + name: monitoring + ports: + - protocol: TCP + port: 8080 + egress: + # Allow DNS + - to: + - namespaceSelector: {} + ports: + - protocol: UDP + port: 53 + # Allow HTTPS to GitHub/upstream + - to: + - namespaceSelector: {} + ports: + - protocol: TCP + port: 443 + # Allow localhost (sidecar communication) + - to: + - podSelector: + matchLabels: + app: terraform-agent + ports: + - protocol: TCP + port: 8080 + +--- +# Deployment with Security Hardening +apiVersion: apps/v1 +kind: Deployment +metadata: + name: terraform-agent-secure + namespace: terraform-agents + labels: + app: terraform-agent + security: hardened +spec: + replicas: 10 + selector: + matchLabels: + app: terraform-agent + template: + metadata: + labels: + app: terraform-agent + security: hardened + annotations: + # Security annotations + seccomp.security.alpha.kubernetes.io/pod: "runtime/default" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + spec: + serviceAccountName: terraform-agent + + # Security context for pod + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + + volumes: + - name: git-cache + emptyDir: + sizeLimit: 10Gi + # Use memory-backed emptyDir for sensitive data (optional) + # medium: "Memory" # Uncomment for in-memory cache + - name: goblet-config + configMap: + name: goblet-secure-config + # Optional: Encrypted volume for cache + # - name: encrypted-cache + # persistentVolumeClaim: + # claimName: encrypted-cache-pvc + + containers: + # Main Terraform Agent Container + - name: terraform-agent + image: your-terraform-agent:latest + env: + # Git proxy configuration + - name: HTTP_PROXY + value: "http://localhost:8080" + - name: HTTPS_PROXY + value: "http://localhost:8080" + + # Terraform Cloud workspace ID (for tenant isolation) + - name: TFC_WORKSPACE_ID + value: "ws-example123" # Should come from pod labels or injection + + # Pass workspace ID to Goblet via custom header + - name: GIT_CONFIG_COUNT + value: "2" + - name: GIT_CONFIG_KEY_0 + value: "http.proxy" + - name: GIT_CONFIG_VALUE_0 + value: "http://localhost:8080" + - name: GIT_CONFIG_KEY_1 + value: "http.extraHeader" + - name: GIT_CONFIG_VALUE_1 + value: "X-TFC-Workspace-ID: $(TFC_WORKSPACE_ID)" + + # Security context for container + securityContext: + runAsNonRoot: true + runAsUser: 1000 + allowPrivilegeEscalation: false + readOnlyRootFilesystem: false + capabilities: + drop: + - ALL + + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "2" + memory: "4Gi" + + # Goblet Sidecar Container (Secure Configuration) + - name: goblet-cache + image: goblet:latest + ports: + - containerPort: 8080 + name: http + protocol: TCP + env: + - name: GOBLET_PORT + value: "8080" + - name: GOBLET_CACHE_ROOT + value: "/cache" + - name: GOBLET_LOG_LEVEL + value: "info" + + # CRITICAL: Isolation mode configuration + - name: GOBLET_ISOLATION_MODE + value: "tenant" + - name: GOBLET_TENANT_HEADER + value: "X-TFC-Workspace-ID" + + # Optional: Enable audit logging + - name: GOBLET_AUDIT_LOG + value: "true" + - name: GOBLET_AUDIT_LOG_PATH + value: "/cache/audit.log" + + volumeMounts: + - name: git-cache + mountPath: /cache + - name: goblet-config + mountPath: /etc/goblet + readOnly: true + + # Security context + securityContext: + runAsNonRoot: true + runAsUser: 1000 + allowPrivilegeEscalation: false + readOnlyRootFilesystem: false # Git needs write access to cache + capabilities: + drop: + - ALL + + # Probes + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 30 + readinessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 10 + + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1" + memory: "2Gi" + +--- +# PodSecurityPolicy (if using PSP) +apiVersion: policy/v1beta1 +kind: PodSecurityPolicy +metadata: + name: terraform-agent-psp + annotations: + seccomp.security.alpha.kubernetes.io/allowedProfileNames: 'runtime/default' + seccomp.security.alpha.kubernetes.io/defaultProfileName: 'runtime/default' +spec: + privileged: false + allowPrivilegeEscalation: false + requiredDropCapabilities: + - ALL + volumes: + - 'configMap' + - 'emptyDir' + - 'projected' + - 'secret' + - 'downwardAPI' + - 'persistentVolumeClaim' + hostNetwork: false + hostIPC: false + hostPID: false + runAsUser: + rule: 'MustRunAsNonRoot' + seLinux: + rule: 'RunAsAny' + supplementalGroups: + rule: 'RunAsAny' + fsGroup: + rule: 'RunAsAny' + readOnlyRootFilesystem: false + +--- +# ResourceQuota per namespace (tenant isolation at cluster level) +apiVersion: v1 +kind: ResourceQuota +metadata: + name: terraform-agent-quota + namespace: terraform-agents +spec: + hard: + requests.cpu: "100" + requests.memory: 200Gi + persistentvolumeclaims: "100" + pods: "100" + +--- +# LimitRange to prevent resource exhaustion +apiVersion: v1 +kind: LimitRange +metadata: + name: terraform-agent-limits + namespace: terraform-agents +spec: + limits: + - max: + cpu: "4" + memory: "8Gi" + min: + cpu: "100m" + memory: "128Mi" + type: Container + - max: + cpu: "8" + memory: "16Gi" + min: + cpu: "100m" + memory: "128Mi" + type: Pod diff --git a/loadtest/loadtest.py b/loadtest/loadtest.py new file mode 100644 index 0000000..dba4e48 --- /dev/null +++ b/loadtest/loadtest.py @@ -0,0 +1,369 @@ +#!/usr/bin/env python3 +""" +Load test harness for Goblet Git caching proxy. + +This script simulates multiple concurrent Git clients making requests +to the proxy, testing cache efficiency, throughput, and stability. +""" + +import argparse +import concurrent.futures +import hashlib +import json +import random +import statistics +import sys +import time +from dataclasses import dataclass +from typing import List, Dict, Tuple +import requests +from urllib.parse import urljoin + + +@dataclass +class TestResult: + """Results from a single test request.""" + success: bool + duration_ms: float + repo: str + operation: str + served_by: str = "" + error: str = "" + + +class GitProtocolV2Client: + """Simple Git protocol v2 client for testing.""" + + def __init__(self, base_url: str, timeout: int = 60): + self.base_url = base_url + self.timeout = timeout + self.session = requests.Session() + + def ls_refs(self, repo_path: str) -> Tuple[bool, float, str, str]: + """ + Execute ls-refs command. + Returns: (success, duration_ms, served_by, error) + """ + url = urljoin(self.base_url, f"/{repo_path}/git-upload-pack") + + # Git protocol v2 ls-refs payload + payload = ( + b"0014command=ls-refs\n" + b"0001" + b"0009peel\n" + b"000csymrefs\n" + b"000bunborn\n" + b"0014ref-prefix refs/\n" + b"0000" + ) + + headers = { + "Content-Type": "application/x-git-upload-pack-request", + "Git-Protocol": "version=2", + "Accept": "application/x-git-upload-pack-result", + } + + start = time.time() + try: + response = self.session.post( + url, data=payload, headers=headers, timeout=self.timeout + ) + duration_ms = (time.time() - start) * 1000 + + if response.status_code != 200: + return False, duration_ms, "", f"HTTP {response.status_code}" + + if len(response.content) == 0: + return False, duration_ms, "", "Empty response" + + served_by = response.headers.get("X-Served-By", "") + return True, duration_ms, served_by, "" + + except Exception as e: + duration_ms = (time.time() - start) * 1000 + return False, duration_ms, "", str(e) + + def fetch(self, repo_path: str, want_ref: str) -> Tuple[bool, float, str, str]: + """ + Execute fetch command. + Returns: (success, duration_ms, served_by, error) + """ + url = urljoin(self.base_url, f"/{repo_path}/git-upload-pack") + + # Git protocol v2 fetch payload + want_line = f"want {want_ref}\n".encode() + payload = ( + b"0011command=fetch\n" + b"0001" + b"000cthin-pack\n" + b"000cofs-delta\n" + + f"{len(want_line) + 4:04x}".encode() + + want_line + + b"00000009done\n" + b"0000" + ) + + headers = { + "Content-Type": "application/x-git-upload-pack-request", + "Git-Protocol": "version=2", + "Accept": "application/x-git-upload-pack-result", + } + + start = time.time() + try: + response = self.session.post( + url, data=payload, headers=headers, timeout=self.timeout + ) + duration_ms = (time.time() - start) * 1000 + + if response.status_code != 200: + return False, duration_ms, "", f"HTTP {response.status_code}" + + served_by = response.headers.get("X-Served-By", "") + return True, duration_ms, served_by, "" + + except Exception as e: + duration_ms = (time.time() - start) * 1000 + return False, duration_ms, "", str(e) + + +class LoadTestRunner: + """Orchestrates load testing.""" + + def __init__( + self, + target_url: str, + repositories: List[str], + num_workers: int = 10, + requests_per_worker: int = 100, + think_time_ms: int = 100, + ): + self.target_url = target_url + self.repositories = repositories + self.num_workers = num_workers + self.requests_per_worker = requests_per_worker + self.think_time_ms = think_time_ms + self.results: List[TestResult] = [] + + def worker_task(self, worker_id: int) -> List[TestResult]: + """Worker function that executes requests.""" + client = GitProtocolV2Client(self.target_url) + results = [] + + for i in range(self.requests_per_worker): + # Select random repository + repo = random.choice(self.repositories) + + # 80% ls-refs, 20% fetch + if random.random() < 0.8: + success, duration, served_by, error = client.ls_refs(repo) + result = TestResult( + success=success, + duration_ms=duration, + repo=repo, + operation="ls-refs", + served_by=served_by, + error=error, + ) + else: + # For fetch, we need a valid ref - use a common one + # In real scenario, would ls-refs first + dummy_ref = "0" * 40 # Placeholder + success, duration, served_by, error = client.fetch(repo, dummy_ref) + result = TestResult( + success=success, + duration_ms=duration, + repo=repo, + operation="fetch", + served_by=served_by, + error=error, + ) + + results.append(result) + + # Progress indicator + if (i + 1) % 10 == 0: + print( + f"Worker {worker_id}: {i + 1}/{self.requests_per_worker} requests", + end="\r", + ) + + # Think time + if self.think_time_ms > 0: + time.sleep(self.think_time_ms / 1000) + + return results + + def run(self) -> Dict: + """Execute load test and return summary statistics.""" + print(f"Starting load test:") + print(f" Target: {self.target_url}") + print(f" Workers: {self.num_workers}") + print(f" Requests per worker: {self.requests_per_worker}") + print(f" Total requests: {self.num_workers * self.requests_per_worker}") + print(f" Repositories: {len(self.repositories)}") + print() + + start_time = time.time() + + # Execute workers in parallel + with concurrent.futures.ThreadPoolExecutor( + max_workers=self.num_workers + ) as executor: + futures = [ + executor.submit(self.worker_task, i) for i in range(self.num_workers) + ] + + for future in concurrent.futures.as_completed(futures): + self.results.extend(future.result()) + + total_duration = time.time() - start_time + + return self._compute_statistics(total_duration) + + def _compute_statistics(self, total_duration: float) -> Dict: + """Compute summary statistics from results.""" + total_requests = len(self.results) + successful = [r for r in self.results if r.success] + failed = [r for r in self.results if not r.success] + + durations = [r.duration_ms for r in successful] + + # Server distribution + server_counts = {} + for r in self.results: + if r.served_by: + server_counts[r.served_by] = server_counts.get(r.served_by, 0) + 1 + + # Repository distribution + repo_requests = {} + for r in self.results: + repo_requests[r.repo] = repo_requests.get(r.repo, 0) + 1 + + stats = { + "total_requests": total_requests, + "successful": len(successful), + "failed": len(failed), + "success_rate": len(successful) / total_requests * 100, + "total_duration_sec": total_duration, + "requests_per_sec": total_requests / total_duration, + "duration_ms": { + "min": min(durations) if durations else 0, + "max": max(durations) if durations else 0, + "mean": statistics.mean(durations) if durations else 0, + "median": statistics.median(durations) if durations else 0, + "p95": ( + sorted(durations)[int(len(durations) * 0.95)] + if durations + else 0 + ), + "p99": ( + sorted(durations)[int(len(durations) * 0.99)] + if durations + else 0 + ), + }, + "server_distribution": server_counts, + "repo_distribution": repo_requests, + "errors": {}, + } + + # Collect error types + for r in failed: + stats["errors"][r.error] = stats["errors"].get(r.error, 0) + 1 + + return stats + + def print_summary(self, stats: Dict): + """Print formatted summary statistics.""" + print("\n" + "=" * 60) + print("LOAD TEST RESULTS") + print("=" * 60) + print(f"\nTotal Requests: {stats['total_requests']}") + print(f"Successful: {stats['successful']}") + print(f"Failed: {stats['failed']}") + print(f"Success Rate: {stats['success_rate']:.2f}%") + print(f"Total Duration: {stats['total_duration_sec']:.2f}s") + print(f"Requests/sec: {stats['requests_per_sec']:.2f}") + + print(f"\nResponse Times (ms):") + print(f" Min: {stats['duration_ms']['min']:.2f}") + print(f" Max: {stats['duration_ms']['max']:.2f}") + print(f" Mean: {stats['duration_ms']['mean']:.2f}") + print(f" Median: {stats['duration_ms']['median']:.2f}") + print(f" P95: {stats['duration_ms']['p95']:.2f}") + print(f" P99: {stats['duration_ms']['p99']:.2f}") + + if stats["server_distribution"]: + print(f"\nServer Distribution:") + for server, count in sorted(stats["server_distribution"].items()): + pct = count / stats["total_requests"] * 100 + print(f" {server:20s} {count:6d} ({pct:5.2f}%)") + + if stats["errors"]: + print(f"\nErrors:") + for error, count in sorted( + stats["errors"].items(), key=lambda x: x[1], reverse=True + ): + print(f" {error:40s} {count:6d}") + + print("\n" + "=" * 60 + "\n") + + +def main(): + parser = argparse.ArgumentParser( + description="Load test harness for Goblet Git caching proxy" + ) + parser.add_argument( + "--url", + default="http://localhost:8080", + help="Target URL (default: http://localhost:8080)", + ) + parser.add_argument( + "--workers", type=int, default=10, help="Number of concurrent workers" + ) + parser.add_argument( + "--requests", type=int, default=100, help="Requests per worker" + ) + parser.add_argument( + "--think-time", type=int, default=100, help="Think time between requests (ms)" + ) + parser.add_argument( + "--repos", + nargs="+", + default=[ + "github.com/kubernetes/kubernetes", + "github.com/golang/go", + "github.com/torvalds/linux", + "github.com/hashicorp/terraform", + ], + help="List of repository paths to test", + ) + parser.add_argument( + "--output", help="Output file for JSON results (optional)" + ) + + args = parser.parse_args() + + runner = LoadTestRunner( + target_url=args.url, + repositories=args.repos, + num_workers=args.workers, + requests_per_worker=args.requests, + think_time_ms=args.think_time, + ) + + stats = runner.run() + runner.print_summary(stats) + + if args.output: + with open(args.output, "w") as f: + json.dump(stats, f, indent=2) + print(f"Results saved to {args.output}") + + # Exit code based on success rate + sys.exit(0 if stats["success_rate"] >= 95 else 1) + + +if __name__ == "__main__": + main() diff --git a/loadtest/prometheus.yml b/loadtest/prometheus.yml new file mode 100644 index 0000000..02efce4 --- /dev/null +++ b/loadtest/prometheus.yml @@ -0,0 +1,34 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + cluster: 'loadtest' + +scrape_configs: + # Scrape Goblet instances + - job_name: 'goblet' + static_configs: + - targets: + - 'goblet-1:8080' + - 'goblet-2:8080' + - 'goblet-3:8080' + labels: + service: 'goblet' + metrics_path: '/metrics' + scrape_interval: 10s + + # Scrape HAProxy stats + - job_name: 'haproxy' + static_configs: + - targets: + - 'haproxy:8404' + labels: + service: 'haproxy' + metrics_path: '/metrics' + scrape_interval: 10s + + # Scrape Prometheus itself + - job_name: 'prometheus' + static_configs: + - targets: + - 'localhost:9090'