diff --git a/.env.prod.example b/.env.prod.example new file mode 100644 index 0000000..122244e --- /dev/null +++ b/.env.prod.example @@ -0,0 +1,81 @@ +# Production Environment Configuration +# Copy this file to .env.prod and fill in the actual values + +# Application +APP_NAME=rag7-langgraph +APP_ENV=production +APP_DEBUG=false +LOG_LEVEL=INFO + +# API Configuration +API_HOST=0.0.0.0 +API_PORT=8000 +API_WORKERS=4 +API_RELOAD=false + +# LangGraph Configuration +LANGGRAPH_API_URL=http://langgraph:8123 +LANGGRAPH_CHECKPOINT_STORE=postgres +LANGGRAPH_STREAM_MODE=values + +# Database (PostgreSQL for LangGraph checkpoints) +# TODO: Replace with actual production database credentials +POSTGRES_HOST=postgres +POSTGRES_PORT=5432 +POSTGRES_DB=langgraph_checkpoints +POSTGRES_USER=langgraph +POSTGRES_PASSWORD=CHANGEME_SECURE_PASSWORD + +# Redis (for caching and rate limiting) +# TODO: Replace with actual production Redis credentials +REDIS_HOST=redis +REDIS_PORT=6379 +REDIS_PASSWORD=CHANGEME_SECURE_PASSWORD +REDIS_DB=0 + +# Authentication & Security +# TODO: Generate a secure secret key (e.g., using: openssl rand -hex 32) +SECRET_KEY=CHANGEME_GENERATE_SECURE_KEY +API_KEY_SALT=CHANGEME_GENERATE_SECURE_SALT +ALLOWED_ORIGINS=https://yourdomain.com,https://www.yourdomain.com + +# Observability +ENABLE_METRICS=true +METRICS_PORT=9090 +JAEGER_AGENT_HOST=jaeger +JAEGER_AGENT_PORT=6831 +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 + +# LangChain/LangSmith (optional) +# TODO: Add your LangSmith API key if using +LANGCHAIN_TRACING_V2=false +LANGCHAIN_API_KEY= +LANGCHAIN_PROJECT=rag7-production + +# OpenAI API (if using OpenAI models) +# TODO: Add your OpenAI API key +OPENAI_API_KEY= + +# Other LLM Providers (as needed) +# TODO: Add your API keys for other providers +ANTHROPIC_API_KEY= +COHERE_API_KEY= +HUGGINGFACE_API_KEY= + +# Vector Store Configuration +VECTOR_STORE_TYPE=postgres # or 'pinecone', 'weaviate', 'qdrant' +VECTOR_DIMENSION=1536 + +# n8n Integration (if using) +N8N_WEBHOOK_URL=https://n8n.yourdomain.com/webhook +N8N_API_KEY=CHANGEME_N8N_API_KEY + +# Rate Limiting +RATE_LIMIT_ENABLED=true +RATE_LIMIT_PER_MINUTE=60 +RATE_LIMIT_BURST=10 + +# Feature Flags +ENABLE_ASYNC_PROCESSING=true +ENABLE_CACHING=true +CACHE_TTL_SECONDS=3600 diff --git a/.github/workflows/cd-staging.yml b/.github/workflows/cd-staging.yml new file mode 100644 index 0000000..7d3bcd9 --- /dev/null +++ b/.github/workflows/cd-staging.yml @@ -0,0 +1,73 @@ +name: CD - Deploy to Staging + +on: + push: + branches: [develop] + workflow_dispatch: + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + DEPLOYMENT_NAME: langgraph-api + NAMESPACE: staging + +jobs: + deploy: + runs-on: ubuntu-latest + environment: staging + steps: + - uses: actions/checkout@v4 + + - name: Set up kubectl + uses: azure/setup-kubectl@v3 + with: + version: 'v1.28.0' + + - name: Configure kubeconfig + run: | + mkdir -p $HOME/.kube + echo "${{ secrets.KUBECONFIG_STAGING }}" | base64 -d > $HOME/.kube/config + chmod 600 $HOME/.kube/config + + - name: Verify cluster connection + run: | + kubectl cluster-info + kubectl get nodes + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set image tag + id: image + run: | + IMAGE_TAG="${{ github.sha }}" + echo "tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT + echo "image=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${IMAGE_TAG}" >> $GITHUB_OUTPUT + + - name: Update deployment image + run: | + kubectl set image deployment/${{ env.DEPLOYMENT_NAME }} \ + langgraph=${{ steps.image.outputs.image }} \ + -n ${{ env.NAMESPACE }} + + - name: Wait for rollout + run: | + kubectl rollout status deployment/${{ env.DEPLOYMENT_NAME }} \ + -n ${{ env.NAMESPACE }} \ + --timeout=5m + + - name: Verify deployment + run: | + kubectl get pods -n ${{ env.NAMESPACE }} -l app=langgraph + kubectl get service -n ${{ env.NAMESPACE }} -l app=langgraph + + - name: Run smoke tests + run: | + # TODO: Add smoke test endpoint checks + # kubectl run smoke-test --image=curlimages/curl --rm -it --restart=Never \ + # -- curl -f http://${{ env.DEPLOYMENT_NAME }}.${{ env.NAMESPACE }}.svc.cluster.local/health + echo "Smoke tests would run here" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..6365e17 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,109 @@ +name: CI Pipeline + +on: + push: + branches: [main, develop] + pull_request: + branches: [main, develop] + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 black isort mypy + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + + - name: Lint with flake8 + run: | + # Stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # Exit-zero treats all errors as warnings + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + + - name: Check formatting with black + run: black --check . + continue-on-error: true + + - name: Check import ordering with isort + run: isort --check-only . + continue-on-error: true + + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install pytest pytest-cov pytest-asyncio + + - name: Run tests + run: | + pytest --cov=. --cov-report=xml --cov-report=term + continue-on-error: true + + - name: Upload coverage reports + uses: codecov/codecov-action@v3 + continue-on-error: true + + build: + runs-on: ubuntu-latest + needs: [lint, test] + permissions: + contents: read + packages: write + steps: + - uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=ref,event=branch + type=ref,event=pr + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=sha,prefix={{branch}}- + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: . + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..79e1579 --- /dev/null +++ b/.gitignore @@ -0,0 +1,22 @@ +*.pyc +__pycache__/ +*.pyo +*.pyd +.Python +*.so +*.egg +*.egg-info/ +dist/ +build/ +.eggs/ +*.log +.env +.venv +venv/ +ENV/ +.DS_Store +.idea/ +.vscode/ +*.swp +*.swo +*~ diff --git a/README.md b/README.md index f5a8ce3..6889614 100644 --- a/README.md +++ b/README.md @@ -1 +1,3 @@ -# rag7 \ No newline at end of file +# rag7 + +> **📦 Production Templates Available**: This repository now includes production-ready deployment templates, CI/CD workflows, Kubernetes manifests, n8n workflows, and comprehensive documentation. See the `docs/` directory and related files to get started with deployment. \ No newline at end of file diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml new file mode 100644 index 0000000..62973eb --- /dev/null +++ b/docker-compose.prod.yml @@ -0,0 +1,149 @@ +version: '3.8' + +services: + langgraph: + image: ghcr.io/stacey77/rag7:latest + container_name: langgraph-api + restart: unless-stopped + env_file: + - .env.prod + ports: + - "8123:8123" + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8123/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + networks: + - rag7-network + volumes: + - ./data:/app/data + deploy: + resources: + limits: + cpus: '2.0' + memory: 4G + reservations: + cpus: '1.0' + memory: 2G + + integration-api: + build: + context: ./integration/api + dockerfile: Dockerfile + container_name: integration-api + restart: unless-stopped + env_file: + - .env.prod + ports: + - "8000:8000" + depends_on: + langgraph: + condition: service_healthy + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 20s + networks: + - rag7-network + deploy: + resources: + limits: + cpus: '1.0' + memory: 2G + reservations: + cpus: '0.5' + memory: 1G + + postgres: + image: postgres:15-alpine + container_name: langgraph-postgres + restart: unless-stopped + environment: + POSTGRES_DB: ${POSTGRES_DB:-langgraph_checkpoints} + POSTGRES_USER: ${POSTGRES_USER:-langgraph} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-CHANGEME_SECURE_PASSWORD} + PGDATA: /var/lib/postgresql/data/pgdata + ports: + - "5432:5432" + volumes: + - postgres-data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-langgraph}"] + interval: 10s + timeout: 5s + retries: 5 + networks: + - rag7-network + deploy: + resources: + limits: + cpus: '1.0' + memory: 2G + reservations: + cpus: '0.5' + memory: 1G + + redis: + image: redis:7-alpine + container_name: langgraph-redis + restart: unless-stopped + command: redis-server --requirepass ${REDIS_PASSWORD:-CHANGEME_SECURE_PASSWORD} + ports: + - "6379:6379" + volumes: + - redis-data:/data + healthcheck: + test: ["CMD", "redis-cli", "--raw", "incr", "ping"] + interval: 10s + timeout: 5s + retries: 5 + networks: + - rag7-network + deploy: + resources: + limits: + cpus: '0.5' + memory: 512M + reservations: + cpus: '0.25' + memory: 256M + + nginx: + image: nginx:alpine + container_name: rag7-nginx + restart: unless-stopped + ports: + - "80:80" + - "443:443" + volumes: + - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro + - ./nginx/ssl:/etc/nginx/ssl:ro + depends_on: + - integration-api + - langgraph + networks: + - rag7-network + deploy: + resources: + limits: + cpus: '0.5' + memory: 512M + +volumes: + postgres-data: + driver: local + redis-data: + driver: local + +networks: + rag7-network: + driver: bridge diff --git a/docs/deployment.md b/docs/deployment.md new file mode 100644 index 0000000..349556b --- /dev/null +++ b/docs/deployment.md @@ -0,0 +1,355 @@ +# Deployment Guide + +This guide covers deploying the RAG7 LangGraph application to production environments. + +## Table of Contents + +- [Prerequisites](#prerequisites) +- [Environment Configuration](#environment-configuration) +- [Deployment Options](#deployment-options) + - [Docker Compose](#docker-compose) + - [Kubernetes](#kubernetes) +- [Post-Deployment](#post-deployment) +- [Troubleshooting](#troubleshooting) + +## Prerequisites + +### Required Tools + +- Docker & Docker Compose (v2.0+) +- kubectl (v1.28+) for Kubernetes deployments +- Access to container registry (GitHub Container Registry) +- Database: PostgreSQL 15+ +- Cache: Redis 7+ + +### Required Secrets + +Before deploying, you must configure the following secrets: + +1. **Database Credentials**: PostgreSQL password +2. **Secret Keys**: Application secret key and API key salt +3. **API Keys**: OpenAI, Anthropic, or other LLM provider keys +4. **Registry Access**: GitHub Container Registry token (for pulling images) +5. **Kubeconfig**: Kubernetes cluster credentials (for K8s deployments) + +## Environment Configuration + +### Step 1: Create Production Environment File + +```bash +cp .env.prod.example .env.prod +``` + +### Step 2: Update Required Values + +Edit `.env.prod` and replace all `CHANGEME_*` placeholders: + +```bash +# Generate secure secret key +openssl rand -hex 32 + +# Generate API key salt +openssl rand -hex 16 + +# Generate secure database password +openssl rand -base64 32 +``` + +### Step 3: Configure API Keys + +Add your LLM provider API keys: + +```bash +OPENAI_API_KEY=sk-your-actual-key-here +LANGCHAIN_API_KEY=ls__your-actual-key-here +``` + +## Deployment Options + +### Docker Compose + +Docker Compose is suitable for single-server deployments or staging environments. + +#### Deploy with Docker Compose + +```bash +# Pull latest images +docker-compose -f docker-compose.prod.yml pull + +# Start services +docker-compose -f docker-compose.prod.yml up -d + +# Check service status +docker-compose -f docker-compose.prod.yml ps + +# View logs +docker-compose -f docker-compose.prod.yml logs -f langgraph +``` + +#### Verify Deployment + +```bash +# Check health endpoint +curl http://localhost:8000/health + +# Check readiness endpoint +curl http://localhost:8000/ready + +# Test graph execution +curl -X POST http://localhost:8000/v1/graph/run \ + -H "Content-Type: application/json" \ + -d '{"input": {"query": "test"}}' +``` + +#### Stop Services + +```bash +docker-compose -f docker-compose.prod.yml down +``` + +### Kubernetes + +Kubernetes is recommended for production deployments requiring high availability and auto-scaling. + +#### Prerequisites + +1. **Configure kubectl** + +```bash +# Set up kubeconfig +export KUBECONFIG=/path/to/your/kubeconfig + +# Verify connection +kubectl cluster-info +kubectl get nodes +``` + +2. **Create Namespace** + +```bash +kubectl apply -f k8s/langgraph-deployment.yaml +# This creates the rag7 namespace +``` + +3. **Configure Secrets** + +Update the secrets in `k8s/langgraph-deployment.yaml` or create them via kubectl: + +```bash +# Create secrets from literal values +kubectl create secret generic langgraph-secrets \ + --namespace=rag7 \ + --from-literal=POSTGRES_PASSWORD='your-secure-password' \ + --from-literal=REDIS_PASSWORD='your-redis-password' \ + --from-literal=SECRET_KEY='your-secret-key' \ + --from-literal=API_KEY_SALT='your-api-salt' \ + --from-literal=OPENAI_API_KEY='sk-your-key' \ + --dry-run=client -o yaml | kubectl apply -f - +``` + +4. **Create Image Pull Secret** (for GHCR) + +```bash +kubectl create secret docker-registry ghcr-secret \ + --namespace=rag7 \ + --docker-server=ghcr.io \ + --docker-username=YOUR_GITHUB_USERNAME \ + --docker-password=YOUR_GITHUB_TOKEN \ + --docker-email=YOUR_EMAIL +``` + +#### Deploy to Kubernetes + +```bash +# Apply all manifests +kubectl apply -f k8s/langgraph-deployment.yaml +kubectl apply -f k8s/hpa.yaml + +# Check deployment status +kubectl get deployments -n rag7 +kubectl get pods -n rag7 +kubectl get services -n rag7 + +# Watch rollout +kubectl rollout status deployment/langgraph -n rag7 +``` + +#### Verify Kubernetes Deployment + +```bash +# Port-forward to test locally +kubectl port-forward -n rag7 svc/langgraph 8123:8123 + +# In another terminal, test endpoints +curl http://localhost:8123/health +curl http://localhost:8123/ready +``` + +#### Expose Service (Optional) + +Using an Ingress controller: + +```yaml +# ingress.yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: langgraph-ingress + namespace: rag7 + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod +spec: + ingressClassName: nginx + tls: + - hosts: + - api.yourdomain.com + secretName: langgraph-tls + rules: + - host: api.yourdomain.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: langgraph + port: + number: 8123 +``` + +```bash +kubectl apply -f ingress.yaml +``` + +## Post-Deployment + +### Database Migrations + +Run any necessary database migrations: + +```bash +# Docker Compose +docker-compose -f docker-compose.prod.yml exec langgraph python -m alembic upgrade head + +# Kubernetes +kubectl exec -n rag7 -it deployment/langgraph -- python -m alembic upgrade head +``` + +### Monitoring Setup + +1. **Check Metrics Endpoint** + +```bash +curl http://localhost:9090/metrics +``` + +2. **Configure Prometheus** (if using) + +Add scraping configuration for the metrics endpoint. + +3. **Set Up Alerts** + +Configure alerting rules for critical metrics. + +### Backup Configuration + +1. **Database Backups** + +```bash +# Set up automated PostgreSQL backups +kubectl create cronjob postgres-backup \ + --image=postgres:15-alpine \ + --schedule="0 2 * * *" \ + --restart=Never \ + -- pg_dump -h postgres.rag7.svc.cluster.local -U langgraph > /backup/db.sql +``` + +2. **Configuration Backups** + +Store ConfigMaps and Secrets in version control (encrypted). + +## Troubleshooting + +### Common Issues + +#### Pods Not Starting + +```bash +# Check pod status +kubectl get pods -n rag7 + +# View pod logs +kubectl logs -n rag7 deployment/langgraph --tail=100 + +# Describe pod for events +kubectl describe pod -n rag7 POD_NAME +``` + +#### Connection Issues + +```bash +# Test database connectivity +kubectl run -n rag7 -it --rm debug --image=postgres:15-alpine --restart=Never \ + -- psql -h postgres.rag7.svc.cluster.local -U langgraph -d langgraph_checkpoints + +# Test Redis connectivity +kubectl run -n rag7 -it --rm debug --image=redis:7-alpine --restart=Never \ + -- redis-cli -h redis.rag7.svc.cluster.local -a PASSWORD ping +``` + +#### Image Pull Errors + +```bash +# Verify image pull secret +kubectl get secret ghcr-secret -n rag7 -o yaml + +# Test image pull manually +docker pull ghcr.io/stacey77/rag7:latest +``` + +#### Resource Constraints + +```bash +# Check resource usage +kubectl top pods -n rag7 + +# Check HPA status +kubectl get hpa -n rag7 + +# Describe HPA for details +kubectl describe hpa langgraph-hpa -n rag7 +``` + +### Logs and Debugging + +```bash +# Stream all logs +kubectl logs -n rag7 -l app=langgraph --tail=100 -f + +# Get logs from specific pod +kubectl logs -n rag7 POD_NAME --tail=200 + +# Get previous crashed container logs +kubectl logs -n rag7 POD_NAME --previous +``` + +### Rollback Deployment + +```bash +# View rollout history +kubectl rollout history deployment/langgraph -n rag7 + +# Rollback to previous version +kubectl rollout undo deployment/langgraph -n rag7 + +# Rollback to specific revision +kubectl rollout undo deployment/langgraph -n rag7 --to-revision=2 +``` + +## Next Steps + +- Configure observability (see [observability.md](./observability.md)) +- Set up alerting and on-call rotation (see [runbook.md](./runbook.md)) +- Import n8n workflows (see [../n8n/README.md](../n8n/README.md)) +- Configure CI/CD pipelines +- Set up backup and disaster recovery procedures diff --git a/docs/observability.md b/docs/observability.md new file mode 100644 index 0000000..a0c3051 --- /dev/null +++ b/docs/observability.md @@ -0,0 +1,454 @@ +# Observability Guide + +This guide covers monitoring, logging, and tracing for the RAG7 LangGraph application. + +## Table of Contents + +- [Metrics](#metrics) +- [Logging](#logging) +- [Tracing](#tracing) +- [Dashboards](#dashboards) +- [Alerting](#alerting) + +## Metrics + +### Prometheus Metrics Endpoint + +The application exposes Prometheus-compatible metrics at `/metrics` on port 9090. + +#### Key Metrics to Monitor + +**Application Metrics:** +- `http_requests_total` - Total HTTP requests by endpoint and status +- `http_request_duration_seconds` - Request duration histogram +- `langgraph_execution_duration_seconds` - Graph execution time +- `langgraph_errors_total` - Total graph execution errors +- `active_graph_executions` - Currently running graph executions + +**System Metrics:** +- `process_cpu_usage` - CPU usage percentage +- `process_memory_bytes` - Memory usage in bytes +- `process_open_fds` - Open file descriptors + +**Database Metrics:** +- `db_connections_active` - Active database connections +- `db_query_duration_seconds` - Database query duration +- `db_errors_total` - Database errors + +**Cache Metrics:** +- `cache_hits_total` - Cache hit count +- `cache_misses_total` - Cache miss count +- `cache_evictions_total` - Cache eviction count + +### Scraping Configuration + +Add to your Prometheus configuration: + +```yaml +scrape_configs: + - job_name: 'langgraph' + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - rag7 + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) +``` + +### Example Queries + +```promql +# Request rate per second +rate(http_requests_total[5m]) + +# 99th percentile latency +histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) + +# Error rate +rate(langgraph_errors_total[5m]) / rate(http_requests_total[5m]) + +# Memory usage over time +process_memory_bytes + +# Cache hit rate +rate(cache_hits_total[5m]) / (rate(cache_hits_total[5m]) + rate(cache_misses_total[5m])) +``` + +## Logging + +### Log Levels + +The application supports the following log levels: +- `DEBUG` - Detailed diagnostic information +- `INFO` - General informational messages +- `WARNING` - Warning messages for potentially harmful situations +- `ERROR` - Error events that might still allow the app to continue +- `CRITICAL` - Critical events that may cause the app to abort + +Set log level via environment variable: +```bash +LOG_LEVEL=INFO +``` + +### Log Format + +Logs are output in JSON format for easy parsing: + +```json +{ + "timestamp": "2024-01-15T10:30:45.123Z", + "level": "INFO", + "logger": "langgraph.api", + "message": "Graph execution completed", + "request_id": "req_abc123", + "graph_id": "graph_xyz789", + "duration_ms": 1234, + "user_id": "user_456" +} +``` + +### Accessing Logs + +#### Docker Compose + +```bash +# View logs from all services +docker-compose -f docker-compose.prod.yml logs -f + +# View logs from specific service +docker-compose -f docker-compose.prod.yml logs -f langgraph + +# View last 100 lines +docker-compose -f docker-compose.prod.yml logs --tail=100 langgraph +``` + +#### Kubernetes + +```bash +# View logs from all pods +kubectl logs -n rag7 -l app=langgraph --tail=100 -f + +# View logs from specific pod +kubectl logs -n rag7 POD_NAME --tail=200 -f + +# View logs from all containers in pod +kubectl logs -n rag7 POD_NAME --all-containers=true +``` + +### Centralized Logging + +#### ELK Stack (Elasticsearch, Logstash, Kibana) + +Configure Filebeat to ship logs to Elasticsearch: + +```yaml +# filebeat.yml +filebeat.inputs: + - type: container + paths: + - '/var/lib/docker/containers/*/*.log' + processors: + - add_kubernetes_metadata: + host: ${NODE_NAME} + matchers: + - logs_path: + logs_path: "/var/lib/docker/containers/" + +output.elasticsearch: + hosts: ["elasticsearch:9200"] + index: "langgraph-logs-%{+yyyy.MM.dd}" +``` + +#### Loki (Grafana Loki) + +Deploy Promtail to collect and forward logs: + +```yaml +# promtail-config.yaml +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod +``` + +## Tracing + +### OpenTelemetry Integration + +The application supports OpenTelemetry for distributed tracing. + +#### Configuration + +Set environment variables: + +```bash +OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318 +JAEGER_AGENT_HOST=jaeger +JAEGER_AGENT_PORT=6831 +``` + +#### Trace Context + +Each request includes trace context propagation: +- `traceparent` - W3C Trace Context +- `tracestate` - Additional trace state + +#### Trace Attributes + +Custom attributes added to spans: +- `graph.id` - LangGraph graph identifier +- `graph.execution.id` - Unique execution ID +- `user.id` - User identifier +- `llm.provider` - LLM provider (OpenAI, Anthropic, etc.) +- `llm.model` - Model name +- `llm.tokens` - Token count + +### Jaeger UI + +Access Jaeger UI to view traces: + +```bash +# Port forward Jaeger UI +kubectl port-forward -n observability svc/jaeger-query 16686:16686 + +# Open in browser +open http://localhost:16686 +``` + +### Trace Analysis + +Use traces to identify: +- Slow operations and bottlenecks +- Service dependencies +- Error propagation +- Concurrent execution patterns + +## Dashboards + +### Grafana Dashboard + +Import the pre-configured Grafana dashboard: + +```bash +# TODO: Create and include dashboard JSON +# grafana-dashboard.json +``` + +#### Key Panels + +1. **Overview** + - Request rate (requests/sec) + - Error rate (%) + - 95th/99th percentile latency + - Active executions + +2. **Performance** + - Request duration distribution + - Graph execution time + - Database query time + - Cache hit rate + +3. **Resources** + - CPU usage + - Memory usage + - Network I/O + - Disk I/O + +4. **Errors** + - Error count by type + - Error rate trend + - Failed executions + +### Custom Dashboards + +Create custom dashboards using Grafana or your preferred tool. + +#### Example: Request Rate Dashboard + +```json +{ + "title": "Request Rate", + "targets": [ + { + "expr": "rate(http_requests_total{namespace=\"rag7\"}[5m])", + "legendFormat": "{{method}} {{endpoint}}" + } + ] +} +``` + +## Alerting + +### Alert Rules + +Configure alerts for critical conditions: + +#### High Error Rate + +```yaml +groups: + - name: langgraph_alerts + interval: 30s + rules: + - alert: HighErrorRate + expr: rate(langgraph_errors_total[5m]) / rate(http_requests_total[5m]) > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "High error rate detected" + description: "Error rate is {{ $value | humanizePercentage }} (threshold: 5%)" +``` + +#### High Latency + +```yaml + - alert: HighLatency + expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "High latency detected" + description: "99th percentile latency is {{ $value }}s (threshold: 5s)" +``` + +#### Pod Restart + +```yaml + - alert: PodRestarting + expr: rate(kube_pod_container_status_restarts_total{namespace="rag7"}[15m]) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Pod {{ $labels.pod }} is restarting" + description: "Pod has restarted {{ $value }} times in the last 15 minutes" +``` + +#### Resource Exhaustion + +```yaml + - alert: HighMemoryUsage + expr: container_memory_usage_bytes{namespace="rag7"} / container_spec_memory_limit_bytes > 0.9 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage detected" + description: "Memory usage is {{ $value | humanizePercentage }}" +``` + +### Alert Channels + +Configure notification channels: + +#### Slack + +```yaml +receivers: + - name: 'slack-notifications' + slack_configs: + - api_url: 'YOUR_SLACK_WEBHOOK_URL' + channel: '#alerts' + title: 'Alert: {{ .GroupLabels.alertname }}' + text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}' +``` + +#### PagerDuty + +```yaml +receivers: + - name: 'pagerduty' + pagerduty_configs: + - service_key: 'YOUR_PAGERDUTY_KEY' + description: '{{ .GroupLabels.alertname }}: {{ .CommonAnnotations.summary }}' +``` + +#### Email + +```yaml +receivers: + - name: 'email' + email_configs: + - to: 'oncall@example.com' + from: 'alerts@example.com' + smarthost: 'smtp.example.com:587' + auth_username: 'alerts@example.com' + auth_password: 'YOUR_SMTP_PASSWORD' +``` + +## Health Checks + +### Endpoints + +- `GET /health` - Basic health check (liveness probe) +- `GET /ready` - Readiness check (includes dependencies) +- `GET /metrics` - Prometheus metrics + +### Example Health Check Response + +```json +{ + "status": "healthy", + "timestamp": "2024-01-15T10:30:45.123Z", + "checks": { + "database": "ok", + "redis": "ok", + "llm_provider": "ok" + }, + "version": "1.0.0", + "uptime_seconds": 86400 +} +``` + +### Curl Commands + +```bash +# Check health +curl -i http://localhost:8123/health + +# Check readiness +curl -i http://localhost:8123/ready + +# Fetch metrics +curl http://localhost:9090/metrics +``` + +## Best Practices + +1. **Set up alerts before issues occur** - Don't wait for production incidents +2. **Monitor the full stack** - Application, infrastructure, and dependencies +3. **Use structured logging** - Enables better searching and analysis +4. **Implement distributed tracing** - Essential for debugging microservices +5. **Create runbooks** - Document response procedures (see [runbook.md](./runbook.md)) +6. **Regular review** - Periodically review dashboards and alerts +7. **Load testing** - Test observability under realistic load conditions + +## Next Steps + +- Set up Prometheus and Grafana +- Configure alert notification channels +- Create custom dashboards for your use case +- Implement distributed tracing with Jaeger or Zipkin +- Review and test incident response procedures diff --git a/docs/runbook.md b/docs/runbook.md new file mode 100644 index 0000000..ee35bde --- /dev/null +++ b/docs/runbook.md @@ -0,0 +1,557 @@ +# Operations Runbook + +This runbook provides step-by-step procedures for common operational tasks and incident response. + +## Table of Contents + +- [Emergency Contacts](#emergency-contacts) +- [Common Incidents](#common-incidents) +- [Operational Procedures](#operational-procedures) +- [Maintenance Tasks](#maintenance-tasks) +- [Recovery Procedures](#recovery-procedures) + +## Emergency Contacts + +### On-Call Schedule + +| Role | Primary | Secondary | +|------|---------|-----------| +| Engineering | TODO: Add | TODO: Add | +| DevOps | TODO: Add | TODO: Add | +| Manager | TODO: Add | TODO: Add | + +### Escalation Path + +1. On-call engineer (15 min response time) +2. Secondary on-call (30 min response time) +3. Engineering manager (1 hour response time) + +### Communication Channels + +- **Slack**: #incidents (for incident coordination) +- **PagerDuty**: For critical alerts +- **Status Page**: TODO: Add URL + +## Common Incidents + +### High Error Rate + +**Symptoms:** +- Alert: "HighErrorRate" firing +- Increased 5xx responses +- User reports of failures + +**Diagnosis:** + +```bash +# Check error logs +kubectl logs -n rag7 -l app=langgraph --tail=100 | grep ERROR + +# Check error metrics +curl http://localhost:9090/metrics | grep error + +# Check recent deployments +kubectl rollout history deployment/langgraph -n rag7 +``` + +**Resolution Steps:** + +1. **Identify error type** + ```bash + # Group errors by type + kubectl logs -n rag7 -l app=langgraph | grep ERROR | cut -d' ' -f5- | sort | uniq -c | sort -rn + ``` + +2. **Check dependencies** + ```bash + # Test database connection + kubectl run -n rag7 -it --rm debug --image=postgres:15-alpine --restart=Never \ + -- psql -h postgres.rag7.svc.cluster.local -U langgraph -c "SELECT 1" + + # Test Redis connection + kubectl run -n rag7 -it --rm debug --image=redis:7-alpine --restart=Never \ + -- redis-cli -h redis.rag7.svc.cluster.local ping + ``` + +3. **Rollback if recent deployment** + ```bash + kubectl rollout undo deployment/langgraph -n rag7 + ``` + +4. **Scale up if capacity issue** + ```bash + kubectl scale deployment/langgraph -n rag7 --replicas=5 + ``` + +### High Latency + +**Symptoms:** +- Alert: "HighLatency" firing +- Slow response times +- Request timeouts + +**Diagnosis:** + +```bash +# Check request latency +curl http://localhost:9090/metrics | grep duration + +# Check resource usage +kubectl top pods -n rag7 + +# Check database slow queries +kubectl exec -n rag7 -it deployment/postgres -- \ + psql -U langgraph -c "SELECT query, calls, mean_exec_time FROM pg_stat_statements ORDER BY mean_exec_time DESC LIMIT 10" +``` + +**Resolution Steps:** + +1. **Check for resource constraints** + ```bash + kubectl describe pod -n rag7 POD_NAME | grep -A 5 "Limits\|Requests" + ``` + +2. **Scale horizontally** + ```bash + kubectl scale deployment/langgraph -n rag7 --replicas=8 + ``` + +3. **Check for memory leaks** + ```bash + kubectl top pods -n rag7 --sort-by=memory + ``` + +4. **Restart pods if necessary** + ```bash + kubectl rollout restart deployment/langgraph -n rag7 + ``` + +### Pod Crash Loop + +**Symptoms:** +- Alert: "PodRestarting" firing +- Pods in CrashLoopBackOff state +- Service degradation + +**Diagnosis:** + +```bash +# Check pod status +kubectl get pods -n rag7 + +# View recent logs +kubectl logs -n rag7 POD_NAME --tail=100 + +# View previous container logs +kubectl logs -n rag7 POD_NAME --previous + +# Describe pod for events +kubectl describe pod -n rag7 POD_NAME +``` + +**Resolution Steps:** + +1. **Check for configuration issues** + ```bash + kubectl get configmap langgraph-config -n rag7 -o yaml + kubectl get secret langgraph-secrets -n rag7 -o yaml + ``` + +2. **Verify image** + ```bash + kubectl get deployment langgraph -n rag7 -o jsonpath='{.spec.template.spec.containers[0].image}' + ``` + +3. **Check resource limits** + ```bash + kubectl describe pod -n rag7 POD_NAME | grep -A 5 "Limits" + ``` + +4. **Fix and redeploy** + ```bash + # Update configuration + kubectl edit configmap langgraph-config -n rag7 + + # Restart deployment + kubectl rollout restart deployment/langgraph -n rag7 + ``` + +### Database Connection Issues + +**Symptoms:** +- Database connection errors in logs +- "connection refused" or "connection timeout" +- Service unavailable + +**Diagnosis:** + +```bash +# Check PostgreSQL pod status +kubectl get pods -n rag7 -l app=postgres + +# Check PostgreSQL logs +kubectl logs -n rag7 -l app=postgres --tail=100 + +# Test connection from application pod +kubectl exec -n rag7 -it deployment/langgraph -- \ + python -c "import psycopg2; conn = psycopg2.connect('postgresql://langgraph:PASSWORD@postgres.rag7.svc.cluster.local/langgraph_checkpoints'); print('Connected')" +``` + +**Resolution Steps:** + +1. **Check database is running** + ```bash + kubectl get statefulset postgres -n rag7 + ``` + +2. **Check connection limits** + ```bash + kubectl exec -n rag7 -it statefulset/postgres -- \ + psql -U langgraph -c "SHOW max_connections" + + kubectl exec -n rag7 -it statefulset/postgres -- \ + psql -U langgraph -c "SELECT count(*) FROM pg_stat_activity" + ``` + +3. **Restart PostgreSQL if needed** + ```bash + kubectl delete pod -n rag7 -l app=postgres + ``` + +4. **Check for disk space issues** + ```bash + kubectl exec -n rag7 -it statefulset/postgres -- df -h + ``` + +### Out of Memory (OOM) + +**Symptoms:** +- Pods killed by OOMKiller +- Memory usage at 100% +- Frequent restarts + +**Diagnosis:** + +```bash +# Check memory usage +kubectl top pods -n rag7 + +# Check OOM events +kubectl get events -n rag7 --sort-by='.lastTimestamp' | grep OOM + +# Check memory limits +kubectl describe pod -n rag7 POD_NAME | grep -A 3 "Limits" +``` + +**Resolution Steps:** + +1. **Increase memory limits** + ```bash + kubectl edit deployment langgraph -n rag7 + # Update memory limits under resources + ``` + +2. **Check for memory leaks** + ```bash + # Monitor memory over time + kubectl top pods -n rag7 --watch + ``` + +3. **Scale horizontally instead** + ```bash + kubectl scale deployment/langgraph -n rag7 --replicas=6 + ``` + +### API Rate Limiting + +**Symptoms:** +- 429 Too Many Requests errors +- LLM provider rate limit errors +- Requests being throttled + +**Diagnosis:** + +```bash +# Check rate limit errors +kubectl logs -n rag7 -l app=langgraph | grep "rate limit" + +# Check metrics +curl http://localhost:9090/metrics | grep rate_limit +``` + +**Resolution Steps:** + +1. **Implement exponential backoff** (code change required) + +2. **Distribute load across multiple API keys** + ```bash + kubectl edit secret langgraph-secrets -n rag7 + # Add additional API keys + ``` + +3. **Cache responses to reduce API calls** + ```bash + # Verify Redis is working + kubectl get pods -n rag7 -l app=redis + ``` + +4. **Contact provider for rate limit increase** + +## Operational Procedures + +### Deployment Procedure + +**Standard Deployment:** + +```bash +# 1. Review changes +git diff main..feature-branch + +# 2. Merge to main +git checkout main +git merge feature-branch + +# 3. Tag release +git tag -a v1.0.1 -m "Release 1.0.1" +git push origin v1.0.1 + +# 4. Build and push image (CI does this automatically) +# CI will build and push ghcr.io/stacey77/rag7:v1.0.1 + +# 5. Update deployment +kubectl set image deployment/langgraph -n rag7 \ + langgraph=ghcr.io/stacey77/rag7:v1.0.1 + +# 6. Monitor rollout +kubectl rollout status deployment/langgraph -n rag7 + +# 7. Verify deployment +curl http://API_ENDPOINT/health +``` + +**Hotfix Deployment:** + +```bash +# 1. Create hotfix branch +git checkout -b hotfix/critical-fix main + +# 2. Make minimal fix +# ... edit files ... + +# 3. Test locally +docker build -t rag7:hotfix . +docker run -p 8123:8123 rag7:hotfix + +# 4. Deploy directly +git commit -am "Hotfix: description" +git push origin hotfix/critical-fix + +# 5. Trigger CD pipeline or deploy manually +kubectl set image deployment/langgraph -n rag7 \ + langgraph=ghcr.io/stacey77/rag7:hotfix-critical-fix + +# 6. Monitor closely +kubectl logs -n rag7 -l app=langgraph -f +``` + +### Rollback Procedure + +```bash +# 1. List rollout history +kubectl rollout history deployment/langgraph -n rag7 + +# 2. Rollback to previous version +kubectl rollout undo deployment/langgraph -n rag7 + +# 3. Or rollback to specific revision +kubectl rollout undo deployment/langgraph -n rag7 --to-revision=5 + +# 4. Monitor rollback +kubectl rollout status deployment/langgraph -n rag7 + +# 5. Verify service health +curl http://API_ENDPOINT/health +``` + +### Scaling Procedure + +**Manual Scaling:** + +```bash +# Scale up +kubectl scale deployment/langgraph -n rag7 --replicas=8 + +# Scale down +kubectl scale deployment/langgraph -n rag7 --replicas=2 + +# Verify +kubectl get pods -n rag7 -l app=langgraph +``` + +**Adjust HPA:** + +```bash +# Update HPA limits +kubectl edit hpa langgraph-hpa -n rag7 + +# Check HPA status +kubectl get hpa -n rag7 +kubectl describe hpa langgraph-hpa -n rag7 +``` + +### Backup Procedure + +**Database Backup:** + +```bash +# 1. Create backup +kubectl exec -n rag7 statefulset/postgres -- \ + pg_dump -U langgraph langgraph_checkpoints > backup_$(date +%Y%m%d_%H%M%S).sql + +# 2. Compress +gzip backup_*.sql + +# 3. Upload to storage +aws s3 cp backup_*.sql.gz s3://rag7-backups/$(date +%Y/%m/%d)/ + +# 4. Verify backup +gunzip -c backup_*.sql.gz | head -n 20 +``` + +**Configuration Backup:** + +```bash +# Export all configurations +kubectl get all,configmap,secret -n rag7 -o yaml > rag7_backup_$(date +%Y%m%d).yaml + +# Store securely +gpg -c rag7_backup_$(date +%Y%m%d).yaml +``` + +## Maintenance Tasks + +### Certificate Renewal + +```bash +# Check certificate expiration +kubectl get certificate -n rag7 + +# Renew certificate (cert-manager does this automatically) +kubectl describe certificate langgraph-tls -n rag7 + +# Manual renewal if needed +kubectl delete secret langgraph-tls -n rag7 +# cert-manager will recreate +``` + +### Database Maintenance + +```bash +# Vacuum database +kubectl exec -n rag7 statefulset/postgres -- \ + psql -U langgraph -c "VACUUM ANALYZE" + +# Check database size +kubectl exec -n rag7 statefulset/postgres -- \ + psql -U langgraph -c "SELECT pg_size_pretty(pg_database_size('langgraph_checkpoints'))" + +# Clean old checkpoints (if applicable) +kubectl exec -n rag7 statefulset/postgres -- \ + psql -U langgraph -c "DELETE FROM checkpoints WHERE created_at < NOW() - INTERVAL '30 days'" +``` + +### Log Rotation + +```bash +# Check log volume sizes +kubectl exec -n rag7 POD_NAME -- du -sh /var/log + +# Logs are automatically rotated by Kubernetes +# Configure retention in logging backend (ELK, Loki, etc.) +``` + +## Recovery Procedures + +### Disaster Recovery + +**Complete Cluster Failure:** + +1. **Provision new cluster** +2. **Restore configurations** + ```bash + kubectl apply -f rag7_backup_YYYYMMDD.yaml + ``` +3. **Restore database** + ```bash + kubectl exec -n rag7 -it statefulset/postgres -- \ + psql -U langgraph langgraph_checkpoints < backup_YYYYMMDD.sql + ``` +4. **Verify services** +5. **Update DNS/Load Balancer** + +### Data Corruption + +```bash +# 1. Stop application +kubectl scale deployment/langgraph -n rag7 --replicas=0 + +# 2. Restore database from backup +kubectl exec -n rag7 -it statefulset/postgres -- \ + dropdb -U langgraph langgraph_checkpoints +kubectl exec -n rag7 -it statefulset/postgres -- \ + createdb -U langgraph langgraph_checkpoints +kubectl exec -n rag7 -it statefulset/postgres -- \ + psql -U langgraph langgraph_checkpoints < backup_YYYYMMDD.sql + +# 3. Restart application +kubectl scale deployment/langgraph -n rag7 --replicas=2 + +# 4. Verify data integrity +# Run validation queries +``` + +## Appendix + +### Useful Commands + +```bash +# Get cluster info +kubectl cluster-info + +# Get all resources in namespace +kubectl get all -n rag7 + +# Check resource usage +kubectl top nodes +kubectl top pods -n rag7 + +# Port forward for local testing +kubectl port-forward -n rag7 svc/langgraph 8123:8123 + +# Execute command in pod +kubectl exec -n rag7 -it POD_NAME -- /bin/bash + +# Copy files from pod +kubectl cp rag7/POD_NAME:/path/to/file ./local/path + +# View events +kubectl get events -n rag7 --sort-by='.lastTimestamp' +``` + +### Monitoring Dashboards + +- **Grafana**: TODO: Add URL +- **Prometheus**: TODO: Add URL +- **Jaeger**: TODO: Add URL +- **Kibana/Loki**: TODO: Add URL + +### Documentation Links + +- [Deployment Guide](./deployment.md) +- [Observability Guide](./observability.md) +- [n8n Workflows](../n8n/README.md) +- [API Documentation](TODO) + +--- + +**Remember:** Always follow the change management process and communicate with the team during incidents. diff --git a/integration/api/Dockerfile b/integration/api/Dockerfile new file mode 100644 index 0000000..065c2ca --- /dev/null +++ b/integration/api/Dockerfile @@ -0,0 +1,39 @@ +FROM python:3.11-slim + +LABEL org.opencontainers.image.source=https://github.com/Stacey77/rag7 +LABEL org.opencontainers.image.description="RAG7 Integration API" +LABEL org.opencontainers.image.licenses=MIT + +# Set working directory +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + gcc \ + g++ \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements first for better caching +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY server.py . + +# Create non-root user +RUN useradd -m -u 1000 appuser && \ + chown -R appuser:appuser /app +USER appuser + +# Expose port +EXPOSE 8000 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \ + CMD curl -f http://localhost:8000/health || exit 1 + +# Run the application +CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/integration/api/requirements.txt b/integration/api/requirements.txt new file mode 100644 index 0000000..f4813f6 --- /dev/null +++ b/integration/api/requirements.txt @@ -0,0 +1,40 @@ +# Integration API Requirements +# Python dependencies for the FastAPI integration layer + +# Web Framework +fastapi==0.104.1 +uvicorn[standard]==0.24.0 + +# HTTP Client +httpx==0.25.1 + +# Data Validation +pydantic==2.5.0 +pydantic-settings==2.1.0 + +# Utilities +python-multipart==0.0.6 +python-dotenv==1.0.0 + +# Optional: Authentication & Security +python-jose[cryptography]==3.3.0 +passlib[bcrypt]==1.7.4 + +# Optional: Caching +redis==5.0.1 +hiredis==2.2.3 + +# Optional: Database +psycopg2-binary==2.9.9 +sqlalchemy==2.0.23 + +# Optional: Observability +prometheus-client==0.19.0 +opentelemetry-api==1.21.0 +opentelemetry-sdk==1.21.0 +opentelemetry-instrumentation-fastapi==0.42b0 + +# Development dependencies (optional) +# pytest==7.4.3 +# pytest-asyncio==0.21.1 +# httpx==0.25.1 # for testing diff --git a/integration/api/server.py b/integration/api/server.py new file mode 100644 index 0000000..0d474ce --- /dev/null +++ b/integration/api/server.py @@ -0,0 +1,352 @@ +""" +RAG7 Integration API Server + +FastAPI-based integration layer for LangGraph orchestration. +Provides REST endpoints for graph execution, health checks, and monitoring. +""" + +import os +import logging +import time +from typing import Dict, Any, Optional +from datetime import datetime + +from fastapi import FastAPI, HTTPException, Request, status +from fastapi.responses import JSONResponse +from fastapi.middleware.cors import CORSMiddleware +from fastapi.middleware.gzip import GZipMiddleware +from pydantic import BaseModel, Field +import httpx + +# Configure logging +logging.basicConfig( + level=os.getenv("LOG_LEVEL", "INFO"), + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + +# Configuration +LANGGRAPH_API_URL = os.getenv("LANGGRAPH_API_URL", "http://langgraph:8123") +API_TIMEOUT = int(os.getenv("API_TIMEOUT", "60")) +ALLOWED_ORIGINS = os.getenv("ALLOWED_ORIGINS", "*").split(",") + +# Initialize FastAPI app +app = FastAPI( + title="RAG7 Integration API", + description="Integration layer for LangGraph-based RAG system", + version="1.0.0", + docs_url="/docs", + redoc_url="/redoc", +) + +# Middleware +app.add_middleware( + CORSMiddleware, + allow_origins=ALLOWED_ORIGINS, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) +app.add_middleware(GZipMiddleware, minimum_size=1000) + +# Request/Response Models +class GraphInput(BaseModel): + """Input schema for graph execution""" + query: str = Field(..., description="User query or prompt", min_length=1) + user_id: Optional[str] = Field(None, description="User identifier") + session_id: Optional[str] = Field(None, description="Session identifier") + config: Optional[Dict[str, Any]] = Field(default_factory=dict, description="Graph configuration") + metadata: Optional[Dict[str, Any]] = Field(default_factory=dict, description="Additional metadata") + + +class GraphOutput(BaseModel): + """Output schema for graph execution""" + status: str = Field(..., description="Execution status") + result: Optional[Dict[str, Any]] = Field(None, description="Graph output") + execution_id: Optional[str] = Field(None, description="Execution identifier") + duration_ms: Optional[int] = Field(None, description="Execution duration in milliseconds") + error: Optional[str] = Field(None, description="Error message if failed") + + +class HealthResponse(BaseModel): + """Health check response""" + status: str = Field(..., description="Health status") + timestamp: str = Field(..., description="Current timestamp") + version: str = Field(..., description="API version") + uptime_seconds: int = Field(..., description="Uptime in seconds") + + +class ReadyResponse(BaseModel): + """Readiness check response""" + ready: bool = Field(..., description="Readiness status") + checks: Dict[str, str] = Field(..., description="Component health checks") + timestamp: str = Field(..., description="Current timestamp") + + +# Global state +startup_time = time.time() + + +# Middleware for request logging +@app.middleware("http") +async def log_requests(request: Request, call_next): + """Log all requests with timing""" + start_time = time.time() + + # Generate request ID + request_id = f"req_{int(start_time * 1000)}" + + logger.info(f"Request {request_id}: {request.method} {request.url.path}") + + try: + response = await call_next(request) + duration = (time.time() - start_time) * 1000 + + logger.info( + f"Request {request_id} completed: " + f"status={response.status_code} duration={duration:.2f}ms" + ) + + return response + except Exception as e: + duration = (time.time() - start_time) * 1000 + logger.error( + f"Request {request_id} failed: {str(e)} duration={duration:.2f}ms" + ) + raise + + +# Health check endpoints +@app.get("/health", response_model=HealthResponse, tags=["Health"]) +async def health_check(): + """ + Basic health check endpoint (liveness probe). + + Returns basic service status without checking dependencies. + """ + return HealthResponse( + status="healthy", + timestamp=datetime.utcnow().isoformat(), + version="1.0.0", + uptime_seconds=int(time.time() - startup_time) + ) + + +@app.get("/ready", response_model=ReadyResponse, tags=["Health"]) +async def readiness_check(): + """ + Readiness check endpoint. + + Checks if the service and its dependencies are ready to handle requests. + """ + checks = {} + ready = True + + # Check LangGraph API + try: + async with httpx.AsyncClient(timeout=5.0) as client: + response = await client.get(f"{LANGGRAPH_API_URL}/health") + if response.status_code == 200: + checks["langgraph"] = "ok" + else: + checks["langgraph"] = f"unhealthy (status {response.status_code})" + ready = False + except Exception as e: + checks["langgraph"] = f"error: {str(e)}" + ready = False + + # TODO: Add checks for other dependencies (database, Redis, etc.) + + return ReadyResponse( + ready=ready, + checks=checks, + timestamp=datetime.utcnow().isoformat() + ) + + +# Graph execution endpoint +@app.post("/v1/graph/run", response_model=GraphOutput, tags=["Graph"]) +async def run_graph(input_data: GraphInput): + """ + Execute a LangGraph workflow. + + This endpoint triggers a graph execution with the provided input + and returns the result. + + Args: + input_data: Graph input including query, user_id, session_id, etc. + + Returns: + GraphOutput: Execution result including status, output, and metadata. + + Raises: + HTTPException: If execution fails or times out. + """ + start_time = time.time() + execution_id = f"exec_{int(start_time * 1000)}" + + logger.info(f"Starting graph execution {execution_id}") + + try: + # Prepare request to LangGraph API + payload = { + "input": { + "query": input_data.query, + "user_id": input_data.user_id or "anonymous", + "session_id": input_data.session_id or execution_id, + **input_data.metadata + }, + "config": { + "configurable": { + "thread_id": input_data.session_id or execution_id + }, + **input_data.config + } + } + + # Execute graph via LangGraph API + async with httpx.AsyncClient(timeout=API_TIMEOUT) as client: + response = await client.post( + f"{LANGGRAPH_API_URL}/v1/graph/run", + json=payload, + headers={ + "Content-Type": "application/json", + # TODO: Add authentication header if required + # "X-API-Key": os.getenv("LANGGRAPH_API_KEY", "") + } + ) + + duration_ms = int((time.time() - start_time) * 1000) + + if response.status_code == 200: + result = response.json() + logger.info(f"Graph execution {execution_id} completed in {duration_ms}ms") + + return GraphOutput( + status="success", + result=result, + execution_id=execution_id, + duration_ms=duration_ms + ) + else: + logger.error( + f"Graph execution {execution_id} failed: " + f"status={response.status_code} response={response.text}" + ) + raise HTTPException( + status_code=response.status_code, + detail=f"Graph execution failed: {response.text}" + ) + + except httpx.TimeoutException: + duration_ms = int((time.time() - start_time) * 1000) + logger.error(f"Graph execution {execution_id} timed out after {duration_ms}ms") + raise HTTPException( + status_code=status.HTTP_504_GATEWAY_TIMEOUT, + detail="Graph execution timed out" + ) + + except httpx.RequestError as e: + duration_ms = int((time.time() - start_time) * 1000) + logger.error(f"Graph execution {execution_id} failed: {str(e)}") + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail=f"Failed to connect to LangGraph API: {str(e)}" + ) + + except Exception as e: + duration_ms = int((time.time() - start_time) * 1000) + logger.error(f"Graph execution {execution_id} error: {str(e)}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Internal server error: {str(e)}" + ) + + +# Additional endpoints can be added here +@app.get("/v1/graph/status/{execution_id}", tags=["Graph"]) +async def get_execution_status(execution_id: str): + """ + Get the status of a graph execution. + + TODO: Implement execution status tracking. + """ + # This would query the execution status from LangGraph or a database + raise HTTPException( + status_code=status.HTTP_501_NOT_IMPLEMENTED, + detail="Execution status tracking not yet implemented" + ) + + +@app.get("/v1/graph/history", tags=["Graph"]) +async def get_execution_history( + user_id: Optional[str] = None, + session_id: Optional[str] = None, + limit: int = 10 +): + """ + Get execution history for a user or session. + + TODO: Implement execution history retrieval. + """ + # This would query execution history from a database + raise HTTPException( + status_code=status.HTTP_501_NOT_IMPLEMENTED, + detail="Execution history not yet implemented" + ) + + +# Exception handlers +@app.exception_handler(HTTPException) +async def http_exception_handler(request: Request, exc: HTTPException): + """Handle HTTP exceptions""" + return JSONResponse( + status_code=exc.status_code, + content={ + "error": exc.detail, + "status_code": exc.status_code, + "timestamp": datetime.utcnow().isoformat() + } + ) + + +@app.exception_handler(Exception) +async def general_exception_handler(request: Request, exc: Exception): + """Handle general exceptions""" + logger.error(f"Unhandled exception: {str(exc)}", exc_info=True) + return JSONResponse( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + content={ + "error": "Internal server error", + "status_code": 500, + "timestamp": datetime.utcnow().isoformat() + } + ) + + +# Startup event +@app.on_event("startup") +async def startup_event(): + """Run on application startup""" + logger.info("Starting RAG7 Integration API") + logger.info(f"LangGraph API URL: {LANGGRAPH_API_URL}") + logger.info(f"API Timeout: {API_TIMEOUT}s") + + +# Shutdown event +@app.on_event("shutdown") +async def shutdown_event(): + """Run on application shutdown""" + logger.info("Shutting down RAG7 Integration API") + + +if __name__ == "__main__": + import uvicorn + + uvicorn.run( + "server:app", + host="0.0.0.0", + port=8000, + log_level="info", + reload=os.getenv("API_RELOAD", "false").lower() == "true" + ) diff --git a/k8s/hpa.yaml b/k8s/hpa.yaml new file mode 100644 index 0000000..ec4d2b8 --- /dev/null +++ b/k8s/hpa.yaml @@ -0,0 +1,61 @@ +--- +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: langgraph-hpa + namespace: rag7 + labels: + app: langgraph +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: langgraph + minReplicas: 2 + maxReplicas: 10 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: 80 + behavior: + scaleDown: + stabilizationWindowSeconds: 300 + policies: + - type: Percent + value: 50 + periodSeconds: 60 + - type: Pods + value: 2 + periodSeconds: 60 + selectPolicy: Min + scaleUp: + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 100 + periodSeconds: 30 + - type: Pods + value: 4 + periodSeconds: 30 + selectPolicy: Max + +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: langgraph-pdb + namespace: rag7 +spec: + minAvailable: 1 + selector: + matchLabels: + app: langgraph diff --git a/k8s/langgraph-deployment.yaml b/k8s/langgraph-deployment.yaml new file mode 100644 index 0000000..78bef64 --- /dev/null +++ b/k8s/langgraph-deployment.yaml @@ -0,0 +1,350 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: rag7 + labels: + name: rag7 + environment: production + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: langgraph-config + namespace: rag7 +data: + APP_ENV: "production" + APP_DEBUG: "false" + LOG_LEVEL: "INFO" + API_HOST: "0.0.0.0" + API_PORT: "8123" + LANGGRAPH_STREAM_MODE: "values" + ENABLE_METRICS: "true" + METRICS_PORT: "9090" + +--- +apiVersion: v1 +kind: Secret +metadata: + name: langgraph-secrets + namespace: rag7 +type: Opaque +stringData: + # TODO: Replace these with actual base64-encoded secrets using kubectl create secret + POSTGRES_PASSWORD: "CHANGEME_SECURE_PASSWORD" + REDIS_PASSWORD: "CHANGEME_SECURE_PASSWORD" + SECRET_KEY: "CHANGEME_GENERATE_SECURE_KEY" + API_KEY_SALT: "CHANGEME_GENERATE_SECURE_SALT" + OPENAI_API_KEY: "sk-CHANGEME" + LANGCHAIN_API_KEY: "" + +--- +apiVersion: v1 +kind: Service +metadata: + name: langgraph + namespace: rag7 + labels: + app: langgraph +spec: + type: ClusterIP + ports: + - port: 8123 + targetPort: 8123 + protocol: TCP + name: http + - port: 9090 + targetPort: 9090 + protocol: TCP + name: metrics + selector: + app: langgraph + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: langgraph + namespace: rag7 + labels: + app: langgraph +spec: + replicas: 2 + selector: + matchLabels: + app: langgraph + template: + metadata: + labels: + app: langgraph + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9090" + prometheus.io/path: "/metrics" + spec: + containers: + - name: langgraph + image: ghcr.io/stacey77/rag7:latest + imagePullPolicy: Always + ports: + - containerPort: 8123 + name: http + protocol: TCP + - containerPort: 9090 + name: metrics + protocol: TCP + env: + - name: APP_ENV + valueFrom: + configMapKeyRef: + name: langgraph-config + key: APP_ENV + - name: LOG_LEVEL + valueFrom: + configMapKeyRef: + name: langgraph-config + key: LOG_LEVEL + - name: API_HOST + valueFrom: + configMapKeyRef: + name: langgraph-config + key: API_HOST + - name: API_PORT + valueFrom: + configMapKeyRef: + name: langgraph-config + key: API_PORT + - name: POSTGRES_HOST + value: "postgres.rag7.svc.cluster.local" + - name: POSTGRES_PORT + value: "5432" + - name: POSTGRES_DB + value: "langgraph_checkpoints" + - name: POSTGRES_USER + value: "langgraph" + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: langgraph-secrets + key: POSTGRES_PASSWORD + - name: REDIS_HOST + value: "redis.rag7.svc.cluster.local" + - name: REDIS_PORT + value: "6379" + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: langgraph-secrets + key: REDIS_PASSWORD + - name: SECRET_KEY + valueFrom: + secretKeyRef: + name: langgraph-secrets + key: SECRET_KEY + - name: OPENAI_API_KEY + valueFrom: + secretKeyRef: + name: langgraph-secrets + key: OPENAI_API_KEY + - name: LANGCHAIN_API_KEY + valueFrom: + secretKeyRef: + name: langgraph-secrets + key: LANGCHAIN_API_KEY + optional: true + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: 2000m + memory: 4Gi + livenessProbe: + httpGet: + path: /health + port: 8123 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /ready + port: 8123 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + imagePullSecrets: + - name: ghcr-secret + +--- +apiVersion: v1 +kind: Service +metadata: + name: postgres + namespace: rag7 + labels: + app: postgres +spec: + type: ClusterIP + ports: + - port: 5432 + targetPort: 5432 + protocol: TCP + name: postgres + selector: + app: postgres + +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: postgres + namespace: rag7 + labels: + app: postgres +spec: + serviceName: postgres + replicas: 1 + selector: + matchLabels: + app: postgres + template: + metadata: + labels: + app: postgres + spec: + containers: + - name: postgres + image: postgres:15-alpine + ports: + - containerPort: 5432 + name: postgres + env: + - name: POSTGRES_DB + value: "langgraph_checkpoints" + - name: POSTGRES_USER + value: "langgraph" + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: langgraph-secrets + key: POSTGRES_PASSWORD + - name: PGDATA + value: /var/lib/postgresql/data/pgdata + volumeMounts: + - name: postgres-storage + mountPath: /var/lib/postgresql/data + resources: + requests: + cpu: 250m + memory: 512Mi + limits: + cpu: 1000m + memory: 2Gi + livenessProbe: + exec: + command: + - pg_isready + - -U + - langgraph + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + exec: + command: + - pg_isready + - -U + - langgraph + initialDelaySeconds: 5 + periodSeconds: 5 + volumeClaimTemplates: + - metadata: + name: postgres-storage + spec: + accessModes: [ "ReadWriteOnce" ] + resources: + requests: + storage: 20Gi + +--- +apiVersion: v1 +kind: Service +metadata: + name: redis + namespace: rag7 + labels: + app: redis +spec: + type: ClusterIP + ports: + - port: 6379 + targetPort: 6379 + protocol: TCP + name: redis + selector: + app: redis + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: redis + namespace: rag7 + labels: + app: redis +spec: + replicas: 1 + selector: + matchLabels: + app: redis + template: + metadata: + labels: + app: redis + spec: + containers: + - name: redis + image: redis:7-alpine + ports: + - containerPort: 6379 + name: redis + command: + - redis-server + - --requirepass + - $(REDIS_PASSWORD) + env: + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: langgraph-secrets + key: REDIS_PASSWORD + volumeMounts: + - name: redis-storage + mountPath: /data + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + livenessProbe: + exec: + command: + - redis-cli + - ping + initialDelaySeconds: 30 + periodSeconds: 10 + readinessProbe: + exec: + command: + - redis-cli + - ping + initialDelaySeconds: 5 + periodSeconds: 5 + volumes: + - name: redis-storage + emptyDir: {} diff --git a/n8n/README.md b/n8n/README.md new file mode 100644 index 0000000..487023a --- /dev/null +++ b/n8n/README.md @@ -0,0 +1,413 @@ +# n8n Workflows for RAG7 + +This directory contains n8n workflow templates for orchestrating the RAG7 LangGraph application. + +## Table of Contents + +- [Overview](#overview) +- [Prerequisites](#prerequisites) +- [Installation](#installation) +- [Workflows](#workflows) +- [Configuration](#configuration) +- [Usage](#usage) +- [Troubleshooting](#troubleshooting) + +## Overview + +n8n provides workflow automation for the RAG7 LangGraph system, enabling: + +- **API Orchestration**: Manage incoming requests and route them to LangGraph +- **Scheduled Processing**: Execute batch tasks on a schedule +- **Error Handling**: Graceful error handling and retry logic +- **Monitoring**: Track execution metrics and logs +- **Integration**: Connect with external systems and APIs + +## Prerequisites + +### Required + +- n8n instance (self-hosted or cloud) +- Access to RAG7 LangGraph API +- API keys for: + - LangGraph API + - OpenAI (or other LLM providers) + - LangChain/LangSmith (optional) + +### Optional + +- PostgreSQL credentials (for direct database access) +- Redis credentials (for caching) +- Task queue system credentials + +## Installation + +### 1. Set Up n8n + +**Self-hosted (Docker):** + +```bash +docker run -d \ + --name n8n \ + -p 5678:5678 \ + -v ~/.n8n:/home/node/.n8n \ + n8nio/n8n +``` + +**Self-hosted (Docker Compose):** + +```yaml +version: '3.8' + +services: + n8n: + image: n8nio/n8n + ports: + - "5678:5678" + environment: + - N8N_BASIC_AUTH_ACTIVE=true + - N8N_BASIC_AUTH_USER=admin + - N8N_BASIC_AUTH_PASSWORD=change-this-password + - N8N_HOST=n8n.yourdomain.com + - N8N_PROTOCOL=https + - WEBHOOK_URL=https://n8n.yourdomain.com/ + volumes: + - n8n_data:/home/node/.n8n + +volumes: + n8n_data: +``` + +**Cloud:** + +Sign up at [n8n.cloud](https://n8n.cloud) + +### 2. Configure Credentials + +1. Open n8n UI (http://localhost:5678 or your n8n URL) +2. Navigate to **Credentials** → **Add Credential** +3. Create credentials based on `credentials/credentials_template.json`: + +**LangGraph API Key:** +- Type: HTTP Header Auth +- Name: `X-API-Key` +- Value: Your LangGraph API key + +**OpenAI API:** +- Type: OpenAI +- API Key: Your OpenAI key + +**Database (Optional):** +- Type: PostgreSQL +- Host: `postgres.rag7.svc.cluster.local` (or external host) +- Port: `5432` +- Database: `langgraph_checkpoints` +- User: `langgraph` +- Password: Your database password + +### 3. Import Workflows + +1. Navigate to **Workflows** → **Add Workflow** +2. Click the **⋮** menu → **Import from File** +3. Import each workflow: + - `workflows/main_orchestrator.json` + - `workflows/langgraph_trigger.json` + +### 4. Configure Environment Variables + +In n8n, set environment variables for each workflow: + +```bash +LANGGRAPH_API_URL=http://langgraph.rag7.svc.cluster.local:8123 +TASK_QUEUE_URL=http://task-queue:8080 +LOGGING_ENDPOINT=http://logging-service:8080 +METRICS_ENDPOINT=http://metrics-service:9090 +``` + +Or set them in your n8n deployment: + +```yaml +environment: + - LANGGRAPH_API_URL=http://langgraph.rag7.svc.cluster.local:8123 + - TASK_QUEUE_URL=http://task-queue:8080 +``` + +## Workflows + +### 1. Main Orchestrator + +**File**: `workflows/main_orchestrator.json` + +**Purpose**: Webhook-based orchestration for real-time LangGraph execution. + +**Trigger**: Webhook (POST request) + +**Flow**: +1. Receives webhook request +2. Validates input +3. Triggers LangGraph execution +4. Processes result +5. Returns response +6. Logs execution + +**Webhook URL**: `https://your-n8n-instance/webhook/orchestrator` + +**Example Request**: + +```bash +curl -X POST https://your-n8n-instance/webhook/orchestrator \ + -H "Content-Type: application/json" \ + -d '{ + "query": "What is the capital of France?", + "user_id": "user123", + "session_id": "session456", + "metadata": { + "source": "web" + } + }' +``` + +**Response**: + +```json +{ + "status": "success", + "result": { + "answer": "The capital of France is Paris.", + "confidence": 0.95 + }, + "execution_id": "exec_789", + "duration_ms": 1234, + "metadata": { + "user_id": "user123", + "session_id": "session456", + "timestamp": "2024-01-15T10:30:45.123Z" + } +} +``` + +### 2. LangGraph Trigger + +**File**: `workflows/langgraph_trigger.json` + +**Purpose**: Scheduled batch processing of pending tasks. + +**Trigger**: Schedule (every 5 minutes by default) + +**Flow**: +1. Fetches pending tasks from queue +2. Checks if tasks exist +3. Splits tasks for parallel processing +4. Executes each task via LangGraph +5. Marks tasks as complete +6. Sends metrics + +**Configuration**: + +Adjust schedule in the workflow: +- Every 5 minutes: `*/5 * * * *` +- Every hour: `0 * * * *` +- Every day at 2 AM: `0 2 * * *` + +## Configuration + +### Workflow Settings + +**Timeout**: +- Default: 60 seconds +- For long-running graphs: Increase to 120-300 seconds + +**Retry Logic**: +- Configure in each HTTP Request node +- Recommended: 3 retries with exponential backoff + +**Batching**: +- Enable for high-volume processing +- Batch size: 5-10 requests +- Batch interval: 1-2 seconds + +### Error Handling + +All workflows include error handling: + +1. **Validation Errors**: Return 400 with error details +2. **API Errors**: Retry with exponential backoff +3. **Timeout Errors**: Return 504 Gateway Timeout +4. **Server Errors**: Return 500 with error message + +### Monitoring + +Enable execution logging: + +1. Navigate to **Settings** → **Executions** +2. Enable **Save execution data** +3. Set retention: 30 days (or as needed) + +## Usage + +### Activate Workflows + +1. Open workflow in n8n +2. Click **Active** toggle +3. Verify webhook URL or schedule + +### Test Workflows + +**Manual Test**: +1. Open workflow +2. Click **Execute Workflow** +3. Provide test data +4. Review execution results + +**Webhook Test**: + +```bash +# Test main orchestrator +curl -X POST https://your-n8n-instance/webhook/orchestrator \ + -H "Content-Type: application/json" \ + -d '{ + "query": "Test query", + "user_id": "test_user" + }' +``` + +### Monitor Executions + +1. Navigate to **Executions** +2. View execution list +3. Click execution to see details +4. Check logs for errors + +### Update Workflows + +1. Deactivate workflow +2. Make changes +3. Test changes +4. Activate workflow + +## Troubleshooting + +### Webhook Not Receiving Requests + +**Check**: +- Webhook URL is correct +- Workflow is active +- n8n is publicly accessible (or within VPN) +- Firewall allows traffic on port 5678 + +**Solution**: +```bash +# Test webhook locally +curl http://localhost:5678/webhook-test/orchestrator +``` + +### Authentication Errors + +**Check**: +- Credentials are configured correctly +- API keys are valid +- Headers are set properly + +**Solution**: +```bash +# Test API key manually +curl -H "X-API-Key: YOUR_KEY" http://langgraph:8123/health +``` + +### Timeout Errors + +**Check**: +- Graph execution time +- Network latency +- Timeout settings in HTTP Request nodes + +**Solution**: +- Increase timeout in node settings +- Optimize graph performance +- Use async processing + +### Connection Errors + +**Check**: +- Service is running +- Network connectivity +- DNS resolution + +**Solution**: +```bash +# Test connectivity +kubectl run -it --rm debug --image=curlimages/curl --restart=Never \ + -- curl http://langgraph.rag7.svc.cluster.local:8123/health +``` + +### Memory/Performance Issues + +**Check**: +- Execution data size +- Batch sizes +- Concurrent executions + +**Solution**: +- Reduce batch size +- Limit concurrent executions +- Archive old execution data + +## Advanced Usage + +### Custom Workflows + +Create custom workflows by: + +1. Combining existing nodes +2. Adding custom code nodes +3. Integrating external services + +### Integrations + +Connect with: +- **Slack**: Send notifications +- **Email**: Send reports +- **Webhooks**: Trigger external systems +- **Databases**: Store results +- **Cloud Storage**: Save artifacts + +### Example: Slack Notification + +Add a Slack node after successful execution: + +```json +{ + "parameters": { + "channel": "#notifications", + "text": "Graph execution completed: {{ $json.execution_id }}" + }, + "type": "n8n-nodes-base.slack" +} +``` + +## Best Practices + +1. **Use Descriptive Names**: Name workflows and nodes clearly +2. **Add Error Handling**: Always handle errors gracefully +3. **Enable Logging**: Keep execution logs for debugging +4. **Set Timeouts**: Prevent workflows from hanging +5. **Use Credentials**: Never hardcode API keys +6. **Test Thoroughly**: Test workflows before activating +7. **Monitor Executions**: Regularly check for failures +8. **Document Changes**: Keep notes on workflow modifications + +## Next Steps + +- Configure additional integrations +- Set up monitoring and alerting +- Create custom workflows for your use case +- Optimize performance and resource usage +- Review [Deployment Guide](../docs/deployment.md) +- Check [Observability Guide](../docs/observability.md) + +## Support + +For issues or questions: +- Check n8n documentation: https://docs.n8n.io +- Join n8n community: https://community.n8n.io +- Review LangGraph documentation +- Contact team via Slack #rag7-support diff --git a/n8n/credentials/credentials_template.json b/n8n/credentials/credentials_template.json new file mode 100644 index 0000000..db2ac40 --- /dev/null +++ b/n8n/credentials/credentials_template.json @@ -0,0 +1,60 @@ +{ + "credentials": [ + { + "name": "LangGraph API Key", + "type": "httpHeaderAuth", + "data": { + "name": "X-API-Key", + "value": "TODO_REPLACE_WITH_YOUR_LANGGRAPH_API_KEY" + } + }, + { + "name": "Task Queue API Key", + "type": "httpHeaderAuth", + "data": { + "name": "Authorization", + "value": "Bearer TODO_REPLACE_WITH_YOUR_TASK_QUEUE_TOKEN" + } + }, + { + "name": "OpenAI API", + "type": "openAiApi", + "data": { + "apiKey": "sk-TODO_REPLACE_WITH_YOUR_OPENAI_KEY" + } + }, + { + "name": "LangChain Credentials", + "type": "httpHeaderAuth", + "data": { + "name": "X-API-Key", + "value": "ls__TODO_REPLACE_WITH_YOUR_LANGCHAIN_KEY" + } + }, + { + "name": "Database Connection", + "type": "postgres", + "data": { + "host": "postgres.rag7.svc.cluster.local", + "port": 5432, + "database": "langgraph_checkpoints", + "user": "langgraph", + "password": "TODO_REPLACE_WITH_DATABASE_PASSWORD", + "ssl": { + "enabled": false + } + } + }, + { + "name": "Redis Connection", + "type": "redis", + "data": { + "host": "redis.rag7.svc.cluster.local", + "port": 6379, + "password": "TODO_REPLACE_WITH_REDIS_PASSWORD", + "database": 0 + } + } + ], + "instructions": "Replace all TODO_REPLACE_WITH_YOUR_* placeholders with actual credentials before importing into n8n." +} diff --git a/n8n/workflows/langgraph_trigger.json b/n8n/workflows/langgraph_trigger.json new file mode 100644 index 0000000..14facda --- /dev/null +++ b/n8n/workflows/langgraph_trigger.json @@ -0,0 +1,285 @@ +{ + "name": "LangGraph Trigger", + "nodes": [ + { + "parameters": { + "rule": { + "interval": [ + { + "field": "cronExpression", + "expression": "*/5 * * * *" + } + ] + } + }, + "id": "schedule-trigger", + "name": "Schedule Trigger", + "type": "n8n-nodes-base.scheduleTrigger", + "typeVersion": 1.1, + "position": [250, 300] + }, + { + "parameters": { + "authentication": "genericCredentialType", + "genericAuthType": "httpHeaderAuth", + "method": "GET", + "url": "={{ $env.TASK_QUEUE_URL }}/pending", + "options": {} + }, + "id": "fetch-pending-tasks", + "name": "Fetch Pending Tasks", + "type": "n8n-nodes-base.httpRequest", + "typeVersion": 4.1, + "position": [450, 300], + "credentials": { + "httpHeaderAuth": { + "id": "task-queue-api-key", + "name": "Task Queue API Key" + } + } + }, + { + "parameters": { + "conditions": { + "options": { + "caseSensitive": true, + "leftValue": "", + "typeValidation": "strict" + }, + "conditions": [ + { + "id": "has-tasks", + "leftValue": "={{ $json.tasks.length }}", + "rightValue": "0", + "operator": { + "type": "number", + "operation": "gt" + } + } + ], + "combinator": "and" + }, + "options": {} + }, + "id": "check-tasks", + "name": "Check Tasks", + "type": "n8n-nodes-base.if", + "typeVersion": 2, + "position": [650, 300] + }, + { + "parameters": { + "fieldToSplitOut": "tasks", + "options": {} + }, + "id": "split-tasks", + "name": "Split Tasks", + "type": "n8n-nodes-base.splitOut", + "typeVersion": 1, + "position": [850, 200] + }, + { + "parameters": { + "authentication": "genericCredentialType", + "genericAuthType": "httpHeaderAuth", + "method": "POST", + "url": "={{ $env.LANGGRAPH_API_URL }}/v1/graph/run", + "sendBody": true, + "bodyParameters": { + "parameters": [ + { + "name": "input", + "value": "={{ $json.input }}" + }, + { + "name": "config", + "value": "={{ { configurable: { thread_id: $json.task_id } } }}" + } + ] + }, + "options": { + "batching": { + "batch": { + "batchSize": 5, + "batchInterval": 1000 + } + }, + "timeout": 120000 + } + }, + "id": "execute-graph", + "name": "Execute Graph", + "type": "n8n-nodes-base.httpRequest", + "typeVersion": 4.1, + "position": [1050, 200], + "credentials": { + "httpHeaderAuth": { + "id": "langgraph-api-key", + "name": "LangGraph API Key" + } + } + }, + { + "parameters": { + "authentication": "genericCredentialType", + "genericAuthType": "httpHeaderAuth", + "method": "POST", + "url": "={{ $env.TASK_QUEUE_URL }}/complete", + "sendBody": true, + "bodyParameters": { + "parameters": [ + { + "name": "task_id", + "value": "={{ $json.task_id }}" + }, + { + "name": "result", + "value": "={{ $json.output }}" + }, + { + "name": "status", + "value": "completed" + } + ] + }, + "options": {} + }, + "id": "mark-complete", + "name": "Mark Complete", + "type": "n8n-nodes-base.httpRequest", + "typeVersion": 4.1, + "position": [1250, 200], + "credentials": { + "httpHeaderAuth": { + "id": "task-queue-api-key", + "name": "Task Queue API Key" + } + } + }, + { + "parameters": { + "jsCode": "// No tasks found\nreturn {\n status: 'idle',\n message: 'No pending tasks',\n timestamp: new Date().toISOString()\n};" + }, + "id": "no-tasks", + "name": "No Tasks", + "type": "n8n-nodes-base.code", + "typeVersion": 2, + "position": [850, 400] + }, + { + "parameters": { + "authentication": "genericCredentialType", + "genericAuthType": "httpHeaderAuth", + "method": "POST", + "url": "={{ $env.METRICS_ENDPOINT }}/metrics", + "sendBody": true, + "bodyParameters": { + "parameters": [ + { + "name": "metric", + "value": "tasks_processed" + }, + { + "name": "value", + "value": "={{ $itemIndex + 1 }}" + }, + { + "name": "labels", + "value": "={{ { workflow: 'langgraph_trigger', status: 'success' } }}" + } + ] + }, + "options": {} + }, + "id": "send-metrics", + "name": "Send Metrics", + "type": "n8n-nodes-base.httpRequest", + "typeVersion": 4.1, + "position": [1450, 200], + "continueOnFail": true + } + ], + "connections": { + "Schedule Trigger": { + "main": [ + [ + { + "node": "Fetch Pending Tasks", + "type": "main", + "index": 0 + } + ] + ] + }, + "Fetch Pending Tasks": { + "main": [ + [ + { + "node": "Check Tasks", + "type": "main", + "index": 0 + } + ] + ] + }, + "Check Tasks": { + "main": [ + [ + { + "node": "Split Tasks", + "type": "main", + "index": 0 + } + ], + [ + { + "node": "No Tasks", + "type": "main", + "index": 0 + } + ] + ] + }, + "Split Tasks": { + "main": [ + [ + { + "node": "Execute Graph", + "type": "main", + "index": 0 + } + ] + ] + }, + "Execute Graph": { + "main": [ + [ + { + "node": "Mark Complete", + "type": "main", + "index": 0 + } + ] + ] + }, + "Mark Complete": { + "main": [ + [ + { + "node": "Send Metrics", + "type": "main", + "index": 0 + } + ] + ] + } + }, + "settings": { + "executionOrder": "v1" + }, + "staticData": null, + "tags": [], + "triggerCount": 1, + "updatedAt": "2024-01-15T10:00:00.000Z", + "versionId": "1" +} diff --git a/n8n/workflows/main_orchestrator.json b/n8n/workflows/main_orchestrator.json new file mode 100644 index 0000000..69f80ff --- /dev/null +++ b/n8n/workflows/main_orchestrator.json @@ -0,0 +1,263 @@ +{ + "name": "Main Orchestrator", + "nodes": [ + { + "parameters": { + "httpMethod": "POST", + "path": "orchestrator", + "responseMode": "responseNode", + "options": {} + }, + "id": "webhook-trigger", + "name": "Webhook", + "type": "n8n-nodes-base.webhook", + "typeVersion": 1, + "position": [250, 300], + "webhookId": "main-orchestrator" + }, + { + "parameters": { + "jsCode": "// Extract and validate input\nconst body = $input.first().json.body;\n\nif (!body || !body.query) {\n throw new Error('Missing required field: query');\n}\n\nreturn {\n query: body.query,\n user_id: body.user_id || 'anonymous',\n session_id: body.session_id || $execution.id,\n metadata: body.metadata || {},\n timestamp: new Date().toISOString()\n};" + }, + "id": "validate-input", + "name": "Validate Input", + "type": "n8n-nodes-base.code", + "typeVersion": 2, + "position": [450, 300] + }, + { + "parameters": { + "authentication": "genericCredentialType", + "genericAuthType": "httpHeaderAuth", + "method": "POST", + "url": "={{ $env.LANGGRAPH_API_URL }}/v1/graph/run", + "sendBody": true, + "bodyParameters": { + "parameters": [ + { + "name": "input", + "value": "={{ { query: $json.query, user_id: $json.user_id, session_id: $json.session_id } }}" + }, + { + "name": "config", + "value": "={{ { configurable: { thread_id: $json.session_id } } }}" + } + ] + }, + "options": { + "timeout": 60000 + } + }, + "id": "trigger-langgraph", + "name": "Trigger LangGraph", + "type": "n8n-nodes-base.httpRequest", + "typeVersion": 4.1, + "position": [650, 300], + "credentials": { + "httpHeaderAuth": { + "id": "langgraph-api-key", + "name": "LangGraph API Key" + } + } + }, + { + "parameters": { + "conditions": { + "options": { + "caseSensitive": true, + "leftValue": "", + "typeValidation": "strict" + }, + "conditions": [ + { + "id": "success-condition", + "leftValue": "={{ $json.status }}", + "rightValue": "success", + "operator": { + "type": "string", + "operation": "equals" + } + } + ], + "combinator": "and" + }, + "options": {} + }, + "id": "check-success", + "name": "Check Success", + "type": "n8n-nodes-base.if", + "typeVersion": 2, + "position": [850, 300] + }, + { + "parameters": { + "jsCode": "// Process successful response\nconst response = $input.first().json;\n\nreturn {\n status: 'success',\n result: response.output,\n execution_id: response.execution_id,\n duration_ms: response.duration_ms,\n metadata: {\n user_id: response.user_id,\n session_id: response.session_id,\n timestamp: new Date().toISOString()\n }\n};" + }, + "id": "process-success", + "name": "Process Success", + "type": "n8n-nodes-base.code", + "typeVersion": 2, + "position": [1050, 200] + }, + { + "parameters": { + "jsCode": "// Handle error\nconst error = $input.first().json;\n\nreturn {\n status: 'error',\n error: {\n message: error.error || 'Unknown error',\n code: error.code || 'INTERNAL_ERROR',\n details: error.details || {}\n },\n metadata: {\n timestamp: new Date().toISOString()\n }\n};" + }, + "id": "handle-error", + "name": "Handle Error", + "type": "n8n-nodes-base.code", + "typeVersion": 2, + "position": [1050, 400] + }, + { + "parameters": { + "respondWith": "json", + "responseBody": "={{ $json }}" + }, + "id": "respond-success", + "name": "Respond Success", + "type": "n8n-nodes-base.respondToWebhook", + "typeVersion": 1, + "position": [1250, 200] + }, + { + "parameters": { + "respondWith": "json", + "responseBody": "={{ $json }}", + "options": { + "responseCode": 500 + } + }, + "id": "respond-error", + "name": "Respond Error", + "type": "n8n-nodes-base.respondToWebhook", + "typeVersion": 1, + "position": [1250, 400] + }, + { + "parameters": { + "authentication": "genericCredentialType", + "genericAuthType": "httpHeaderAuth", + "method": "POST", + "url": "={{ $env.LOGGING_ENDPOINT }}/logs", + "sendBody": true, + "bodyParameters": { + "parameters": [ + { + "name": "level", + "value": "info" + }, + { + "name": "message", + "value": "Workflow execution completed" + }, + { + "name": "data", + "value": "={{ $json }}" + } + ] + }, + "options": {} + }, + "id": "log-execution", + "name": "Log Execution", + "type": "n8n-nodes-base.httpRequest", + "typeVersion": 4.1, + "position": [1050, 300], + "continueOnFail": true + } + ], + "connections": { + "Webhook": { + "main": [ + [ + { + "node": "Validate Input", + "type": "main", + "index": 0 + } + ] + ] + }, + "Validate Input": { + "main": [ + [ + { + "node": "Trigger LangGraph", + "type": "main", + "index": 0 + } + ] + ] + }, + "Trigger LangGraph": { + "main": [ + [ + { + "node": "Check Success", + "type": "main", + "index": 0 + } + ] + ] + }, + "Check Success": { + "main": [ + [ + { + "node": "Process Success", + "type": "main", + "index": 0 + } + ], + [ + { + "node": "Handle Error", + "type": "main", + "index": 0 + } + ] + ] + }, + "Process Success": { + "main": [ + [ + { + "node": "Log Execution", + "type": "main", + "index": 0 + }, + { + "node": "Respond Success", + "type": "main", + "index": 0 + } + ] + ] + }, + "Handle Error": { + "main": [ + [ + { + "node": "Log Execution", + "type": "main", + "index": 0 + }, + { + "node": "Respond Error", + "type": "main", + "index": 0 + } + ] + ] + } + }, + "settings": { + "executionOrder": "v1" + }, + "staticData": null, + "tags": [], + "triggerCount": 1, + "updatedAt": "2024-01-15T10:00:00.000Z", + "versionId": "1" +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c90dd45 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,122 @@ +[build-system] +requires = ["setuptools>=68.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "rag7" +version = "1.0.0" +description = "RAG7 - Production-ready LangGraph-based RAG system" +readme = "README.md" +requires-python = ">=3.11" +license = {text = "MIT"} +authors = [ + {name = "RAG7 Team", email = "team@example.com"} +] +keywords = ["rag", "langgraph", "langchain", "llm", "ai"] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Artificial Intelligence", +] + +dependencies = [ + "langgraph>=0.0.26", + "langchain>=0.1.0", + "langchain-core>=0.1.0", + "langchain-community>=0.0.13", + "openai>=1.6.1", + "anthropic>=0.8.1", + "chromadb>=0.4.22", + "psycopg2-binary>=2.9.9", + "sqlalchemy>=2.0.23", + "redis>=5.0.1", + "fastapi>=0.104.1", + "uvicorn[standard]>=0.24.0", + "httpx>=0.25.1", + "numpy>=1.26.2", + "pandas>=2.1.4", + "pydantic>=2.5.0", + "pydantic-settings>=2.1.0", + "tiktoken>=0.5.2", + "sentence-transformers>=2.2.2", + "python-dotenv>=1.0.0", + "tenacity>=8.2.3", + "aiofiles>=23.2.1", + "prometheus-client>=0.19.0", + "opentelemetry-api>=1.21.0", + "opentelemetry-sdk>=1.21.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=7.4.3", + "pytest-asyncio>=0.21.1", + "pytest-cov>=4.1.0", + "black>=23.12.1", + "flake8>=7.0.0", + "isort>=5.13.2", + "mypy>=1.7.1", +] + +[project.urls] +Homepage = "https://github.com/Stacey77/rag7" +Documentation = "https://github.com/Stacey77/rag7/blob/main/docs" +Repository = "https://github.com/Stacey77/rag7" +Issues = "https://github.com/Stacey77/rag7/issues" + +[tool.setuptools] +packages = ["rag7"] + +[tool.black] +line-length = 100 +target-version = ['py311'] +include = '\.pyi?$' +extend-exclude = ''' +/( + # directories + \.eggs + | \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | build + | dist +)/ +''' + +[tool.isort] +profile = "black" +line_length = 100 +skip_gitignore = true + +[tool.pytest.ini_options] +minversion = "7.0" +addopts = "-ra -q --strict-markers" +testpaths = ["tests"] +pythonpath = ["."] + +[tool.mypy] +python_version = "3.11" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = false +ignore_missing_imports = true + +[tool.coverage.run] +source = ["rag7"] +omit = ["*/tests/*", "*/test_*.py"] + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "def __repr__", + "raise AssertionError", + "raise NotImplementedError", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..63b2edd --- /dev/null +++ b/requirements.txt @@ -0,0 +1,56 @@ +# RAG7 LangGraph Application Requirements +# Python dependencies for the RAG7 project + +# Core Framework +langgraph==0.0.26 +langchain==0.1.0 +langchain-core==0.1.0 +langchain-community==0.0.13 + +# LLM Providers +openai==1.6.1 +anthropic==0.8.1 + +# Vector Stores +chromadb==0.4.22 +faiss-cpu==1.7.4 + +# Database & Storage +psycopg2-binary==2.9.9 +sqlalchemy==2.0.23 +redis==5.0.1 + +# Web Framework (for API) +fastapi==0.104.1 +uvicorn[standard]==0.24.0 +httpx==0.25.1 + +# Data Processing +numpy==1.26.2 +pandas==2.1.4 +pydantic==2.5.0 +pydantic-settings==2.1.0 + +# Text Processing +tiktoken==0.5.2 +sentence-transformers==2.2.2 + +# Utilities +python-dotenv==1.0.0 +tenacity==8.2.3 +aiofiles==23.2.1 + +# Observability & Monitoring +prometheus-client==0.19.0 +opentelemetry-api==1.21.0 +opentelemetry-sdk==1.21.0 +opentelemetry-instrumentation==0.42b0 + +# Development & Testing (optional, uncomment if needed) +# pytest==7.4.3 +# pytest-asyncio==0.21.1 +# pytest-cov==4.1.0 +# black==23.12.1 +# flake8==7.0.0 +# isort==5.13.2 +# mypy==1.7.1