diff --git a/.env.prod.example b/.env.prod.example
new file mode 100644
index 0000000..122244e
--- /dev/null
+++ b/.env.prod.example
@@ -0,0 +1,81 @@
+# Production Environment Configuration
+# Copy this file to .env.prod and fill in the actual values
+
+# Application
+APP_NAME=rag7-langgraph
+APP_ENV=production
+APP_DEBUG=false
+LOG_LEVEL=INFO
+
+# API Configuration
+API_HOST=0.0.0.0
+API_PORT=8000
+API_WORKERS=4
+API_RELOAD=false
+
+# LangGraph Configuration
+LANGGRAPH_API_URL=http://langgraph:8123
+LANGGRAPH_CHECKPOINT_STORE=postgres
+LANGGRAPH_STREAM_MODE=values
+
+# Database (PostgreSQL for LangGraph checkpoints)
+# TODO: Replace with actual production database credentials
+POSTGRES_HOST=postgres
+POSTGRES_PORT=5432
+POSTGRES_DB=langgraph_checkpoints
+POSTGRES_USER=langgraph
+POSTGRES_PASSWORD=CHANGEME_SECURE_PASSWORD
+
+# Redis (for caching and rate limiting)
+# TODO: Replace with actual production Redis credentials
+REDIS_HOST=redis
+REDIS_PORT=6379
+REDIS_PASSWORD=CHANGEME_SECURE_PASSWORD
+REDIS_DB=0
+
+# Authentication & Security
+# TODO: Generate a secure secret key (e.g., using: openssl rand -hex 32)
+SECRET_KEY=CHANGEME_GENERATE_SECURE_KEY
+API_KEY_SALT=CHANGEME_GENERATE_SECURE_SALT
+ALLOWED_ORIGINS=https://yourdomain.com,https://www.yourdomain.com
+
+# Observability
+ENABLE_METRICS=true
+METRICS_PORT=9090
+JAEGER_AGENT_HOST=jaeger
+JAEGER_AGENT_PORT=6831
+OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318
+
+# LangChain/LangSmith (optional)
+# TODO: Add your LangSmith API key if using
+LANGCHAIN_TRACING_V2=false
+LANGCHAIN_API_KEY=
+LANGCHAIN_PROJECT=rag7-production
+
+# OpenAI API (if using OpenAI models)
+# TODO: Add your OpenAI API key
+OPENAI_API_KEY=
+
+# Other LLM Providers (as needed)
+# TODO: Add your API keys for other providers
+ANTHROPIC_API_KEY=
+COHERE_API_KEY=
+HUGGINGFACE_API_KEY=
+
+# Vector Store Configuration
+VECTOR_STORE_TYPE=postgres  # or 'pinecone', 'weaviate', 'qdrant'
+VECTOR_DIMENSION=1536
+
+# n8n Integration (if using)
+N8N_WEBHOOK_URL=https://n8n.yourdomain.com/webhook
+N8N_API_KEY=CHANGEME_N8N_API_KEY
+
+# Rate Limiting
+RATE_LIMIT_ENABLED=true
+RATE_LIMIT_PER_MINUTE=60
+RATE_LIMIT_BURST=10
+
+# Feature Flags
+ENABLE_ASYNC_PROCESSING=true
+ENABLE_CACHING=true
+CACHE_TTL_SECONDS=3600
diff --git a/.github/workflows/cd-staging.yml b/.github/workflows/cd-staging.yml
new file mode 100644
index 0000000..7d3bcd9
--- /dev/null
+++ b/.github/workflows/cd-staging.yml
@@ -0,0 +1,73 @@
+name: CD - Deploy to Staging
+
+on:
+  push:
+    branches: [develop]
+  workflow_dispatch:
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+  DEPLOYMENT_NAME: langgraph-api
+  NAMESPACE: staging
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    environment: staging
+    steps:
+      - uses: actions/checkout@v4
+      
+      - name: Set up kubectl
+        uses: azure/setup-kubectl@v3
+        with:
+          version: 'v1.28.0'
+      
+      - name: Configure kubeconfig
+        run: |
+          mkdir -p $HOME/.kube
+          echo "${{ secrets.KUBECONFIG_STAGING }}" | base64 -d > $HOME/.kube/config
+          chmod 600 $HOME/.kube/config
+      
+      - name: Verify cluster connection
+        run: |
+          kubectl cluster-info
+          kubectl get nodes
+      
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      
+      - name: Set image tag
+        id: image
+        run: |
+          IMAGE_TAG="${{ github.sha }}"
+          echo "tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
+          echo "image=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${IMAGE_TAG}" >> $GITHUB_OUTPUT
+      
+      - name: Update deployment image
+        run: |
+          kubectl set image deployment/${{ env.DEPLOYMENT_NAME }} \
+            langgraph=${{ steps.image.outputs.image }} \
+            -n ${{ env.NAMESPACE }}
+      
+      - name: Wait for rollout
+        run: |
+          kubectl rollout status deployment/${{ env.DEPLOYMENT_NAME }} \
+            -n ${{ env.NAMESPACE }} \
+            --timeout=5m
+      
+      - name: Verify deployment
+        run: |
+          kubectl get pods -n ${{ env.NAMESPACE }} -l app=langgraph
+          kubectl get service -n ${{ env.NAMESPACE }} -l app=langgraph
+      
+      - name: Run smoke tests
+        run: |
+          # TODO: Add smoke test endpoint checks
+          # kubectl run smoke-test --image=curlimages/curl --rm -it --restart=Never \
+          #   -- curl -f http://${{ env.DEPLOYMENT_NAME }}.${{ env.NAMESPACE }}.svc.cluster.local/health
+          echo "Smoke tests would run here"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..6365e17
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,109 @@
+name: CI Pipeline
+
+on:
+  push:
+    branches: [main, develop]
+  pull_request:
+    branches: [main, develop]
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install flake8 black isort mypy
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+      
+      - name: Lint with flake8
+        run: |
+          # Stop the build if there are Python syntax errors or undefined names
+          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+          # Exit-zero treats all errors as warnings
+          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+      
+      - name: Check formatting with black
+        run: black --check .
+        continue-on-error: true
+      
+      - name: Check import ordering with isort
+        run: isort --check-only .
+        continue-on-error: true
+
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+          pip install pytest pytest-cov pytest-asyncio
+      
+      - name: Run tests
+        run: |
+          pytest --cov=. --cov-report=xml --cov-report=term
+        continue-on-error: true
+      
+      - name: Upload coverage reports
+        uses: codecov/codecov-action@v3
+        continue-on-error: true
+
+  build:
+    runs-on: ubuntu-latest
+    needs: [lint, test]
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - uses: actions/checkout@v4
+      
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      
+      - name: Extract metadata
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=ref,event=branch
+            type=ref,event=pr
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=sha,prefix={{branch}}-
+      
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..79e1579
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,22 @@
+*.pyc
+__pycache__/
+*.pyo
+*.pyd
+.Python
+*.so
+*.egg
+*.egg-info/
+dist/
+build/
+.eggs/
+*.log
+.env
+.venv
+venv/
+ENV/
+.DS_Store
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
diff --git a/README.md b/README.md
index f5a8ce3..6889614 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,3 @@
-# rag7
\ No newline at end of file
+# rag7
+
+> **📦 Production Templates Available**: This repository now includes production-ready deployment templates, CI/CD workflows, Kubernetes manifests, n8n workflows, and comprehensive documentation. See the `docs/` directory and related files to get started with deployment.
\ No newline at end of file
diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml
new file mode 100644
index 0000000..62973eb
--- /dev/null
+++ b/docker-compose.prod.yml
@@ -0,0 +1,149 @@
+version: '3.8'
+
+services:
+  langgraph:
+    image: ghcr.io/stacey77/rag7:latest
+    container_name: langgraph-api
+    restart: unless-stopped
+    env_file:
+      - .env.prod
+    ports:
+      - "8123:8123"
+    depends_on:
+      postgres:
+        condition: service_healthy
+      redis:
+        condition: service_healthy
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8123/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
+    networks:
+      - rag7-network
+    volumes:
+      - ./data:/app/data
+    deploy:
+      resources:
+        limits:
+          cpus: '2.0'
+          memory: 4G
+        reservations:
+          cpus: '1.0'
+          memory: 2G
+
+  integration-api:
+    build:
+      context: ./integration/api
+      dockerfile: Dockerfile
+    container_name: integration-api
+    restart: unless-stopped
+    env_file:
+      - .env.prod
+    ports:
+      - "8000:8000"
+    depends_on:
+      langgraph:
+        condition: service_healthy
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 20s
+    networks:
+      - rag7-network
+    deploy:
+      resources:
+        limits:
+          cpus: '1.0'
+          memory: 2G
+        reservations:
+          cpus: '0.5'
+          memory: 1G
+
+  postgres:
+    image: postgres:15-alpine
+    container_name: langgraph-postgres
+    restart: unless-stopped
+    environment:
+      POSTGRES_DB: ${POSTGRES_DB:-langgraph_checkpoints}
+      POSTGRES_USER: ${POSTGRES_USER:-langgraph}
+      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-CHANGEME_SECURE_PASSWORD}
+      PGDATA: /var/lib/postgresql/data/pgdata
+    ports:
+      - "5432:5432"
+    volumes:
+      - postgres-data:/var/lib/postgresql/data
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-langgraph}"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+    networks:
+      - rag7-network
+    deploy:
+      resources:
+        limits:
+          cpus: '1.0'
+          memory: 2G
+        reservations:
+          cpus: '0.5'
+          memory: 1G
+
+  redis:
+    image: redis:7-alpine
+    container_name: langgraph-redis
+    restart: unless-stopped
+    command: redis-server --requirepass ${REDIS_PASSWORD:-CHANGEME_SECURE_PASSWORD}
+    ports:
+      - "6379:6379"
+    volumes:
+      - redis-data:/data
+    healthcheck:
+      test: ["CMD", "redis-cli", "--raw", "incr", "ping"]
+      interval: 10s
+      timeout: 5s
+      retries: 5
+    networks:
+      - rag7-network
+    deploy:
+      resources:
+        limits:
+          cpus: '0.5'
+          memory: 512M
+        reservations:
+          cpus: '0.25'
+          memory: 256M
+
+  nginx:
+    image: nginx:alpine
+    container_name: rag7-nginx
+    restart: unless-stopped
+    ports:
+      - "80:80"
+      - "443:443"
+    volumes:
+      - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
+      - ./nginx/ssl:/etc/nginx/ssl:ro
+    depends_on:
+      - integration-api
+      - langgraph
+    networks:
+      - rag7-network
+    deploy:
+      resources:
+        limits:
+          cpus: '0.5'
+          memory: 512M
+
+volumes:
+  postgres-data:
+    driver: local
+  redis-data:
+    driver: local
+
+networks:
+  rag7-network:
+    driver: bridge
diff --git a/docs/deployment.md b/docs/deployment.md
new file mode 100644
index 0000000..349556b
--- /dev/null
+++ b/docs/deployment.md
@@ -0,0 +1,355 @@
+# Deployment Guide
+
+This guide covers deploying the RAG7 LangGraph application to production environments.
+
+## Table of Contents
+
+- [Prerequisites](#prerequisites)
+- [Environment Configuration](#environment-configuration)
+- [Deployment Options](#deployment-options)
+  - [Docker Compose](#docker-compose)
+  - [Kubernetes](#kubernetes)
+- [Post-Deployment](#post-deployment)
+- [Troubleshooting](#troubleshooting)
+
+## Prerequisites
+
+### Required Tools
+
+- Docker & Docker Compose (v2.0+)
+- kubectl (v1.28+) for Kubernetes deployments
+- Access to container registry (GitHub Container Registry)
+- Database: PostgreSQL 15+
+- Cache: Redis 7+
+
+### Required Secrets
+
+Before deploying, you must configure the following secrets:
+
+1. **Database Credentials**: PostgreSQL password
+2. **Secret Keys**: Application secret key and API key salt
+3. **API Keys**: OpenAI, Anthropic, or other LLM provider keys
+4. **Registry Access**: GitHub Container Registry token (for pulling images)
+5. **Kubeconfig**: Kubernetes cluster credentials (for K8s deployments)
+
+## Environment Configuration
+
+### Step 1: Create Production Environment File
+
+```bash
+cp .env.prod.example .env.prod
+```
+
+### Step 2: Update Required Values
+
+Edit `.env.prod` and replace all `CHANGEME_*` placeholders:
+
+```bash
+# Generate secure secret key
+openssl rand -hex 32
+
+# Generate API key salt
+openssl rand -hex 16
+
+# Generate secure database password
+openssl rand -base64 32
+```
+
+### Step 3: Configure API Keys
+
+Add your LLM provider API keys:
+
+```bash
+OPENAI_API_KEY=sk-your-actual-key-here
+LANGCHAIN_API_KEY=ls__your-actual-key-here
+```
+
+## Deployment Options
+
+### Docker Compose
+
+Docker Compose is suitable for single-server deployments or staging environments.
+
+#### Deploy with Docker Compose
+
+```bash
+# Pull latest images
+docker-compose -f docker-compose.prod.yml pull
+
+# Start services
+docker-compose -f docker-compose.prod.yml up -d
+
+# Check service status
+docker-compose -f docker-compose.prod.yml ps
+
+# View logs
+docker-compose -f docker-compose.prod.yml logs -f langgraph
+```
+
+#### Verify Deployment
+
+```bash
+# Check health endpoint
+curl http://localhost:8000/health
+
+# Check readiness endpoint
+curl http://localhost:8000/ready
+
+# Test graph execution
+curl -X POST http://localhost:8000/v1/graph/run \
+  -H "Content-Type: application/json" \
+  -d '{"input": {"query": "test"}}'
+```
+
+#### Stop Services
+
+```bash
+docker-compose -f docker-compose.prod.yml down
+```
+
+### Kubernetes
+
+Kubernetes is recommended for production deployments requiring high availability and auto-scaling.
+
+#### Prerequisites
+
+1. **Configure kubectl**
+
+```bash
+# Set up kubeconfig
+export KUBECONFIG=/path/to/your/kubeconfig
+
+# Verify connection
+kubectl cluster-info
+kubectl get nodes
+```
+
+2. **Create Namespace**
+
+```bash
+kubectl apply -f k8s/langgraph-deployment.yaml
+# This creates the rag7 namespace
+```
+
+3. **Configure Secrets**
+
+Update the secrets in `k8s/langgraph-deployment.yaml` or create them via kubectl:
+
+```bash
+# Create secrets from literal values
+kubectl create secret generic langgraph-secrets \
+  --namespace=rag7 \
+  --from-literal=POSTGRES_PASSWORD='your-secure-password' \
+  --from-literal=REDIS_PASSWORD='your-redis-password' \
+  --from-literal=SECRET_KEY='your-secret-key' \
+  --from-literal=API_KEY_SALT='your-api-salt' \
+  --from-literal=OPENAI_API_KEY='sk-your-key' \
+  --dry-run=client -o yaml | kubectl apply -f -
+```
+
+4. **Create Image Pull Secret** (for GHCR)
+
+```bash
+kubectl create secret docker-registry ghcr-secret \
+  --namespace=rag7 \
+  --docker-server=ghcr.io \
+  --docker-username=YOUR_GITHUB_USERNAME \
+  --docker-password=YOUR_GITHUB_TOKEN \
+  --docker-email=YOUR_EMAIL
+```
+
+#### Deploy to Kubernetes
+
+```bash
+# Apply all manifests
+kubectl apply -f k8s/langgraph-deployment.yaml
+kubectl apply -f k8s/hpa.yaml
+
+# Check deployment status
+kubectl get deployments -n rag7
+kubectl get pods -n rag7
+kubectl get services -n rag7
+
+# Watch rollout
+kubectl rollout status deployment/langgraph -n rag7
+```
+
+#### Verify Kubernetes Deployment
+
+```bash
+# Port-forward to test locally
+kubectl port-forward -n rag7 svc/langgraph 8123:8123
+
+# In another terminal, test endpoints
+curl http://localhost:8123/health
+curl http://localhost:8123/ready
+```
+
+#### Expose Service (Optional)
+
+Using an Ingress controller:
+
+```yaml
+# ingress.yaml
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: langgraph-ingress
+  namespace: rag7
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+spec:
+  ingressClassName: nginx
+  tls:
+  - hosts:
+    - api.yourdomain.com
+    secretName: langgraph-tls
+  rules:
+  - host: api.yourdomain.com
+    http:
+      paths:
+      - path: /
+        pathType: Prefix
+        backend:
+          service:
+            name: langgraph
+            port:
+              number: 8123
+```
+
+```bash
+kubectl apply -f ingress.yaml
+```
+
+## Post-Deployment
+
+### Database Migrations
+
+Run any necessary database migrations:
+
+```bash
+# Docker Compose
+docker-compose -f docker-compose.prod.yml exec langgraph python -m alembic upgrade head
+
+# Kubernetes
+kubectl exec -n rag7 -it deployment/langgraph -- python -m alembic upgrade head
+```
+
+### Monitoring Setup
+
+1. **Check Metrics Endpoint**
+
+```bash
+curl http://localhost:9090/metrics
+```
+
+2. **Configure Prometheus** (if using)
+
+Add scraping configuration for the metrics endpoint.
+
+3. **Set Up Alerts**
+
+Configure alerting rules for critical metrics.
+
+### Backup Configuration
+
+1. **Database Backups**
+
+```bash
+# Set up automated PostgreSQL backups
+kubectl create cronjob postgres-backup \
+  --image=postgres:15-alpine \
+  --schedule="0 2 * * *" \
+  --restart=Never \
+  -- pg_dump -h postgres.rag7.svc.cluster.local -U langgraph > /backup/db.sql
+```
+
+2. **Configuration Backups**
+
+Store ConfigMaps and Secrets in version control (encrypted).
+
+## Troubleshooting
+
+### Common Issues
+
+#### Pods Not Starting
+
+```bash
+# Check pod status
+kubectl get pods -n rag7
+
+# View pod logs
+kubectl logs -n rag7 deployment/langgraph --tail=100
+
+# Describe pod for events
+kubectl describe pod -n rag7 POD_NAME
+```
+
+#### Connection Issues
+
+```bash
+# Test database connectivity
+kubectl run -n rag7 -it --rm debug --image=postgres:15-alpine --restart=Never \
+  -- psql -h postgres.rag7.svc.cluster.local -U langgraph -d langgraph_checkpoints
+
+# Test Redis connectivity
+kubectl run -n rag7 -it --rm debug --image=redis:7-alpine --restart=Never \
+  -- redis-cli -h redis.rag7.svc.cluster.local -a PASSWORD ping
+```
+
+#### Image Pull Errors
+
+```bash
+# Verify image pull secret
+kubectl get secret ghcr-secret -n rag7 -o yaml
+
+# Test image pull manually
+docker pull ghcr.io/stacey77/rag7:latest
+```
+
+#### Resource Constraints
+
+```bash
+# Check resource usage
+kubectl top pods -n rag7
+
+# Check HPA status
+kubectl get hpa -n rag7
+
+# Describe HPA for details
+kubectl describe hpa langgraph-hpa -n rag7
+```
+
+### Logs and Debugging
+
+```bash
+# Stream all logs
+kubectl logs -n rag7 -l app=langgraph --tail=100 -f
+
+# Get logs from specific pod
+kubectl logs -n rag7 POD_NAME --tail=200
+
+# Get previous crashed container logs
+kubectl logs -n rag7 POD_NAME --previous
+```
+
+### Rollback Deployment
+
+```bash
+# View rollout history
+kubectl rollout history deployment/langgraph -n rag7
+
+# Rollback to previous version
+kubectl rollout undo deployment/langgraph -n rag7
+
+# Rollback to specific revision
+kubectl rollout undo deployment/langgraph -n rag7 --to-revision=2
+```
+
+## Next Steps
+
+- Configure observability (see [observability.md](./observability.md))
+- Set up alerting and on-call rotation (see [runbook.md](./runbook.md))
+- Import n8n workflows (see [../n8n/README.md](../n8n/README.md))
+- Configure CI/CD pipelines
+- Set up backup and disaster recovery procedures
diff --git a/docs/observability.md b/docs/observability.md
new file mode 100644
index 0000000..a0c3051
--- /dev/null
+++ b/docs/observability.md
@@ -0,0 +1,454 @@
+# Observability Guide
+
+This guide covers monitoring, logging, and tracing for the RAG7 LangGraph application.
+
+## Table of Contents
+
+- [Metrics](#metrics)
+- [Logging](#logging)
+- [Tracing](#tracing)
+- [Dashboards](#dashboards)
+- [Alerting](#alerting)
+
+## Metrics
+
+### Prometheus Metrics Endpoint
+
+The application exposes Prometheus-compatible metrics at `/metrics` on port 9090.
+
+#### Key Metrics to Monitor
+
+**Application Metrics:**
+- `http_requests_total` - Total HTTP requests by endpoint and status
+- `http_request_duration_seconds` - Request duration histogram
+- `langgraph_execution_duration_seconds` - Graph execution time
+- `langgraph_errors_total` - Total graph execution errors
+- `active_graph_executions` - Currently running graph executions
+
+**System Metrics:**
+- `process_cpu_usage` - CPU usage percentage
+- `process_memory_bytes` - Memory usage in bytes
+- `process_open_fds` - Open file descriptors
+
+**Database Metrics:**
+- `db_connections_active` - Active database connections
+- `db_query_duration_seconds` - Database query duration
+- `db_errors_total` - Database errors
+
+**Cache Metrics:**
+- `cache_hits_total` - Cache hit count
+- `cache_misses_total` - Cache miss count
+- `cache_evictions_total` - Cache eviction count
+
+### Scraping Configuration
+
+Add to your Prometheus configuration:
+
+```yaml
+scrape_configs:
+  - job_name: 'langgraph'
+    kubernetes_sd_configs:
+      - role: pod
+        namespaces:
+          names:
+            - rag7
+    relabel_configs:
+      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
+        action: keep
+        regex: true
+      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port]
+        action: replace
+        target_label: __address__
+        regex: ([^:]+)(?::\d+)?;(\d+)
+        replacement: $1:$2
+      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
+        action: replace
+        target_label: __metrics_path__
+        regex: (.+)
+```
+
+### Example Queries
+
+```promql
+# Request rate per second
+rate(http_requests_total[5m])
+
+# 99th percentile latency
+histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))
+
+# Error rate
+rate(langgraph_errors_total[5m]) / rate(http_requests_total[5m])
+
+# Memory usage over time
+process_memory_bytes
+
+# Cache hit rate
+rate(cache_hits_total[5m]) / (rate(cache_hits_total[5m]) + rate(cache_misses_total[5m]))
+```
+
+## Logging
+
+### Log Levels
+
+The application supports the following log levels:
+- `DEBUG` - Detailed diagnostic information
+- `INFO` - General informational messages
+- `WARNING` - Warning messages for potentially harmful situations
+- `ERROR` - Error events that might still allow the app to continue
+- `CRITICAL` - Critical events that may cause the app to abort
+
+Set log level via environment variable:
+```bash
+LOG_LEVEL=INFO
+```
+
+### Log Format
+
+Logs are output in JSON format for easy parsing:
+
+```json
+{
+  "timestamp": "2024-01-15T10:30:45.123Z",
+  "level": "INFO",
+  "logger": "langgraph.api",
+  "message": "Graph execution completed",
+  "request_id": "req_abc123",
+  "graph_id": "graph_xyz789",
+  "duration_ms": 1234,
+  "user_id": "user_456"
+}
+```
+
+### Accessing Logs
+
+#### Docker Compose
+
+```bash
+# View logs from all services
+docker-compose -f docker-compose.prod.yml logs -f
+
+# View logs from specific service
+docker-compose -f docker-compose.prod.yml logs -f langgraph
+
+# View last 100 lines
+docker-compose -f docker-compose.prod.yml logs --tail=100 langgraph
+```
+
+#### Kubernetes
+
+```bash
+# View logs from all pods
+kubectl logs -n rag7 -l app=langgraph --tail=100 -f
+
+# View logs from specific pod
+kubectl logs -n rag7 POD_NAME --tail=200 -f
+
+# View logs from all containers in pod
+kubectl logs -n rag7 POD_NAME --all-containers=true
+```
+
+### Centralized Logging
+
+#### ELK Stack (Elasticsearch, Logstash, Kibana)
+
+Configure Filebeat to ship logs to Elasticsearch:
+
+```yaml
+# filebeat.yml
+filebeat.inputs:
+  - type: container
+    paths:
+      - '/var/lib/docker/containers/*/*.log'
+    processors:
+      - add_kubernetes_metadata:
+          host: ${NODE_NAME}
+          matchers:
+          - logs_path:
+              logs_path: "/var/lib/docker/containers/"
+
+output.elasticsearch:
+  hosts: ["elasticsearch:9200"]
+  index: "langgraph-logs-%{+yyyy.MM.dd}"
+```
+
+#### Loki (Grafana Loki)
+
+Deploy Promtail to collect and forward logs:
+
+```yaml
+# promtail-config.yaml
+clients:
+  - url: http://loki:3100/loki/api/v1/push
+
+scrape_configs:
+  - job_name: kubernetes-pods
+    kubernetes_sd_configs:
+      - role: pod
+    relabel_configs:
+      - source_labels: [__meta_kubernetes_namespace]
+        target_label: namespace
+      - source_labels: [__meta_kubernetes_pod_name]
+        target_label: pod
+```
+
+## Tracing
+
+### OpenTelemetry Integration
+
+The application supports OpenTelemetry for distributed tracing.
+
+#### Configuration
+
+Set environment variables:
+
+```bash
+OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318
+JAEGER_AGENT_HOST=jaeger
+JAEGER_AGENT_PORT=6831
+```
+
+#### Trace Context
+
+Each request includes trace context propagation:
+- `traceparent` - W3C Trace Context
+- `tracestate` - Additional trace state
+
+#### Trace Attributes
+
+Custom attributes added to spans:
+- `graph.id` - LangGraph graph identifier
+- `graph.execution.id` - Unique execution ID
+- `user.id` - User identifier
+- `llm.provider` - LLM provider (OpenAI, Anthropic, etc.)
+- `llm.model` - Model name
+- `llm.tokens` - Token count
+
+### Jaeger UI
+
+Access Jaeger UI to view traces:
+
+```bash
+# Port forward Jaeger UI
+kubectl port-forward -n observability svc/jaeger-query 16686:16686
+
+# Open in browser
+open http://localhost:16686
+```
+
+### Trace Analysis
+
+Use traces to identify:
+- Slow operations and bottlenecks
+- Service dependencies
+- Error propagation
+- Concurrent execution patterns
+
+## Dashboards
+
+### Grafana Dashboard
+
+Import the pre-configured Grafana dashboard:
+
+```bash
+# TODO: Create and include dashboard JSON
+# grafana-dashboard.json
+```
+
+#### Key Panels
+
+1. **Overview**
+   - Request rate (requests/sec)
+   - Error rate (%)
+   - 95th/99th percentile latency
+   - Active executions
+
+2. **Performance**
+   - Request duration distribution
+   - Graph execution time
+   - Database query time
+   - Cache hit rate
+
+3. **Resources**
+   - CPU usage
+   - Memory usage
+   - Network I/O
+   - Disk I/O
+
+4. **Errors**
+   - Error count by type
+   - Error rate trend
+   - Failed executions
+
+### Custom Dashboards
+
+Create custom dashboards using Grafana or your preferred tool.
+
+#### Example: Request Rate Dashboard
+
+```json
+{
+  "title": "Request Rate",
+  "targets": [
+    {
+      "expr": "rate(http_requests_total{namespace=\"rag7\"}[5m])",
+      "legendFormat": "{{method}} {{endpoint}}"
+    }
+  ]
+}
+```
+
+## Alerting
+
+### Alert Rules
+
+Configure alerts for critical conditions:
+
+#### High Error Rate
+
+```yaml
+groups:
+  - name: langgraph_alerts
+    interval: 30s
+    rules:
+      - alert: HighErrorRate
+        expr: rate(langgraph_errors_total[5m]) / rate(http_requests_total[5m]) > 0.05
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High error rate detected"
+          description: "Error rate is {{ $value | humanizePercentage }} (threshold: 5%)"
+```
+
+#### High Latency
+
+```yaml
+      - alert: HighLatency
+        expr: histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])) > 5
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High latency detected"
+          description: "99th percentile latency is {{ $value }}s (threshold: 5s)"
+```
+
+#### Pod Restart
+
+```yaml
+      - alert: PodRestarting
+        expr: rate(kube_pod_container_status_restarts_total{namespace="rag7"}[15m]) > 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Pod {{ $labels.pod }} is restarting"
+          description: "Pod has restarted {{ $value }} times in the last 15 minutes"
+```
+
+#### Resource Exhaustion
+
+```yaml
+      - alert: HighMemoryUsage
+        expr: container_memory_usage_bytes{namespace="rag7"} / container_spec_memory_limit_bytes > 0.9
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High memory usage detected"
+          description: "Memory usage is {{ $value | humanizePercentage }}"
+```
+
+### Alert Channels
+
+Configure notification channels:
+
+#### Slack
+
+```yaml
+receivers:
+  - name: 'slack-notifications'
+    slack_configs:
+      - api_url: 'YOUR_SLACK_WEBHOOK_URL'
+        channel: '#alerts'
+        title: 'Alert: {{ .GroupLabels.alertname }}'
+        text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
+```
+
+#### PagerDuty
+
+```yaml
+receivers:
+  - name: 'pagerduty'
+    pagerduty_configs:
+      - service_key: 'YOUR_PAGERDUTY_KEY'
+        description: '{{ .GroupLabels.alertname }}: {{ .CommonAnnotations.summary }}'
+```
+
+#### Email
+
+```yaml
+receivers:
+  - name: 'email'
+    email_configs:
+      - to: 'oncall@example.com'
+        from: 'alerts@example.com'
+        smarthost: 'smtp.example.com:587'
+        auth_username: 'alerts@example.com'
+        auth_password: 'YOUR_SMTP_PASSWORD'
+```
+
+## Health Checks
+
+### Endpoints
+
+- `GET /health` - Basic health check (liveness probe)
+- `GET /ready` - Readiness check (includes dependencies)
+- `GET /metrics` - Prometheus metrics
+
+### Example Health Check Response
+
+```json
+{
+  "status": "healthy",
+  "timestamp": "2024-01-15T10:30:45.123Z",
+  "checks": {
+    "database": "ok",
+    "redis": "ok",
+    "llm_provider": "ok"
+  },
+  "version": "1.0.0",
+  "uptime_seconds": 86400
+}
+```
+
+### Curl Commands
+
+```bash
+# Check health
+curl -i http://localhost:8123/health
+
+# Check readiness
+curl -i http://localhost:8123/ready
+
+# Fetch metrics
+curl http://localhost:9090/metrics
+```
+
+## Best Practices
+
+1. **Set up alerts before issues occur** - Don't wait for production incidents
+2. **Monitor the full stack** - Application, infrastructure, and dependencies
+3. **Use structured logging** - Enables better searching and analysis
+4. **Implement distributed tracing** - Essential for debugging microservices
+5. **Create runbooks** - Document response procedures (see [runbook.md](./runbook.md))
+6. **Regular review** - Periodically review dashboards and alerts
+7. **Load testing** - Test observability under realistic load conditions
+
+## Next Steps
+
+- Set up Prometheus and Grafana
+- Configure alert notification channels
+- Create custom dashboards for your use case
+- Implement distributed tracing with Jaeger or Zipkin
+- Review and test incident response procedures
diff --git a/docs/runbook.md b/docs/runbook.md
new file mode 100644
index 0000000..ee35bde
--- /dev/null
+++ b/docs/runbook.md
@@ -0,0 +1,557 @@
+# Operations Runbook
+
+This runbook provides step-by-step procedures for common operational tasks and incident response.
+
+## Table of Contents
+
+- [Emergency Contacts](#emergency-contacts)
+- [Common Incidents](#common-incidents)
+- [Operational Procedures](#operational-procedures)
+- [Maintenance Tasks](#maintenance-tasks)
+- [Recovery Procedures](#recovery-procedures)
+
+## Emergency Contacts
+
+### On-Call Schedule
+
+| Role | Primary | Secondary |
+|------|---------|-----------|
+| Engineering | TODO: Add | TODO: Add |
+| DevOps | TODO: Add | TODO: Add |
+| Manager | TODO: Add | TODO: Add |
+
+### Escalation Path
+
+1. On-call engineer (15 min response time)
+2. Secondary on-call (30 min response time)
+3. Engineering manager (1 hour response time)
+
+### Communication Channels
+
+- **Slack**: #incidents (for incident coordination)
+- **PagerDuty**: For critical alerts
+- **Status Page**: TODO: Add URL
+
+## Common Incidents
+
+### High Error Rate
+
+**Symptoms:**
+- Alert: "HighErrorRate" firing
+- Increased 5xx responses
+- User reports of failures
+
+**Diagnosis:**
+
+```bash
+# Check error logs
+kubectl logs -n rag7 -l app=langgraph --tail=100 | grep ERROR
+
+# Check error metrics
+curl http://localhost:9090/metrics | grep error
+
+# Check recent deployments
+kubectl rollout history deployment/langgraph -n rag7
+```
+
+**Resolution Steps:**
+
+1. **Identify error type**
+   ```bash
+   # Group errors by type
+   kubectl logs -n rag7 -l app=langgraph | grep ERROR | cut -d' ' -f5- | sort | uniq -c | sort -rn
+   ```
+
+2. **Check dependencies**
+   ```bash
+   # Test database connection
+   kubectl run -n rag7 -it --rm debug --image=postgres:15-alpine --restart=Never \
+     -- psql -h postgres.rag7.svc.cluster.local -U langgraph -c "SELECT 1"
+   
+   # Test Redis connection
+   kubectl run -n rag7 -it --rm debug --image=redis:7-alpine --restart=Never \
+     -- redis-cli -h redis.rag7.svc.cluster.local ping
+   ```
+
+3. **Rollback if recent deployment**
+   ```bash
+   kubectl rollout undo deployment/langgraph -n rag7
+   ```
+
+4. **Scale up if capacity issue**
+   ```bash
+   kubectl scale deployment/langgraph -n rag7 --replicas=5
+   ```
+
+### High Latency
+
+**Symptoms:**
+- Alert: "HighLatency" firing
+- Slow response times
+- Request timeouts
+
+**Diagnosis:**
+
+```bash
+# Check request latency
+curl http://localhost:9090/metrics | grep duration
+
+# Check resource usage
+kubectl top pods -n rag7
+
+# Check database slow queries
+kubectl exec -n rag7 -it deployment/postgres -- \
+  psql -U langgraph -c "SELECT query, calls, mean_exec_time FROM pg_stat_statements ORDER BY mean_exec_time DESC LIMIT 10"
+```
+
+**Resolution Steps:**
+
+1. **Check for resource constraints**
+   ```bash
+   kubectl describe pod -n rag7 POD_NAME | grep -A 5 "Limits\|Requests"
+   ```
+
+2. **Scale horizontally**
+   ```bash
+   kubectl scale deployment/langgraph -n rag7 --replicas=8
+   ```
+
+3. **Check for memory leaks**
+   ```bash
+   kubectl top pods -n rag7 --sort-by=memory
+   ```
+
+4. **Restart pods if necessary**
+   ```bash
+   kubectl rollout restart deployment/langgraph -n rag7
+   ```
+
+### Pod Crash Loop
+
+**Symptoms:**
+- Alert: "PodRestarting" firing
+- Pods in CrashLoopBackOff state
+- Service degradation
+
+**Diagnosis:**
+
+```bash
+# Check pod status
+kubectl get pods -n rag7
+
+# View recent logs
+kubectl logs -n rag7 POD_NAME --tail=100
+
+# View previous container logs
+kubectl logs -n rag7 POD_NAME --previous
+
+# Describe pod for events
+kubectl describe pod -n rag7 POD_NAME
+```
+
+**Resolution Steps:**
+
+1. **Check for configuration issues**
+   ```bash
+   kubectl get configmap langgraph-config -n rag7 -o yaml
+   kubectl get secret langgraph-secrets -n rag7 -o yaml
+   ```
+
+2. **Verify image**
+   ```bash
+   kubectl get deployment langgraph -n rag7 -o jsonpath='{.spec.template.spec.containers[0].image}'
+   ```
+
+3. **Check resource limits**
+   ```bash
+   kubectl describe pod -n rag7 POD_NAME | grep -A 5 "Limits"
+   ```
+
+4. **Fix and redeploy**
+   ```bash
+   # Update configuration
+   kubectl edit configmap langgraph-config -n rag7
+   
+   # Restart deployment
+   kubectl rollout restart deployment/langgraph -n rag7
+   ```
+
+### Database Connection Issues
+
+**Symptoms:**
+- Database connection errors in logs
+- "connection refused" or "connection timeout"
+- Service unavailable
+
+**Diagnosis:**
+
+```bash
+# Check PostgreSQL pod status
+kubectl get pods -n rag7 -l app=postgres
+
+# Check PostgreSQL logs
+kubectl logs -n rag7 -l app=postgres --tail=100
+
+# Test connection from application pod
+kubectl exec -n rag7 -it deployment/langgraph -- \
+  python -c "import psycopg2; conn = psycopg2.connect('postgresql://langgraph:PASSWORD@postgres.rag7.svc.cluster.local/langgraph_checkpoints'); print('Connected')"
+```
+
+**Resolution Steps:**
+
+1. **Check database is running**
+   ```bash
+   kubectl get statefulset postgres -n rag7
+   ```
+
+2. **Check connection limits**
+   ```bash
+   kubectl exec -n rag7 -it statefulset/postgres -- \
+     psql -U langgraph -c "SHOW max_connections"
+   
+   kubectl exec -n rag7 -it statefulset/postgres -- \
+     psql -U langgraph -c "SELECT count(*) FROM pg_stat_activity"
+   ```
+
+3. **Restart PostgreSQL if needed**
+   ```bash
+   kubectl delete pod -n rag7 -l app=postgres
+   ```
+
+4. **Check for disk space issues**
+   ```bash
+   kubectl exec -n rag7 -it statefulset/postgres -- df -h
+   ```
+
+### Out of Memory (OOM)
+
+**Symptoms:**
+- Pods killed by OOMKiller
+- Memory usage at 100%
+- Frequent restarts
+
+**Diagnosis:**
+
+```bash
+# Check memory usage
+kubectl top pods -n rag7
+
+# Check OOM events
+kubectl get events -n rag7 --sort-by='.lastTimestamp' | grep OOM
+
+# Check memory limits
+kubectl describe pod -n rag7 POD_NAME | grep -A 3 "Limits"
+```
+
+**Resolution Steps:**
+
+1. **Increase memory limits**
+   ```bash
+   kubectl edit deployment langgraph -n rag7
+   # Update memory limits under resources
+   ```
+
+2. **Check for memory leaks**
+   ```bash
+   # Monitor memory over time
+   kubectl top pods -n rag7 --watch
+   ```
+
+3. **Scale horizontally instead**
+   ```bash
+   kubectl scale deployment/langgraph -n rag7 --replicas=6
+   ```
+
+### API Rate Limiting
+
+**Symptoms:**
+- 429 Too Many Requests errors
+- LLM provider rate limit errors
+- Requests being throttled
+
+**Diagnosis:**
+
+```bash
+# Check rate limit errors
+kubectl logs -n rag7 -l app=langgraph | grep "rate limit"
+
+# Check metrics
+curl http://localhost:9090/metrics | grep rate_limit
+```
+
+**Resolution Steps:**
+
+1. **Implement exponential backoff** (code change required)
+
+2. **Distribute load across multiple API keys**
+   ```bash
+   kubectl edit secret langgraph-secrets -n rag7
+   # Add additional API keys
+   ```
+
+3. **Cache responses to reduce API calls**
+   ```bash
+   # Verify Redis is working
+   kubectl get pods -n rag7 -l app=redis
+   ```
+
+4. **Contact provider for rate limit increase**
+
+## Operational Procedures
+
+### Deployment Procedure
+
+**Standard Deployment:**
+
+```bash
+# 1. Review changes
+git diff main..feature-branch
+
+# 2. Merge to main
+git checkout main
+git merge feature-branch
+
+# 3. Tag release
+git tag -a v1.0.1 -m "Release 1.0.1"
+git push origin v1.0.1
+
+# 4. Build and push image (CI does this automatically)
+# CI will build and push ghcr.io/stacey77/rag7:v1.0.1
+
+# 5. Update deployment
+kubectl set image deployment/langgraph -n rag7 \
+  langgraph=ghcr.io/stacey77/rag7:v1.0.1
+
+# 6. Monitor rollout
+kubectl rollout status deployment/langgraph -n rag7
+
+# 7. Verify deployment
+curl http://API_ENDPOINT/health
+```
+
+**Hotfix Deployment:**
+
+```bash
+# 1. Create hotfix branch
+git checkout -b hotfix/critical-fix main
+
+# 2. Make minimal fix
+# ... edit files ...
+
+# 3. Test locally
+docker build -t rag7:hotfix .
+docker run -p 8123:8123 rag7:hotfix
+
+# 4. Deploy directly
+git commit -am "Hotfix: description"
+git push origin hotfix/critical-fix
+
+# 5. Trigger CD pipeline or deploy manually
+kubectl set image deployment/langgraph -n rag7 \
+  langgraph=ghcr.io/stacey77/rag7:hotfix-critical-fix
+
+# 6. Monitor closely
+kubectl logs -n rag7 -l app=langgraph -f
+```
+
+### Rollback Procedure
+
+```bash
+# 1. List rollout history
+kubectl rollout history deployment/langgraph -n rag7
+
+# 2. Rollback to previous version
+kubectl rollout undo deployment/langgraph -n rag7
+
+# 3. Or rollback to specific revision
+kubectl rollout undo deployment/langgraph -n rag7 --to-revision=5
+
+# 4. Monitor rollback
+kubectl rollout status deployment/langgraph -n rag7
+
+# 5. Verify service health
+curl http://API_ENDPOINT/health
+```
+
+### Scaling Procedure
+
+**Manual Scaling:**
+
+```bash
+# Scale up
+kubectl scale deployment/langgraph -n rag7 --replicas=8
+
+# Scale down
+kubectl scale deployment/langgraph -n rag7 --replicas=2
+
+# Verify
+kubectl get pods -n rag7 -l app=langgraph
+```
+
+**Adjust HPA:**
+
+```bash
+# Update HPA limits
+kubectl edit hpa langgraph-hpa -n rag7
+
+# Check HPA status
+kubectl get hpa -n rag7
+kubectl describe hpa langgraph-hpa -n rag7
+```
+
+### Backup Procedure
+
+**Database Backup:**
+
+```bash
+# 1. Create backup
+kubectl exec -n rag7 statefulset/postgres -- \
+  pg_dump -U langgraph langgraph_checkpoints > backup_$(date +%Y%m%d_%H%M%S).sql
+
+# 2. Compress
+gzip backup_*.sql
+
+# 3. Upload to storage
+aws s3 cp backup_*.sql.gz s3://rag7-backups/$(date +%Y/%m/%d)/
+
+# 4. Verify backup
+gunzip -c backup_*.sql.gz | head -n 20
+```
+
+**Configuration Backup:**
+
+```bash
+# Export all configurations
+kubectl get all,configmap,secret -n rag7 -o yaml > rag7_backup_$(date +%Y%m%d).yaml
+
+# Store securely
+gpg -c rag7_backup_$(date +%Y%m%d).yaml
+```
+
+## Maintenance Tasks
+
+### Certificate Renewal
+
+```bash
+# Check certificate expiration
+kubectl get certificate -n rag7
+
+# Renew certificate (cert-manager does this automatically)
+kubectl describe certificate langgraph-tls -n rag7
+
+# Manual renewal if needed
+kubectl delete secret langgraph-tls -n rag7
+# cert-manager will recreate
+```
+
+### Database Maintenance
+
+```bash
+# Vacuum database
+kubectl exec -n rag7 statefulset/postgres -- \
+  psql -U langgraph -c "VACUUM ANALYZE"
+
+# Check database size
+kubectl exec -n rag7 statefulset/postgres -- \
+  psql -U langgraph -c "SELECT pg_size_pretty(pg_database_size('langgraph_checkpoints'))"
+
+# Clean old checkpoints (if applicable)
+kubectl exec -n rag7 statefulset/postgres -- \
+  psql -U langgraph -c "DELETE FROM checkpoints WHERE created_at < NOW() - INTERVAL '30 days'"
+```
+
+### Log Rotation
+
+```bash
+# Check log volume sizes
+kubectl exec -n rag7 POD_NAME -- du -sh /var/log
+
+# Logs are automatically rotated by Kubernetes
+# Configure retention in logging backend (ELK, Loki, etc.)
+```
+
+## Recovery Procedures
+
+### Disaster Recovery
+
+**Complete Cluster Failure:**
+
+1. **Provision new cluster**
+2. **Restore configurations**
+   ```bash
+   kubectl apply -f rag7_backup_YYYYMMDD.yaml
+   ```
+3. **Restore database**
+   ```bash
+   kubectl exec -n rag7 -it statefulset/postgres -- \
+     psql -U langgraph langgraph_checkpoints < backup_YYYYMMDD.sql
+   ```
+4. **Verify services**
+5. **Update DNS/Load Balancer**
+
+### Data Corruption
+
+```bash
+# 1. Stop application
+kubectl scale deployment/langgraph -n rag7 --replicas=0
+
+# 2. Restore database from backup
+kubectl exec -n rag7 -it statefulset/postgres -- \
+  dropdb -U langgraph langgraph_checkpoints
+kubectl exec -n rag7 -it statefulset/postgres -- \
+  createdb -U langgraph langgraph_checkpoints
+kubectl exec -n rag7 -it statefulset/postgres -- \
+  psql -U langgraph langgraph_checkpoints < backup_YYYYMMDD.sql
+
+# 3. Restart application
+kubectl scale deployment/langgraph -n rag7 --replicas=2
+
+# 4. Verify data integrity
+# Run validation queries
+```
+
+## Appendix
+
+### Useful Commands
+
+```bash
+# Get cluster info
+kubectl cluster-info
+
+# Get all resources in namespace
+kubectl get all -n rag7
+
+# Check resource usage
+kubectl top nodes
+kubectl top pods -n rag7
+
+# Port forward for local testing
+kubectl port-forward -n rag7 svc/langgraph 8123:8123
+
+# Execute command in pod
+kubectl exec -n rag7 -it POD_NAME -- /bin/bash
+
+# Copy files from pod
+kubectl cp rag7/POD_NAME:/path/to/file ./local/path
+
+# View events
+kubectl get events -n rag7 --sort-by='.lastTimestamp'
+```
+
+### Monitoring Dashboards
+
+- **Grafana**: TODO: Add URL
+- **Prometheus**: TODO: Add URL
+- **Jaeger**: TODO: Add URL
+- **Kibana/Loki**: TODO: Add URL
+
+### Documentation Links
+
+- [Deployment Guide](./deployment.md)
+- [Observability Guide](./observability.md)
+- [n8n Workflows](../n8n/README.md)
+- [API Documentation](TODO)
+
+---
+
+**Remember:** Always follow the change management process and communicate with the team during incidents.
diff --git a/integration/api/Dockerfile b/integration/api/Dockerfile
new file mode 100644
index 0000000..065c2ca
--- /dev/null
+++ b/integration/api/Dockerfile
@@ -0,0 +1,39 @@
+FROM python:3.11-slim
+
+LABEL org.opencontainers.image.source=https://github.com/Stacey77/rag7
+LABEL org.opencontainers.image.description="RAG7 Integration API"
+LABEL org.opencontainers.image.licenses=MIT
+
+# Set working directory
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    g++ \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy requirements first for better caching
+COPY requirements.txt .
+
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy application code
+COPY server.py .
+
+# Create non-root user
+RUN useradd -m -u 1000 appuser && \
+    chown -R appuser:appuser /app
+USER appuser
+
+# Expose port
+EXPOSE 8000
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
+  CMD curl -f http://localhost:8000/health || exit 1
+
+# Run the application
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/integration/api/requirements.txt b/integration/api/requirements.txt
new file mode 100644
index 0000000..f4813f6
--- /dev/null
+++ b/integration/api/requirements.txt
@@ -0,0 +1,40 @@
+# Integration API Requirements
+# Python dependencies for the FastAPI integration layer
+
+# Web Framework
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+
+# HTTP Client
+httpx==0.25.1
+
+# Data Validation
+pydantic==2.5.0
+pydantic-settings==2.1.0
+
+# Utilities
+python-multipart==0.0.6
+python-dotenv==1.0.0
+
+# Optional: Authentication & Security
+python-jose[cryptography]==3.3.0
+passlib[bcrypt]==1.7.4
+
+# Optional: Caching
+redis==5.0.1
+hiredis==2.2.3
+
+# Optional: Database
+psycopg2-binary==2.9.9
+sqlalchemy==2.0.23
+
+# Optional: Observability
+prometheus-client==0.19.0
+opentelemetry-api==1.21.0
+opentelemetry-sdk==1.21.0
+opentelemetry-instrumentation-fastapi==0.42b0
+
+# Development dependencies (optional)
+# pytest==7.4.3
+# pytest-asyncio==0.21.1
+# httpx==0.25.1  # for testing
diff --git a/integration/api/server.py b/integration/api/server.py
new file mode 100644
index 0000000..0d474ce
--- /dev/null
+++ b/integration/api/server.py
@@ -0,0 +1,352 @@
+"""
+RAG7 Integration API Server
+
+FastAPI-based integration layer for LangGraph orchestration.
+Provides REST endpoints for graph execution, health checks, and monitoring.
+"""
+
+import os
+import logging
+import time
+from typing import Dict, Any, Optional
+from datetime import datetime
+
+from fastapi import FastAPI, HTTPException, Request, status
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.middleware.gzip import GZipMiddleware
+from pydantic import BaseModel, Field
+import httpx
+
+# Configure logging
+logging.basicConfig(
+    level=os.getenv("LOG_LEVEL", "INFO"),
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+# Configuration
+LANGGRAPH_API_URL = os.getenv("LANGGRAPH_API_URL", "http://langgraph:8123")
+API_TIMEOUT = int(os.getenv("API_TIMEOUT", "60"))
+ALLOWED_ORIGINS = os.getenv("ALLOWED_ORIGINS", "*").split(",")
+
+# Initialize FastAPI app
+app = FastAPI(
+    title="RAG7 Integration API",
+    description="Integration layer for LangGraph-based RAG system",
+    version="1.0.0",
+    docs_url="/docs",
+    redoc_url="/redoc",
+)
+
+# Middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=ALLOWED_ORIGINS,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.add_middleware(GZipMiddleware, minimum_size=1000)
+
+# Request/Response Models
+class GraphInput(BaseModel):
+    """Input schema for graph execution"""
+    query: str = Field(..., description="User query or prompt", min_length=1)
+    user_id: Optional[str] = Field(None, description="User identifier")
+    session_id: Optional[str] = Field(None, description="Session identifier")
+    config: Optional[Dict[str, Any]] = Field(default_factory=dict, description="Graph configuration")
+    metadata: Optional[Dict[str, Any]] = Field(default_factory=dict, description="Additional metadata")
+
+
+class GraphOutput(BaseModel):
+    """Output schema for graph execution"""
+    status: str = Field(..., description="Execution status")
+    result: Optional[Dict[str, Any]] = Field(None, description="Graph output")
+    execution_id: Optional[str] = Field(None, description="Execution identifier")
+    duration_ms: Optional[int] = Field(None, description="Execution duration in milliseconds")
+    error: Optional[str] = Field(None, description="Error message if failed")
+
+
+class HealthResponse(BaseModel):
+    """Health check response"""
+    status: str = Field(..., description="Health status")
+    timestamp: str = Field(..., description="Current timestamp")
+    version: str = Field(..., description="API version")
+    uptime_seconds: int = Field(..., description="Uptime in seconds")
+
+
+class ReadyResponse(BaseModel):
+    """Readiness check response"""
+    ready: bool = Field(..., description="Readiness status")
+    checks: Dict[str, str] = Field(..., description="Component health checks")
+    timestamp: str = Field(..., description="Current timestamp")
+
+
+# Global state
+startup_time = time.time()
+
+
+# Middleware for request logging
+@app.middleware("http")
+async def log_requests(request: Request, call_next):
+    """Log all requests with timing"""
+    start_time = time.time()
+    
+    # Generate request ID
+    request_id = f"req_{int(start_time * 1000)}"
+    
+    logger.info(f"Request {request_id}: {request.method} {request.url.path}")
+    
+    try:
+        response = await call_next(request)
+        duration = (time.time() - start_time) * 1000
+        
+        logger.info(
+            f"Request {request_id} completed: "
+            f"status={response.status_code} duration={duration:.2f}ms"
+        )
+        
+        return response
+    except Exception as e:
+        duration = (time.time() - start_time) * 1000
+        logger.error(
+            f"Request {request_id} failed: {str(e)} duration={duration:.2f}ms"
+        )
+        raise
+
+
+# Health check endpoints
+@app.get("/health", response_model=HealthResponse, tags=["Health"])
+async def health_check():
+    """
+    Basic health check endpoint (liveness probe).
+    
+    Returns basic service status without checking dependencies.
+    """
+    return HealthResponse(
+        status="healthy",
+        timestamp=datetime.utcnow().isoformat(),
+        version="1.0.0",
+        uptime_seconds=int(time.time() - startup_time)
+    )
+
+
+@app.get("/ready", response_model=ReadyResponse, tags=["Health"])
+async def readiness_check():
+    """
+    Readiness check endpoint.
+    
+    Checks if the service and its dependencies are ready to handle requests.
+    """
+    checks = {}
+    ready = True
+    
+    # Check LangGraph API
+    try:
+        async with httpx.AsyncClient(timeout=5.0) as client:
+            response = await client.get(f"{LANGGRAPH_API_URL}/health")
+            if response.status_code == 200:
+                checks["langgraph"] = "ok"
+            else:
+                checks["langgraph"] = f"unhealthy (status {response.status_code})"
+                ready = False
+    except Exception as e:
+        checks["langgraph"] = f"error: {str(e)}"
+        ready = False
+    
+    # TODO: Add checks for other dependencies (database, Redis, etc.)
+    
+    return ReadyResponse(
+        ready=ready,
+        checks=checks,
+        timestamp=datetime.utcnow().isoformat()
+    )
+
+
+# Graph execution endpoint
+@app.post("/v1/graph/run", response_model=GraphOutput, tags=["Graph"])
+async def run_graph(input_data: GraphInput):
+    """
+    Execute a LangGraph workflow.
+    
+    This endpoint triggers a graph execution with the provided input
+    and returns the result.
+    
+    Args:
+        input_data: Graph input including query, user_id, session_id, etc.
+    
+    Returns:
+        GraphOutput: Execution result including status, output, and metadata.
+    
+    Raises:
+        HTTPException: If execution fails or times out.
+    """
+    start_time = time.time()
+    execution_id = f"exec_{int(start_time * 1000)}"
+    
+    logger.info(f"Starting graph execution {execution_id}")
+    
+    try:
+        # Prepare request to LangGraph API
+        payload = {
+            "input": {
+                "query": input_data.query,
+                "user_id": input_data.user_id or "anonymous",
+                "session_id": input_data.session_id or execution_id,
+                **input_data.metadata
+            },
+            "config": {
+                "configurable": {
+                    "thread_id": input_data.session_id or execution_id
+                },
+                **input_data.config
+            }
+        }
+        
+        # Execute graph via LangGraph API
+        async with httpx.AsyncClient(timeout=API_TIMEOUT) as client:
+            response = await client.post(
+                f"{LANGGRAPH_API_URL}/v1/graph/run",
+                json=payload,
+                headers={
+                    "Content-Type": "application/json",
+                    # TODO: Add authentication header if required
+                    # "X-API-Key": os.getenv("LANGGRAPH_API_KEY", "")
+                }
+            )
+        
+        duration_ms = int((time.time() - start_time) * 1000)
+        
+        if response.status_code == 200:
+            result = response.json()
+            logger.info(f"Graph execution {execution_id} completed in {duration_ms}ms")
+            
+            return GraphOutput(
+                status="success",
+                result=result,
+                execution_id=execution_id,
+                duration_ms=duration_ms
+            )
+        else:
+            logger.error(
+                f"Graph execution {execution_id} failed: "
+                f"status={response.status_code} response={response.text}"
+            )
+            raise HTTPException(
+                status_code=response.status_code,
+                detail=f"Graph execution failed: {response.text}"
+            )
+    
+    except httpx.TimeoutException:
+        duration_ms = int((time.time() - start_time) * 1000)
+        logger.error(f"Graph execution {execution_id} timed out after {duration_ms}ms")
+        raise HTTPException(
+            status_code=status.HTTP_504_GATEWAY_TIMEOUT,
+            detail="Graph execution timed out"
+        )
+    
+    except httpx.RequestError as e:
+        duration_ms = int((time.time() - start_time) * 1000)
+        logger.error(f"Graph execution {execution_id} failed: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail=f"Failed to connect to LangGraph API: {str(e)}"
+        )
+    
+    except Exception as e:
+        duration_ms = int((time.time() - start_time) * 1000)
+        logger.error(f"Graph execution {execution_id} error: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Internal server error: {str(e)}"
+        )
+
+
+# Additional endpoints can be added here
+@app.get("/v1/graph/status/{execution_id}", tags=["Graph"])
+async def get_execution_status(execution_id: str):
+    """
+    Get the status of a graph execution.
+    
+    TODO: Implement execution status tracking.
+    """
+    # This would query the execution status from LangGraph or a database
+    raise HTTPException(
+        status_code=status.HTTP_501_NOT_IMPLEMENTED,
+        detail="Execution status tracking not yet implemented"
+    )
+
+
+@app.get("/v1/graph/history", tags=["Graph"])
+async def get_execution_history(
+    user_id: Optional[str] = None,
+    session_id: Optional[str] = None,
+    limit: int = 10
+):
+    """
+    Get execution history for a user or session.
+    
+    TODO: Implement execution history retrieval.
+    """
+    # This would query execution history from a database
+    raise HTTPException(
+        status_code=status.HTTP_501_NOT_IMPLEMENTED,
+        detail="Execution history not yet implemented"
+    )
+
+
+# Exception handlers
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request: Request, exc: HTTPException):
+    """Handle HTTP exceptions"""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content={
+            "error": exc.detail,
+            "status_code": exc.status_code,
+            "timestamp": datetime.utcnow().isoformat()
+        }
+    )
+
+
+@app.exception_handler(Exception)
+async def general_exception_handler(request: Request, exc: Exception):
+    """Handle general exceptions"""
+    logger.error(f"Unhandled exception: {str(exc)}", exc_info=True)
+    return JSONResponse(
+        status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+        content={
+            "error": "Internal server error",
+            "status_code": 500,
+            "timestamp": datetime.utcnow().isoformat()
+        }
+    )
+
+
+# Startup event
+@app.on_event("startup")
+async def startup_event():
+    """Run on application startup"""
+    logger.info("Starting RAG7 Integration API")
+    logger.info(f"LangGraph API URL: {LANGGRAPH_API_URL}")
+    logger.info(f"API Timeout: {API_TIMEOUT}s")
+
+
+# Shutdown event
+@app.on_event("shutdown")
+async def shutdown_event():
+    """Run on application shutdown"""
+    logger.info("Shutting down RAG7 Integration API")
+
+
+if __name__ == "__main__":
+    import uvicorn
+    
+    uvicorn.run(
+        "server:app",
+        host="0.0.0.0",
+        port=8000,
+        log_level="info",
+        reload=os.getenv("API_RELOAD", "false").lower() == "true"
+    )
diff --git a/k8s/hpa.yaml b/k8s/hpa.yaml
new file mode 100644
index 0000000..ec4d2b8
--- /dev/null
+++ b/k8s/hpa.yaml
@@ -0,0 +1,61 @@
+---
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: langgraph-hpa
+  namespace: rag7
+  labels:
+    app: langgraph
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: langgraph
+  minReplicas: 2
+  maxReplicas: 10
+  metrics:
+  - type: Resource
+    resource:
+      name: cpu
+      target:
+        type: Utilization
+        averageUtilization: 70
+  - type: Resource
+    resource:
+      name: memory
+      target:
+        type: Utilization
+        averageUtilization: 80
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 300
+      policies:
+      - type: Percent
+        value: 50
+        periodSeconds: 60
+      - type: Pods
+        value: 2
+        periodSeconds: 60
+      selectPolicy: Min
+    scaleUp:
+      stabilizationWindowSeconds: 0
+      policies:
+      - type: Percent
+        value: 100
+        periodSeconds: 30
+      - type: Pods
+        value: 4
+        periodSeconds: 30
+      selectPolicy: Max
+
+---
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  name: langgraph-pdb
+  namespace: rag7
+spec:
+  minAvailable: 1
+  selector:
+    matchLabels:
+      app: langgraph
diff --git a/k8s/langgraph-deployment.yaml b/k8s/langgraph-deployment.yaml
new file mode 100644
index 0000000..78bef64
--- /dev/null
+++ b/k8s/langgraph-deployment.yaml
@@ -0,0 +1,350 @@
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: rag7
+  labels:
+    name: rag7
+    environment: production
+
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: langgraph-config
+  namespace: rag7
+data:
+  APP_ENV: "production"
+  APP_DEBUG: "false"
+  LOG_LEVEL: "INFO"
+  API_HOST: "0.0.0.0"
+  API_PORT: "8123"
+  LANGGRAPH_STREAM_MODE: "values"
+  ENABLE_METRICS: "true"
+  METRICS_PORT: "9090"
+
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: langgraph-secrets
+  namespace: rag7
+type: Opaque
+stringData:
+  # TODO: Replace these with actual base64-encoded secrets using kubectl create secret
+  POSTGRES_PASSWORD: "CHANGEME_SECURE_PASSWORD"
+  REDIS_PASSWORD: "CHANGEME_SECURE_PASSWORD"
+  SECRET_KEY: "CHANGEME_GENERATE_SECURE_KEY"
+  API_KEY_SALT: "CHANGEME_GENERATE_SECURE_SALT"
+  OPENAI_API_KEY: "sk-CHANGEME"
+  LANGCHAIN_API_KEY: ""
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: langgraph
+  namespace: rag7
+  labels:
+    app: langgraph
+spec:
+  type: ClusterIP
+  ports:
+    - port: 8123
+      targetPort: 8123
+      protocol: TCP
+      name: http
+    - port: 9090
+      targetPort: 9090
+      protocol: TCP
+      name: metrics
+  selector:
+    app: langgraph
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: langgraph
+  namespace: rag7
+  labels:
+    app: langgraph
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: langgraph
+  template:
+    metadata:
+      labels:
+        app: langgraph
+      annotations:
+        prometheus.io/scrape: "true"
+        prometheus.io/port: "9090"
+        prometheus.io/path: "/metrics"
+    spec:
+      containers:
+      - name: langgraph
+        image: ghcr.io/stacey77/rag7:latest
+        imagePullPolicy: Always
+        ports:
+        - containerPort: 8123
+          name: http
+          protocol: TCP
+        - containerPort: 9090
+          name: metrics
+          protocol: TCP
+        env:
+        - name: APP_ENV
+          valueFrom:
+            configMapKeyRef:
+              name: langgraph-config
+              key: APP_ENV
+        - name: LOG_LEVEL
+          valueFrom:
+            configMapKeyRef:
+              name: langgraph-config
+              key: LOG_LEVEL
+        - name: API_HOST
+          valueFrom:
+            configMapKeyRef:
+              name: langgraph-config
+              key: API_HOST
+        - name: API_PORT
+          valueFrom:
+            configMapKeyRef:
+              name: langgraph-config
+              key: API_PORT
+        - name: POSTGRES_HOST
+          value: "postgres.rag7.svc.cluster.local"
+        - name: POSTGRES_PORT
+          value: "5432"
+        - name: POSTGRES_DB
+          value: "langgraph_checkpoints"
+        - name: POSTGRES_USER
+          value: "langgraph"
+        - name: POSTGRES_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: langgraph-secrets
+              key: POSTGRES_PASSWORD
+        - name: REDIS_HOST
+          value: "redis.rag7.svc.cluster.local"
+        - name: REDIS_PORT
+          value: "6379"
+        - name: REDIS_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: langgraph-secrets
+              key: REDIS_PASSWORD
+        - name: SECRET_KEY
+          valueFrom:
+            secretKeyRef:
+              name: langgraph-secrets
+              key: SECRET_KEY
+        - name: OPENAI_API_KEY
+          valueFrom:
+            secretKeyRef:
+              name: langgraph-secrets
+              key: OPENAI_API_KEY
+        - name: LANGCHAIN_API_KEY
+          valueFrom:
+            secretKeyRef:
+              name: langgraph-secrets
+              key: LANGCHAIN_API_KEY
+              optional: true
+        resources:
+          requests:
+            cpu: 500m
+            memory: 1Gi
+          limits:
+            cpu: 2000m
+            memory: 4Gi
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 8123
+          initialDelaySeconds: 30
+          periodSeconds: 10
+          timeoutSeconds: 5
+          failureThreshold: 3
+        readinessProbe:
+          httpGet:
+            path: /ready
+            port: 8123
+          initialDelaySeconds: 10
+          periodSeconds: 5
+          timeoutSeconds: 3
+          failureThreshold: 3
+      imagePullSecrets:
+      - name: ghcr-secret
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: postgres
+  namespace: rag7
+  labels:
+    app: postgres
+spec:
+  type: ClusterIP
+  ports:
+    - port: 5432
+      targetPort: 5432
+      protocol: TCP
+      name: postgres
+  selector:
+    app: postgres
+
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: postgres
+  namespace: rag7
+  labels:
+    app: postgres
+spec:
+  serviceName: postgres
+  replicas: 1
+  selector:
+    matchLabels:
+      app: postgres
+  template:
+    metadata:
+      labels:
+        app: postgres
+    spec:
+      containers:
+      - name: postgres
+        image: postgres:15-alpine
+        ports:
+        - containerPort: 5432
+          name: postgres
+        env:
+        - name: POSTGRES_DB
+          value: "langgraph_checkpoints"
+        - name: POSTGRES_USER
+          value: "langgraph"
+        - name: POSTGRES_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: langgraph-secrets
+              key: POSTGRES_PASSWORD
+        - name: PGDATA
+          value: /var/lib/postgresql/data/pgdata
+        volumeMounts:
+        - name: postgres-storage
+          mountPath: /var/lib/postgresql/data
+        resources:
+          requests:
+            cpu: 250m
+            memory: 512Mi
+          limits:
+            cpu: 1000m
+            memory: 2Gi
+        livenessProbe:
+          exec:
+            command:
+            - pg_isready
+            - -U
+            - langgraph
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        readinessProbe:
+          exec:
+            command:
+            - pg_isready
+            - -U
+            - langgraph
+          initialDelaySeconds: 5
+          periodSeconds: 5
+  volumeClaimTemplates:
+  - metadata:
+      name: postgres-storage
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 20Gi
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: redis
+  namespace: rag7
+  labels:
+    app: redis
+spec:
+  type: ClusterIP
+  ports:
+    - port: 6379
+      targetPort: 6379
+      protocol: TCP
+      name: redis
+  selector:
+    app: redis
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: redis
+  namespace: rag7
+  labels:
+    app: redis
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: redis
+  template:
+    metadata:
+      labels:
+        app: redis
+    spec:
+      containers:
+      - name: redis
+        image: redis:7-alpine
+        ports:
+        - containerPort: 6379
+          name: redis
+        command:
+        - redis-server
+        - --requirepass
+        - $(REDIS_PASSWORD)
+        env:
+        - name: REDIS_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: langgraph-secrets
+              key: REDIS_PASSWORD
+        volumeMounts:
+        - name: redis-storage
+          mountPath: /data
+        resources:
+          requests:
+            cpu: 100m
+            memory: 128Mi
+          limits:
+            cpu: 500m
+            memory: 512Mi
+        livenessProbe:
+          exec:
+            command:
+            - redis-cli
+            - ping
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        readinessProbe:
+          exec:
+            command:
+            - redis-cli
+            - ping
+          initialDelaySeconds: 5
+          periodSeconds: 5
+      volumes:
+      - name: redis-storage
+        emptyDir: {}
diff --git a/n8n/README.md b/n8n/README.md
new file mode 100644
index 0000000..487023a
--- /dev/null
+++ b/n8n/README.md
@@ -0,0 +1,413 @@
+# n8n Workflows for RAG7
+
+This directory contains n8n workflow templates for orchestrating the RAG7 LangGraph application.
+
+## Table of Contents
+
+- [Overview](#overview)
+- [Prerequisites](#prerequisites)
+- [Installation](#installation)
+- [Workflows](#workflows)
+- [Configuration](#configuration)
+- [Usage](#usage)
+- [Troubleshooting](#troubleshooting)
+
+## Overview
+
+n8n provides workflow automation for the RAG7 LangGraph system, enabling:
+
+- **API Orchestration**: Manage incoming requests and route them to LangGraph
+- **Scheduled Processing**: Execute batch tasks on a schedule
+- **Error Handling**: Graceful error handling and retry logic
+- **Monitoring**: Track execution metrics and logs
+- **Integration**: Connect with external systems and APIs
+
+## Prerequisites
+
+### Required
+
+- n8n instance (self-hosted or cloud)
+- Access to RAG7 LangGraph API
+- API keys for:
+  - LangGraph API
+  - OpenAI (or other LLM providers)
+  - LangChain/LangSmith (optional)
+
+### Optional
+
+- PostgreSQL credentials (for direct database access)
+- Redis credentials (for caching)
+- Task queue system credentials
+
+## Installation
+
+### 1. Set Up n8n
+
+**Self-hosted (Docker):**
+
+```bash
+docker run -d \
+  --name n8n \
+  -p 5678:5678 \
+  -v ~/.n8n:/home/node/.n8n \
+  n8nio/n8n
+```
+
+**Self-hosted (Docker Compose):**
+
+```yaml
+version: '3.8'
+
+services:
+  n8n:
+    image: n8nio/n8n
+    ports:
+      - "5678:5678"
+    environment:
+      - N8N_BASIC_AUTH_ACTIVE=true
+      - N8N_BASIC_AUTH_USER=admin
+      - N8N_BASIC_AUTH_PASSWORD=change-this-password
+      - N8N_HOST=n8n.yourdomain.com
+      - N8N_PROTOCOL=https
+      - WEBHOOK_URL=https://n8n.yourdomain.com/
+    volumes:
+      - n8n_data:/home/node/.n8n
+
+volumes:
+  n8n_data:
+```
+
+**Cloud:**
+
+Sign up at [n8n.cloud](https://n8n.cloud)
+
+### 2. Configure Credentials
+
+1. Open n8n UI (http://localhost:5678 or your n8n URL)
+2. Navigate to **Credentials** → **Add Credential**
+3. Create credentials based on `credentials/credentials_template.json`:
+
+**LangGraph API Key:**
+- Type: HTTP Header Auth
+- Name: `X-API-Key`
+- Value: Your LangGraph API key
+
+**OpenAI API:**
+- Type: OpenAI
+- API Key: Your OpenAI key
+
+**Database (Optional):**
+- Type: PostgreSQL
+- Host: `postgres.rag7.svc.cluster.local` (or external host)
+- Port: `5432`
+- Database: `langgraph_checkpoints`
+- User: `langgraph`
+- Password: Your database password
+
+### 3. Import Workflows
+
+1. Navigate to **Workflows** → **Add Workflow**
+2. Click the **⋮** menu → **Import from File**
+3. Import each workflow:
+   - `workflows/main_orchestrator.json`
+   - `workflows/langgraph_trigger.json`
+
+### 4. Configure Environment Variables
+
+In n8n, set environment variables for each workflow:
+
+```bash
+LANGGRAPH_API_URL=http://langgraph.rag7.svc.cluster.local:8123
+TASK_QUEUE_URL=http://task-queue:8080
+LOGGING_ENDPOINT=http://logging-service:8080
+METRICS_ENDPOINT=http://metrics-service:9090
+```
+
+Or set them in your n8n deployment:
+
+```yaml
+environment:
+  - LANGGRAPH_API_URL=http://langgraph.rag7.svc.cluster.local:8123
+  - TASK_QUEUE_URL=http://task-queue:8080
+```
+
+## Workflows
+
+### 1. Main Orchestrator
+
+**File**: `workflows/main_orchestrator.json`
+
+**Purpose**: Webhook-based orchestration for real-time LangGraph execution.
+
+**Trigger**: Webhook (POST request)
+
+**Flow**:
+1. Receives webhook request
+2. Validates input
+3. Triggers LangGraph execution
+4. Processes result
+5. Returns response
+6. Logs execution
+
+**Webhook URL**: `https://your-n8n-instance/webhook/orchestrator`
+
+**Example Request**:
+
+```bash
+curl -X POST https://your-n8n-instance/webhook/orchestrator \
+  -H "Content-Type: application/json" \
+  -d '{
+    "query": "What is the capital of France?",
+    "user_id": "user123",
+    "session_id": "session456",
+    "metadata": {
+      "source": "web"
+    }
+  }'
+```
+
+**Response**:
+
+```json
+{
+  "status": "success",
+  "result": {
+    "answer": "The capital of France is Paris.",
+    "confidence": 0.95
+  },
+  "execution_id": "exec_789",
+  "duration_ms": 1234,
+  "metadata": {
+    "user_id": "user123",
+    "session_id": "session456",
+    "timestamp": "2024-01-15T10:30:45.123Z"
+  }
+}
+```
+
+### 2. LangGraph Trigger
+
+**File**: `workflows/langgraph_trigger.json`
+
+**Purpose**: Scheduled batch processing of pending tasks.
+
+**Trigger**: Schedule (every 5 minutes by default)
+
+**Flow**:
+1. Fetches pending tasks from queue
+2. Checks if tasks exist
+3. Splits tasks for parallel processing
+4. Executes each task via LangGraph
+5. Marks tasks as complete
+6. Sends metrics
+
+**Configuration**:
+
+Adjust schedule in the workflow:
+- Every 5 minutes: `*/5 * * * *`
+- Every hour: `0 * * * *`
+- Every day at 2 AM: `0 2 * * *`
+
+## Configuration
+
+### Workflow Settings
+
+**Timeout**:
+- Default: 60 seconds
+- For long-running graphs: Increase to 120-300 seconds
+
+**Retry Logic**:
+- Configure in each HTTP Request node
+- Recommended: 3 retries with exponential backoff
+
+**Batching**:
+- Enable for high-volume processing
+- Batch size: 5-10 requests
+- Batch interval: 1-2 seconds
+
+### Error Handling
+
+All workflows include error handling:
+
+1. **Validation Errors**: Return 400 with error details
+2. **API Errors**: Retry with exponential backoff
+3. **Timeout Errors**: Return 504 Gateway Timeout
+4. **Server Errors**: Return 500 with error message
+
+### Monitoring
+
+Enable execution logging:
+
+1. Navigate to **Settings** → **Executions**
+2. Enable **Save execution data**
+3. Set retention: 30 days (or as needed)
+
+## Usage
+
+### Activate Workflows
+
+1. Open workflow in n8n
+2. Click **Active** toggle
+3. Verify webhook URL or schedule
+
+### Test Workflows
+
+**Manual Test**:
+1. Open workflow
+2. Click **Execute Workflow**
+3. Provide test data
+4. Review execution results
+
+**Webhook Test**:
+
+```bash
+# Test main orchestrator
+curl -X POST https://your-n8n-instance/webhook/orchestrator \
+  -H "Content-Type: application/json" \
+  -d '{
+    "query": "Test query",
+    "user_id": "test_user"
+  }'
+```
+
+### Monitor Executions
+
+1. Navigate to **Executions**
+2. View execution list
+3. Click execution to see details
+4. Check logs for errors
+
+### Update Workflows
+
+1. Deactivate workflow
+2. Make changes
+3. Test changes
+4. Activate workflow
+
+## Troubleshooting
+
+### Webhook Not Receiving Requests
+
+**Check**:
+- Webhook URL is correct
+- Workflow is active
+- n8n is publicly accessible (or within VPN)
+- Firewall allows traffic on port 5678
+
+**Solution**:
+```bash
+# Test webhook locally
+curl http://localhost:5678/webhook-test/orchestrator
+```
+
+### Authentication Errors
+
+**Check**:
+- Credentials are configured correctly
+- API keys are valid
+- Headers are set properly
+
+**Solution**:
+```bash
+# Test API key manually
+curl -H "X-API-Key: YOUR_KEY" http://langgraph:8123/health
+```
+
+### Timeout Errors
+
+**Check**:
+- Graph execution time
+- Network latency
+- Timeout settings in HTTP Request nodes
+
+**Solution**:
+- Increase timeout in node settings
+- Optimize graph performance
+- Use async processing
+
+### Connection Errors
+
+**Check**:
+- Service is running
+- Network connectivity
+- DNS resolution
+
+**Solution**:
+```bash
+# Test connectivity
+kubectl run -it --rm debug --image=curlimages/curl --restart=Never \
+  -- curl http://langgraph.rag7.svc.cluster.local:8123/health
+```
+
+### Memory/Performance Issues
+
+**Check**:
+- Execution data size
+- Batch sizes
+- Concurrent executions
+
+**Solution**:
+- Reduce batch size
+- Limit concurrent executions
+- Archive old execution data
+
+## Advanced Usage
+
+### Custom Workflows
+
+Create custom workflows by:
+
+1. Combining existing nodes
+2. Adding custom code nodes
+3. Integrating external services
+
+### Integrations
+
+Connect with:
+- **Slack**: Send notifications
+- **Email**: Send reports
+- **Webhooks**: Trigger external systems
+- **Databases**: Store results
+- **Cloud Storage**: Save artifacts
+
+### Example: Slack Notification
+
+Add a Slack node after successful execution:
+
+```json
+{
+  "parameters": {
+    "channel": "#notifications",
+    "text": "Graph execution completed: {{ $json.execution_id }}"
+  },
+  "type": "n8n-nodes-base.slack"
+}
+```
+
+## Best Practices
+
+1. **Use Descriptive Names**: Name workflows and nodes clearly
+2. **Add Error Handling**: Always handle errors gracefully
+3. **Enable Logging**: Keep execution logs for debugging
+4. **Set Timeouts**: Prevent workflows from hanging
+5. **Use Credentials**: Never hardcode API keys
+6. **Test Thoroughly**: Test workflows before activating
+7. **Monitor Executions**: Regularly check for failures
+8. **Document Changes**: Keep notes on workflow modifications
+
+## Next Steps
+
+- Configure additional integrations
+- Set up monitoring and alerting
+- Create custom workflows for your use case
+- Optimize performance and resource usage
+- Review [Deployment Guide](../docs/deployment.md)
+- Check [Observability Guide](../docs/observability.md)
+
+## Support
+
+For issues or questions:
+- Check n8n documentation: https://docs.n8n.io
+- Join n8n community: https://community.n8n.io
+- Review LangGraph documentation
+- Contact team via Slack #rag7-support
diff --git a/n8n/credentials/credentials_template.json b/n8n/credentials/credentials_template.json
new file mode 100644
index 0000000..db2ac40
--- /dev/null
+++ b/n8n/credentials/credentials_template.json
@@ -0,0 +1,60 @@
+{
+  "credentials": [
+    {
+      "name": "LangGraph API Key",
+      "type": "httpHeaderAuth",
+      "data": {
+        "name": "X-API-Key",
+        "value": "TODO_REPLACE_WITH_YOUR_LANGGRAPH_API_KEY"
+      }
+    },
+    {
+      "name": "Task Queue API Key",
+      "type": "httpHeaderAuth",
+      "data": {
+        "name": "Authorization",
+        "value": "Bearer TODO_REPLACE_WITH_YOUR_TASK_QUEUE_TOKEN"
+      }
+    },
+    {
+      "name": "OpenAI API",
+      "type": "openAiApi",
+      "data": {
+        "apiKey": "sk-TODO_REPLACE_WITH_YOUR_OPENAI_KEY"
+      }
+    },
+    {
+      "name": "LangChain Credentials",
+      "type": "httpHeaderAuth",
+      "data": {
+        "name": "X-API-Key",
+        "value": "ls__TODO_REPLACE_WITH_YOUR_LANGCHAIN_KEY"
+      }
+    },
+    {
+      "name": "Database Connection",
+      "type": "postgres",
+      "data": {
+        "host": "postgres.rag7.svc.cluster.local",
+        "port": 5432,
+        "database": "langgraph_checkpoints",
+        "user": "langgraph",
+        "password": "TODO_REPLACE_WITH_DATABASE_PASSWORD",
+        "ssl": {
+          "enabled": false
+        }
+      }
+    },
+    {
+      "name": "Redis Connection",
+      "type": "redis",
+      "data": {
+        "host": "redis.rag7.svc.cluster.local",
+        "port": 6379,
+        "password": "TODO_REPLACE_WITH_REDIS_PASSWORD",
+        "database": 0
+      }
+    }
+  ],
+  "instructions": "Replace all TODO_REPLACE_WITH_YOUR_* placeholders with actual credentials before importing into n8n."
+}
diff --git a/n8n/workflows/langgraph_trigger.json b/n8n/workflows/langgraph_trigger.json
new file mode 100644
index 0000000..14facda
--- /dev/null
+++ b/n8n/workflows/langgraph_trigger.json
@@ -0,0 +1,285 @@
+{
+  "name": "LangGraph Trigger",
+  "nodes": [
+    {
+      "parameters": {
+        "rule": {
+          "interval": [
+            {
+              "field": "cronExpression",
+              "expression": "*/5 * * * *"
+            }
+          ]
+        }
+      },
+      "id": "schedule-trigger",
+      "name": "Schedule Trigger",
+      "type": "n8n-nodes-base.scheduleTrigger",
+      "typeVersion": 1.1,
+      "position": [250, 300]
+    },
+    {
+      "parameters": {
+        "authentication": "genericCredentialType",
+        "genericAuthType": "httpHeaderAuth",
+        "method": "GET",
+        "url": "={{ $env.TASK_QUEUE_URL }}/pending",
+        "options": {}
+      },
+      "id": "fetch-pending-tasks",
+      "name": "Fetch Pending Tasks",
+      "type": "n8n-nodes-base.httpRequest",
+      "typeVersion": 4.1,
+      "position": [450, 300],
+      "credentials": {
+        "httpHeaderAuth": {
+          "id": "task-queue-api-key",
+          "name": "Task Queue API Key"
+        }
+      }
+    },
+    {
+      "parameters": {
+        "conditions": {
+          "options": {
+            "caseSensitive": true,
+            "leftValue": "",
+            "typeValidation": "strict"
+          },
+          "conditions": [
+            {
+              "id": "has-tasks",
+              "leftValue": "={{ $json.tasks.length }}",
+              "rightValue": "0",
+              "operator": {
+                "type": "number",
+                "operation": "gt"
+              }
+            }
+          ],
+          "combinator": "and"
+        },
+        "options": {}
+      },
+      "id": "check-tasks",
+      "name": "Check Tasks",
+      "type": "n8n-nodes-base.if",
+      "typeVersion": 2,
+      "position": [650, 300]
+    },
+    {
+      "parameters": {
+        "fieldToSplitOut": "tasks",
+        "options": {}
+      },
+      "id": "split-tasks",
+      "name": "Split Tasks",
+      "type": "n8n-nodes-base.splitOut",
+      "typeVersion": 1,
+      "position": [850, 200]
+    },
+    {
+      "parameters": {
+        "authentication": "genericCredentialType",
+        "genericAuthType": "httpHeaderAuth",
+        "method": "POST",
+        "url": "={{ $env.LANGGRAPH_API_URL }}/v1/graph/run",
+        "sendBody": true,
+        "bodyParameters": {
+          "parameters": [
+            {
+              "name": "input",
+              "value": "={{ $json.input }}"
+            },
+            {
+              "name": "config",
+              "value": "={{ { configurable: { thread_id: $json.task_id } } }}"
+            }
+          ]
+        },
+        "options": {
+          "batching": {
+            "batch": {
+              "batchSize": 5,
+              "batchInterval": 1000
+            }
+          },
+          "timeout": 120000
+        }
+      },
+      "id": "execute-graph",
+      "name": "Execute Graph",
+      "type": "n8n-nodes-base.httpRequest",
+      "typeVersion": 4.1,
+      "position": [1050, 200],
+      "credentials": {
+        "httpHeaderAuth": {
+          "id": "langgraph-api-key",
+          "name": "LangGraph API Key"
+        }
+      }
+    },
+    {
+      "parameters": {
+        "authentication": "genericCredentialType",
+        "genericAuthType": "httpHeaderAuth",
+        "method": "POST",
+        "url": "={{ $env.TASK_QUEUE_URL }}/complete",
+        "sendBody": true,
+        "bodyParameters": {
+          "parameters": [
+            {
+              "name": "task_id",
+              "value": "={{ $json.task_id }}"
+            },
+            {
+              "name": "result",
+              "value": "={{ $json.output }}"
+            },
+            {
+              "name": "status",
+              "value": "completed"
+            }
+          ]
+        },
+        "options": {}
+      },
+      "id": "mark-complete",
+      "name": "Mark Complete",
+      "type": "n8n-nodes-base.httpRequest",
+      "typeVersion": 4.1,
+      "position": [1250, 200],
+      "credentials": {
+        "httpHeaderAuth": {
+          "id": "task-queue-api-key",
+          "name": "Task Queue API Key"
+        }
+      }
+    },
+    {
+      "parameters": {
+        "jsCode": "// No tasks found\nreturn {\n  status: 'idle',\n  message: 'No pending tasks',\n  timestamp: new Date().toISOString()\n};"
+      },
+      "id": "no-tasks",
+      "name": "No Tasks",
+      "type": "n8n-nodes-base.code",
+      "typeVersion": 2,
+      "position": [850, 400]
+    },
+    {
+      "parameters": {
+        "authentication": "genericCredentialType",
+        "genericAuthType": "httpHeaderAuth",
+        "method": "POST",
+        "url": "={{ $env.METRICS_ENDPOINT }}/metrics",
+        "sendBody": true,
+        "bodyParameters": {
+          "parameters": [
+            {
+              "name": "metric",
+              "value": "tasks_processed"
+            },
+            {
+              "name": "value",
+              "value": "={{ $itemIndex + 1 }}"
+            },
+            {
+              "name": "labels",
+              "value": "={{ { workflow: 'langgraph_trigger', status: 'success' } }}"
+            }
+          ]
+        },
+        "options": {}
+      },
+      "id": "send-metrics",
+      "name": "Send Metrics",
+      "type": "n8n-nodes-base.httpRequest",
+      "typeVersion": 4.1,
+      "position": [1450, 200],
+      "continueOnFail": true
+    }
+  ],
+  "connections": {
+    "Schedule Trigger": {
+      "main": [
+        [
+          {
+            "node": "Fetch Pending Tasks",
+            "type": "main",
+            "index": 0
+          }
+        ]
+      ]
+    },
+    "Fetch Pending Tasks": {
+      "main": [
+        [
+          {
+            "node": "Check Tasks",
+            "type": "main",
+            "index": 0
+          }
+        ]
+      ]
+    },
+    "Check Tasks": {
+      "main": [
+        [
+          {
+            "node": "Split Tasks",
+            "type": "main",
+            "index": 0
+          }
+        ],
+        [
+          {
+            "node": "No Tasks",
+            "type": "main",
+            "index": 0
+          }
+        ]
+      ]
+    },
+    "Split Tasks": {
+      "main": [
+        [
+          {
+            "node": "Execute Graph",
+            "type": "main",
+            "index": 0
+          }
+        ]
+      ]
+    },
+    "Execute Graph": {
+      "main": [
+        [
+          {
+            "node": "Mark Complete",
+            "type": "main",
+            "index": 0
+          }
+        ]
+      ]
+    },
+    "Mark Complete": {
+      "main": [
+        [
+          {
+            "node": "Send Metrics",
+            "type": "main",
+            "index": 0
+          }
+        ]
+      ]
+    }
+  },
+  "settings": {
+    "executionOrder": "v1"
+  },
+  "staticData": null,
+  "tags": [],
+  "triggerCount": 1,
+  "updatedAt": "2024-01-15T10:00:00.000Z",
+  "versionId": "1"
+}
diff --git a/n8n/workflows/main_orchestrator.json b/n8n/workflows/main_orchestrator.json
new file mode 100644
index 0000000..69f80ff
--- /dev/null
+++ b/n8n/workflows/main_orchestrator.json
@@ -0,0 +1,263 @@
+{
+  "name": "Main Orchestrator",
+  "nodes": [
+    {
+      "parameters": {
+        "httpMethod": "POST",
+        "path": "orchestrator",
+        "responseMode": "responseNode",
+        "options": {}
+      },
+      "id": "webhook-trigger",
+      "name": "Webhook",
+      "type": "n8n-nodes-base.webhook",
+      "typeVersion": 1,
+      "position": [250, 300],
+      "webhookId": "main-orchestrator"
+    },
+    {
+      "parameters": {
+        "jsCode": "// Extract and validate input\nconst body = $input.first().json.body;\n\nif (!body || !body.query) {\n  throw new Error('Missing required field: query');\n}\n\nreturn {\n  query: body.query,\n  user_id: body.user_id || 'anonymous',\n  session_id: body.session_id || $execution.id,\n  metadata: body.metadata || {},\n  timestamp: new Date().toISOString()\n};"
+      },
+      "id": "validate-input",
+      "name": "Validate Input",
+      "type": "n8n-nodes-base.code",
+      "typeVersion": 2,
+      "position": [450, 300]
+    },
+    {
+      "parameters": {
+        "authentication": "genericCredentialType",
+        "genericAuthType": "httpHeaderAuth",
+        "method": "POST",
+        "url": "={{ $env.LANGGRAPH_API_URL }}/v1/graph/run",
+        "sendBody": true,
+        "bodyParameters": {
+          "parameters": [
+            {
+              "name": "input",
+              "value": "={{ { query: $json.query, user_id: $json.user_id, session_id: $json.session_id } }}"
+            },
+            {
+              "name": "config",
+              "value": "={{ { configurable: { thread_id: $json.session_id } } }}"
+            }
+          ]
+        },
+        "options": {
+          "timeout": 60000
+        }
+      },
+      "id": "trigger-langgraph",
+      "name": "Trigger LangGraph",
+      "type": "n8n-nodes-base.httpRequest",
+      "typeVersion": 4.1,
+      "position": [650, 300],
+      "credentials": {
+        "httpHeaderAuth": {
+          "id": "langgraph-api-key",
+          "name": "LangGraph API Key"
+        }
+      }
+    },
+    {
+      "parameters": {
+        "conditions": {
+          "options": {
+            "caseSensitive": true,
+            "leftValue": "",
+            "typeValidation": "strict"
+          },
+          "conditions": [
+            {
+              "id": "success-condition",
+              "leftValue": "={{ $json.status }}",
+              "rightValue": "success",
+              "operator": {
+                "type": "string",
+                "operation": "equals"
+              }
+            }
+          ],
+          "combinator": "and"
+        },
+        "options": {}
+      },
+      "id": "check-success",
+      "name": "Check Success",
+      "type": "n8n-nodes-base.if",
+      "typeVersion": 2,
+      "position": [850, 300]
+    },
+    {
+      "parameters": {
+        "jsCode": "// Process successful response\nconst response = $input.first().json;\n\nreturn {\n  status: 'success',\n  result: response.output,\n  execution_id: response.execution_id,\n  duration_ms: response.duration_ms,\n  metadata: {\n    user_id: response.user_id,\n    session_id: response.session_id,\n    timestamp: new Date().toISOString()\n  }\n};"
+      },
+      "id": "process-success",
+      "name": "Process Success",
+      "type": "n8n-nodes-base.code",
+      "typeVersion": 2,
+      "position": [1050, 200]
+    },
+    {
+      "parameters": {
+        "jsCode": "// Handle error\nconst error = $input.first().json;\n\nreturn {\n  status: 'error',\n  error: {\n    message: error.error || 'Unknown error',\n    code: error.code || 'INTERNAL_ERROR',\n    details: error.details || {}\n  },\n  metadata: {\n    timestamp: new Date().toISOString()\n  }\n};"
+      },
+      "id": "handle-error",
+      "name": "Handle Error",
+      "type": "n8n-nodes-base.code",
+      "typeVersion": 2,
+      "position": [1050, 400]
+    },
+    {
+      "parameters": {
+        "respondWith": "json",
+        "responseBody": "={{ $json }}"
+      },
+      "id": "respond-success",
+      "name": "Respond Success",
+      "type": "n8n-nodes-base.respondToWebhook",
+      "typeVersion": 1,
+      "position": [1250, 200]
+    },
+    {
+      "parameters": {
+        "respondWith": "json",
+        "responseBody": "={{ $json }}",
+        "options": {
+          "responseCode": 500
+        }
+      },
+      "id": "respond-error",
+      "name": "Respond Error",
+      "type": "n8n-nodes-base.respondToWebhook",
+      "typeVersion": 1,
+      "position": [1250, 400]
+    },
+    {
+      "parameters": {
+        "authentication": "genericCredentialType",
+        "genericAuthType": "httpHeaderAuth",
+        "method": "POST",
+        "url": "={{ $env.LOGGING_ENDPOINT }}/logs",
+        "sendBody": true,
+        "bodyParameters": {
+          "parameters": [
+            {
+              "name": "level",
+              "value": "info"
+            },
+            {
+              "name": "message",
+              "value": "Workflow execution completed"
+            },
+            {
+              "name": "data",
+              "value": "={{ $json }}"
+            }
+          ]
+        },
+        "options": {}
+      },
+      "id": "log-execution",
+      "name": "Log Execution",
+      "type": "n8n-nodes-base.httpRequest",
+      "typeVersion": 4.1,
+      "position": [1050, 300],
+      "continueOnFail": true
+    }
+  ],
+  "connections": {
+    "Webhook": {
+      "main": [
+        [
+          {
+            "node": "Validate Input",
+            "type": "main",
+            "index": 0
+          }
+        ]
+      ]
+    },
+    "Validate Input": {
+      "main": [
+        [
+          {
+            "node": "Trigger LangGraph",
+            "type": "main",
+            "index": 0
+          }
+        ]
+      ]
+    },
+    "Trigger LangGraph": {
+      "main": [
+        [
+          {
+            "node": "Check Success",
+            "type": "main",
+            "index": 0
+          }
+        ]
+      ]
+    },
+    "Check Success": {
+      "main": [
+        [
+          {
+            "node": "Process Success",
+            "type": "main",
+            "index": 0
+          }
+        ],
+        [
+          {
+            "node": "Handle Error",
+            "type": "main",
+            "index": 0
+          }
+        ]
+      ]
+    },
+    "Process Success": {
+      "main": [
+        [
+          {
+            "node": "Log Execution",
+            "type": "main",
+            "index": 0
+          },
+          {
+            "node": "Respond Success",
+            "type": "main",
+            "index": 0
+          }
+        ]
+      ]
+    },
+    "Handle Error": {
+      "main": [
+        [
+          {
+            "node": "Log Execution",
+            "type": "main",
+            "index": 0
+          },
+          {
+            "node": "Respond Error",
+            "type": "main",
+            "index": 0
+          }
+        ]
+      ]
+    }
+  },
+  "settings": {
+    "executionOrder": "v1"
+  },
+  "staticData": null,
+  "tags": [],
+  "triggerCount": 1,
+  "updatedAt": "2024-01-15T10:00:00.000Z",
+  "versionId": "1"
+}
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..c90dd45
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,122 @@
+[build-system]
+requires = ["setuptools>=68.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "rag7"
+version = "1.0.0"
+description = "RAG7 - Production-ready LangGraph-based RAG system"
+readme = "README.md"
+requires-python = ">=3.11"
+license = {text = "MIT"}
+authors = [
+    {name = "RAG7 Team", email = "team@example.com"}
+]
+keywords = ["rag", "langgraph", "langchain", "llm", "ai"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+
+dependencies = [
+    "langgraph>=0.0.26",
+    "langchain>=0.1.0",
+    "langchain-core>=0.1.0",
+    "langchain-community>=0.0.13",
+    "openai>=1.6.1",
+    "anthropic>=0.8.1",
+    "chromadb>=0.4.22",
+    "psycopg2-binary>=2.9.9",
+    "sqlalchemy>=2.0.23",
+    "redis>=5.0.1",
+    "fastapi>=0.104.1",
+    "uvicorn[standard]>=0.24.0",
+    "httpx>=0.25.1",
+    "numpy>=1.26.2",
+    "pandas>=2.1.4",
+    "pydantic>=2.5.0",
+    "pydantic-settings>=2.1.0",
+    "tiktoken>=0.5.2",
+    "sentence-transformers>=2.2.2",
+    "python-dotenv>=1.0.0",
+    "tenacity>=8.2.3",
+    "aiofiles>=23.2.1",
+    "prometheus-client>=0.19.0",
+    "opentelemetry-api>=1.21.0",
+    "opentelemetry-sdk>=1.21.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.4.3",
+    "pytest-asyncio>=0.21.1",
+    "pytest-cov>=4.1.0",
+    "black>=23.12.1",
+    "flake8>=7.0.0",
+    "isort>=5.13.2",
+    "mypy>=1.7.1",
+]
+
+[project.urls]
+Homepage = "https://github.com/Stacey77/rag7"
+Documentation = "https://github.com/Stacey77/rag7/blob/main/docs"
+Repository = "https://github.com/Stacey77/rag7"
+Issues = "https://github.com/Stacey77/rag7/issues"
+
+[tool.setuptools]
+packages = ["rag7"]
+
+[tool.black]
+line-length = 100
+target-version = ['py311']
+include = '\.pyi?$'
+extend-exclude = '''
+/(
+  # directories
+  \.eggs
+  | \.git
+  | \.hg
+  | \.mypy_cache
+  | \.tox
+  | \.venv
+  | build
+  | dist
+)/
+'''
+
+[tool.isort]
+profile = "black"
+line_length = 100
+skip_gitignore = true
+
+[tool.pytest.ini_options]
+minversion = "7.0"
+addopts = "-ra -q --strict-markers"
+testpaths = ["tests"]
+pythonpath = ["."]
+
+[tool.mypy]
+python_version = "3.11"
+warn_return_any = true
+warn_unused_configs = true
+disallow_untyped_defs = false
+ignore_missing_imports = true
+
+[tool.coverage.run]
+source = ["rag7"]
+omit = ["*/tests/*", "*/test_*.py"]
+
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "def __repr__",
+    "raise AssertionError",
+    "raise NotImplementedError",
+    "if __name__ == .__main__.:",
+    "if TYPE_CHECKING:",
+]
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..63b2edd
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,56 @@
+# RAG7 LangGraph Application Requirements
+# Python dependencies for the RAG7 project
+
+# Core Framework
+langgraph==0.0.26
+langchain==0.1.0
+langchain-core==0.1.0
+langchain-community==0.0.13
+
+# LLM Providers
+openai==1.6.1
+anthropic==0.8.1
+
+# Vector Stores
+chromadb==0.4.22
+faiss-cpu==1.7.4
+
+# Database & Storage
+psycopg2-binary==2.9.9
+sqlalchemy==2.0.23
+redis==5.0.1
+
+# Web Framework (for API)
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+httpx==0.25.1
+
+# Data Processing
+numpy==1.26.2
+pandas==2.1.4
+pydantic==2.5.0
+pydantic-settings==2.1.0
+
+# Text Processing
+tiktoken==0.5.2
+sentence-transformers==2.2.2
+
+# Utilities
+python-dotenv==1.0.0
+tenacity==8.2.3
+aiofiles==23.2.1
+
+# Observability & Monitoring
+prometheus-client==0.19.0
+opentelemetry-api==1.21.0
+opentelemetry-sdk==1.21.0
+opentelemetry-instrumentation==0.42b0
+
+# Development & Testing (optional, uncomment if needed)
+# pytest==7.4.3
+# pytest-asyncio==0.21.1
+# pytest-cov==4.1.0
+# black==23.12.1
+# flake8==7.0.0
+# isort==5.13.2
+# mypy==1.7.1