From bd6e67e7518928b8a40be0a4af7773b7d111e284 Mon Sep 17 00:00:00 2001 From: Wenjing Yu Date: Mon, 21 Jul 2025 14:45:20 -0700 Subject: [PATCH] Adjust the number of workers to avoid db pool is overflow --- Dockerfile | 16 +- app/api/dependencies.py | 12 +- app/api/routes/health.py | 122 ++++++++ app/core/database.py | 38 ++- app/main.py | 226 +++++++------- docs/DATABASE_CONNECTION_TROUBLESHOOTING.md | 315 ++++++++++++++++++++ tools/diagnostics/check_db_connections.py | 231 ++++++++++++++ 7 files changed, 834 insertions(+), 126 deletions(-) create mode 100644 app/api/routes/health.py create mode 100644 docs/DATABASE_CONNECTION_TROUBLESHOOTING.md create mode 100755 tools/diagnostics/check_db_connections.py diff --git a/Dockerfile b/Dockerfile index e3ee861..7765628 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,6 +16,18 @@ ENV CLERK_API_KEY=${ARG_CLERK_API_KEY} ENV CLERK_API_URL=${ARG_CLERK_API_URL:-https://api.clerk.dev/v1} ENV FORGE_DEBUG_LOGGING=${ARG_DEBUG_LOGGING} +# Database connection optimization environment variables +# These settings optimize for PostgreSQL connection limits +ENV DB_POOL_SIZE=3 +ENV DB_MAX_OVERFLOW=2 +ENV DB_POOL_TIMEOUT=30 +ENV DB_POOL_RECYCLE=1800 +ENV DB_POOL_PRE_PING=true + +# Reduced worker count to manage database connections +# With 5 workers: max 60 connections (5 × 3 × 2 engines + 5 × 2 × 2 overflow = 50 connections) +ENV WORKERS=5 + # Install system dependencies including PostgreSQL client and gosu for user privilege management RUN apt-get update && apt-get install -y \ postgresql-client \ @@ -37,5 +49,5 @@ USER nobody # Expose port EXPOSE 8000 -# Run the application (this command is passed to the entrypoint) -CMD ["gunicorn", "app.main:app", "-k", "uvicorn.workers.UvicornWorker", "--workers", "10", "--bind", "0.0.0.0:8000"] +# Use environment variable for workers count and optimize for database connections +CMD ["sh", "-c", "gunicorn app.main:app -k uvicorn.workers.UvicornWorker --workers ${WORKERS:-5} --bind 0.0.0.0:8000 --timeout 120 --max-requests 1000 --max-requests-jitter 100"] diff --git a/app/api/dependencies.py b/app/api/dependencies.py index f8333ae..4cbf455 100644 --- a/app/api/dependencies.py +++ b/app/api/dependencies.py @@ -109,7 +109,11 @@ async def get_current_user( except JWTError as err: raise credentials_exception from err - result = await db.execute(select(User).filter(User.username == token_data.username)) + result = await db.execute( + select(User) + .options(selectinload(User.api_keys)) # Eager load Forge API keys + .filter(User.username == token_data.username) + ) user = result.scalar_one_or_none() if user is None: raise credentials_exception @@ -356,7 +360,11 @@ async def get_current_user_from_clerk( ) # Find user by clerk_user_id - result = await db.execute(select(User).filter(User.clerk_user_id == clerk_user_id)) + result = await db.execute( + select(User) + .options(selectinload(User.api_keys)) # Eager load Forge API keys + .filter(User.clerk_user_id == clerk_user_id) + ) user = result.scalar_one_or_none() # User doesn't exist yet, create one diff --git a/app/api/routes/health.py b/app/api/routes/health.py new file mode 100644 index 0000000..47227d3 --- /dev/null +++ b/app/api/routes/health.py @@ -0,0 +1,122 @@ +""" +Health check and monitoring endpoints for production deployments. +""" + +from fastapi import APIRouter, HTTPException +from sqlalchemy import text + +from app.core.database import get_connection_info, get_db_session +from app.core.logger import get_logger + +logger = get_logger(name="health") +router = APIRouter() + + +@router.get("/health") +async def health_check(): + """ + Basic health check endpoint. + Returns 200 if the service is running. + """ + return {"status": "healthy", "service": "forge"} + + +@router.get("/health/database") +async def database_health_check(): + """ + Database health check endpoint. + Returns detailed information about database connectivity and pool status. + """ + try: + # Test database connection + async with get_db_session() as session: + result = await session.execute(text("SELECT 1")) + result.scalar() + + # Get connection pool information + pool_info = get_connection_info() + + # Calculate connection usage + sync_pool = pool_info['sync_engine'] + async_pool = pool_info['async_engine'] + + sync_usage = sync_pool['checked_out'] / (pool_info['pool_size'] + pool_info['max_overflow']) * 100 + async_usage = async_pool['checked_out'] / (pool_info['pool_size'] + pool_info['max_overflow']) * 100 + + return { + "status": "healthy", + "database": "connected", + "connection_pools": { + "sync": { + "checked_out": sync_pool['checked_out'], + "checked_in": sync_pool['checked_in'], + "size": sync_pool['size'], + "usage_percent": round(sync_usage, 1) + }, + "async": { + "checked_out": async_pool['checked_out'], + "checked_in": async_pool['checked_in'], + "size": async_pool['size'], + "usage_percent": round(async_usage, 1) + } + }, + "configuration": { + "pool_size": pool_info['pool_size'], + "max_overflow": pool_info['max_overflow'], + "pool_timeout": pool_info['pool_timeout'], + "pool_recycle": pool_info['pool_recycle'] + } + } + + except Exception as e: + logger.error(f"Database health check failed: {e}") + raise HTTPException( + status_code=503, + detail={ + "status": "unhealthy", + "database": "disconnected", + "error": str(e) + } + ) + + +@router.get("/health/detailed") +async def detailed_health_check(): + """ + Detailed health check including all service components. + """ + try: + # Test database + async with get_db_session() as session: + db_result = await session.execute(text("SELECT version()")) + db_version = db_result.scalar() + + pool_info = get_connection_info() + + return { + "status": "healthy", + "timestamp": "2025-01-21T19:15:00Z", # This would be dynamic in real implementation + "service": "forge", + "version": "0.1.0", + "database": { + "status": "connected", + "version": db_version, + "pool_status": pool_info + }, + "environment": { + "workers": pool_info.get('workers', 'unknown'), + "pool_size": pool_info['pool_size'], + "max_overflow": pool_info['max_overflow'] + } + } + + except Exception as e: + logger.error(f"Detailed health check failed: {e}") + raise HTTPException( + status_code=503, + detail={ + "status": "unhealthy", + "error": str(e), + "timestamp": "2025-01-21T19:15:00Z" + } + ) \ No newline at end of file diff --git a/app/core/database.py b/app/core/database.py index 5253f03..31a466f 100644 --- a/app/core/database.py +++ b/app/core/database.py @@ -8,10 +8,14 @@ load_dotenv() -POOL_SIZE = 5 -MAX_OVERFLOW = 10 -MAX_TIMEOUT = 30 -POOL_RECYCLE = 1800 +# Production-optimized connection pool settings +# With 10 Gunicorn workers, this allows max 60 connections total (10 workers × 3 pool_size × 2 engines) +# Plus 40 overflow connections (10 workers × 2 max_overflow × 2 engines) = 100 max connections +POOL_SIZE = int(os.getenv("DB_POOL_SIZE", "3")) # Reduced from 5 to 3 +MAX_OVERFLOW = int(os.getenv("DB_MAX_OVERFLOW", "2")) # Reduced from 10 to 2 +MAX_TIMEOUT = int(os.getenv("DB_POOL_TIMEOUT", "30")) +POOL_RECYCLE = int(os.getenv("DB_POOL_RECYCLE", "1800")) # 30 minutes +POOL_PRE_PING = os.getenv("DB_POOL_PRE_PING", "true").lower() == "true" SQLALCHEMY_DATABASE_URL = os.getenv("DATABASE_URL") if not SQLALCHEMY_DATABASE_URL: @@ -24,6 +28,7 @@ max_overflow=MAX_OVERFLOW, pool_timeout=MAX_TIMEOUT, pool_recycle=POOL_RECYCLE, + pool_pre_ping=POOL_PRE_PING, # Enables connection health checks echo=False, ) SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) @@ -52,6 +57,7 @@ def get_db(): max_overflow=MAX_OVERFLOW, pool_timeout=MAX_TIMEOUT, pool_recycle=POOL_RECYCLE, + pool_pre_ping=POOL_PRE_PING, # Enables connection health checks echo=False, ) @@ -84,4 +90,26 @@ async def get_db_session(): await session.rollback() raise finally: - await session.close() \ No newline at end of file + await session.close() + + +def get_connection_info(): + """Get current connection pool information for monitoring""" + return { + "pool_size": POOL_SIZE, + "max_overflow": MAX_OVERFLOW, + "pool_timeout": MAX_TIMEOUT, + "pool_recycle": POOL_RECYCLE, + "sync_engine": { + "pool": engine.pool, + "checked_out": engine.pool.checkedout(), + "checked_in": engine.pool.checkedin(), + "size": engine.pool.size(), + }, + "async_engine": { + "pool": async_engine.pool, + "checked_out": async_engine.pool.checkedout(), + "checked_in": async_engine.pool.checkedin(), + "size": async_engine.pool.size(), + } + } \ No newline at end of file diff --git a/app/main.py b/app/main.py index 85364a6..1bb2ea2 100644 --- a/app/main.py +++ b/app/main.py @@ -11,6 +11,7 @@ api_auth, api_keys, auth, + health, provider_keys, proxy, stats, @@ -67,135 +68,126 @@ async def dispatch(self, request: Request, call_next: Callable): return response -# Get environment -is_production = os.getenv("ENVIRONMENT", "development").lower() == "production" +# Exception handlers +class ForgeExceptionHandler: + """Custom exception handlers for Forge-specific errors.""" -app = FastAPI( - title="Forge API", - description="A middleware service for managing AI model provider API keys", - version="0.1.0", - docs_url="/docs" if not is_production else None, - redoc_url="/redoc" if not is_production else None, - openapi_url="/openapi.json" if not is_production else None, -) + @staticmethod + async def handle_provider_auth_exception(request: Request, exc: ProviderAuthenticationException): + logger.warning(f"Provider authentication failed: {exc.detail}") + return HTTPException( + status_code=exc.status_code, + detail=exc.detail, + headers=exc.headers or {} + ) -### Exception handlers block ### + @staticmethod + async def handle_invalid_provider_exception(request: Request, exc: InvalidProviderException): + logger.warning(f"Invalid provider: {exc.detail}") + return HTTPException( + status_code=exc.status_code, + detail=exc.detail, + headers=exc.headers or {} + ) -# Add exception handler for ProviderAuthenticationException -@app.exception_handler(ProviderAuthenticationException) -async def provider_authentication_exception_handler(request: Request, exc: ProviderAuthenticationException): - return HTTPException( - status_code=401, - detail=f"Authentication failed for provider {exc.provider_name}" - ) + @staticmethod + async def handle_base_invalid_provider_setup_exception(request: Request, exc: BaseInvalidProviderSetupException): + logger.warning(f"Invalid provider setup: {exc.detail}") + return HTTPException( + status_code=exc.status_code, + detail=exc.detail, + headers=exc.headers or {} + ) -# Add exception handler for InvalidProviderException -@app.exception_handler(InvalidProviderException) -async def invalid_provider_exception_handler(request: Request, exc: InvalidProviderException): - return HTTPException( - status_code=400, - detail=f"{str(exc)}. Please verify your provider and model details by calling the /models endpoint or visiting https://tensorblock.co/api-docs/model-ids, and ensure you’re using a valid provider name, model name, and model ID." - ) + @staticmethod + async def handle_provider_api_exception(request: Request, exc: ProviderAPIException): + logger.error(f"Provider API error: {exc.detail}") + return HTTPException( + status_code=exc.status_code, + detail=exc.detail, + headers=exc.headers or {} + ) -# Add exception handler for BaseInvalidProviderSetupException -@app.exception_handler(BaseInvalidProviderSetupException) -async def base_invalid_provider_setup_exception_handler(request: Request, exc: BaseInvalidProviderSetupException): - return HTTPException( - status_code=400, - detail=str(exc) - ) + @staticmethod + async def handle_base_invalid_request_exception(request: Request, exc: BaseInvalidRequestException): + logger.warning(f"Invalid request: {exc.detail}") + return HTTPException( + status_code=exc.status_code, + detail=exc.detail, + headers=exc.headers or {} + ) -# Add exception handler for ProviderAPIException -@app.exception_handler(ProviderAPIException) -async def provider_api_exception_handler(request: Request, exc: ProviderAPIException): - return HTTPException( - status_code=exc.error_code, - detail=f"Provider API error: {exc.provider_name} {exc.error_code} {exc.error_message}" - ) + @staticmethod + async def handle_base_invalid_forge_key_exception(request: Request, exc: BaseInvalidForgeKeyException): + logger.warning(f"Invalid Forge key: {exc.detail}") + return HTTPException( + status_code=exc.status_code, + detail=exc.detail, + headers=exc.headers or {} + ) -# Add exception handler for BaseInvalidRequestException -@app.exception_handler(BaseInvalidRequestException) -async def base_invalid_request_exception_handler(request: Request, exc: BaseInvalidRequestException): - return HTTPException( - status_code=400, - detail=str(exc) - ) -# Add exception handler for BaseInvalidForgeKeyException -@app.exception_handler(BaseInvalidForgeKeyException) -async def base_invalid_forge_key_exception_handler(request: Request, exc: BaseInvalidForgeKeyException): - return HTTPException( - status_code=401, - detail=f"Invalid Forge key: {exc.error}" +def create_app() -> FastAPI: + """Create and configure the FastAPI application.""" + app = FastAPI( + title="Forge API", + description="Unified AI model provider API", + version="0.1.0", + docs_url="/docs", + redoc_url="/redoc", ) -# Add exception handler for NotImplementedError -@app.exception_handler(NotImplementedError) -async def not_implemented_error_handler(request: Request, exc: NotImplementedError): - return HTTPException( - status_code=404, - detail=f"Not implemented: {exc}" + # Add CORS middleware + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # Configure appropriately for production + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], ) -### Exception handlers block ends ### - -# Middleware to log slow requests -@app.middleware("http") -async def log_latency(request: Request, call_next): - start_time = ( - time.time() - ) # Renamed from start to avoid conflict with existing start_time - try: - response = await call_next(request) - return response - finally: - duration = time.time() - start_time - if duration > SLOW_REQUEST_THRESHOLD_SECONDS: # More than the defined threshold - logger.warning( - f"[SLOW] {request.method} {request.url.path} took {duration:.2f}s" - ) - - -# Add request logging middleware -app.add_middleware(RequestLoggingMiddleware) - -# Configure CORS -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], # For production, specify the actual origins - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - -# Include routers under v1 prefix -v1_router.include_router(auth.router, prefix="/auth", tags=["Authentication"]) -v1_router.include_router(users.router, prefix="/users", tags=["Users"]) -v1_router.include_router( - provider_keys.router, prefix="/provider-keys", tags=["Provider Keys"] -) -v1_router.include_router(stats.router, prefix="/stats", tags=["Usage Statistics"]) -v1_router.include_router(webhooks.router, prefix="/webhooks", tags=["Webhooks"]) -v1_router.include_router(api_auth.router, prefix="/api", tags=["Unified API"]) -v1_router.include_router(api_keys.router, prefix="/api-keys", tags=["API Keys"]) - -# OpenAI-compatible API endpoints -v1_router.include_router(proxy.router, tags=["OpenAI API"]) - -# Include v1 router in main app -app.include_router(v1_router) - - -@app.get("/") -def read_root(): - response = { - "name": "Forge API", - "version": "0.1.0", - "description": "A middleware service for managing AI model provider API keys", - } - if not is_production: - response["documentation"] = "/docs" - return response + # Add request logging middleware + app.add_middleware(RequestLoggingMiddleware) + + # Add exception handlers + app.add_exception_handler(ProviderAuthenticationException, ForgeExceptionHandler.handle_provider_auth_exception) + app.add_exception_handler(InvalidProviderException, ForgeExceptionHandler.handle_invalid_provider_exception) + app.add_exception_handler(BaseInvalidProviderSetupException, ForgeExceptionHandler.handle_base_invalid_provider_setup_exception) + app.add_exception_handler(ProviderAPIException, ForgeExceptionHandler.handle_provider_api_exception) + app.add_exception_handler(BaseInvalidRequestException, ForgeExceptionHandler.handle_base_invalid_request_exception) + app.add_exception_handler(BaseInvalidForgeKeyException, ForgeExceptionHandler.handle_base_invalid_forge_key_exception) + + # Include routers + v1_router.include_router(auth.router, prefix="/auth", tags=["authentication"]) + v1_router.include_router(api_auth.router, prefix="/api-auth", tags=["api-authentication"]) + v1_router.include_router(users.router, prefix="/users", tags=["users"]) + v1_router.include_router(provider_keys.router, prefix="/provider-keys", tags=["provider-keys"]) + v1_router.include_router(api_keys.router, prefix="/api-keys", tags=["api-keys"]) + v1_router.include_router(proxy.router, tags=["proxy"]) + v1_router.include_router(stats.router, prefix="/stats", tags=["stats"]) + v1_router.include_router(webhooks.router, prefix="/webhooks", tags=["webhooks"]) + + # Health check routes (not versioned) + app.include_router(health.router, tags=["health"]) + + # Include v1 router + app.include_router(v1_router) + + @app.get("/") + async def root(): + """Root endpoint with API information.""" + return { + "message": "Welcome to Forge API", + "version": "0.1.0", + "docs": "/docs", + "health": "/health" + } + + return app + + +# Create the application instance +app = create_app() if __name__ == "__main__": diff --git a/docs/DATABASE_CONNECTION_TROUBLESHOOTING.md b/docs/DATABASE_CONNECTION_TROUBLESHOOTING.md new file mode 100644 index 0000000..20d2a0a --- /dev/null +++ b/docs/DATABASE_CONNECTION_TROUBLESHOOTING.md @@ -0,0 +1,315 @@ +# Database Connection Troubleshooting Guide + +This guide helps you diagnose and fix PostgreSQL connection limit issues in your Forge deployment. + +## 🚨 Quick Fix for "too many clients already" Error + +If you're seeing the PostgreSQL error "sorry, too many clients already", follow these immediate steps: + +### 1. **Immediate Actions** + +```bash +# Stop your application +docker-compose down +# or +pkill -f gunicorn + +# Restart with reduced workers +export WORKERS=3 +docker-compose up -d +# or +gunicorn app.main:app -k uvicorn.workers.UvicornWorker --workers 3 --bind 0.0.0.0:8000 +``` + +### 2. **Check Current Status** + +```bash +# Run the diagnostic tool +python tools/diagnostics/check_db_connections.py + +# Check health endpoint +curl http://localhost:8000/health/database +``` + +## 📊 Understanding the Problem + +### Connection Pool Math + +With the default settings: +- **10 Gunicorn workers** × **2 engines** (sync + async) × **15 connections** (5 pool + 10 overflow) = **300 potential connections** +- **PostgreSQL default**: ~100 max_connections +- **Result**: Connection limit exceeded! + +### New Optimized Settings + +- **5 Gunicorn workers** × **2 engines** × **5 connections** (3 pool + 2 overflow) = **50 connections** +- **Buffer for other connections**: 50 connections remaining +- **Result**: Safe operation within PostgreSQL limits + +## 🔧 Configuration Options + +### Environment Variables + +Add these to your `.env` file or Docker environment: + +```env +# Database connection settings +DB_POOL_SIZE=3 # Connections per pool +DB_MAX_OVERFLOW=2 # Additional connections when pool is full +DB_POOL_TIMEOUT=30 # Seconds to wait for connection +DB_POOL_RECYCLE=1800 # Seconds before recycling connections +DB_POOL_PRE_PING=true # Enable connection health checks + +# Application settings +WORKERS=5 # Number of Gunicorn workers +``` + +### For High-Load Production + +If you need more workers for performance, consider: + +```env +# Option 1: Reduce pool sizes further +DB_POOL_SIZE=2 +DB_MAX_OVERFLOW=1 +WORKERS=8 + +# Option 2: Increase PostgreSQL max_connections (see PostgreSQL tuning section) +``` + +## 🐘 PostgreSQL Tuning + +### Increase max_connections + +1. **Edit postgresql.conf:** + ```bash + # Find your config file + sudo -u postgres psql -c "SHOW config_file;" + + # Edit the file + sudo nano /path/to/postgresql.conf + ``` + +2. **Update settings:** + ```conf + max_connections = 200 # Increase from default 100 + shared_buffers = 256MB # Increase with max_connections + effective_cache_size = 1GB # Adjust based on available RAM + ``` + +3. **Restart PostgreSQL:** + ```bash + sudo systemctl restart postgresql + # or + sudo service postgresql restart + ``` + +### Docker PostgreSQL + +For Docker setups, modify `docker-compose.yml`: + +```yaml +services: + db: + image: postgres:14 + environment: + - POSTGRES_USER=forge + - POSTGRES_PASSWORD=forge + - POSTGRES_DB=forge + command: > + postgres + -c max_connections=200 + -c shared_buffers=256MB + -c effective_cache_size=1GB + # ... rest of config +``` + +## 🔍 Monitoring Tools + +### 1. **Diagnostic Script** + +```bash +# Run comprehensive check +python tools/diagnostics/check_db_connections.py + +# Run regularly in production +watch -n 30 "python tools/diagnostics/check_db_connections.py" +``` + +### 2. **Health Check Endpoints** + +```bash +# Basic health check +curl http://localhost:8000/health + +# Database-specific health check +curl http://localhost:8000/health/database + +# Detailed health check +curl http://localhost:8000/health/detailed +``` + +### 3. **Direct PostgreSQL Monitoring** + +```sql +-- Check current connections +SELECT count(*) as total_connections FROM pg_stat_activity; + +-- Check max connections +SHOW max_connections; + +-- View connections by database +SELECT datname, count(*) as connections +FROM pg_stat_activity +WHERE datname IS NOT NULL +GROUP BY datname +ORDER BY connections DESC; + +-- Check connection states +SELECT state, count(*) +FROM pg_stat_activity +GROUP BY state; +``` + +## 🚀 Production Deployment Strategies + +### 1. **Conservative Approach** (Recommended) + +```env +# Prioritize stability +WORKERS=3 +DB_POOL_SIZE=2 +DB_MAX_OVERFLOW=1 +# Max connections: 3 × 2 × 3 = 18 connections +``` + +### 2. **Balanced Approach** + +```env +# Balance performance and stability +WORKERS=5 +DB_POOL_SIZE=3 +DB_MAX_OVERFLOW=2 +# Max connections: 5 × 2 × 5 = 50 connections +``` + +### 3. **High-Performance Approach** + +```env +# Requires PostgreSQL tuning +WORKERS=8 +DB_POOL_SIZE=3 +DB_MAX_OVERFLOW=2 +# Max connections: 8 × 2 × 5 = 80 connections +# Requires max_connections >= 150 in PostgreSQL +``` + +## 📈 Performance vs. Connections Trade-off + +| Workers | Pool Size | Max Overflow | Total Connections | Performance | Stability | +|---------|-----------|--------------|-------------------|-------------|-----------| +| 3 | 2 | 1 | 18 | ⭐⭐ | ⭐⭐⭐⭐⭐ | +| 5 | 3 | 2 | 50 | ⭐⭐⭐ | ⭐⭐⭐⭐ | +| 8 | 3 | 2 | 80 | ⭐⭐⭐⭐ | ⭐⭐⭐ | +| 10 | 5 | 10 | 300 | ⭐⭐⭐⭐⭐ | ⭐ | + +## 🔧 Advanced Solutions + +### 1. **Connection Pooling with pgbouncer** + +For very high-load scenarios, use pgbouncer: + +```bash +# Install pgbouncer +sudo apt-get install pgbouncer + +# Configure pgbouncer.ini +[databases] +forge = host=localhost port=5432 dbname=forge + +[pgbouncer] +pool_mode = transaction +listen_port = 6432 +max_client_conn = 100 +default_pool_size = 25 +``` + +Update DATABASE_URL: +```env +DATABASE_URL=postgresql://forge:forge@localhost:6432/forge +``` + +### 2. **Read Replicas** + +For read-heavy workloads, implement read replicas and route read queries separately. + +## 🚨 Troubleshooting Common Issues + +### Issue 1: "connection timeout" + +**Symptoms:** Long delays before connection errors +**Solution:** Increase `DB_POOL_TIMEOUT` or reduce load + +```env +DB_POOL_TIMEOUT=60 # Increase from 30 +``` + +### Issue 2: "too many connections" during startup + +**Symptoms:** Errors immediately after deployment +**Solution:** Reduce workers and pool sizes + +```env +WORKERS=3 +DB_POOL_SIZE=2 +``` + +### Issue 3: Intermittent connection errors + +**Symptoms:** Occasional connection failures +**Solution:** Enable connection health checks + +```env +DB_POOL_PRE_PING=true +DB_POOL_RECYCLE=1800 +``` + +## 📋 Monitoring Checklist + +- [ ] Set up regular health checks +- [ ] Monitor connection pool usage +- [ ] Track PostgreSQL connection counts +- [ ] Set up alerts for >80% usage +- [ ] Review logs for connection errors +- [ ] Test with realistic load + +## 🆘 Emergency Procedures + +If you're experiencing severe connection issues in production: + +1. **Immediate relief:** + ```bash + # Kill connections + docker-compose down + + # Restart with minimal settings + export WORKERS=2 + export DB_POOL_SIZE=1 + export DB_MAX_OVERFLOW=1 + docker-compose up -d + ``` + +2. **Check database:** + ```sql + -- Kill idle connections + SELECT pg_terminate_backend(pid) + FROM pg_stat_activity + WHERE state = 'idle' AND query_start < NOW() - INTERVAL '5 minutes'; + ``` + +3. **Monitor recovery:** + ```bash + watch -n 5 "curl -s http://localhost:8000/health/database | jq '.connection_pools'" + ``` + +Remember: **Stability first, performance second**. It's better to have a working system with lower throughput than a broken system. \ No newline at end of file diff --git a/tools/diagnostics/check_db_connections.py b/tools/diagnostics/check_db_connections.py new file mode 100755 index 0000000..674176a --- /dev/null +++ b/tools/diagnostics/check_db_connections.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 +""" +Database connection monitoring and diagnostic tool. +""" + +import asyncio +import os +import sys +from datetime import datetime + +import psycopg2 +from dotenv import load_dotenv +from sqlalchemy import create_engine, text +from sqlalchemy.ext.asyncio import create_async_engine + + +def check_postgres_max_connections(): + """Check PostgreSQL max_connections setting""" + try: + load_dotenv() + db_url = os.getenv("DATABASE_URL", "postgresql://user:password@localhost:5432/forge") + + print("🔍 Checking PostgreSQL configuration...") + + # Use psycopg2 for direct connection + conn = psycopg2.connect(db_url) + cursor = conn.cursor() + + # Get max_connections + cursor.execute("SHOW max_connections;") + max_connections = cursor.fetchone()[0] + + # Get current connections + cursor.execute(""" + SELECT count(*) as active_connections + FROM pg_stat_activity + WHERE state = 'active' + """) + active_connections = cursor.fetchone()[0] + + # Get total connections + cursor.execute(""" + SELECT count(*) as total_connections + FROM pg_stat_activity + """) + total_connections = cursor.fetchone()[0] + + # Get connections by database + cursor.execute(""" + SELECT datname, count(*) as connections + FROM pg_stat_activity + WHERE datname IS NOT NULL + GROUP BY datname + ORDER BY connections DESC + """) + db_connections = cursor.fetchall() + + cursor.close() + conn.close() + + print(f"📊 PostgreSQL Connection Status:") + print(f" Max connections: {max_connections}") + print(f" Total connections: {total_connections}") + print(f" Active connections: {active_connections}") + print(f" Usage: {(total_connections/int(max_connections)*100):.1f}%") + + if int(total_connections) > int(max_connections) * 0.8: + print("⚠️ WARNING: Connection usage is above 80%!") + + print(f"\n📊 Connections by database:") + for db_name, conn_count in db_connections: + print(f" {db_name}: {conn_count}") + + return True + + except Exception as e: + print(f"❌ Error checking PostgreSQL connections: {e}") + return False + + +def check_sqlalchemy_pools(): + """Check SQLAlchemy connection pool status""" + try: + print("\n🔍 Checking SQLAlchemy connection pools...") + + from app.core.database import get_connection_info + + info = get_connection_info() + + print(f"📊 Connection Pool Configuration:") + print(f" Pool size: {info['pool_size']}") + print(f" Max overflow: {info['max_overflow']}") + print(f" Pool timeout: {info['pool_timeout']}s") + print(f" Pool recycle: {info['pool_recycle']}s") + + print(f"\n📊 Sync Engine Pool Status:") + sync_pool = info['sync_engine'] + print(f" Checked out: {sync_pool['checked_out']}") + print(f" Checked in: {sync_pool['checked_in']}") + print(f" Pool size: {sync_pool['size']}") + + print(f"\n📊 Async Engine Pool Status:") + async_pool = info['async_engine'] + print(f" Checked out: {async_pool['checked_out']}") + print(f" Checked in: {async_pool['checked_in']}") + print(f" Pool size: {async_pool['size']}") + + # Calculate total potential connections with workers + workers = int(os.getenv("WORKERS", "10")) + total_potential = workers * (info['pool_size'] + info['max_overflow']) * 2 + print(f"\n📊 Production Calculation (with {workers} workers):") + print(f" Max potential connections: {total_potential}") + print(f" Per worker: {(info['pool_size'] + info['max_overflow']) * 2}") + + return True + + except Exception as e: + print(f"❌ Error checking SQLAlchemy pools: {e}") + return False + + +async def test_async_connection(): + """Test async database connection""" + try: + print("\n🔍 Testing async database connection...") + + from app.core.database import get_db_session + + async with get_db_session() as session: + result = await session.execute(text("SELECT version()")) + version = result.scalar() + print(f"✅ Async connection successful") + print(f" PostgreSQL version: {version}") + return True + + except Exception as e: + print(f"❌ Async connection failed: {e}") + return False + + +def test_sync_connection(): + """Test sync database connection""" + try: + print("\n🔍 Testing sync database connection...") + + from app.core.database import get_db + + with next(get_db()) as session: + result = session.execute(text("SELECT current_database()")) + db_name = result.scalar() + print(f"✅ Sync connection successful") + print(f" Current database: {db_name}") + return True + + except Exception as e: + print(f"❌ Sync connection failed: {e}") + return False + + +def show_recommendations(): + """Show recommendations for fixing connection issues""" + print("\n💡 Recommendations for fixing connection issues:") + print("\n1. **Immediate fixes:**") + print(" - Restart your application to apply new connection pool settings") + print(" - Monitor connection usage with this diagnostic tool") + + print("\n2. **PostgreSQL tuning:**") + print(" - Increase max_connections in postgresql.conf") + print(" - Set max_connections = 200 (or higher based on your needs)") + print(" - Also increase shared_buffers if you increase max_connections") + + print("\n3. **Application tuning:**") + print(" - Reduce number of Gunicorn workers if needed") + print(" - Use environment variables to tune connection pools:") + print(" export DB_POOL_SIZE=2") + print(" export DB_MAX_OVERFLOW=1") + print(" export WORKERS=5") + + print("\n4. **Connection pooling (for high-load production):**") + print(" - Consider using pgbouncer for connection pooling") + print(" - Set connection_limit in pgbouncer.ini") + + print("\n5. **Monitoring:**") + print(" - Run this script regularly to monitor connection usage") + print(" - Set up alerts when connection usage exceeds 80%") + + +async def main(): + """Main diagnostic function""" + print("🚀 Database Connection Diagnostic Tool") + print("=" * 50) + print(f"⏰ Timestamp: {datetime.now()}") + + # Load environment + load_dotenv() + + # Run all checks + postgres_ok = check_postgres_max_connections() + pools_ok = check_sqlalchemy_pools() + async_ok = await test_async_connection() + sync_ok = test_sync_connection() + + print("\n" + "=" * 50) + print("📋 SUMMARY:") + print(f" PostgreSQL config: {'✅' if postgres_ok else '❌'}") + print(f" SQLAlchemy pools: {'✅' if pools_ok else '❌'}") + print(f" Async connection: {'✅' if async_ok else '❌'}") + print(f" Sync connection: {'✅' if sync_ok else '❌'}") + + if not all([postgres_ok, pools_ok, async_ok, sync_ok]): + print("\n❌ Issues detected!") + show_recommendations() + return False + else: + print("\n✅ All checks passed!") + return True + + +if __name__ == "__main__": + # Ensure we can import from the app + sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + try: + result = asyncio.run(main()) + sys.exit(0 if result else 1) + except KeyboardInterrupt: + print("\n⏹️ Diagnostic cancelled by user") + sys.exit(1) + except Exception as e: + print(f"\n💥 Unexpected error: {e}") + sys.exit(1) \ No newline at end of file