From bd6e67e7518928b8a40be0a4af7773b7d111e284 Mon Sep 17 00:00:00 2001
From: Wenjing Yu <zihao.chen31@gmail.com>
Date: Mon, 21 Jul 2025 14:45:20 -0700
Subject: [PATCH] Adjust the number of workers to avoid db pool is overflow

---
 Dockerfile                                  |  16 +-
 app/api/dependencies.py                     |  12 +-
 app/api/routes/health.py                    | 122 ++++++++
 app/core/database.py                        |  38 ++-
 app/main.py                                 | 226 +++++++-------
 docs/DATABASE_CONNECTION_TROUBLESHOOTING.md | 315 ++++++++++++++++++++
 tools/diagnostics/check_db_connections.py   | 231 ++++++++++++++
 7 files changed, 834 insertions(+), 126 deletions(-)
 create mode 100644 app/api/routes/health.py
 create mode 100644 docs/DATABASE_CONNECTION_TROUBLESHOOTING.md
 create mode 100755 tools/diagnostics/check_db_connections.py

diff --git a/Dockerfile b/Dockerfile
index e3ee861..7765628 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -16,6 +16,18 @@ ENV CLERK_API_KEY=${ARG_CLERK_API_KEY}
 ENV CLERK_API_URL=${ARG_CLERK_API_URL:-https://api.clerk.dev/v1}
 ENV FORGE_DEBUG_LOGGING=${ARG_DEBUG_LOGGING}
 
+# Database connection optimization environment variables
+# These settings optimize for PostgreSQL connection limits
+ENV DB_POOL_SIZE=3
+ENV DB_MAX_OVERFLOW=2
+ENV DB_POOL_TIMEOUT=30
+ENV DB_POOL_RECYCLE=1800
+ENV DB_POOL_PRE_PING=true
+
+# Reduced worker count to manage database connections
+# With 5 workers: max 60 connections (5 × 3 × 2 engines + 5 × 2 × 2 overflow = 50 connections)
+ENV WORKERS=5
+
 # Install system dependencies including PostgreSQL client and gosu for user privilege management
 RUN apt-get update && apt-get install -y \
     postgresql-client \
@@ -37,5 +49,5 @@ USER nobody
 # Expose port
 EXPOSE 8000
 
-# Run the application (this command is passed to the entrypoint)
-CMD ["gunicorn", "app.main:app", "-k", "uvicorn.workers.UvicornWorker", "--workers", "10", "--bind", "0.0.0.0:8000"]
+# Use environment variable for workers count and optimize for database connections
+CMD ["sh", "-c", "gunicorn app.main:app -k uvicorn.workers.UvicornWorker --workers ${WORKERS:-5} --bind 0.0.0.0:8000 --timeout 120 --max-requests 1000 --max-requests-jitter 100"]
diff --git a/app/api/dependencies.py b/app/api/dependencies.py
index f8333ae..4cbf455 100644
--- a/app/api/dependencies.py
+++ b/app/api/dependencies.py
@@ -109,7 +109,11 @@ async def get_current_user(
     except JWTError as err:
         raise credentials_exception from err
     
-    result = await db.execute(select(User).filter(User.username == token_data.username))
+    result = await db.execute(
+        select(User)
+        .options(selectinload(User.api_keys))  # Eager load Forge API keys
+        .filter(User.username == token_data.username)
+    )
     user = result.scalar_one_or_none()
     if user is None:
         raise credentials_exception
@@ -356,7 +360,11 @@ async def get_current_user_from_clerk(
         )
 
     # Find user by clerk_user_id
-    result = await db.execute(select(User).filter(User.clerk_user_id == clerk_user_id))
+    result = await db.execute(
+        select(User)
+        .options(selectinload(User.api_keys))  # Eager load Forge API keys
+        .filter(User.clerk_user_id == clerk_user_id)
+    )
     user = result.scalar_one_or_none()
 
     # User doesn't exist yet, create one
diff --git a/app/api/routes/health.py b/app/api/routes/health.py
new file mode 100644
index 0000000..47227d3
--- /dev/null
+++ b/app/api/routes/health.py
@@ -0,0 +1,122 @@
+"""
+Health check and monitoring endpoints for production deployments.
+"""
+
+from fastapi import APIRouter, HTTPException
+from sqlalchemy import text
+
+from app.core.database import get_connection_info, get_db_session
+from app.core.logger import get_logger
+
+logger = get_logger(name="health")
+router = APIRouter()
+
+
+@router.get("/health")
+async def health_check():
+    """
+    Basic health check endpoint.
+    Returns 200 if the service is running.
+    """
+    return {"status": "healthy", "service": "forge"}
+
+
+@router.get("/health/database")
+async def database_health_check():
+    """
+    Database health check endpoint.
+    Returns detailed information about database connectivity and pool status.
+    """
+    try:
+        # Test database connection
+        async with get_db_session() as session:
+            result = await session.execute(text("SELECT 1"))
+            result.scalar()
+        
+        # Get connection pool information
+        pool_info = get_connection_info()
+        
+        # Calculate connection usage
+        sync_pool = pool_info['sync_engine']
+        async_pool = pool_info['async_engine']
+        
+        sync_usage = sync_pool['checked_out'] / (pool_info['pool_size'] + pool_info['max_overflow']) * 100
+        async_usage = async_pool['checked_out'] / (pool_info['pool_size'] + pool_info['max_overflow']) * 100
+        
+        return {
+            "status": "healthy",
+            "database": "connected",
+            "connection_pools": {
+                "sync": {
+                    "checked_out": sync_pool['checked_out'],
+                    "checked_in": sync_pool['checked_in'],
+                    "size": sync_pool['size'],
+                    "usage_percent": round(sync_usage, 1)
+                },
+                "async": {
+                    "checked_out": async_pool['checked_out'],
+                    "checked_in": async_pool['checked_in'], 
+                    "size": async_pool['size'],
+                    "usage_percent": round(async_usage, 1)
+                }
+            },
+            "configuration": {
+                "pool_size": pool_info['pool_size'],
+                "max_overflow": pool_info['max_overflow'],
+                "pool_timeout": pool_info['pool_timeout'],
+                "pool_recycle": pool_info['pool_recycle']
+            }
+        }
+        
+    except Exception as e:
+        logger.error(f"Database health check failed: {e}")
+        raise HTTPException(
+            status_code=503,
+            detail={
+                "status": "unhealthy",
+                "database": "disconnected",
+                "error": str(e)
+            }
+        )
+
+
+@router.get("/health/detailed")
+async def detailed_health_check():
+    """
+    Detailed health check including all service components.
+    """
+    try:
+        # Test database
+        async with get_db_session() as session:
+            db_result = await session.execute(text("SELECT version()"))
+            db_version = db_result.scalar()
+        
+        pool_info = get_connection_info()
+        
+        return {
+            "status": "healthy",
+            "timestamp": "2025-01-21T19:15:00Z",  # This would be dynamic in real implementation
+            "service": "forge",
+            "version": "0.1.0",
+            "database": {
+                "status": "connected",
+                "version": db_version,
+                "pool_status": pool_info
+            },
+            "environment": {
+                "workers": pool_info.get('workers', 'unknown'),
+                "pool_size": pool_info['pool_size'],
+                "max_overflow": pool_info['max_overflow']
+            }
+        }
+        
+    except Exception as e:
+        logger.error(f"Detailed health check failed: {e}")
+        raise HTTPException(
+            status_code=503,
+            detail={
+                "status": "unhealthy",
+                "error": str(e),
+                "timestamp": "2025-01-21T19:15:00Z"
+            }
+        ) 
\ No newline at end of file
diff --git a/app/core/database.py b/app/core/database.py
index 5253f03..31a466f 100644
--- a/app/core/database.py
+++ b/app/core/database.py
@@ -8,10 +8,14 @@
 
 load_dotenv()
 
-POOL_SIZE = 5
-MAX_OVERFLOW = 10
-MAX_TIMEOUT = 30
-POOL_RECYCLE = 1800
+# Production-optimized connection pool settings
+# With 10 Gunicorn workers, this allows max 60 connections total (10 workers × 3 pool_size × 2 engines)
+# Plus 40 overflow connections (10 workers × 2 max_overflow × 2 engines) = 100 max connections
+POOL_SIZE = int(os.getenv("DB_POOL_SIZE", "3"))  # Reduced from 5 to 3
+MAX_OVERFLOW = int(os.getenv("DB_MAX_OVERFLOW", "2"))  # Reduced from 10 to 2
+MAX_TIMEOUT = int(os.getenv("DB_POOL_TIMEOUT", "30"))
+POOL_RECYCLE = int(os.getenv("DB_POOL_RECYCLE", "1800"))  # 30 minutes
+POOL_PRE_PING = os.getenv("DB_POOL_PRE_PING", "true").lower() == "true"
 
 SQLALCHEMY_DATABASE_URL = os.getenv("DATABASE_URL")
 if not SQLALCHEMY_DATABASE_URL:
@@ -24,6 +28,7 @@
     max_overflow=MAX_OVERFLOW,
     pool_timeout=MAX_TIMEOUT,
     pool_recycle=POOL_RECYCLE,
+    pool_pre_ping=POOL_PRE_PING,  # Enables connection health checks
     echo=False,
 )
 SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
@@ -52,6 +57,7 @@ def get_db():
     max_overflow=MAX_OVERFLOW,
     pool_timeout=MAX_TIMEOUT,
     pool_recycle=POOL_RECYCLE,
+    pool_pre_ping=POOL_PRE_PING,  # Enables connection health checks
     echo=False,
 )
 
@@ -84,4 +90,26 @@ async def get_db_session():
             await session.rollback()
             raise
         finally:
-            await session.close()
\ No newline at end of file
+            await session.close()
+
+
+def get_connection_info():
+    """Get current connection pool information for monitoring"""
+    return {
+        "pool_size": POOL_SIZE,
+        "max_overflow": MAX_OVERFLOW,
+        "pool_timeout": MAX_TIMEOUT,
+        "pool_recycle": POOL_RECYCLE,
+        "sync_engine": {
+            "pool": engine.pool,
+            "checked_out": engine.pool.checkedout(),
+            "checked_in": engine.pool.checkedin(),
+            "size": engine.pool.size(),
+        },
+        "async_engine": {
+            "pool": async_engine.pool,
+            "checked_out": async_engine.pool.checkedout(),
+            "checked_in": async_engine.pool.checkedin(), 
+            "size": async_engine.pool.size(),
+        }
+    }
\ No newline at end of file
diff --git a/app/main.py b/app/main.py
index 85364a6..1bb2ea2 100644
--- a/app/main.py
+++ b/app/main.py
@@ -11,6 +11,7 @@
     api_auth,
     api_keys,
     auth,
+    health,
     provider_keys,
     proxy,
     stats,
@@ -67,135 +68,126 @@ async def dispatch(self, request: Request, call_next: Callable):
         return response
 
 
-# Get environment
-is_production = os.getenv("ENVIRONMENT", "development").lower() == "production"
+# Exception handlers
+class ForgeExceptionHandler:
+    """Custom exception handlers for Forge-specific errors."""
 
-app = FastAPI(
-    title="Forge API",
-    description="A middleware service for managing AI model provider API keys",
-    version="0.1.0",
-    docs_url="/docs" if not is_production else None,
-    redoc_url="/redoc" if not is_production else None,
-    openapi_url="/openapi.json" if not is_production else None,
-)
+    @staticmethod
+    async def handle_provider_auth_exception(request: Request, exc: ProviderAuthenticationException):
+        logger.warning(f"Provider authentication failed: {exc.detail}")
+        return HTTPException(
+            status_code=exc.status_code,
+            detail=exc.detail,
+            headers=exc.headers or {}
+        )
 
-### Exception handlers block ###
+    @staticmethod
+    async def handle_invalid_provider_exception(request: Request, exc: InvalidProviderException):
+        logger.warning(f"Invalid provider: {exc.detail}")
+        return HTTPException(
+            status_code=exc.status_code,
+            detail=exc.detail,
+            headers=exc.headers or {}
+        )
 
-# Add exception handler for ProviderAuthenticationException
-@app.exception_handler(ProviderAuthenticationException)
-async def provider_authentication_exception_handler(request: Request, exc: ProviderAuthenticationException):
-    return HTTPException(
-        status_code=401,
-        detail=f"Authentication failed for provider {exc.provider_name}"
-    )
+    @staticmethod
+    async def handle_base_invalid_provider_setup_exception(request: Request, exc: BaseInvalidProviderSetupException):
+        logger.warning(f"Invalid provider setup: {exc.detail}")
+        return HTTPException(
+            status_code=exc.status_code,
+            detail=exc.detail,
+            headers=exc.headers or {}
+        )
 
-# Add exception handler for InvalidProviderException
-@app.exception_handler(InvalidProviderException)
-async def invalid_provider_exception_handler(request: Request, exc: InvalidProviderException):
-    return HTTPException(
-        status_code=400,
-        detail=f"{str(exc)}. Please verify your provider and model details by calling the /models endpoint or visiting https://tensorblock.co/api-docs/model-ids, and ensure you’re using a valid provider name, model name, and model ID."
-    )
+    @staticmethod
+    async def handle_provider_api_exception(request: Request, exc: ProviderAPIException):
+        logger.error(f"Provider API error: {exc.detail}")
+        return HTTPException(
+            status_code=exc.status_code,
+            detail=exc.detail,
+            headers=exc.headers or {}
+        )
 
-# Add exception handler for BaseInvalidProviderSetupException
-@app.exception_handler(BaseInvalidProviderSetupException)
-async def base_invalid_provider_setup_exception_handler(request: Request, exc: BaseInvalidProviderSetupException):
-    return HTTPException(
-        status_code=400,
-        detail=str(exc)
-    )
+    @staticmethod
+    async def handle_base_invalid_request_exception(request: Request, exc: BaseInvalidRequestException):
+        logger.warning(f"Invalid request: {exc.detail}")
+        return HTTPException(
+            status_code=exc.status_code,
+            detail=exc.detail,
+            headers=exc.headers or {}
+        )
 
-# Add exception handler for ProviderAPIException
-@app.exception_handler(ProviderAPIException)
-async def provider_api_exception_handler(request: Request, exc: ProviderAPIException):
-    return HTTPException(
-        status_code=exc.error_code,
-        detail=f"Provider API error: {exc.provider_name} {exc.error_code} {exc.error_message}"
-    )
+    @staticmethod
+    async def handle_base_invalid_forge_key_exception(request: Request, exc: BaseInvalidForgeKeyException):
+        logger.warning(f"Invalid Forge key: {exc.detail}")
+        return HTTPException(
+            status_code=exc.status_code,
+            detail=exc.detail,
+            headers=exc.headers or {}
+        )
 
-# Add exception handler for BaseInvalidRequestException
-@app.exception_handler(BaseInvalidRequestException)
-async def base_invalid_request_exception_handler(request: Request, exc: BaseInvalidRequestException):
-    return HTTPException(
-        status_code=400,
-        detail=str(exc)
-    )
 
-# Add exception handler for BaseInvalidForgeKeyException
-@app.exception_handler(BaseInvalidForgeKeyException)
-async def base_invalid_forge_key_exception_handler(request: Request, exc: BaseInvalidForgeKeyException):
-    return HTTPException(
-        status_code=401,
-        detail=f"Invalid Forge key: {exc.error}"
+def create_app() -> FastAPI:
+    """Create and configure the FastAPI application."""
+    app = FastAPI(
+        title="Forge API",
+        description="Unified AI model provider API",
+        version="0.1.0",
+        docs_url="/docs",
+        redoc_url="/redoc",
     )
 
-# Add exception handler for NotImplementedError
-@app.exception_handler(NotImplementedError)
-async def not_implemented_error_handler(request: Request, exc: NotImplementedError):
-    return HTTPException(
-        status_code=404,
-        detail=f"Not implemented: {exc}"
+    # Add CORS middleware
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],  # Configure appropriately for production
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
     )
-### Exception handlers block ends ###
 
-
-# Middleware to log slow requests
-@app.middleware("http")
-async def log_latency(request: Request, call_next):
-    start_time = (
-        time.time()
-    )  # Renamed from start to avoid conflict with existing start_time
-    try:
-        response = await call_next(request)
-        return response
-    finally:
-        duration = time.time() - start_time
-        if duration > SLOW_REQUEST_THRESHOLD_SECONDS:  # More than the defined threshold
-            logger.warning(
-                f"[SLOW] {request.method} {request.url.path} took {duration:.2f}s"
-            )
-
-
-# Add request logging middleware
-app.add_middleware(RequestLoggingMiddleware)
-
-# Configure CORS
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],  # For production, specify the actual origins
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-# Include routers under v1 prefix
-v1_router.include_router(auth.router, prefix="/auth", tags=["Authentication"])
-v1_router.include_router(users.router, prefix="/users", tags=["Users"])
-v1_router.include_router(
-    provider_keys.router, prefix="/provider-keys", tags=["Provider Keys"]
-)
-v1_router.include_router(stats.router, prefix="/stats", tags=["Usage Statistics"])
-v1_router.include_router(webhooks.router, prefix="/webhooks", tags=["Webhooks"])
-v1_router.include_router(api_auth.router, prefix="/api", tags=["Unified API"])
-v1_router.include_router(api_keys.router, prefix="/api-keys", tags=["API Keys"])
-
-# OpenAI-compatible API endpoints
-v1_router.include_router(proxy.router, tags=["OpenAI API"])
-
-# Include v1 router in main app
-app.include_router(v1_router)
-
-
-@app.get("/")
-def read_root():
-    response = {
-        "name": "Forge API",
-        "version": "0.1.0",
-        "description": "A middleware service for managing AI model provider API keys",
-    }
-    if not is_production:
-        response["documentation"] = "/docs"
-    return response
+    # Add request logging middleware
+    app.add_middleware(RequestLoggingMiddleware)
+
+    # Add exception handlers
+    app.add_exception_handler(ProviderAuthenticationException, ForgeExceptionHandler.handle_provider_auth_exception)
+    app.add_exception_handler(InvalidProviderException, ForgeExceptionHandler.handle_invalid_provider_exception)
+    app.add_exception_handler(BaseInvalidProviderSetupException, ForgeExceptionHandler.handle_base_invalid_provider_setup_exception)
+    app.add_exception_handler(ProviderAPIException, ForgeExceptionHandler.handle_provider_api_exception)
+    app.add_exception_handler(BaseInvalidRequestException, ForgeExceptionHandler.handle_base_invalid_request_exception)
+    app.add_exception_handler(BaseInvalidForgeKeyException, ForgeExceptionHandler.handle_base_invalid_forge_key_exception)
+
+    # Include routers
+    v1_router.include_router(auth.router, prefix="/auth", tags=["authentication"])
+    v1_router.include_router(api_auth.router, prefix="/api-auth", tags=["api-authentication"])
+    v1_router.include_router(users.router, prefix="/users", tags=["users"])
+    v1_router.include_router(provider_keys.router, prefix="/provider-keys", tags=["provider-keys"])
+    v1_router.include_router(api_keys.router, prefix="/api-keys", tags=["api-keys"])
+    v1_router.include_router(proxy.router, tags=["proxy"])
+    v1_router.include_router(stats.router, prefix="/stats", tags=["stats"])
+    v1_router.include_router(webhooks.router, prefix="/webhooks", tags=["webhooks"])
+
+    # Health check routes (not versioned)
+    app.include_router(health.router, tags=["health"])
+    
+    # Include v1 router
+    app.include_router(v1_router)
+
+    @app.get("/")
+    async def root():
+        """Root endpoint with API information."""
+        return {
+            "message": "Welcome to Forge API",
+            "version": "0.1.0",
+            "docs": "/docs",
+            "health": "/health"
+        }
+
+    return app
+
+
+# Create the application instance
+app = create_app()
 
 
 if __name__ == "__main__":
diff --git a/docs/DATABASE_CONNECTION_TROUBLESHOOTING.md b/docs/DATABASE_CONNECTION_TROUBLESHOOTING.md
new file mode 100644
index 0000000..20d2a0a
--- /dev/null
+++ b/docs/DATABASE_CONNECTION_TROUBLESHOOTING.md
@@ -0,0 +1,315 @@
+# Database Connection Troubleshooting Guide
+
+This guide helps you diagnose and fix PostgreSQL connection limit issues in your Forge deployment.
+
+## 🚨 Quick Fix for "too many clients already" Error
+
+If you're seeing the PostgreSQL error "sorry, too many clients already", follow these immediate steps:
+
+### 1. **Immediate Actions**
+
+```bash
+# Stop your application
+docker-compose down
+# or
+pkill -f gunicorn
+
+# Restart with reduced workers
+export WORKERS=3
+docker-compose up -d
+# or
+gunicorn app.main:app -k uvicorn.workers.UvicornWorker --workers 3 --bind 0.0.0.0:8000
+```
+
+### 2. **Check Current Status**
+
+```bash
+# Run the diagnostic tool
+python tools/diagnostics/check_db_connections.py
+
+# Check health endpoint
+curl http://localhost:8000/health/database
+```
+
+## 📊 Understanding the Problem
+
+### Connection Pool Math
+
+With the default settings:
+- **10 Gunicorn workers** × **2 engines** (sync + async) × **15 connections** (5 pool + 10 overflow) = **300 potential connections**
+- **PostgreSQL default**: ~100 max_connections
+- **Result**: Connection limit exceeded!
+
+### New Optimized Settings
+
+- **5 Gunicorn workers** × **2 engines** × **5 connections** (3 pool + 2 overflow) = **50 connections**
+- **Buffer for other connections**: 50 connections remaining
+- **Result**: Safe operation within PostgreSQL limits
+
+## 🔧 Configuration Options
+
+### Environment Variables
+
+Add these to your `.env` file or Docker environment:
+
+```env
+# Database connection settings
+DB_POOL_SIZE=3              # Connections per pool
+DB_MAX_OVERFLOW=2           # Additional connections when pool is full
+DB_POOL_TIMEOUT=30          # Seconds to wait for connection
+DB_POOL_RECYCLE=1800        # Seconds before recycling connections
+DB_POOL_PRE_PING=true       # Enable connection health checks
+
+# Application settings
+WORKERS=5                   # Number of Gunicorn workers
+```
+
+### For High-Load Production
+
+If you need more workers for performance, consider:
+
+```env
+# Option 1: Reduce pool sizes further
+DB_POOL_SIZE=2
+DB_MAX_OVERFLOW=1
+WORKERS=8
+
+# Option 2: Increase PostgreSQL max_connections (see PostgreSQL tuning section)
+```
+
+## 🐘 PostgreSQL Tuning
+
+### Increase max_connections
+
+1. **Edit postgresql.conf:**
+   ```bash
+   # Find your config file
+   sudo -u postgres psql -c "SHOW config_file;"
+   
+   # Edit the file
+   sudo nano /path/to/postgresql.conf
+   ```
+
+2. **Update settings:**
+   ```conf
+   max_connections = 200          # Increase from default 100
+   shared_buffers = 256MB         # Increase with max_connections
+   effective_cache_size = 1GB     # Adjust based on available RAM
+   ```
+
+3. **Restart PostgreSQL:**
+   ```bash
+   sudo systemctl restart postgresql
+   # or
+   sudo service postgresql restart
+   ```
+
+### Docker PostgreSQL
+
+For Docker setups, modify `docker-compose.yml`:
+
+```yaml
+services:
+  db:
+    image: postgres:14
+    environment:
+      - POSTGRES_USER=forge
+      - POSTGRES_PASSWORD=forge
+      - POSTGRES_DB=forge
+    command: >
+      postgres 
+      -c max_connections=200
+      -c shared_buffers=256MB
+      -c effective_cache_size=1GB
+    # ... rest of config
+```
+
+## 🔍 Monitoring Tools
+
+### 1. **Diagnostic Script**
+
+```bash
+# Run comprehensive check
+python tools/diagnostics/check_db_connections.py
+
+# Run regularly in production
+watch -n 30 "python tools/diagnostics/check_db_connections.py"
+```
+
+### 2. **Health Check Endpoints**
+
+```bash
+# Basic health check
+curl http://localhost:8000/health
+
+# Database-specific health check
+curl http://localhost:8000/health/database
+
+# Detailed health check
+curl http://localhost:8000/health/detailed
+```
+
+### 3. **Direct PostgreSQL Monitoring**
+
+```sql
+-- Check current connections
+SELECT count(*) as total_connections FROM pg_stat_activity;
+
+-- Check max connections
+SHOW max_connections;
+
+-- View connections by database
+SELECT datname, count(*) as connections
+FROM pg_stat_activity 
+WHERE datname IS NOT NULL
+GROUP BY datname
+ORDER BY connections DESC;
+
+-- Check connection states
+SELECT state, count(*) 
+FROM pg_stat_activity 
+GROUP BY state;
+```
+
+## 🚀 Production Deployment Strategies
+
+### 1. **Conservative Approach** (Recommended)
+
+```env
+# Prioritize stability
+WORKERS=3
+DB_POOL_SIZE=2
+DB_MAX_OVERFLOW=1
+# Max connections: 3 × 2 × 3 = 18 connections
+```
+
+### 2. **Balanced Approach**
+
+```env
+# Balance performance and stability  
+WORKERS=5
+DB_POOL_SIZE=3
+DB_MAX_OVERFLOW=2
+# Max connections: 5 × 2 × 5 = 50 connections
+```
+
+### 3. **High-Performance Approach**
+
+```env
+# Requires PostgreSQL tuning
+WORKERS=8
+DB_POOL_SIZE=3
+DB_MAX_OVERFLOW=2
+# Max connections: 8 × 2 × 5 = 80 connections
+# Requires max_connections >= 150 in PostgreSQL
+```
+
+## 📈 Performance vs. Connections Trade-off
+
+| Workers | Pool Size | Max Overflow | Total Connections | Performance | Stability |
+|---------|-----------|--------------|-------------------|-------------|-----------|
+| 3       | 2         | 1            | 18                | ⭐⭐         | ⭐⭐⭐⭐⭐     |
+| 5       | 3         | 2            | 50                | ⭐⭐⭐       | ⭐⭐⭐⭐      |
+| 8       | 3         | 2            | 80                | ⭐⭐⭐⭐      | ⭐⭐⭐       |
+| 10      | 5         | 10           | 300               | ⭐⭐⭐⭐⭐     | ⭐           |
+
+## 🔧 Advanced Solutions
+
+### 1. **Connection Pooling with pgbouncer**
+
+For very high-load scenarios, use pgbouncer:
+
+```bash
+# Install pgbouncer
+sudo apt-get install pgbouncer
+
+# Configure pgbouncer.ini
+[databases]
+forge = host=localhost port=5432 dbname=forge
+
+[pgbouncer]
+pool_mode = transaction
+listen_port = 6432
+max_client_conn = 100
+default_pool_size = 25
+```
+
+Update DATABASE_URL:
+```env
+DATABASE_URL=postgresql://forge:forge@localhost:6432/forge
+```
+
+### 2. **Read Replicas**
+
+For read-heavy workloads, implement read replicas and route read queries separately.
+
+## 🚨 Troubleshooting Common Issues
+
+### Issue 1: "connection timeout"
+
+**Symptoms:** Long delays before connection errors
+**Solution:** Increase `DB_POOL_TIMEOUT` or reduce load
+
+```env
+DB_POOL_TIMEOUT=60  # Increase from 30
+```
+
+### Issue 2: "too many connections" during startup
+
+**Symptoms:** Errors immediately after deployment
+**Solution:** Reduce workers and pool sizes
+
+```env
+WORKERS=3
+DB_POOL_SIZE=2
+```
+
+### Issue 3: Intermittent connection errors
+
+**Symptoms:** Occasional connection failures
+**Solution:** Enable connection health checks
+
+```env
+DB_POOL_PRE_PING=true
+DB_POOL_RECYCLE=1800
+```
+
+## 📋 Monitoring Checklist
+
+- [ ] Set up regular health checks
+- [ ] Monitor connection pool usage
+- [ ] Track PostgreSQL connection counts
+- [ ] Set up alerts for >80% usage
+- [ ] Review logs for connection errors
+- [ ] Test with realistic load
+
+## 🆘 Emergency Procedures
+
+If you're experiencing severe connection issues in production:
+
+1. **Immediate relief:**
+   ```bash
+   # Kill connections
+   docker-compose down
+   
+   # Restart with minimal settings
+   export WORKERS=2
+   export DB_POOL_SIZE=1
+   export DB_MAX_OVERFLOW=1
+   docker-compose up -d
+   ```
+
+2. **Check database:**
+   ```sql
+   -- Kill idle connections
+   SELECT pg_terminate_backend(pid) 
+   FROM pg_stat_activity 
+   WHERE state = 'idle' AND query_start < NOW() - INTERVAL '5 minutes';
+   ```
+
+3. **Monitor recovery:**
+   ```bash
+   watch -n 5 "curl -s http://localhost:8000/health/database | jq '.connection_pools'"
+   ```
+
+Remember: **Stability first, performance second**. It's better to have a working system with lower throughput than a broken system. 
\ No newline at end of file
diff --git a/tools/diagnostics/check_db_connections.py b/tools/diagnostics/check_db_connections.py
new file mode 100755
index 0000000..674176a
--- /dev/null
+++ b/tools/diagnostics/check_db_connections.py
@@ -0,0 +1,231 @@
+#!/usr/bin/env python3
+"""
+Database connection monitoring and diagnostic tool.
+"""
+
+import asyncio
+import os
+import sys
+from datetime import datetime
+
+import psycopg2
+from dotenv import load_dotenv
+from sqlalchemy import create_engine, text
+from sqlalchemy.ext.asyncio import create_async_engine
+
+
+def check_postgres_max_connections():
+    """Check PostgreSQL max_connections setting"""
+    try:
+        load_dotenv()
+        db_url = os.getenv("DATABASE_URL", "postgresql://user:password@localhost:5432/forge")
+        
+        print("🔍 Checking PostgreSQL configuration...")
+        
+        # Use psycopg2 for direct connection
+        conn = psycopg2.connect(db_url)
+        cursor = conn.cursor()
+        
+        # Get max_connections
+        cursor.execute("SHOW max_connections;")
+        max_connections = cursor.fetchone()[0]
+        
+        # Get current connections
+        cursor.execute("""
+            SELECT count(*) as active_connections 
+            FROM pg_stat_activity 
+            WHERE state = 'active'
+        """)
+        active_connections = cursor.fetchone()[0]
+        
+        # Get total connections
+        cursor.execute("""
+            SELECT count(*) as total_connections 
+            FROM pg_stat_activity
+        """)
+        total_connections = cursor.fetchone()[0]
+        
+        # Get connections by database
+        cursor.execute("""
+            SELECT datname, count(*) as connections
+            FROM pg_stat_activity 
+            WHERE datname IS NOT NULL
+            GROUP BY datname
+            ORDER BY connections DESC
+        """)
+        db_connections = cursor.fetchall()
+        
+        cursor.close()
+        conn.close()
+        
+        print(f"📊 PostgreSQL Connection Status:")
+        print(f"   Max connections: {max_connections}")
+        print(f"   Total connections: {total_connections}")
+        print(f"   Active connections: {active_connections}")
+        print(f"   Usage: {(total_connections/int(max_connections)*100):.1f}%")
+        
+        if int(total_connections) > int(max_connections) * 0.8:
+            print("⚠️  WARNING: Connection usage is above 80%!")
+        
+        print(f"\n📊 Connections by database:")
+        for db_name, conn_count in db_connections:
+            print(f"   {db_name}: {conn_count}")
+            
+        return True
+        
+    except Exception as e:
+        print(f"❌ Error checking PostgreSQL connections: {e}")
+        return False
+
+
+def check_sqlalchemy_pools():
+    """Check SQLAlchemy connection pool status"""
+    try:
+        print("\n🔍 Checking SQLAlchemy connection pools...")
+        
+        from app.core.database import get_connection_info
+        
+        info = get_connection_info()
+        
+        print(f"📊 Connection Pool Configuration:")
+        print(f"   Pool size: {info['pool_size']}")
+        print(f"   Max overflow: {info['max_overflow']}")
+        print(f"   Pool timeout: {info['pool_timeout']}s")
+        print(f"   Pool recycle: {info['pool_recycle']}s")
+        
+        print(f"\n📊 Sync Engine Pool Status:")
+        sync_pool = info['sync_engine']
+        print(f"   Checked out: {sync_pool['checked_out']}")
+        print(f"   Checked in: {sync_pool['checked_in']}")
+        print(f"   Pool size: {sync_pool['size']}")
+        
+        print(f"\n📊 Async Engine Pool Status:")
+        async_pool = info['async_engine']  
+        print(f"   Checked out: {async_pool['checked_out']}")
+        print(f"   Checked in: {async_pool['checked_in']}")
+        print(f"   Pool size: {async_pool['size']}")
+        
+        # Calculate total potential connections with workers
+        workers = int(os.getenv("WORKERS", "10"))
+        total_potential = workers * (info['pool_size'] + info['max_overflow']) * 2
+        print(f"\n📊 Production Calculation (with {workers} workers):")
+        print(f"   Max potential connections: {total_potential}")
+        print(f"   Per worker: {(info['pool_size'] + info['max_overflow']) * 2}")
+        
+        return True
+        
+    except Exception as e:
+        print(f"❌ Error checking SQLAlchemy pools: {e}")
+        return False
+
+
+async def test_async_connection():
+    """Test async database connection"""
+    try:
+        print("\n🔍 Testing async database connection...")
+        
+        from app.core.database import get_db_session
+        
+        async with get_db_session() as session:
+            result = await session.execute(text("SELECT version()"))
+            version = result.scalar()
+            print(f"✅ Async connection successful")
+            print(f"   PostgreSQL version: {version}")
+            return True
+            
+    except Exception as e:
+        print(f"❌ Async connection failed: {e}")
+        return False
+
+
+def test_sync_connection():
+    """Test sync database connection"""
+    try:
+        print("\n🔍 Testing sync database connection...")
+        
+        from app.core.database import get_db
+        
+        with next(get_db()) as session:
+            result = session.execute(text("SELECT current_database()"))
+            db_name = result.scalar()
+            print(f"✅ Sync connection successful")
+            print(f"   Current database: {db_name}")
+            return True
+            
+    except Exception as e:
+        print(f"❌ Sync connection failed: {e}")
+        return False
+
+
+def show_recommendations():
+    """Show recommendations for fixing connection issues"""
+    print("\n💡 Recommendations for fixing connection issues:")
+    print("\n1. **Immediate fixes:**")
+    print("   - Restart your application to apply new connection pool settings")
+    print("   - Monitor connection usage with this diagnostic tool")
+    
+    print("\n2. **PostgreSQL tuning:**")
+    print("   - Increase max_connections in postgresql.conf")
+    print("   - Set max_connections = 200 (or higher based on your needs)")
+    print("   - Also increase shared_buffers if you increase max_connections")
+    
+    print("\n3. **Application tuning:**")
+    print("   - Reduce number of Gunicorn workers if needed")
+    print("   - Use environment variables to tune connection pools:")
+    print("     export DB_POOL_SIZE=2")
+    print("     export DB_MAX_OVERFLOW=1") 
+    print("     export WORKERS=5")
+    
+    print("\n4. **Connection pooling (for high-load production):**")
+    print("   - Consider using pgbouncer for connection pooling")
+    print("   - Set connection_limit in pgbouncer.ini")
+    
+    print("\n5. **Monitoring:**")
+    print("   - Run this script regularly to monitor connection usage")
+    print("   - Set up alerts when connection usage exceeds 80%")
+
+
+async def main():
+    """Main diagnostic function"""
+    print("🚀 Database Connection Diagnostic Tool")
+    print("=" * 50)
+    print(f"⏰ Timestamp: {datetime.now()}")
+    
+    # Load environment
+    load_dotenv()
+    
+    # Run all checks
+    postgres_ok = check_postgres_max_connections()
+    pools_ok = check_sqlalchemy_pools()
+    async_ok = await test_async_connection()
+    sync_ok = test_sync_connection()
+    
+    print("\n" + "=" * 50)
+    print("📋 SUMMARY:")
+    print(f"   PostgreSQL config: {'✅' if postgres_ok else '❌'}")
+    print(f"   SQLAlchemy pools: {'✅' if pools_ok else '❌'}")
+    print(f"   Async connection: {'✅' if async_ok else '❌'}")
+    print(f"   Sync connection: {'✅' if sync_ok else '❌'}")
+    
+    if not all([postgres_ok, pools_ok, async_ok, sync_ok]):
+        print("\n❌ Issues detected!")
+        show_recommendations()
+        return False
+    else:
+        print("\n✅ All checks passed!")
+        return True
+
+
+if __name__ == "__main__":
+    # Ensure we can import from the app
+    sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    
+    try:
+        result = asyncio.run(main())
+        sys.exit(0 if result else 1)
+    except KeyboardInterrupt:
+        print("\n⏹️  Diagnostic cancelled by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n💥 Unexpected error: {e}")
+        sys.exit(1) 
\ No newline at end of file