Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,18 @@ ENV CLERK_API_KEY=${ARG_CLERK_API_KEY}
ENV CLERK_API_URL=${ARG_CLERK_API_URL:-https://api.clerk.dev/v1}
ENV FORGE_DEBUG_LOGGING=${ARG_DEBUG_LOGGING}

# Database connection optimization environment variables
# These settings optimize for PostgreSQL connection limits
ENV DB_POOL_SIZE=3
ENV DB_MAX_OVERFLOW=2
ENV DB_POOL_TIMEOUT=30
ENV DB_POOL_RECYCLE=1800
ENV DB_POOL_PRE_PING=true

# Reduced worker count to manage database connections
# With 5 workers: max 60 connections (5 × 3 × 2 engines + 5 × 2 × 2 overflow = 50 connections)
ENV WORKERS=5

# Install system dependencies including PostgreSQL client and gosu for user privilege management
RUN apt-get update && apt-get install -y \
postgresql-client \
Expand All @@ -37,5 +49,5 @@ USER nobody
# Expose port
EXPOSE 8000

# Run the application (this command is passed to the entrypoint)
CMD ["gunicorn", "app.main:app", "-k", "uvicorn.workers.UvicornWorker", "--workers", "10", "--bind", "0.0.0.0:8000"]
# Use environment variable for workers count and optimize for database connections
CMD ["sh", "-c", "gunicorn app.main:app -k uvicorn.workers.UvicornWorker --workers ${WORKERS:-5} --bind 0.0.0.0:8000 --timeout 120 --max-requests 1000 --max-requests-jitter 100"]
12 changes: 10 additions & 2 deletions app/api/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,11 @@ async def get_current_user(
except JWTError as err:
raise credentials_exception from err

result = await db.execute(select(User).filter(User.username == token_data.username))
result = await db.execute(
select(User)
.options(selectinload(User.api_keys)) # Eager load Forge API keys
.filter(User.username == token_data.username)
)
user = result.scalar_one_or_none()
if user is None:
raise credentials_exception
Expand Down Expand Up @@ -356,7 +360,11 @@ async def get_current_user_from_clerk(
)

# Find user by clerk_user_id
result = await db.execute(select(User).filter(User.clerk_user_id == clerk_user_id))
result = await db.execute(
select(User)
.options(selectinload(User.api_keys)) # Eager load Forge API keys
.filter(User.clerk_user_id == clerk_user_id)
)
user = result.scalar_one_or_none()

# User doesn't exist yet, create one
Expand Down
122 changes: 122 additions & 0 deletions app/api/routes/health.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""
Health check and monitoring endpoints for production deployments.
"""

from fastapi import APIRouter, HTTPException
from sqlalchemy import text

from app.core.database import get_connection_info, get_db_session
from app.core.logger import get_logger

logger = get_logger(name="health")
router = APIRouter()


@router.get("/health")
async def health_check():
"""
Basic health check endpoint.
Returns 200 if the service is running.
"""
return {"status": "healthy", "service": "forge"}


@router.get("/health/database")
async def database_health_check():
"""
Database health check endpoint.
Returns detailed information about database connectivity and pool status.
"""
try:
# Test database connection
async with get_db_session() as session:
result = await session.execute(text("SELECT 1"))
result.scalar()

# Get connection pool information
pool_info = get_connection_info()

# Calculate connection usage
sync_pool = pool_info['sync_engine']
async_pool = pool_info['async_engine']

sync_usage = sync_pool['checked_out'] / (pool_info['pool_size'] + pool_info['max_overflow']) * 100
async_usage = async_pool['checked_out'] / (pool_info['pool_size'] + pool_info['max_overflow']) * 100

return {
"status": "healthy",
"database": "connected",
"connection_pools": {
"sync": {
"checked_out": sync_pool['checked_out'],
"checked_in": sync_pool['checked_in'],
"size": sync_pool['size'],
"usage_percent": round(sync_usage, 1)
},
"async": {
"checked_out": async_pool['checked_out'],
"checked_in": async_pool['checked_in'],
"size": async_pool['size'],
"usage_percent": round(async_usage, 1)
}
},
"configuration": {
"pool_size": pool_info['pool_size'],
"max_overflow": pool_info['max_overflow'],
"pool_timeout": pool_info['pool_timeout'],
"pool_recycle": pool_info['pool_recycle']
}
}

except Exception as e:
logger.error(f"Database health check failed: {e}")
raise HTTPException(
status_code=503,
detail={
"status": "unhealthy",
"database": "disconnected",
"error": str(e)
}
)


@router.get("/health/detailed")
async def detailed_health_check():
"""
Detailed health check including all service components.
"""
try:
# Test database
async with get_db_session() as session:
db_result = await session.execute(text("SELECT version()"))
db_version = db_result.scalar()

pool_info = get_connection_info()

return {
"status": "healthy",
"timestamp": "2025-01-21T19:15:00Z", # This would be dynamic in real implementation
"service": "forge",
"version": "0.1.0",
"database": {
"status": "connected",
"version": db_version,
"pool_status": pool_info
},
"environment": {
"workers": pool_info.get('workers', 'unknown'),
"pool_size": pool_info['pool_size'],
"max_overflow": pool_info['max_overflow']
}
}

except Exception as e:
logger.error(f"Detailed health check failed: {e}")
raise HTTPException(
status_code=503,
detail={
"status": "unhealthy",
"error": str(e),
"timestamp": "2025-01-21T19:15:00Z"
}
)
38 changes: 33 additions & 5 deletions app/core/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,14 @@

load_dotenv()

POOL_SIZE = 5
MAX_OVERFLOW = 10
MAX_TIMEOUT = 30
POOL_RECYCLE = 1800
# Production-optimized connection pool settings
# With 10 Gunicorn workers, this allows max 60 connections total (10 workers × 3 pool_size × 2 engines)
# Plus 40 overflow connections (10 workers × 2 max_overflow × 2 engines) = 100 max connections
POOL_SIZE = int(os.getenv("DB_POOL_SIZE", "3")) # Reduced from 5 to 3
MAX_OVERFLOW = int(os.getenv("DB_MAX_OVERFLOW", "2")) # Reduced from 10 to 2
MAX_TIMEOUT = int(os.getenv("DB_POOL_TIMEOUT", "30"))
POOL_RECYCLE = int(os.getenv("DB_POOL_RECYCLE", "1800")) # 30 minutes
POOL_PRE_PING = os.getenv("DB_POOL_PRE_PING", "true").lower() == "true"

SQLALCHEMY_DATABASE_URL = os.getenv("DATABASE_URL")
if not SQLALCHEMY_DATABASE_URL:
Expand All @@ -24,6 +28,7 @@
max_overflow=MAX_OVERFLOW,
pool_timeout=MAX_TIMEOUT,
pool_recycle=POOL_RECYCLE,
pool_pre_ping=POOL_PRE_PING, # Enables connection health checks
echo=False,
)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Expand Down Expand Up @@ -52,6 +57,7 @@ def get_db():
max_overflow=MAX_OVERFLOW,
pool_timeout=MAX_TIMEOUT,
pool_recycle=POOL_RECYCLE,
pool_pre_ping=POOL_PRE_PING, # Enables connection health checks
echo=False,
)

Expand Down Expand Up @@ -84,4 +90,26 @@ async def get_db_session():
await session.rollback()
raise
finally:
await session.close()
await session.close()


def get_connection_info():
"""Get current connection pool information for monitoring"""
return {
"pool_size": POOL_SIZE,
"max_overflow": MAX_OVERFLOW,
"pool_timeout": MAX_TIMEOUT,
"pool_recycle": POOL_RECYCLE,
"sync_engine": {
"pool": engine.pool,
"checked_out": engine.pool.checkedout(),
"checked_in": engine.pool.checkedin(),
"size": engine.pool.size(),
},
"async_engine": {
"pool": async_engine.pool,
"checked_out": async_engine.pool.checkedout(),
"checked_in": async_engine.pool.checkedin(),
"size": async_engine.pool.size(),
}
}
Loading
Loading