MemexLLM-backend/Dockerfile at main · MohitGoyal09/MemexLLM-backend · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# ============================================================================
# FAST Multi-Stage Dockerfile - Optimized for Quick Builds
# ============================================================================
#
# DEFAULT MODE (Combined API + Worker):
#   The API server runs with an embedded Procrastinate worker.
#   This is cost-effective for Railway/Render ($5 plans).
#   Set ENABLE_EMBEDDED_WORKER=true (default) to use this mode.
#
# SEPARATE WORKER MODE (for scaling):
#   Deploy two services:
#   1. API: Use default target (runtime)
#   2. Worker: Use target "worker" with ENABLE_EMBEDDED_WORKER=false on API
#
# ============================================================================

FROM python:3.11.8-slim-bookworm AS base

ENV PYTHONUNBUFFERED=1 \
    PYTHONDONTWRITEBYTECODE=1 \
    PIP_NO_CACHE_DIR=1 \
    PIP_DISABLE_PIP_VERSION_CHECK=1

# =============================================================================
# SYSTEM DEPENDENCIES FOR UNSTRUCTURED LIBRARY
# =============================================================================
# Unstructured requires these system packages for document processing:
# - libmagic-dev: File type detection (python-magic)
# - poppler-utils: PDF processing (pdfinfo, pdftotext)
# - tesseract-ocr: OCR for images and scanned PDFs
# - tesseract-ocr-eng: English language pack for OCR
# - tesseract-ocr-all: All languages (optional, adds ~400MB)
# - libgl1: OpenGL support for image processing
# - libreoffice: MS Office document conversion (optional but recommended)
# - ffmpeg: Audio/video processing
# - build-essential: Compiling Python packages
# - libpq-dev: PostgreSQL development headers
# =============================================================================

RUN apt-get update && apt-get install -y --no-install-recommends \
    build-essential \
    libpq-dev \
    libsndfile1 \
    ffmpeg \
    # PDF and image processing
    poppler-utils \
    tesseract-ocr \
    tesseract-ocr-eng \
    # File type detection
    libmagic-dev \
    # Image processing (OpenCV dependency)
    libgl1 \
    libglib2.0-0 \
    libsm6 \
    libxext6 \
    libxrender-dev \
    libgomp1 \
    # Browser for PO Token generation (yt-dlp-getpot-wpc)
    chromium \
    chromium-driver \
    # MS Office support (optional - uncomment if needed)
    # libreoffice \
    # Cleaning
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean

# Verify tesseract installation
RUN tesseract --version

# ============================================================================
# Builder Stage - Install Python packages
# ============================================================================
FROM base AS builder

RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"

WORKDIR /build

# Copy ONLY requirements file (best caching)
COPY requirements-docker.txt ./

# Install with specific resolver options to avoid backtracking
RUN pip install --upgrade pip && \
    pip install --use-deprecated=legacy-resolver -r requirements-docker.txt

# Verify unstructured installation and available parsers
RUN python -c "from unstructured.partition.auto import partition; print('✓ Unstructured installed successfully')" || echo "⚠ Unstructured import warning"

# Pre-download removed (Kokoro replaced by Google TTS)

# ============================================================================
# Runtime Stage
# ============================================================================
FROM base AS runtime

# Create app user with home directory
RUN groupadd -r appuser && useradd -r -g appuser -m -d /home/appuser appuser

# Copy venv from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"

WORKDIR /app

# Copy application
COPY --chown=appuser:appuser src ./src
COPY --chown=appuser:appuser alembic.ini ./
COPY --chown=appuser:appuser migrations ./migrations
COPY --chown=appuser:appuser scripts ./scripts
# youtube cookies and tokens are optional and should be mounted as volumes or secrets if needed

# Create data directories and cache folders
RUN mkdir -p data/uploads data/audio_output logs .cache/huggingface .cache/fastembed && \
    chown -R appuser:appuser /app

# Set cache directories to writable locations (HF_HOME is the modern standard)
ENV HF_HOME=/app/.cache/huggingface \
    SENTENCE_TRANSFORMERS_HOME=/app/.cache/huggingface \
    FASTEMBED_CACHE_PATH=/app/.cache/fastembed \
    # Unstructured settings
    UNSTRUCTURED_CACHE_DIR=/app/.cache/unstructured

USER appuser

# Pre-download models to bake them into the image
# This prevents 30s+ timeout on startup and OOM kills
RUN python scripts/download_models.py

EXPOSE 8000

# Lighter health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
    CMD python -c "import httpx; httpx.get('http://localhost:8000/api/v1/health/liveness', timeout=5)" || exit 1

# Default: Run API with embedded worker (combined mode)
# ENABLE_EMBEDDED_WORKER=true is the default
CMD ["uvicorn", "src.app:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"]

# ============================================================================
# Worker variant (for separate deployment when scaling)
# Use this target only if you need to run worker separately
# Remember to set ENABLE_EMBEDDED_WORKER=false on the API service
# ============================================================================
FROM runtime AS worker
ENV ENABLE_EMBEDDED_WORKER=false
CMD ["python", "-m", "src.services.queue.worker"]