llama-cpp-studio/Dockerfile at main · lapy/llama-cpp-studio · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
################################################################################
# llama.cpp Studio - Multi-stage Docker build
################################################################################
ARG BASE_IMAGE=ubuntu:24.04

################################################################################
# Stage 1: Frontend Builder
# Purpose: Compile Vue.js frontend with Vite
################################################################################
FROM node:20-slim AS frontend-builder

WORKDIR /build

# Copy package files and install dependencies (including devDependencies for build tools)
COPY package*.json ./
RUN if [ -f package-lock.json ] || [ -f npm-shrinkwrap.json ]; then \
        npm ci; \
    else \
        npm install; \
    fi

# Copy frontend source using the same layout as the repo root scripts expect
COPY frontend/ ./frontend/
RUN npm run build

################################################################################
# Stage 2: Python Builder
# Purpose: Create isolated venv with all Python dependencies
################################################################################
FROM ${BASE_IMAGE} AS python-builder

ENV DEBIAN_FRONTEND=noninteractive

# Install build dependencies required for python wheels and runtime compilation (llama.cpp)
RUN apt-get update && apt-get install -y --no-install-recommends \
    python3 \
    python3-venv \
    python3-pip \
    python3-dev \
    build-essential \
    cmake \
    gcc \
    g++ \
    pkg-config \
    libssl-dev \
    libffi-dev \
    libcurl4-openssl-dev \
    libopenblas-dev \
    git \
    curl \
    wget \
    ca-certificates \
    pciutils \
    lsb-release \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean

# Install Rust (required for tokenizers)
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
    sh -s -- -y --profile minimal --default-toolchain stable
ENV PATH="/root/.cargo/bin:${PATH}"

# Create venv and install Python packages
ENV VENV_PATH=/opt/venv
RUN python3 -m venv ${VENV_PATH}
ENV PATH="${VENV_PATH}/bin:${PATH}"

COPY requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
    pip install --no-cache-dir -r /tmp/requirements.txt && \
    rm /tmp/requirements.txt

################################################################################
# Stage 3: Runtime
# Purpose: Minimal production image with compiled artifacts
################################################################################
FROM ${BASE_IMAGE} AS runtime

ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 \
    CUDA_VISIBLE_DEVICES=all \
    NVIDIA_VISIBLE_DEVICES=all \
    NVIDIA_DRIVER_CAPABILITIES=compute,utility \
    HF_HOME=/app/data/hf-cache \
    HUGGINGFACE_HUB_CACHE=/app/data/hf-cache/hub \
    VENV_PATH=/opt/venv \
    PYTHONPATH=/app \
    PATH="/app/data/cuda/current/bin:${PATH}" \
    LD_LIBRARY_PATH="/app/data/cuda/current/lib64:${LD_LIBRARY_PATH}"

# Install runtime dependencies (retain build toolchain for llama.cpp builds and GPU detection)
RUN apt-get update && apt-get install -y --no-install-recommends \
    python3 \
    python3-venv \
    python3-pip \
    python3-dev \
    build-essential \
    git \
    pkg-config \
    ninja-build \
    curl \
    ca-certificates \
    wget \
    # Core libs for Python packages
    libssl3 \
    libssl-dev \
    libffi8 \
    libcurl4 \
    libcurl4-openssl-dev \
    libopenblas0 \
    # GPU acceleration support (CUDA only)
    ocl-icd-libopencl1 \
    libnuma1 \
    pciutils \
    # Optional: ROCm (fails gracefully if unavailable)
    && (apt-get install -y --no-install-recommends rocminfo rocm-smi || echo "ROCm unavailable") \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean

# CUDA installation is handled at runtime via the built-in CUDA installer
# All CUDA components (toolkit, cuDNN, TensorRT) are installed to /app/data/cuda/
# which is volume-backed and persists across container restarts

# Install modern CMake (3.28+) required for CUDA 12.x support
# Ubuntu 24.04 may have a newer cmake, but we install a specific version for consistency
# Placed here to avoid re-downloading when application code changes
ARG CMAKE_VERSION=3.31.3
RUN curl -fsSL "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh" -o /tmp/cmake.sh \
    && chmod +x /tmp/cmake.sh \
    && /tmp/cmake.sh --skip-license --prefix=/usr/local \
    && rm /tmp/cmake.sh \
    && cmake --version

# Install llama-swap binary
ARG LLAMA_SWAP_VERSION=197
RUN curl -fsSL "https://github.com/mostlygeek/llama-swap/releases/download/v${LLAMA_SWAP_VERSION}/llama-swap_${LLAMA_SWAP_VERSION}_linux_amd64.tar.gz" -o /tmp/llama-swap.tar.gz && \
    tar -xzf /tmp/llama-swap.tar.gz -C /tmp && \
    mv /tmp/llama-swap /usr/local/bin/llama-swap && \
    chmod +x /usr/local/bin/llama-swap && \
    rm -rf /tmp/* && \
    llama-swap --version

# Copy Python venv from builder
COPY --from=python-builder ${VENV_PATH} ${VENV_PATH}
ENV PATH="${VENV_PATH}/bin:${PATH}"

# Set up application directory
WORKDIR /app

# Copy application code (excluding data via .dockerignore)
COPY backend/ ./backend/
COPY --from=frontend-builder /build/frontend/dist ./frontend/dist
COPY frontend/public ./frontend/public

# Copy and setup entrypoint script and CUDA environment helper
COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh
COPY setup-cuda-env.sh /app/setup-cuda-env.sh
# Ensure line endings are Unix format and file is executable
RUN sed -i 's/\r$//' /usr/local/bin/docker-entrypoint.sh /app/setup-cuda-env.sh && \
    chmod 755 /usr/local/bin/docker-entrypoint.sh /app/setup-cuda-env.sh && \
    [ -f /usr/local/bin/docker-entrypoint.sh ] && \
    [ -x /usr/local/bin/docker-entrypoint.sh ] && \
    head -n 1 /usr/local/bin/docker-entrypoint.sh | grep -q "^#!/bin/bash"

# Create python symlink for compatibility
RUN ln -sf /usr/bin/python3 /usr/bin/python

# Create non-root user and data directory structure
RUN useradd -m -s /bin/bash appuser && \
    mkdir -p /app/data/models /app/data/config /app/data/logs /app/data/llama-cpp /app/data/hf-cache/hub && \
    chown -R appuser:appuser /app && \
    # Ensure entrypoint script is accessible to appuser
    chmod 755 /usr/local/bin/docker-entrypoint.sh
# Note: /app/data is a volume mount, so permissions will be set by the entrypoint or at runtime

# Expose API port
EXPOSE 8080

# Declare volume for persistent data
VOLUME ["/app/data"]

# Switch to non-root user
USER appuser

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD curl -f http://localhost:8080/api/status || exit 1

# Set entrypoint to configure CUDA environment before starting application
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]

# Start application
CMD ["python", "backend/main.py"]