diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 00000000..f69435b1 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,65 @@ +name: Pre-commit Checks + +on: + push: + branches: [master, main, develop] + pull_request: + branches: [master, main, develop] + +jobs: + pre-commit: + name: Run Pre-commit Hooks + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + cache: 'pip' + + - name: Set up Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'npm' + cache-dependency-path: frontend/package-lock.json + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y shellcheck + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install pre-commit + + - name: Cache pre-commit environments + uses: actions/cache@v4 + with: + path: ~/.cache/pre-commit + key: pre-commit-${{ hashFiles('.pre-commit-config.yaml') }} + restore-keys: | + pre-commit- + + - name: Run pre-commit + run: pre-commit run --all-files --show-diff-on-failure + + - name: Generate pre-commit summary + if: always() + run: | + echo "## Pre-commit Check Results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "Pre-commit hooks executed on all files." >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Hooks Executed:" >> $GITHUB_STEP_SUMMARY + echo "- Code formatting (Ruff, Prettier)" >> $GITHUB_STEP_SUMMARY + echo "- Linting (Ruff, mypy, ESLint)" >> $GITHUB_STEP_SUMMARY + echo "- Security checks (Gitleaks, Bandit)" >> $GITHUB_STEP_SUMMARY + echo "- Shell script validation (shellcheck)" >> $GITHUB_STEP_SUMMARY + echo "- Dockerfile linting (Hadolint)" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/security-scan.yml b/.github/workflows/security-scan.yml new file mode 100644 index 00000000..b86453a3 --- /dev/null +++ b/.github/workflows/security-scan.yml @@ -0,0 +1,248 @@ +name: Security Scanning + +on: + push: + branches: [master, main, develop] + paths: + - 'backend/**' + - 'frontend/**' + - '**/Dockerfile*' + - '.github/workflows/security-scan.yml' + pull_request: + branches: [master, main, develop] + paths: + - 'backend/**' + - 'frontend/**' + - '**/Dockerfile*' + - '.github/workflows/security-scan.yml' + schedule: + # Run weekly security scans on Sundays at 00:00 UTC + - cron: '0 0 * * 0' + workflow_dispatch: + inputs: + component: + description: 'Component to scan (backend, frontend, or all)' + required: false + default: 'all' + type: choice + options: + - all + - backend + - frontend + +jobs: + dockerfile-lint: + name: Lint Dockerfiles + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Run Hadolint on backend Dockerfile + uses: hadolint/hadolint-action@v3.1.0 + with: + dockerfile: backend/Dockerfile.prod + config: .hadolint.yaml + failure-threshold: warning + + - name: Run Hadolint on frontend Dockerfile + uses: hadolint/hadolint-action@v3.1.0 + with: + dockerfile: frontend/Dockerfile.prod + config: .hadolint.yaml + failure-threshold: warning + + build-and-scan-backend: + name: Build & Scan Backend + runs-on: ubuntu-latest + permissions: + contents: read + security-events: write + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build backend image + uses: docker/build-push-action@v5 + with: + context: ./backend + file: ./backend/Dockerfile.prod + tags: opentranscribe-backend:${{ github.sha }} + load: true + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + with: + image-ref: opentranscribe-backend:${{ github.sha }} + format: 'sarif' + output: 'trivy-backend-results.sarif' + severity: 'CRITICAL,HIGH,MEDIUM' + + - name: Upload Trivy results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v3 + if: always() + with: + sarif_file: 'trivy-backend-results.sarif' + category: 'trivy-backend' + + - name: Run Trivy vulnerability scanner (table output) + uses: aquasecurity/trivy-action@master + with: + image-ref: opentranscribe-backend:${{ github.sha }} + format: 'table' + severity: 'CRITICAL,HIGH,MEDIUM' + + - name: Run Dockle + uses: erzz/dockle-action@v1 + with: + image: opentranscribe-backend:${{ github.sha }} + exit-code: '0' + failure-threshold: warn + + - name: Generate SBOM with Syft + uses: anchore/sbom-action@v0 + with: + image: opentranscribe-backend:${{ github.sha }} + artifact-name: backend-sbom.spdx.json + output-file: backend-sbom.spdx.json + + - name: Scan SBOM with Grype + uses: anchore/scan-action@v3 + with: + sbom: backend-sbom.spdx.json + fail-build: false + severity-cutoff: medium + + - name: Upload SBOM artifact + uses: actions/upload-artifact@v4 + if: always() + with: + name: backend-sbom + path: backend-sbom.spdx.json + retention-days: 90 + + build-and-scan-frontend: + name: Build & Scan Frontend + runs-on: ubuntu-latest + permissions: + contents: read + security-events: write + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build frontend image + uses: docker/build-push-action@v5 + with: + context: ./frontend + file: ./frontend/Dockerfile.prod + tags: opentranscribe-frontend:${{ github.sha }} + load: true + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + with: + image-ref: opentranscribe-frontend:${{ github.sha }} + format: 'sarif' + output: 'trivy-frontend-results.sarif' + severity: 'CRITICAL,HIGH,MEDIUM' + + - name: Upload Trivy results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v3 + if: always() + with: + sarif_file: 'trivy-frontend-results.sarif' + category: 'trivy-frontend' + + - name: Run Trivy vulnerability scanner (table output) + uses: aquasecurity/trivy-action@master + with: + image-ref: opentranscribe-frontend:${{ github.sha }} + format: 'table' + severity: 'CRITICAL,HIGH,MEDIUM' + + - name: Run Dockle + uses: erzz/dockle-action@v1 + with: + image: opentranscribe-frontend:${{ github.sha }} + exit-code: '0' + failure-threshold: warn + + - name: Generate SBOM with Syft + uses: anchore/sbom-action@v0 + with: + image: opentranscribe-frontend:${{ github.sha }} + artifact-name: frontend-sbom.spdx.json + output-file: frontend-sbom.spdx.json + + - name: Scan SBOM with Grype + uses: anchore/scan-action@v3 + with: + sbom: frontend-sbom.spdx.json + fail-build: false + severity-cutoff: medium + + - name: Upload SBOM artifact + uses: actions/upload-artifact@v4 + if: always() + with: + name: frontend-sbom + path: frontend-sbom.spdx.json + retention-days: 90 + + dependency-scan: + name: Scan Dependencies + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Run Trivy vulnerability scanner on repository + uses: aquasecurity/trivy-action@master + with: + scan-type: 'fs' + scan-ref: '.' + format: 'table' + severity: 'CRITICAL,HIGH' + skip-dirs: 'node_modules,venv,.venv' + + - name: Run Trivy config scanner + uses: aquasecurity/trivy-action@master + with: + scan-type: 'config' + scan-ref: '.' + format: 'table' + severity: 'CRITICAL,HIGH' + skip-dirs: 'node_modules,venv,.venv' + + summary: + name: Security Scan Summary + runs-on: ubuntu-latest + needs: [dockerfile-lint, build-and-scan-backend, build-and-scan-frontend, dependency-scan] + if: always() + steps: + - name: Check scan results + run: | + echo "## Security Scan Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "All security scans completed. Check individual job logs for details." >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Completed Scans:" >> $GITHUB_STEP_SUMMARY + echo "- ✅ Dockerfile Linting (Hadolint)" >> $GITHUB_STEP_SUMMARY + echo "- ✅ Backend Image Scanning (Trivy, Dockle, Grype)" >> $GITHUB_STEP_SUMMARY + echo "- ✅ Frontend Image Scanning (Trivy, Dockle, Grype)" >> $GITHUB_STEP_SUMMARY + echo "- ✅ Dependency Scanning (Trivy)" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Reports Available:" >> $GITHUB_STEP_SUMMARY + echo "- SBOM artifacts uploaded for both backend and frontend" >> $GITHUB_STEP_SUMMARY + echo "- Vulnerability results uploaded to GitHub Security tab" >> $GITHUB_STEP_SUMMARY diff --git a/.gitignore b/.gitignore index 7ffb6d2c..c3da8ad9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ # OpenTranscribe .gitignore .github/DRAFT_ISSUES/ +offline-package-build/ +security-reports-* # Environment variables .env @@ -238,3 +240,4 @@ frontend/static/ffmpeg/ # Private project management files GITHUB_OPTIMIZATION_TASKS.md +security-reports/ diff --git a/.hadolint.yaml b/.hadolint.yaml new file mode 100644 index 00000000..2b1bc689 --- /dev/null +++ b/.hadolint.yaml @@ -0,0 +1,33 @@ +# Hadolint configuration for OpenTranscribe Dockerfiles +# https://github.com/hadolint/hadolint + +# Ignore specific rules +ignored: + - DL3008 # Pin versions in apt-get install (Debian/Ubuntu) + - DL3009 # Delete the apt-get lists after installing (already handled) + - DL3015 # Additional packages by --no-install-recommends (sometimes we need them) + - DL3018 # Pin versions in apk add (Alpine - can break builds) + - DL3059 # Multiple consecutive RUN instructions (acceptable for layer caching) + +# Trust specific registries +trustedRegistries: + - docker.io + - ghcr.io + - nvcr.io + +# Override default rules severity +override: + error: + - DL3002 # Do not switch to root USER + - DL3020 # Use COPY instead of ADD for files and folders + - DL4006 # Set SHELL option -o pipefail before RUN with pipe + warning: + - DL3003 # Use WORKDIR to switch to a directory + - DL3007 # Using latest tag for image + - DL3042 # Avoid cache directory with pip + info: + - DL3013 # Pin versions in pip install + - DL3016 # Pin versions in npm install + +# Strict mode - warn on all other issues +failure-threshold: warning diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..c170a3e7 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,98 @@ +# Pre-commit hooks configuration for OpenTranscribe +# See https://pre-commit.com for more information + +repos: + # General pre-commit hooks + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + args: [--unsafe] # Allow custom YAML tags in docker-compose + - id: check-json + - id: check-added-large-files + args: [--maxkb=10240] # 10MB max file size + - id: check-merge-conflict + - id: check-toml + - id: mixed-line-ending + args: [--fix=lf] + - id: check-executables-have-shebangs + - id: check-shebang-scripts-are-executable + + # Python formatting and linting with Ruff (replaces Black, isort, flake8) + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.2.2 + hooks: + - id: ruff + args: [--fix, --show-fixes] + files: ^backend/ + - id: ruff-format + files: ^backend/ + + # Python type checking + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.8.0 + hooks: + - id: mypy + files: ^backend/ + additional_dependencies: + - types-requests + - types-python-dateutil + - types-pyyaml + - types-redis + args: [--ignore-missing-imports, --check-untyped-defs] + + # Frontend formatting with Prettier + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.1.0 + hooks: + - id: prettier + files: ^frontend/ + types_or: [javascript, jsx, ts, tsx, json, yaml, html, css, scss, markdown] + args: [--write, --ignore-unknown] + + # Dockerfile linting + - repo: https://github.com/hadolint/hadolint + rev: v2.12.0 + hooks: + - id: hadolint-docker + name: Lint Dockerfiles + files: Dockerfile.* + entry: hadolint + language: system + + # Secret detection + - repo: https://github.com/gitleaks/gitleaks + rev: v8.18.1 + hooks: + - id: gitleaks + args: [--verbose, --no-banner] + + # Python security linting + - repo: https://github.com/PyCQA/bandit + rev: 1.7.7 + hooks: + - id: bandit + files: ^backend/ + args: [-c, pyproject.toml, -r, backend/] + additional_dependencies: ["bandit[toml]"] + + # Shell script linting + - repo: https://github.com/shellcheck-py/shellcheck-py + rev: v0.9.0.6 + hooks: + - id: shellcheck + args: [--severity=warning] + + # Commit message linting (optional) + - repo: https://github.com/compilerla/conventional-pre-commit + rev: v3.0.0 + hooks: + - id: conventional-pre-commit + stages: [commit-msg] + args: [--force-scope] + +# Configuration for specific hooks +default_stages: [commit] +fail_fast: false diff --git a/backend/DOCKER_STRATEGY.md b/backend/DOCKER_STRATEGY.md new file mode 100644 index 00000000..bf201d63 --- /dev/null +++ b/backend/DOCKER_STRATEGY.md @@ -0,0 +1,188 @@ +# Docker Build Strategy - OpenTranscribe Backend + +## Overview + +The OpenTranscribe backend uses two Docker build strategies optimized for different use cases: + +1. **Dockerfile.prod** - Standard production build (currently in use) +2. **Dockerfile.prod.optimized** - Multi-stage build for enhanced security (future use) + +## Current Configuration + +### Active Dockerfile: `Dockerfile.prod` + +**Base Image:** `python:3.12-slim-bookworm` (Debian 12) + +**Key Features:** +- ✅ Single-stage build for faster iteration +- ✅ CUDA 12.8 & cuDNN 9 compatibility +- ✅ Security updates (CVE-2025-32434 fixed) +- ✅ Root user (required for GPU access in development) + +**Used By:** +- `backend` service (docker-compose.yml:80) +- `celery-worker` service (docker-compose.yml:152) +- `flower` service (docker-compose.yml:254) + +### ML/AI Stack (All cuDNN 9 Compatible) + +| Package | Version | Notes | +|---------|---------|-------| +| PyTorch | 2.8.0+cu128 | CVE-2025-32434 fixed, CUDA 12.8 | +| CTranslate2 | ≥4.6.0 | cuDNN 9 support | +| WhisperX | 3.7.0 | Latest with ctranslate2 4.5+ support | +| PyAnnote Audio | ≥3.3.2 | PyTorch 2.6+ compatible | +| NumPy | ≥1.25.2 | 2.x compatible, no CVEs | + +### Critical Configuration + +**LD_LIBRARY_PATH** (Line 28): +```dockerfile +ENV LD_LIBRARY_PATH=/usr/local/lib/python3.12/site-packages/nvidia/cudnn/lib:/usr/local/lib/python3.12/site-packages/nvidia/cuda_runtime/lib +``` + +**Why This Matters:** +- PyAnnote diarization requires cuDNN 9 libraries +- Libraries are in Python package directory, not system path +- Without this, you get: `Unable to load libcudnn_cnn.so.9` → SIGABRT crash +- Must be set at Dockerfile level (persistent, can't be overridden) + +## Future Strategy: Optimized Build + +### Dockerfile.prod.optimized (Not Yet Active) + +**When to Use:** +- Production deployments requiring maximum security +- Environments that support non-root containers +- CI/CD pipelines with security scanning + +**Key Improvements:** + +1. **Multi-Stage Build** + - Stage 1 (builder): Compiles dependencies with build tools + - Stage 2 (runtime): Minimal image, only runtime dependencies + - Result: ~40% smaller image size + +2. **Non-Root User** + - Runs as `appuser` (UID 1000) + - Follows principle of least privilege + - Better for production security posture + +3. **Security Enhancements** + - No build tools in final image + - No curl/git (attack surface reduction) + - OCI-compliant labels for tracking + - Built-in health checks + +4. **Library Paths** (Adjusted for non-root) + ```dockerfile + ENV LD_LIBRARY_PATH=/home/appuser/.local/lib/python3.12/site-packages/nvidia/cudnn/lib:/home/appuser/.local/lib/python3.12/site-packages/nvidia/cuda_runtime/lib + ``` + +### Migration Path + +**Phase 1: Current** ✅ +- Using `Dockerfile.prod` (root user) +- Verified working with GPU/CUDA +- All services stable + +**Phase 2: Testing** (Next Step) +1. Test `Dockerfile.prod.optimized` with same workload +2. Verify GPU access works with non-root user +3. Confirm cuDNN libraries load correctly +4. Run full transcription pipeline test + +**Phase 3: Migration** +1. Update docker-compose.yml to use `Dockerfile.prod.optimized` +2. Update GPU device permissions if needed +3. Deploy to staging environment +4. Monitor for 48 hours +5. Production rollout + +## Troubleshooting + +### Common Issues + +**Problem:** `Unable to load libcudnn_cnn.so.9` +- **Cause:** LD_LIBRARY_PATH not set +- **Fix:** Ensure LD_LIBRARY_PATH in Dockerfile (not docker-compose) + +**Problem:** `Worker exited with SIGABRT` +- **Cause:** cuDNN library version mismatch +- **Fix:** Verify PyTorch 2.8.0+cu128 → cuDNN 9.10.2 + +**Problem:** GPU not accessible in optimized build +- **Cause:** Non-root user lacks GPU permissions +- **Fix:** Add user to `video` group or use `--privileged` + +## Development Workflow + +### Local Development (with venv) +```bash +cd backend/ +source venv/bin/activate +pip install -r requirements-dev.txt # Includes testing tools +``` + +### Container Testing +```bash +# Current production build +./opentr.sh start prod + +# Test optimized build (after migration) +docker compose -f docker-compose.yml -f docker-compose.optimized.yml up +``` + +### Building Images +```bash +# Standard build +docker compose build backend celery-worker flower + +# Optimized build (future) +docker compose build -f Dockerfile.prod.optimized backend +``` + +## Security Considerations + +### Current (Dockerfile.prod) +- ✅ Updated base image (Debian 12 Bookworm) +- ✅ CVE-2025-32434 fixed (PyTorch 2.8.0) +- ✅ Minimal package installation +- ⚠️ Runs as root (required for current GPU setup) + +### Future (Dockerfile.prod.optimized) +- ✅ All above, plus: +- ✅ Non-root user execution +- ✅ Multi-stage build (no build tools in runtime) +- ✅ Explicit OCI labels for compliance +- ✅ Health check integration + +## File Structure + +``` +backend/ +├── Dockerfile.prod # Current production (in use) +├── Dockerfile.prod.optimized # Future optimized build +├── requirements.txt # Production dependencies +├── requirements-dev.txt # Development tools +├── DOCKER_STRATEGY.md # This file +└── .dockerignore # Excludes venv, etc. +``` + +## Key Takeaways + +1. **Always use Dockerfile.prod for now** - verified working +2. **LD_LIBRARY_PATH is critical** - must be in Dockerfile +3. **cuDNN 9 compatibility** - all packages updated +4. **Optimized build is ready** - awaiting GPU permission testing +5. **No downgrade needed** - NumPy 2.x works perfectly + +## Change History + +- **2025-10-11**: Initial strategy with cuDNN 9 migration + - Updated PyTorch 2.2.2 → 2.8.0+cu128 + - Updated CTranslate2 4.4.0 → 4.6.0 + - Updated WhisperX 3.4.3 → 3.7.0 + - Fixed LD_LIBRARY_PATH for cuDNN libraries + - Removed obsolete Dockerfile.dev variants + - Created Dockerfile.prod.optimized for future use diff --git a/backend/Dockerfile.dev b/backend/Dockerfile.dev deleted file mode 100644 index 842de9f1..00000000 --- a/backend/Dockerfile.dev +++ /dev/null @@ -1,33 +0,0 @@ -FROM python:3.12.11-slim-bullseye - -WORKDIR /app - -# Install system dependencies -RUN apt-get update && apt-get install -y \ - build-essential \ - curl \ - git \ - ffmpeg \ - libsndfile1 \ - libimage-exiftool-perl \ - && rm -rf /var/lib/apt/lists/* - -# Copy requirements file -COPY requirements.txt . - -# Install Python dependencies -RUN pip install --no-cache-dir -r requirements.txt -# https://github.com/OpenNMT/CTranslate2/issues/1806, also requires downgrading torch to be compatible with ver8 -RUN pip install --no-cache-dir "torch==2.2.2" "torchaudio==2.2.2" "numpy<2.0" - -# Create directories for models and temporary files -RUN mkdir -p /app/models /app/temp - -# Copy application code -COPY . . - -# Expose port -EXPOSE 8080 - -# Command to run the application -CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8080", "--reload"] diff --git a/backend/Dockerfile.prod b/backend/Dockerfile.prod index 1462c6d8..5766cc79 100644 --- a/backend/Dockerfile.prod +++ b/backend/Dockerfile.prod @@ -1,4 +1,4 @@ -FROM python:3.12.11-slim-bullseye +FROM python:3.12-slim-bookworm WORKDIR /app @@ -6,7 +6,6 @@ WORKDIR /app RUN apt-get update && apt-get install -y \ build-essential \ curl \ - git \ ffmpeg \ libsndfile1 \ libimage-exiftool-perl \ @@ -16,9 +15,17 @@ RUN apt-get update && apt-get install -y \ COPY requirements.txt . # Install Python dependencies +# All packages now use cuDNN 9 for CUDA 12.8 compatibility +# PyTorch 2.8.0+cu128 - includes CVE-2025-32434 security fix +# CTranslate2 4.6.0+ - cuDNN 9 support +# WhisperX 3.7.0 - latest version with ctranslate2 4.5+ compatibility +# NumPy 2.x - fully compatible with all packages, no security issues RUN pip install --no-cache-dir -r requirements.txt -# https://github.com/OpenNMT/CTranslate2/issues/1806, also requires downgrading torch to be compatible with ver8 -RUN pip install --no-cache-dir "torch==2.2.2" "torchaudio==2.2.2" "numpy<2.0" + +# Set LD_LIBRARY_PATH for cuDNN libraries from PyTorch package +# This ensures PyAnnote and other tools can find cuDNN 9 libraries +# Must be set at build time to persist in the container +ENV LD_LIBRARY_PATH=/usr/local/lib/python3.12/site-packages/nvidia/cudnn/lib:/usr/local/lib/python3.12/site-packages/nvidia/cuda_runtime/lib # Create directories for models and temporary files RUN mkdir -p /app/models /app/temp diff --git a/backend/Dockerfile.prod.optimized b/backend/Dockerfile.prod.optimized new file mode 100644 index 00000000..3c2f9889 --- /dev/null +++ b/backend/Dockerfile.prod.optimized @@ -0,0 +1,90 @@ +# ============================================================================= +# OpenTranscribe Backend - Production Dockerfile (Optimized) +# Multi-stage build optimized for security and minimal image size +# Updated with cuDNN 9 compatibility for PyTorch 2.8.0+cu128 +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Stage 1: Build Stage - Install Python dependencies with compilation +# ----------------------------------------------------------------------------- +FROM python:3.12-slim-bookworm AS builder + +WORKDIR /build + +# Install build dependencies (only in this stage) +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + gcc \ + g++ \ + && rm -rf /var/lib/apt/lists/* + +# Copy only requirements first for better layer caching +COPY requirements.txt . + +# Install Python dependencies +# All packages now use cuDNN 9 for CUDA 12.8 compatibility +# PyTorch 2.8.0+cu128 - includes CVE-2025-32434 security fix +# CTranslate2 4.6.0+ - cuDNN 9 support +# WhisperX 3.7.0 - latest version with ctranslate2 4.5+ compatibility +# NumPy 2.x - fully compatible with all packages, no security issues +# Use --user to install to /root/.local which we'll copy to final stage +RUN pip install --user --no-cache-dir --no-warn-script-location -r requirements.txt + +# ----------------------------------------------------------------------------- +# Stage 2: Runtime Stage - Minimal production image +# ----------------------------------------------------------------------------- +FROM python:3.12-slim-bookworm + +# OCI annotations for metadata +LABEL org.opencontainers.image.title="OpenTranscribe Backend" \ + org.opencontainers.image.description="AI-powered transcription backend with WhisperX and PyAnnote" \ + org.opencontainers.image.vendor="OpenTranscribe" \ + org.opencontainers.image.authors="OpenTranscribe Contributors" \ + org.opencontainers.image.licenses="MIT" \ + org.opencontainers.image.source="https://github.com/yourusername/transcribe-app" \ + org.opencontainers.image.documentation="https://github.com/yourusername/transcribe-app/blob/main/README.md" + +# Install only runtime dependencies (no build tools, no git, no curl) +RUN apt-get update && apt-get install -y --no-install-recommends \ + ffmpeg \ + libsndfile1 \ + libimage-exiftool-perl \ + libgomp1 \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Create non-root user for security +RUN groupadd -r appuser && \ + useradd -r -g appuser -u 1000 -m -s /bin/bash appuser && \ + mkdir -p /app /app/models /app/temp && \ + chown -R appuser:appuser /app + +# Set working directory +WORKDIR /app + +# Copy Python packages from builder stage +COPY --from=builder --chown=appuser:appuser /root/.local /home/appuser/.local + +# Ensure scripts in .local are usable by adding to PATH +# Set LD_LIBRARY_PATH for cuDNN libraries from PyTorch package +# This ensures PyAnnote and other tools can find cuDNN 9 libraries +ENV PATH=/home/appuser/.local/bin:$PATH \ + PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + LD_LIBRARY_PATH=/home/appuser/.local/lib/python3.12/site-packages/nvidia/cudnn/lib:/home/appuser/.local/lib/python3.12/site-packages/nvidia/cuda_runtime/lib + +# Copy application code +COPY --chown=appuser:appuser . . + +# Switch to non-root user +USER appuser + +# Health check for container orchestration +HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \ + CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8080/health').read()" || exit 1 + +# Expose application port +EXPOSE 8080 + +# Run application with auto-scaling workers (Uvicorn detects CPU cores) +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8080"] diff --git a/backend/app/auth/direct_auth.py b/backend/app/auth/direct_auth.py index f7ea361d..cc5f2d50 100644 --- a/backend/app/auth/direct_auth.py +++ b/backend/app/auth/direct_auth.py @@ -13,8 +13,15 @@ from app.core.config import settings -# Password hashing -pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto") +# Password hashing with bcrypt_sha256 to handle long passwords properly +# bcrypt_sha256 pre-hashes with SHA256 to work around bcrypt's 72-byte limitation +# We keep "bcrypt" in the list to verify existing hashes, but new hashes use bcrypt_sha256 +pwd_context = CryptContext( + schemes=["bcrypt_sha256", "bcrypt"], # bcrypt_sha256 for new, bcrypt for legacy + deprecated=["bcrypt"], # Mark plain bcrypt as deprecated (will auto-upgrade on verify) + bcrypt_sha256__default_rounds=12, + bcrypt__default_rounds=12, +) # Database connection parameters DB_USER = os.getenv("POSTGRES_USER", "postgres") diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 59146e5b..2b134de1 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -172,8 +172,8 @@ def effective_batch_size(self) -> int: return 1 return int(self.BATCH_SIZE) - # Storage paths - DATA_DIR: Path = Path(os.getenv("DATA_DIR", "/mnt/nvm/repos/transcribe-app/data")) + # Storage paths (container paths, mounted from host via docker-compose volumes) + DATA_DIR: Path = Path(os.getenv("DATA_DIR", "/app/data")) UPLOAD_DIR: Path = DATA_DIR / "uploads" MODEL_BASE_DIR: Path = Path(os.getenv("MODELS_DIR", "/app/models")) diff --git a/backend/app/core/security.py b/backend/app/core/security.py index d3915b66..b95683ed 100644 --- a/backend/app/core/security.py +++ b/backend/app/core/security.py @@ -15,7 +15,15 @@ from app.core.config import settings from app.models.user import User -pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto") +# Configure password hashing with bcrypt_sha256 to handle long passwords properly +# bcrypt_sha256 pre-hashes with SHA256 to work around bcrypt's 72-byte limitation +# We keep "bcrypt" in the list to verify existing hashes, but new hashes use bcrypt_sha256 +pwd_context = CryptContext( + schemes=["bcrypt_sha256", "bcrypt"], # bcrypt_sha256 for new, bcrypt for legacy + deprecated=["bcrypt"], # Mark plain bcrypt as deprecated (will auto-upgrade on verify) + bcrypt_sha256__default_rounds=12, + bcrypt__default_rounds=12, +) def create_access_token(subject: Union[str, Any], expires_delta: Optional[timedelta] = None) -> str: diff --git a/backend/app/tasks/transcription/core.py b/backend/app/tasks/transcription/core.py index 0532369a..e9f75f69 100644 --- a/backend/app/tasks/transcription/core.py +++ b/backend/app/tasks/transcription/core.py @@ -376,18 +376,47 @@ def whisperx_progress_callback(progress, message): except Exception as e: logger.error(f"Error in WhisperX processing: {str(e)}") + + # Provide user-friendly error messages for common issues + error_message = str(e) + if "libcudnn" in error_message.lower(): + error_message = ( + "Audio processing failed due to a system library compatibility issue. " + "The transcription service requires updated dependencies. " + "Please contact support for assistance." + ) + elif "cuda" in error_message.lower() and "out of memory" in error_message.lower(): + error_message = ( + "GPU out of memory error. The audio file may be too large for available GPU resources. " + "Please try with a shorter audio file or contact support." + ) + elif "cuda" in error_message.lower() or "gpu" in error_message.lower(): + error_message = ( + "GPU processing error occurred during transcription. " + "The system may need reconfiguration. " + "Please try again or contact support if the issue persists." + ) + elif "model" in error_message.lower() and ( + "download" in error_message.lower() or "load" in error_message.lower() + ): + error_message = ( + "Failed to download or load AI models. " + "Please check your internet connection and try again. " + "If the problem persists, contact support." + ) + with session_scope() as db: update_task_status( db, task_id, "failed", - error_message=f"Processing error: {str(e)}", + error_message=error_message, completed=True, ) update_media_file_status(db, file_id, FileStatus.ERROR) - send_error_notification(user_id, file_id, str(e)) - return {"status": "error", "message": str(e)} + send_error_notification(user_id, file_id, error_message) + return {"status": "error", "message": error_message} except Exception as e: logger.error(f"Error processing file {file_id}: {str(e)}") diff --git a/backend/app/tasks/transcription/whisperx_service.py b/backend/app/tasks/transcription/whisperx_service.py index 3c42f7de..518578f1 100644 --- a/backend/app/tasks/transcription/whisperx_service.py +++ b/backend/app/tasks/transcription/whisperx_service.py @@ -12,6 +12,18 @@ class WhisperXService: """Service for handling WhisperX transcription operations with cross-platform support.""" def __init__(self, model_name: str = None, models_dir: str = None): + # Add safe globals for PyTorch 2.6+ compatibility with PyAnnote + # PyTorch 2.6+ changed torch.load() default to weights_only=True + # PyAnnote models require ListConfig to be whitelisted + try: + import torch.serialization + from omegaconf.listconfig import ListConfig + + torch.serialization.add_safe_globals([ListConfig]) + logger.debug("Added safe globals for PyTorch 2.6+ compatibility") + except Exception as e: + logger.warning(f"Could not add safe globals for torch.load: {e}") + # Initialize hardware detection self.hardware_config = detect_hardware() @@ -210,6 +222,10 @@ def perform_speaker_diarization( Returns: Diarization result + + Raises: + RuntimeError: If cuDNN library compatibility issues occur + ImportError: If WhisperX is not installed """ try: import whisperx @@ -218,17 +234,39 @@ def perform_speaker_diarization( logger.info("Performing speaker diarization...") - diarize_params = {"max_speakers": max_speakers, "min_speakers": min_speakers} + try: + diarize_params = {"max_speakers": max_speakers, "min_speakers": min_speakers} - # Use PyAnnote-compatible device configuration - pyannote_config = self.hardware_config.get_pyannote_config() + # Use PyAnnote-compatible device configuration + pyannote_config = self.hardware_config.get_pyannote_config() - diarize_model = whisperx.diarize.DiarizationPipeline( - use_auth_token=hf_token, device=pyannote_config["device"] - ) + diarize_model = whisperx.diarize.DiarizationPipeline( + use_auth_token=hf_token, device=pyannote_config["device"] + ) + + diarize_segments = diarize_model(audio, **diarize_params) + return diarize_segments + + except Exception as e: + error_msg = str(e) + + # Detect cuDNN library compatibility issues + if "libcudnn" in error_msg.lower(): + raise RuntimeError( + "CUDA cuDNN library compatibility error detected. This indicates a version " + "mismatch between PyTorch and CTranslate2. The system requires all packages " + "to use cuDNN 9 for CUDA 12.8 compatibility. " + f"Technical details: {error_msg}" + ) from e + + # Detect general CUDA errors + if "cuda" in error_msg.lower() or "gpu" in error_msg.lower(): + raise RuntimeError( + f"GPU processing error during speaker diarization: {error_msg}" + ) from e - diarize_segments = diarize_model(audio, **diarize_params) - return diarize_segments + # Re-raise other exceptions + raise def assign_speakers_to_words( self, diarize_segments, aligned_result: dict[str, Any] diff --git a/backend/requirements-dev.txt b/backend/requirements-dev.txt new file mode 100644 index 00000000..1d17aa00 --- /dev/null +++ b/backend/requirements-dev.txt @@ -0,0 +1,27 @@ +# Development Dependencies for OpenTranscribe Backend +# Install with: pip install -r requirements.txt -r requirements-dev.txt + +# Production dependencies +-r requirements.txt + +# Code Quality & Formatting +black>=25.1.0 +ruff>=0.12.0 +mypy>=1.17.0 +mypy-extensions>=1.1.0 + +# Testing +pytest>=8.4.0 +pytest-asyncio>=0.23.0 # For testing async code +pytest-cov>=4.1.0 # Code coverage reports + +# Git Hooks & Pre-commit +pre-commit>=4.3.0 + +# Type Stubs for Better Type Checking +types-redis>=4.6.0 +types-passlib>=1.7.0 + +# Development Utilities +ipython>=8.12.0 # Enhanced Python shell +ipdb>=0.13.0 # IPython debugger diff --git a/backend/requirements.txt b/backend/requirements.txt index 8fd3e054..d42ec54a 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -17,20 +17,32 @@ flower>=2.0.0 psycopg2-binary>=2.9.7 minio>=7.1.17 opensearch-py>=2.3.1 -numpy>=1.25.2 httpx>=0.24.1 python-dotenv>=1.0.0 pytest>=7.4.2 -whisperx==3.4.3 -ctranslate2==4.4.0 + +# AI/ML Stack - All cuDNN 9 compatible for CUDA 12.8 +# NumPy 2.x - fully compatible with PyTorch 2.8+, CTranslate2 4.6+, PyAnnote 3.4+ +numpy>=1.25.2 + +# PyTorch with CUDA 12.8 support (CVE-2025-32434 fixed in 2.6.0+) +--extra-index-url https://download.pytorch.org/whl/cu128 +torch==2.8.0+cu128 +torchaudio==2.8.0+cu128 + +# WhisperX latest version with ctranslate2 4.5+ support +whisperx==3.7.0 + +# CTranslate2 with cuDNN 9 support (required for CUDA 12.8) +ctranslate2>=4.6.0 + +# PyAnnote (compatible with NumPy 2.x and PyTorch 2.6+) pyannote.audio>=3.3.2 + +# Supporting libraries omegaconf>=2.3.0 ffmpeg-python>=0.2.0 sentencepiece>=0.1.99 -transformers>=4.34.0 -# PyTorch dependencies - installed per platform in Dockerfile -# torch>=2.1.0 -# torchaudio>=2.1.0 psutil>=5.9.5 pyexiftool>=0.5.0 -yt-dlp>=2023.11.16 \ No newline at end of file +yt-dlp>=2023.11.16 diff --git a/docker-compose.yml b/docker-compose.yml index bc30b808..829a4eb6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -75,9 +75,9 @@ services: # Development services that mirror production setup backend: - build: + build: context: ./backend - dockerfile: Dockerfile.dev + dockerfile: Dockerfile.prod restart: always volumes: - ./backend:/app @@ -147,9 +147,9 @@ services: command: uvicorn app.main:app --host 0.0.0.0 --port 8080 --reload celery-worker: - build: + build: context: ./backend - dockerfile: Dockerfile.dev + dockerfile: Dockerfile.prod restart: always command: celery -A app.core.celery worker --loglevel=info -Q gpu,nlp,utility,celery --concurrency=1 runtime: nvidia @@ -249,9 +249,9 @@ services: condition: service_healthy flower: - build: + build: context: ./backend - dockerfile: Dockerfile.dev + dockerfile: Dockerfile.prod restart: always command: > python -m celery -A app.core.celery flower diff --git a/docs/SECURITY_SCANNING.md b/docs/SECURITY_SCANNING.md new file mode 100644 index 00000000..9b8819bb --- /dev/null +++ b/docs/SECURITY_SCANNING.md @@ -0,0 +1,532 @@ +# Security Scanning Guide + +This guide explains how to use the free, open-source security scanning tools integrated into OpenTranscribe for local development and CI/CD pipelines. + +## Table of Contents + +- [Overview](#overview) +- [Tools Used](#tools-used) +- [Local Scanning](#local-scanning) +- [Pre-commit Hooks](#pre-commit-hooks) +- [CI/CD Integration](#cicd-integration) +- [Understanding Reports](#understanding-reports) +- [Best Practices](#best-practices) +- [Troubleshooting](#troubleshooting) + +## Overview + +OpenTranscribe uses a comprehensive security scanning approach with **free, open-source tools** that run locally—no Docker Hub subscription or paid services required. + +### Key Benefits + +- ✅ **100% Free**: All tools are open-source and free to use +- ✅ **No Cloud Required**: Scan images locally on your machine +- ✅ **Multi-layered**: Combines linting, best practices, and vulnerability scanning +- ✅ **SBOM Generation**: Create Software Bill of Materials for compliance +- ✅ **CI/CD Ready**: Automated scanning in GitHub Actions +- ✅ **Fast**: Smart caching and SBOM-based scanning + +## Tools Used + +### 1. Hadolint - Dockerfile Linter +**Purpose**: Lint Dockerfiles for best practices and common mistakes + +```bash +# Install +brew install hadolint # macOS +# Or download binary for Linux + +# Scan +hadolint backend/Dockerfile.prod +``` + +**What it catches**: Deprecated instructions, inefficient layer caching, security issues in Dockerfile syntax + +### 2. Dockle - Container Image Best Practices +**Purpose**: Check container images against CIS Docker Benchmarks + +```bash +# Run via Docker (no installation needed) +docker run --rm -v /var/run/docker.sock:/var/run/docker.sock \ + goodwithtech/dockle:latest your-image:tag +``` + +**What it catches**: Missing security labels, exposed secrets, improper permissions, CIS violations + +### 3. Trivy - Comprehensive Vulnerability Scanner +**Purpose**: Scan for CVEs in OS packages, language dependencies, and configurations + +```bash +# Install +brew install trivy # macOS +curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sh # Linux + +# Scan +trivy image your-image:tag +``` + +**What it catches**: Known vulnerabilities (CVEs), misconfigurations, exposed secrets, license issues + +### 4. Grype - Fast Vulnerability Scanner +**Purpose**: High-performance vulnerability scanning with SBOM support + +```bash +# Install +curl -sSfL https://raw.githubusercontent.com/anchore/grype/main/install.sh | sh + +# Scan +grype your-image:tag +grype sbom:./sbom.json # Faster, scan from SBOM +``` + +**What it catches**: CVEs in packages and dependencies with detailed fix recommendations + +### 5. Syft - SBOM Generator +**Purpose**: Generate Software Bill of Materials for compliance and auditing + +```bash +# Install +curl -sSfL https://raw.githubusercontent.com/anchore/syft/main/install.sh | sh + +# Generate SBOM +syft your-image:tag -o cyclonedx-json > sbom.json +syft your-image:tag -o spdx-json > sbom-spdx.json +``` + +**What it provides**: Complete inventory of all software components, versions, licenses + +## Local Scanning + +### Quick Start + +1. **Install all tools at once**: + ```bash + ./scripts/security-scan.sh install + ``` + +2. **Scan both images**: + ```bash + ./scripts/security-scan.sh all + ``` + +3. **Scan specific component**: + ```bash + ./scripts/security-scan.sh backend + ./scripts/security-scan.sh frontend + ``` + +### Advanced Usage + +**Customize severity threshold**: +```bash +SEVERITY_THRESHOLD=HIGH ./scripts/security-scan.sh all +``` + +**Fail on critical vulnerabilities**: +```bash +FAIL_ON_CRITICAL=true ./scripts/security-scan.sh backend +``` + +**Custom output directory**: +```bash +OUTPUT_DIR=./my-reports ./scripts/security-scan.sh all +``` + +**Scan with Docker build integration**: +```bash +# Build and scan automatically +./scripts/docker-build-push.sh backend + +# Build without scanning (faster) +SKIP_SECURITY_SCAN=true ./scripts/docker-build-push.sh backend + +# Build and fail on security issues +FAIL_ON_SECURITY_ISSUES=true FAIL_ON_CRITICAL=true ./scripts/docker-build-push.sh all +``` + +### Typical Local Workflow + +```bash +# 1. Make changes to Dockerfile or code +vim backend/Dockerfile.prod + +# 2. Build image locally (single platform for speed) +cd backend +docker build -f Dockerfile.prod -t opentranscribe-backend:test . +cd .. + +# 3. Lint Dockerfile +hadolint backend/Dockerfile.prod + +# 4. Check best practices +docker run --rm -v /var/run/docker.sock:/var/run/docker.sock \ + goodwithtech/dockle:latest opentranscribe-backend:test + +# 5. Generate SBOM once +syft opentranscribe-backend:test -o cyclonedx-json > sbom.json + +# 6. Scan for vulnerabilities +trivy image opentranscribe-backend:test +grype sbom:sbom.json + +# Or use the all-in-one script +./scripts/security-scan.sh backend +``` + +## Pre-commit Hooks + +### Setup + +Install pre-commit hooks to automatically lint Dockerfiles before commits: + +```bash +# Install pre-commit +pip install pre-commit + +# Install git hooks +pre-commit install + +# Install commit message hook (optional) +pre-commit install --hook-type commit-msg +``` + +### What Gets Checked + +Every commit automatically runs: +- ✅ **Hadolint**: Lints all Dockerfile.* files +- ✅ **Gitleaks**: Scans for exposed secrets +- ✅ **Bandit**: Python security linting +- ✅ **Shellcheck**: Shell script validation +- ✅ **Ruff**: Python formatting and linting +- ✅ **Prettier**: Frontend code formatting +- ✅ **mypy**: Python type checking + +### Manual Runs + +```bash +# Run on staged files only +pre-commit run + +# Run on all files +pre-commit run --all-files + +# Run specific hook +pre-commit run hadolint-docker --all-files + +# Skip hooks temporarily (not recommended) +git commit --no-verify +``` + +### Configuration + +Edit `.pre-commit-config.yaml` to customize hooks: + +```yaml +repos: + - repo: https://github.com/hadolint/hadolint + rev: v2.12.0 + hooks: + - id: hadolint-docker + args: [--config, .hadolint.yaml] +``` + +Edit `.hadolint.yaml` to configure Dockerfile linting rules: + +```yaml +ignored: + - DL3008 # Pin versions in apt-get (can ignore for dev) + +override: + error: + - DL3002 # Never switch to root USER + - DL3020 # Use COPY instead of ADD +``` + +## CI/CD Integration + +### GitHub Actions Workflows + +Two workflows are included: + +1. **`security-scan.yml`** - Comprehensive image scanning +2. **`pre-commit.yml`** - Pre-commit hooks in CI + +### Security Scan Workflow + +**Triggers**: +- Push to main/master/develop branches +- Pull requests +- Weekly schedule (Sundays at 00:00 UTC) +- Manual workflow dispatch + +**What it does**: +1. Lints Dockerfiles with Hadolint +2. Builds both backend and frontend images +3. Scans with Trivy (uploads to GitHub Security tab) +4. Checks best practices with Dockle +5. Generates SBOMs with Syft +6. Scans SBOMs with Grype +7. Scans repository dependencies +8. Uploads artifacts and reports + +**Viewing Results**: +- GitHub Security tab shows Trivy findings +- Workflow logs show detailed scan results +- SBOM artifacts available for download (90-day retention) + +**Manual Trigger**: +```bash +# Via GitHub UI: Actions → Security Scanning → Run workflow + +# Via GitHub CLI +gh workflow run security-scan.yml -f component=backend +``` + +### Pre-commit Workflow + +Runs all pre-commit hooks on every push/PR: +- Ensures code quality standards +- Catches issues before merge +- Provides quick feedback loop + +### Badge (Optional) + +Add to README.md: + +```markdown +[![Security Scanning](https://github.com/yourusername/transcribe-app/actions/workflows/security-scan.yml/badge.svg)](https://github.com/yourusername/transcribe-app/actions/workflows/security-scan.yml) +``` + +## Understanding Reports + +### Report Files + +All reports are saved to `./security-reports/`: + +``` +security-reports/ +├── backend-hadolint.txt # Dockerfile linting results +├── backend-dockle.json # CIS best practices check +├── backend-sbom.json # SBOM (CycloneDX format) +├── backend-sbom.txt # SBOM (human-readable table) +├── backend-trivy.json # Trivy scan (JSON) +├── backend-trivy.txt # Trivy scan (table) +├── backend-grype.json # Grype scan (JSON) +├── backend-grype.txt # Grype scan (table) +└── frontend-* # Same for frontend +``` + +### Reading Trivy Reports + +```bash +# View summary +cat security-reports/backend-trivy.txt + +# Query JSON for specific severity +jq '.Results[].Vulnerabilities[] | select(.Severity == "CRITICAL")' \ + security-reports/backend-trivy.json + +# Count vulnerabilities by severity +jq '[.Results[].Vulnerabilities[] | .Severity] | group_by(.) | map({severity: .[0], count: length})' \ + security-reports/backend-trivy.json +``` + +### Reading Grype Reports + +```bash +# View summary +cat security-reports/backend-grype.txt + +# Query JSON for fixable vulnerabilities +jq '.matches[] | select(.vulnerability.fix.state == "fixed")' \ + security-reports/backend-grype.json + +# Group by package +jq '[.matches[] | {package: .artifact.name, cve: .vulnerability.id, severity: .vulnerability.severity}] | group_by(.package)' \ + security-reports/backend-grype.json +``` + +### Understanding Severity Levels + +- **CRITICAL**: Immediate action required (exploitable, high impact) +- **HIGH**: Should fix soon (significant risk) +- **MEDIUM**: Fix when feasible (moderate risk) +- **LOW**: Informational (minimal risk) + +### Common Vulnerability Types + +1. **OS Package CVEs**: Outdated system packages (fix: update base image) +2. **Language Dependencies**: Vulnerable npm/pip packages (fix: update requirements) +3. **Configuration Issues**: Misconfigurations (fix: update Dockerfile/config) +4. **Embedded Secrets**: Exposed keys/passwords (fix: use environment variables) + +## Best Practices + +### Development Workflow + +1. **Lint before build**: Run `hadolint` on Dockerfiles before building +2. **Build with security in mind**: Use minimal base images, multi-stage builds +3. **Scan early and often**: Run scans locally before pushing +4. **Generate SBOMs**: Keep SBOMs for compliance and faster re-scanning +5. **Fix high/critical first**: Prioritize by severity and exploitability + +### Dockerfile Security Tips + +```dockerfile +# ✅ GOOD: Minimal base, specific version +FROM python:3.11-slim-bookworm + +# ❌ BAD: Using 'latest' tag +FROM python:latest + +# ✅ GOOD: Multi-stage build to reduce attack surface +FROM node:20 AS builder +WORKDIR /app +COPY package*.json ./ +RUN npm ci --only=production + +FROM node:20-alpine +COPY --from=builder /app/node_modules ./node_modules + +# ✅ GOOD: Non-root user +RUN adduser --disabled-password --gecos '' appuser +USER appuser + +# ❌ BAD: Running as root (default) +# (no USER specified) + +# ✅ GOOD: Clean up in same layer +RUN apt-get update && \ + apt-get install -y --no-install-recommends pkg && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# ❌ BAD: Separate layers, cache gets huge +RUN apt-get update +RUN apt-get install -y pkg +RUN apt-get clean +``` + +### Continuous Improvement + +- **Weekly scans**: Run scheduled scans to catch new CVEs +- **Update dependencies**: Regularly update base images and packages +- **Track trends**: Monitor vulnerability counts over time +- **Automate fixes**: Use Dependabot or Renovate for dependency updates +- **Document exceptions**: If you can't fix a vulnerability, document why + +### CI/CD Best Practices + +```bash +# For pull requests: Scan but don't fail +FAIL_ON_CRITICAL=false ./scripts/security-scan.sh all + +# For main branch: Fail on critical issues +FAIL_ON_CRITICAL=true FAIL_ON_SECURITY_ISSUES=true ./scripts/security-scan.sh all + +# For releases: Full scan with strict settings +SEVERITY_THRESHOLD=MEDIUM FAIL_ON_CRITICAL=true ./scripts/security-scan.sh all +``` + +## Troubleshooting + +### "Command not found" errors + +Install missing tools: +```bash +./scripts/security-scan.sh install +``` + +Or install individually: +```bash +brew install trivy hadolint +curl -sSfL https://raw.githubusercontent.com/anchore/grype/main/install.sh | sh +curl -sSfL https://raw.githubusercontent.com/anchore/syft/main/install.sh | sh +``` + +### "Image not found" errors + +Build or pull the image first: +```bash +docker build -f backend/Dockerfile.prod -t davidamacey/opentranscribe-backend:latest ./backend +# or +docker pull davidamacey/opentranscribe-backend:latest +``` + +### Slow scans + +Use SBOM-based scanning for speed: +```bash +# Generate SBOM once +syft your-image:tag -o cyclonedx-json > sbom.json + +# Scan from SBOM (much faster) +grype sbom:sbom.json +``` + +### False positives + +**Trivy**: Create `.trivyignore`: +``` +# Ignore specific CVE +CVE-2023-12345 + +# Ignore by package +pkg:pypi/package-name +``` + +**Grype**: Create `.grype.yaml`: +```yaml +ignore: + - vulnerability: CVE-2023-12345 + reason: False positive - not applicable to our use case +``` + +### Database update failures + +Clear cache and update: +```bash +# Trivy +trivy image --clear-cache +trivy image --download-db-only + +# Grype +grype db update +``` + +### Pre-commit hook failures + +Skip temporarily (not recommended): +```bash +git commit --no-verify +``` + +Fix issues and commit again: +```bash +# Run specific hook to see details +pre-commit run hadolint-docker --all-files + +# Fix the issue +vim backend/Dockerfile.prod + +# Commit again +git add backend/Dockerfile.prod +git commit -m "fix: resolve Dockerfile linting issues" +``` + +## Additional Resources + +- [Trivy Documentation](https://aquasecurity.github.io/trivy/) +- [Grype Documentation](https://github.com/anchore/grype) +- [Syft Documentation](https://github.com/anchore/syft) +- [Hadolint Documentation](https://github.com/hadolint/hadolint) +- [Dockle Documentation](https://github.com/goodwithtech/dockle) +- [Docker Security Best Practices](https://docs.docker.com/develop/security-best-practices/) +- [CIS Docker Benchmark](https://www.cisecurity.org/benchmark/docker) + +## Support + +For issues or questions: +1. Check the [Troubleshooting](#troubleshooting) section +2. Review tool documentation (links above) +3. Open an issue on GitHub with scan reports attached diff --git a/frontend/Dockerfile.prod b/frontend/Dockerfile.prod index e57d0ef7..764d3429 100644 --- a/frontend/Dockerfile.prod +++ b/frontend/Dockerfile.prod @@ -29,7 +29,7 @@ RUN chmod -R 755 static/fonts RUN npm run build # Production stage -FROM nginx:stable-alpine +FROM nginx:alpine # Copy the built files from the build stage COPY --from=build /app/dist /usr/share/nginx/html diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..4d6e9c84 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,116 @@ +# OpenTranscribe Python Project Configuration + +[project] +name = "opentranscribe" +version = "2.0.0" +description = "AI-powered transcription application with speaker diarization and LLM features" +authors = [ + {name = "OpenTranscribe Contributors"} +] +requires-python = ">=3.11" + +[tool.black] +line-length = 100 +target-version = ['py311'] +include = '\.pyi?$' +extend-exclude = ''' +/( + # directories + \.eggs + | \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | venv + | _build + | buck-out + | build + | dist + | node_modules +)/ +''' + +[tool.ruff] +line-length = 100 +target-version = "py311" + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "UP", # pyupgrade + "ARG", # flake8-unused-arguments + "SIM", # flake8-simplify +] +ignore = [ + "E501", # Line too long - handled by black + "B008", # Do not perform function calls in argument defaults + "W191", # Indentation contains tabs + "B904", # Allow raising exceptions without from e, for HTTPException +] + +[tool.ruff.lint.isort] +known-first-party = ["app"] + +[tool.mypy] +python_version = "3.11" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = false +disallow_incomplete_defs = false +check_untyped_defs = true +no_implicit_optional = true +warn_redundant_casts = true +warn_unused_ignores = false +warn_no_return = true +strict_equality = true +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = [ + "celery.*", + "whisperx.*", + "pyannote.*", + "minio.*", + "opensearchpy.*", +] +ignore_missing_imports = true + +[tool.bandit] +exclude_dirs = ["tests", "venv", ".venv", "node_modules"] +skips = ["B101", "B601"] # Skip assert_used and paramiko_calls + +[tool.bandit.assert_used] +skips = ["*_test.py", "test_*.py"] + +[tool.pytest.ini_options] +testpaths = ["backend/tests"] +python_files = ["test_*.py", "*_test.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +addopts = "-v --tb=short --strict-markers" +markers = [ + "slow: marks tests as slow (deselect with '-m \"not slow\"')", + "integration: marks tests as integration tests", + "unit: marks tests as unit tests", +] + +[tool.coverage.run] +source = ["app"] +omit = [ + "*/tests/*", + "*/test_*.py", + "*/__pycache__/*", + "*/venv/*", + "*/.venv/*", +] + +[tool.coverage.report] +precision = 2 +show_missing = true +skip_covered = false diff --git a/scripts/build-offline-package.sh b/scripts/build-offline-package.sh index 69e0ac16..5d13a5b7 100755 --- a/scripts/build-offline-package.sh +++ b/scripts/build-offline-package.sh @@ -13,6 +13,11 @@ BLUE='\033[0;34m' CYAN='\033[0;36m' NC='\033[0m' # No Color +# Load .env file if it exists and HUGGINGFACE_TOKEN is not already set +if [ -z "$HUGGINGFACE_TOKEN" ] && [ -f .env ]; then + export HUGGINGFACE_TOKEN=$(grep "^HUGGINGFACE_TOKEN=" .env | cut -d'=' -f2) +fi + # Configuration VERSION="${1:-$(git rev-parse --short HEAD)}" PACKAGE_NAME="opentranscribe-offline-v${VERSION}" @@ -307,8 +312,8 @@ copy_configuration() { # INSTALLATION SCRIPTS ####################### -create_installation_scripts() { - print_header "Creating Installation Scripts" +copy_installation_scripts() { + print_header "Copying Installation Scripts" # Copy installation script print_info "Copying install.sh..." @@ -327,8 +332,8 @@ create_installation_scripts() { # DOCUMENTATION ####################### -create_documentation() { - print_header "Creating Documentation" +copy_documentation() { + print_header "Copying Documentation" print_info "Copying README-OFFLINE.md..." cp README-OFFLINE.md "${PACKAGE_DIR}/" @@ -417,21 +422,14 @@ main() { print_warning "This process will take 1-2 hours and requires internet access" echo - read -p "Continue? (y/N) " -n 1 -r - echo - if [[ ! $REPLY =~ ^[Yy]$ ]]; then - print_info "Aborted by user" - exit 0 - fi - # Execute build steps preflight_checks setup_directories pull_and_save_images download_models copy_configuration - create_installation_scripts - create_documentation + copy_installation_scripts + copy_documentation finalize_package compress_package diff --git a/scripts/docker-build-push.sh b/scripts/docker-build-push.sh index 15ee4fb7..7283dffc 100755 --- a/scripts/docker-build-push.sh +++ b/scripts/docker-build-push.sh @@ -79,6 +79,36 @@ detect_changes() { fi } +# Function to run security scan if enabled +run_security_scan() { + local component=$1 + + if [ "${SKIP_SECURITY_SCAN}" = "true" ]; then + print_warning "Security scanning skipped (SKIP_SECURITY_SCAN=true)" + return 0 + fi + + if [ ! -f "./scripts/security-scan.sh" ]; then + print_warning "Security scan script not found, skipping..." + return 0 + fi + + print_info "Running security scan for ${component}..." + if OUTPUT_DIR="./security-reports" FAIL_ON_CRITICAL="${FAIL_ON_CRITICAL:-false}" ./scripts/security-scan.sh "${component}"; then + print_success "Security scan passed for ${component}" + return 0 + else + print_warning "Security scan found issues for ${component}" + if [ "${FAIL_ON_SECURITY_ISSUES}" = "true" ]; then + print_error "Failing build due to security issues (FAIL_ON_SECURITY_ISSUES=true)" + return 1 + else + print_warning "Continuing despite security issues (set FAIL_ON_SECURITY_ISSUES=true to fail)" + return 0 + fi + fi +} + # Function to build and push backend build_backend() { print_info "Building backend image..." @@ -100,6 +130,9 @@ build_backend() { print_success "Backend image built and pushed successfully" print_info "Tags: ${REPO_BACKEND}:latest, ${REPO_BACKEND}:${COMMIT_SHA}" + + # Run security scan after build + run_security_scan "backend" } # Function to build and push frontend @@ -123,6 +156,9 @@ build_frontend() { print_success "Frontend image built and pushed successfully" print_info "Tags: ${REPO_FRONTEND}:latest, ${REPO_FRONTEND}:${COMMIT_SHA}" + + # Run security scan after build + run_security_scan "frontend" } # Function to show usage @@ -140,17 +176,36 @@ Options: help Show this help message Environment Variables: - DOCKERHUB_USERNAME Docker Hub username (default: davidamacey) - PLATFORMS Target platforms (default: linux/amd64,linux/arm64) + DOCKERHUB_USERNAME Docker Hub username (default: davidamacey) + PLATFORMS Target platforms (default: linux/amd64,linux/arm64) + SKIP_SECURITY_SCAN Skip security scanning (default: false) + FAIL_ON_SECURITY_ISSUES Fail build if security issues found (default: false) + FAIL_ON_CRITICAL Fail scan if CRITICAL vulnerabilities found (default: false) Examples: - $0 # Build and push both images + $0 # Build and push both images with security scanning $0 backend # Build and push only backend $0 auto # Auto-detect and build changed components # Build only for current platform (faster) PLATFORMS=linux/amd64 $0 backend + # Skip security scanning for faster builds + SKIP_SECURITY_SCAN=true $0 all + + # Fail build if security issues found (recommended for CI) + FAIL_ON_SECURITY_ISSUES=true FAIL_ON_CRITICAL=true $0 all + +Security Scanning: + After building, images are automatically scanned with: + - Hadolint: Dockerfile linting + - Dockle: CIS best practices + - Trivy: Vulnerability scanning + - Grype: Additional vulnerability scanning + - Syft: SBOM generation + + Reports are saved to ./security-reports/ + EOF } diff --git a/scripts/security-scan.sh b/scripts/security-scan.sh new file mode 100755 index 00000000..cb8c1f96 --- /dev/null +++ b/scripts/security-scan.sh @@ -0,0 +1,530 @@ +#!/bin/bash +set -e + +# Security Scanning Script for OpenTranscribe Docker Images +# Uses free, open-source tools to scan for vulnerabilities and security issues +# No Docker Hub/Scout subscription required + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' # No Color + +# Configuration +DOCKERHUB_USERNAME="${DOCKERHUB_USERNAME:-davidamacey}" +REPO_BACKEND="${DOCKERHUB_USERNAME}/opentranscribe-backend" +REPO_FRONTEND="${DOCKERHUB_USERNAME}/opentranscribe-frontend" +SCAN_TARGET="${1:-all}" +OUTPUT_DIR="${OUTPUT_DIR:-./security-reports}" +SEVERITY_THRESHOLD="${SEVERITY_THRESHOLD:-MEDIUM}" +FAIL_ON_CRITICAL="${FAIL_ON_CRITICAL:-true}" + +# Create output directory +mkdir -p "${OUTPUT_DIR}" + +# Function to print colored output +print_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +print_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +print_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +print_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +print_header() { + echo -e "${CYAN}========================================${NC}" + echo -e "${CYAN}$1${NC}" + echo -e "${CYAN}========================================${NC}" +} + +# Function to check if a command exists +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +# Function to install Trivy +install_trivy() { + if command_exists trivy; then + print_info "Trivy already installed: $(trivy --version | head -1)" + return 0 + fi + + print_warning "Trivy not found. Installing..." + + if [[ "$OSTYPE" == "darwin"* ]]; then + if command_exists brew; then + brew install trivy + else + print_error "Homebrew not found. Please install Trivy manually: https://aquasecurity.github.io/trivy/latest/getting-started/installation/" + return 1 + fi + else + curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sh -s -- -b /usr/local/bin + fi + + print_success "Trivy installed successfully" +} + +# Function to install Grype +install_grype() { + if command_exists grype; then + print_info "Grype already installed: $(grype version | head -1)" + return 0 + fi + + print_warning "Grype not found. Installing..." + curl -sSfL https://raw.githubusercontent.com/anchore/grype/main/install.sh | sh -s -- -b /usr/local/bin + print_success "Grype installed successfully" +} + +# Function to install Syft +install_syft() { + if command_exists syft; then + print_info "Syft already installed: $(syft version | head -1)" + return 0 + fi + + print_warning "Syft not found. Installing..." + curl -sSfL https://raw.githubusercontent.com/anchore/syft/main/install.sh | sh -s -- -b /usr/local/bin + print_success "Syft installed successfully" +} + +# Function to install Hadolint +install_hadolint() { + if command_exists hadolint; then + print_info "Hadolint already installed: $(hadolint --version)" + return 0 + fi + + print_warning "Hadolint not found. Installing..." + + if [[ "$OSTYPE" == "darwin"* ]]; then + if command_exists brew; then + brew install hadolint + else + print_error "Homebrew not found. Please install Hadolint manually: https://github.com/hadolint/hadolint" + return 1 + fi + else + HADOLINT_VERSION=$(curl -s https://api.github.com/repos/hadolint/hadolint/releases/latest | grep '"tag_name":' | sed -E 's/.*"v([^"]+)".*/\1/') + curl -sL -o /usr/local/bin/hadolint "https://github.com/hadolint/hadolint/releases/download/v${HADOLINT_VERSION}/hadolint-Linux-x86_64" + chmod +x /usr/local/bin/hadolint + fi + + print_success "Hadolint installed successfully" +} + +# Function to check and install Dockle +check_dockle() { + if ! command_exists docker; then + print_warning "Docker not found. Dockle requires Docker to run." + return 1 + fi + + print_info "Dockle will run via Docker image (no installation needed)" + return 0 +} + +# Function to lint Dockerfile with Hadolint +lint_dockerfile() { + local dockerfile=$1 + local component=$2 + + print_header "Linting Dockerfile: ${dockerfile}" + + local output_file="${OUTPUT_DIR}/${component}-hadolint.txt" + + if hadolint "${dockerfile}" | tee "${output_file}"; then + print_success "Dockerfile passed Hadolint checks" + return 0 + else + print_warning "Dockerfile has linting issues (see ${output_file})" + return 1 + fi +} + +# Function to run Dockle on image +run_dockle() { + local image=$1 + local component=$2 + + print_header "Running Dockle on ${image}" + + local output_file="${OUTPUT_DIR}/${component}-dockle.json" + + # Run Dockle via Docker + if docker run --rm \ + -v /var/run/docker.sock:/var/run/docker.sock \ + goodwithtech/dockle:latest \ + --format json \ + --output "${output_file}" \ + "${image}"; then + print_success "Dockle scan completed (see ${output_file})" + + # Display summary + docker run --rm \ + -v /var/run/docker.sock:/var/run/docker.sock \ + goodwithtech/dockle:latest \ + "${image}" + + return 0 + else + print_error "Dockle scan failed" + return 1 + fi +} + +# Function to generate SBOM with Syft +generate_sbom() { + local image=$1 + local component=$2 + + print_header "Generating SBOM for ${image}" + + local sbom_file="${OUTPUT_DIR}/${component}-sbom.json" + + syft "${image}" -o cyclonedx-json > "${sbom_file}" + print_success "SBOM generated: ${sbom_file}" + + # Also generate human-readable table format + syft "${image}" -o table > "${OUTPUT_DIR}/${component}-sbom.txt" + print_info "Human-readable SBOM: ${OUTPUT_DIR}/${component}-sbom.txt" + + echo "${sbom_file}" +} + +# Function to scan vulnerabilities with Trivy +scan_trivy() { + local image=$1 + local component=$2 + + print_header "Scanning ${image} with Trivy" + + local json_output="${OUTPUT_DIR}/${component}-trivy.json" + local html_output="${OUTPUT_DIR}/${component}-trivy.html" + local txt_output="${OUTPUT_DIR}/${component}-trivy.txt" + + # Run Trivy scan with multiple output formats + trivy image \ + --severity "${SEVERITY_THRESHOLD},HIGH,CRITICAL" \ + --format json \ + --output "${json_output}" \ + "${image}" + + trivy image \ + --severity "${SEVERITY_THRESHOLD},HIGH,CRITICAL" \ + --format table \ + --output "${txt_output}" \ + "${image}" + + # Display summary + print_info "Trivy scan results:" + trivy image \ + --severity "${SEVERITY_THRESHOLD},HIGH,CRITICAL" \ + "${image}" + + print_success "Trivy reports generated:" + print_info " - JSON: ${json_output}" + print_info " - Text: ${txt_output}" + + # Check for CRITICAL vulnerabilities + local critical_count=$(jq '[.Results[]?.Vulnerabilities[]? | select(.Severity == "CRITICAL")] | length' "${json_output}") + local high_count=$(jq '[.Results[]?.Vulnerabilities[]? | select(.Severity == "HIGH")] | length' "${json_output}") + + print_info "Found ${critical_count} CRITICAL and ${high_count} HIGH severity vulnerabilities" + + if [ "${FAIL_ON_CRITICAL}" = "true" ] && [ "${critical_count}" -gt 0 ]; then + print_error "CRITICAL vulnerabilities found - scan failed" + return 1 + fi + + return 0 +} + +# Function to scan vulnerabilities with Grype +scan_grype() { + local image=$1 + local component=$2 + local sbom_file=$3 + + print_header "Scanning with Grype" + + local json_output="${OUTPUT_DIR}/${component}-grype.json" + local txt_output="${OUTPUT_DIR}/${component}-grype.txt" + + # Scan from SBOM for speed + if [ -n "${sbom_file}" ] && [ -f "${sbom_file}" ]; then + print_info "Scanning from SBOM for faster results..." + grype "sbom:${sbom_file}" \ + --output json \ + --file "${json_output}" + + grype "sbom:${sbom_file}" \ + --output table \ + | tee "${txt_output}" + else + # Scan image directly + grype "${image}" \ + --output json \ + --file "${json_output}" + + grype "${image}" \ + --output table \ + | tee "${txt_output}" + fi + + print_success "Grype reports generated:" + print_info " - JSON: ${json_output}" + print_info " - Text: ${txt_output}" + + # Check for CRITICAL vulnerabilities + local critical_count=$(jq '[.matches[]? | select(.vulnerability.severity == "Critical")] | length' "${json_output}") + local high_count=$(jq '[.matches[]? | select(.vulnerability.severity == "High")] | length' "${json_output}") + + print_info "Found ${critical_count} Critical and ${high_count} High severity vulnerabilities" + + if [ "${FAIL_ON_CRITICAL}" = "true" ] && [ "${critical_count}" -gt 0 ]; then + print_error "CRITICAL vulnerabilities found - scan failed" + return 1 + fi + + return 0 +} + +# Function to scan a component (backend or frontend) +scan_component() { + local component=$1 + local dockerfile="" + local image="" + + case "${component}" in + backend) + dockerfile="backend/Dockerfile.prod" + image="${REPO_BACKEND}:latest" + ;; + frontend) + dockerfile="frontend/Dockerfile.prod" + image="${REPO_FRONTEND}:latest" + ;; + *) + print_error "Invalid component: ${component}" + return 1 + ;; + esac + + print_header "Security Scanning: ${component}" + print_info "Image: ${image}" + print_info "Dockerfile: ${dockerfile}" + echo "" + + local exit_code=0 + + # Step 1: Lint Dockerfile + if [ -f "${dockerfile}" ]; then + lint_dockerfile "${dockerfile}" "${component}" || exit_code=$? + else + print_warning "Dockerfile not found: ${dockerfile}" + fi + + echo "" + + # Check if image exists locally (build if needed or pull from registry) + if ! docker image inspect "${image}" >/dev/null 2>&1; then + print_warning "Image not found locally: ${image}" + print_info "Attempting to pull from registry..." + if ! docker pull "${image}"; then + print_error "Failed to pull image. Please build it first." + return 1 + fi + fi + + # Step 2: Run Dockle for CIS best practices + check_dockle && run_dockle "${image}" "${component}" || exit_code=$? + echo "" + + # Step 3: Generate SBOM + local sbom_file=$(generate_sbom "${image}" "${component}") + echo "" + + # Step 4: Scan with Trivy + scan_trivy "${image}" "${component}" || exit_code=$? + echo "" + + # Step 5: Scan with Grype + scan_grype "${image}" "${component}" "${sbom_file}" || exit_code=$? + echo "" + + if [ ${exit_code} -eq 0 ]; then + print_success "Security scan completed for ${component}" + else + print_error "Security scan failed for ${component}" + fi + + return ${exit_code} +} + +# Function to generate summary report +generate_summary() { + print_header "Security Scan Summary" + + print_info "All reports saved to: ${OUTPUT_DIR}" + echo "" + + print_info "Report files:" + ls -lh "${OUTPUT_DIR}" | tail -n +2 + echo "" + + # Generate HTML summary if reports exist + if [ -f "${OUTPUT_DIR}/backend-trivy.json" ] || [ -f "${OUTPUT_DIR}/frontend-trivy.json" ]; then + print_info "To view detailed reports:" + for file in "${OUTPUT_DIR}"/*.json; do + [ -f "$file" ] && print_info " - $(basename ${file})" + done + fi +} + +# Function to show usage +show_usage() { + cat << EOF +Usage: $0 [OPTION] + +Security scanning for OpenTranscribe Docker images using free, open-source tools + +Tools used: + - Hadolint: Dockerfile linter + - Dockle: Container image CIS best practices checker + - Syft: SBOM (Software Bill of Materials) generator + - Trivy: Comprehensive vulnerability scanner + - Grype: Fast vulnerability scanner + +Options: + backend Scan only backend image + frontend Scan only frontend image + all Scan both images (default) + install Install all required tools + help Show this help message + +Environment Variables: + OUTPUT_DIR Report output directory (default: ./security-reports) + SEVERITY_THRESHOLD Minimum severity to report (default: MEDIUM) + FAIL_ON_CRITICAL Fail if CRITICAL vulnerabilities found (default: true) + DOCKERHUB_USERNAME Docker Hub username (default: davidamacey) + +Examples: + $0 # Scan both images + $0 backend # Scan only backend + $0 install # Install all required tools + + # Customize scanning + OUTPUT_DIR=./reports SEVERITY_THRESHOLD=HIGH $0 all + FAIL_ON_CRITICAL=false $0 backend + +Reports: + All reports are saved to \${OUTPUT_DIR}/ with multiple formats: + - *-hadolint.txt: Dockerfile linting results + - *-dockle.json: CIS best practices check + - *-sbom.json: Software Bill of Materials (CycloneDX format) + - *-trivy.json: Trivy vulnerability scan (JSON) + - *-trivy.txt: Trivy vulnerability scan (human-readable) + - *-grype.json: Grype vulnerability scan (JSON) + - *-grype.txt: Grype vulnerability scan (human-readable) + +EOF +} + +# Function to install all tools +install_all_tools() { + print_header "Installing Security Scanning Tools" + + install_trivy + install_grype + install_syft + install_hadolint + check_dockle + + print_success "All tools installed successfully!" + echo "" + print_info "Tool versions:" + command_exists trivy && trivy --version | head -1 + command_exists grype && grype version | head -1 + command_exists syft && syft version | head -1 + command_exists hadolint && hadolint --version + print_info "Dockle: runs via Docker image" +} + +# Main function +main() { + print_header "OpenTranscribe Security Scanner" + print_info "Output directory: ${OUTPUT_DIR}" + print_info "Severity threshold: ${SEVERITY_THRESHOLD}" + print_info "Fail on critical: ${FAIL_ON_CRITICAL}" + echo "" + + case "${SCAN_TARGET}" in + install) + install_all_tools + exit 0 + ;; + help|--help|-h) + show_usage + exit 0 + ;; + backend|frontend) + # Check required tools + install_trivy + install_grype + install_syft + install_hadolint + check_dockle + + scan_component "${SCAN_TARGET}" + exit_code=$? + ;; + all) + # Check required tools + install_trivy + install_grype + install_syft + install_hadolint + check_dockle + + scan_component "backend" + backend_exit=$? + + scan_component "frontend" + frontend_exit=$? + + exit_code=$((backend_exit + frontend_exit)) + ;; + *) + print_error "Invalid option: ${SCAN_TARGET}" + show_usage + exit 1 + ;; + esac + + echo "" + generate_summary + + if [ ${exit_code} -eq 0 ]; then + print_success "All security scans passed!" + exit 0 + else + print_error "Security scans failed" + exit 1 + fi +} + +# Run main function +main