Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
scratch*

# Python
__pycache__/
*.py[cod]
Expand All @@ -14,6 +16,7 @@ build/
# Environment variables
.env
.env.local
.env.prod

# Logs
logs/
Expand Down
21 changes: 10 additions & 11 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ WORKDIR /app
# Install build dependencies in a single layer
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
vim \
gcc \
g++ \
git \
Expand All @@ -18,10 +19,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
# Copy only requirements first for better caching
COPY requirements.txt .

# Use pip cache and install in parallel
RUN --mount=type=cache,target=/root/.cache/pip \
python -m pip install --upgrade pip setuptools wheel && \
pip install -r requirements.txt --user --no-warn-script-location
# Install to explicit location
RUN python -m pip install --no-cache-dir --prefix=/install --upgrade pip setuptools wheel && \
pip install --no-cache-dir --no-deps --prefix=/install -r requirements.txt

RUN pip install --prefix=/install -r requirements.txt --no-cache-dir

# ============================================
# Final stage - minimal runtime image
Expand All @@ -37,18 +39,15 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
&& rm -rf /var/lib/apt/lists/*

# Copy installed packages from builder
COPY --from=builder /root/.local /root/.local

ENV PATH=/root/.local/bin:$PATH
COPY --from=builder /install /usr/local

# Create necessary directories
RUN mkdir -p /app/data /app/logs /root/.cache/clip

RUN ls

# Copy application code (do this last for better caching)
COPY . .

# Lightweight healthcheck
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD python -c "import sys; sys.exit(0)" || exit 1

CMD ["python", "app.py"]
CMD ["python", "app.py"]
20 changes: 13 additions & 7 deletions Dockerfile.api
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.13-slim
FROM python:3.13-slim as builder

WORKDIR /app

Expand All @@ -7,9 +7,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf

# Copy requirements and install dependencies
COPY api/requirements.txt /app/api/requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python -m pip install --upgrade pip && \
pip install -r /app/api/requirements.txt
RUN python -m pip install --prefix=/install -r --upgrade pip && \
pip install --prefix=/install -r /app/api/requirements.txt --no-cache-dir

# ============================================
# Final stage - minimal runtime image
# ============================================
FROM python:3.13-slim

WORKDIR /app

# Copy installed packages from builder
COPY --from=builder /install /usr/local

# Copy application code
COPY api/ /app/api/
Expand All @@ -18,9 +27,6 @@ COPY utils/ /app/utils/
# Expose API port
EXPOSE 8000

# Healthcheck (check if uvicorn is responding)
HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
CMD curl -f http://localhost:8000/ || exit 1

# Run the API
CMD ["uvicorn", "api.api:app", "--host", "0.0.0.0", "--port", "8000"]
11 changes: 11 additions & 0 deletions api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import aio_pika
import json
import uuid
from dotenv import load_dotenv

from dotenv import load_dotenv
import os
Expand All @@ -26,13 +27,23 @@
)
logger = logging.getLogger(__name__)

load_dotenv()

# RabbitMQ Configuration
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "localhost")
RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", "5672")
RABBITMQ_VHOST = os.getenv("RABBITMQ_VHOST", "/")
RABBITMQ_USER = os.getenv("RABBITMQ_USER", "admin")
RABBITMQ_PASS = os.getenv("RABBITMQ_PASS", "admin123")

#PRINT THE RABBITMQ CONFIG FOR DEBUGGING
# logger.info("###################")
# logger.info(f"RABBITMQ_HOST={RABBITMQ_HOST}")
# logger.info(f"RABBITMQ_PORT={RABBITMQ_PORT}")
# logger.info(f"RABBITMQ_VHOST={RABBITMQ_VHOST}")
# logger.info(f"RABBITMQ_USER={RABBITMQ_USER}")
# logger.info("###################")

SUBMISSION_QUEUE = os.getenv("SUBMISSION_QUEUE", "plagiarism_submissions")
FEEDBACK_QUEUE = os.getenv("FEEDBACK_QUEUE", "plagiarism_feedback")

Expand Down
4 changes: 4 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ def validate_configuration():
]

missing = [var for var in required_env_vars if not os.getenv(var)]
#print the required env vars and their values for debugging
# for var in required_env_vars:
# logger.info("###################")
# logger.info(f"{var}={os.getenv(var)}")
if missing:
logger.error(f"Missing required environment variables: {missing}")
raise ValueError(f"Missing required environment variables: {missing}")
Expand Down
2 changes: 1 addition & 1 deletion config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ class DetectionConfig(BaseSettings):

exact_dup_threshold: float = Field(default=0.95, env="EXACT_DUPLICATE_THRESHOLD")
near_dup_threshold: float = Field(default=0.90, env="NEAR_DUPLICATE_THRESHOLD")
semantic_threshold: float = Field(default=0.80, env="SEMANTIC_MATCH_THRESHOLD")
semantic_threshold: float = Field(default=0.70, env="SEMANTIC_MATCH_THRESHOLD")

# Hash matching thresholds (Hamming distance, 0-64 bits)
hash_threshold: int = Field(default=8, env="HASH_MATCH_THRESHOLD")
Expand Down
7 changes: 5 additions & 2 deletions database/db_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ async def init_pool(self):
db_name = os.getenv("POSTGRES_DB") or os.getenv("DB_NAME")
db_host = os.getenv("POSTGRES_HOST") or os.getenv("DB_HOST", "localhost")
db_port = int(os.getenv("POSTGRES_PORT") or os.getenv("DB_PORT", "5432"))
# db_port = 5435 # TEMP OVERRIDE FOR TESTING



if not all([db_user, db_password, db_name]):
raise ValueError("Missing required database environment variables")
Expand Down Expand Up @@ -511,10 +514,10 @@ async def fetch_reference_images_by_id(self, reference_id):
raise RuntimeError("Database pool not initialized")

try:
image_path = await self._fetch(
image_path = await self._fetchval(
"""
SELECT image_path
FROM reference_images where id = $1;
FROM reference_images where reference_id = $1;
""",
reference_id,
)
Expand Down
4 changes: 2 additions & 2 deletions database/init.sql
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,12 @@ USING hnsw (clip_embedding vector_ip_ops);
-- Reference images corpus
CREATE TABLE IF NOT EXISTS reference_images (
id UUID PRIMARY KEY DEFAULT uuid_generate_v4(),
reference_id VARCHAR(100) UNIQUE NOT NULL,
reference_id VARCHAR(200) UNIQUE NOT NULL, -- will be used as assignment id when fetching references from assignments
image_path TEXT NOT NULL,
phash VARCHAR(64) NOT NULL,
dhash VARCHAR(64) NOT NULL,
ahash VARCHAR(64) NOT NULL,
category VARCHAR(100),
category VARCHAR(200),
description TEXT,
source VARCHAR(200),
faiss_index_position INTEGER,
Expand Down
84 changes: 77 additions & 7 deletions docker-compose-prod.yml
Original file line number Diff line number Diff line change
@@ -1,39 +1,109 @@
version: '3.8'

services:
# ===================================
# POSTGRESQL - Database
# ===================================
postgres:
image: pgvector/pgvector:pg16
container_name: plg-postgres
ports:
- "5432:5432"
environment:
POSTGRES_DB: ${POSTGRES_DB}
POSTGRES_USER: ${POSTGRES_USER}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
POSTGRES_INITDB_ARGS: "-E UTF8"
POSTGRES_MAX_CONNECTIONS: 20
PGDATA: /var/lib/postgresql/data/pgdata
volumes:
- postgres_data:/var/lib/postgresql/data
- ./database/init.sql:/docker-entrypoint-initdb.d/init.sql
healthcheck:
test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER -d $$POSTGRES_DB"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
deploy:
resources:
limits:
cpus: '0.5'
memory: 512M
reservations:
cpus: '0.25'
memory: 256M
networks:
- plg-network
restart: unless-stopped

# ===================================
# PLAGIARISM CHECKER SERVICE
# ===================================
plagiarism-checker:
build:
context: .
dockerfile: Dockerfile
container_name: mentorme-plagiarism-checker
container_name: plg-checker
env_file:
- .env
volumes:
- ./data:/app/data
- ./logs:/app/logs
depends_on:
rabbitmq:
postgres:
condition: service_healthy
deploy:
resources:
limits:
cpus: '8.0' # Increased from 4.0 - allows up to 8 CPU cores
memory: 8G
reservations:
cpus: '2' # Increased from 1 - guarantees 2 cores minimum
memory: 2G
restart: unless-stopped
networks:
- plg-network

# ===================================
# API SERVICE
# ===================================
api:
build:
context: .
dockerfile: Dockerfile.api
container_name: plg-api
env_file:
- .env
volumes:
- ./data:/app/data
- ./logs:/app/logs
depends_on:
postgres:
condition: service_healthy
deploy:
resources:
limits:
cpus: '1.0'
memory: 4G
cpus: '8.0' # Increased from 4.0 - allows up to 8 CPU cores
memory: 8G
reservations:
cpus: '0.5'
cpus: '2' # Increased from 1 - guarantees 2 cores minimum
memory: 2G
restart: unless-stopped
networks:
- mentorme-plagiarism-network
- plg-network


# ===================================
# VOLUMES
# ===================================
volumes:
postgres_data:
driver: local

# ===================================
# NETWORKS
# ===================================
networks:
mentorme-plagiarism-network:
plg-network:
driver: bridge
6 changes: 3 additions & 3 deletions docs/DOCUMENTATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -776,11 +776,11 @@ asyncio.run(test())
#### Build Docker Image
```bash
# Standard build (model downloaded on first run)
docker build -t mentorme-plagiarism:latest .
docker build -t plg:latest .

# With HuggingFace token for model prefetch during build (optional)
# This pre-downloads the CLIP model into the Docker image
docker build -t mentorme-plagiarism:latest \
docker build -t plg:latest \
--build-arg HUGGINGFACE_HUB_TOKEN=your_token_here .

# Note: HuggingFace token is optional - public models can be downloaded without authentication
Expand All @@ -804,7 +804,7 @@ docker-compose down
# docker-compose.yml snippet
services:
worker:
image: mentorme-plagiarism:latest
image: plg:latest
environment:
- POSTGRES_HOST=postgres
- RABBITMQ_HOST=rabbitmq
Expand Down
Loading