From 4baabf289f95beddd584ff26e7acd4e548c3fbed Mon Sep 17 00:00:00 2001 From: TAP Date: Wed, 17 Dec 2025 22:42:41 +0530 Subject: [PATCH 1/7] feedback payload fix --- .env.prod | 119 ++++ api/api.py | 11 + app.py | 4 + database/db_manager.py | 11 + mq/rmq_client.py | 41 +- plag_checker/submissions_checker.py | 7 + requirements.txt | 6 +- start-dev-env.README.md | 698 ++++++++++---------- start-dev-env.sh | 944 ++++++++++++++-------------- 9 files changed, 1009 insertions(+), 832 deletions(-) create mode 100644 .env.prod mode change 100644 => 100755 start-dev-env.sh diff --git a/.env.prod b/.env.prod new file mode 100644 index 0000000..101b48d --- /dev/null +++ b/.env.prod @@ -0,0 +1,119 @@ +# RABBITMQ CONFIGURATION +RABBITMQ_HOST=armadillo.rmq.cloudamqp.com +RABBITMQ_PORT=5672 +RABBITMQ_USER=fzdqidte +RABBITMQ_PASS=0SMrDogBVcWUcu9brWwp2QhET_kArl59 +RABBITMQ_VHOST=fzdqidte +RABBITMQ_MANAGEMENT_PORT=15672 +RABBITMQ_PREFETCH_COUNT=1 + +# Message retry configuration +# Maximum number of retries before sending to DLQ (prevents poison messages) +MAX_RETRIES=3 + +# Queue Names +SUBMISSION_QUEUE=plagiarism_submissions +FEEDBACK_QUEUE=plagiarism_feedback +# Dead Letter Queue (optional - leave empty to disable) +DEAD_LETTER_QUEUE=plagiarism_failed_submissions + +# POSTGRESQL CONFIGURATION +POSTGRES_HOST=db.example.com +POSTGRES_PORT=5432 +POSTGRES_DB=plagiarism_db +POSTGRES_USER=postgres +POSTGRES_PASSWORD=postgres + +# PGADMIN CONFIGURATION (Optional - for development only) +PGADMIN_EMAIL=admin@admin.com +PGADMIN_PASSWORD=admin123 + +# Connection Pool +POSTGRES_POOL_SIZE=10 +POSTGRES_MAX_OVERFLOW=20 + +# PLAGIARISM DETECTION THRESHOLDS +EXACT_DUPLICATE_THRESHOLD=0.95 +NEAR_DUPLICATE_THRESHOLD=0.90 +SEMANTIC_MATCH_THRESHOLD=0.80 + +# ==== PRODUCTION: Uncomment below for 7-day window ==== +RESUBMISSION_WINDOW_DAYS=14 + +# Hash comparison threshold (Hamming distance) +HASH_MATCH_THRESHOLD=10 + +# IMAGE PROCESSING +# Maximum image size in MB +MAX_IMAGE_SIZE_MB=10 + +# Image download timeout in seconds +IMAGE_DOWNLOAD_TIMEOUT=30 + +# Image validation thresholds +# Min variance to detect blank images (lower = more strict) +IMAGE_MIN_VARIANCE=5.0 +# Min unique colors required +IMAGE_MIN_UNIQUE_COLORS=10 +# Max ratio of dominant color (higher = more permissive) +IMAGE_MAX_SOLID_COLOR_RATIO=0.95 + +# CLIP Model Configuration +CLIP_MODEL=ViT-L/14 +CLIP_DEVICE=cpu +CLIP_PRETRAINED=laion2B-s32B-b82K + +# Local Model Path (Optional - use pre-downloaded models) +# If set, the system will load the model from this path instead of downloading from HuggingFace +# Example: CLIP_LOCAL_MODEL_PATH=./models/clip/open_clip_pytorch_model.bin +# Download models from: https://huggingface.co/laion/CLIP-ViT-L-14-laion2B-s32B-b82K +CLIP_LOCAL_MODEL_PATH=/app/models/clip/open_clip_pytorch_model.bin + +# Disable SSL verification for HuggingFace downloads (for corporate proxy/self-signed certs) +# Set to "true" only if you encounter SSL certificate errors +DISABLE_SSL_VERIFY=true +PYTHONHTTPSVERIFY=0 + +# VECTOR SEARCH CONFIGURATION +# Use pgvector (PostgreSQL) or FAISS for vector similarity search +USE_PGVECTOR=true + +# FAISS Configuration +FAISS_INDEX_PATH=/app/data/faiss_index.bin +FAISS_METADATA_PATH=/app/data/faiss_metadata.json +FAISS_DIMENSION=768 +FAISS_TOP_K=4 # Number of top candidates to retrieve from FAISS search + +# STORAGE PATHS +# Reference images directory +REFERENCE_IMAGES_DIR=./data/reference_images + +# Temporary storage for downloaded submissions +TEMP_IMAGES_DIR=./data/temp_images + +# Logs directory +#LOGS_DIR=./logs + +# APPLICATION SETTINGS +LOG_LEVEL=INFO + +# Worker concurrency (number of threads) +#WORKER_THREADS=4 + +# Enable performance metrics(ignore) +#ENABLE_METRICS=true + +# DEVELOPMENT SETTINGS +# Set to "development" or "production" +#ENVIRONMENT=development + +#DEBUG=true + + + +# Mock Glific API (for testing without WhatsApp) +#Used to skip WhatsApp delivery in testing mode +MOCK_GLIFIC=true +# ==== TESTING: 2-minute resubmission window (comment out for production) ==== +RESUBMISSION_WINDOW_MINUTES=2 + diff --git a/api/api.py b/api/api.py index 0d42d81..e8bc7ff 100644 --- a/api/api.py +++ b/api/api.py @@ -11,6 +11,7 @@ import aio_pika import json import uuid +from dotenv import load_dotenv from dotenv import load_dotenv import os @@ -26,6 +27,8 @@ ) logger = logging.getLogger(__name__) +load_dotenv() + # RabbitMQ Configuration RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "localhost") RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", "5672") @@ -33,6 +36,14 @@ RABBITMQ_USER = os.getenv("RABBITMQ_USER", "admin") RABBITMQ_PASS = os.getenv("RABBITMQ_PASS", "admin123") +#PRINT THE RABBITMQ CONFIG FOR DEBUGGING +# logger.info("###################") +# logger.info(f"RABBITMQ_HOST={RABBITMQ_HOST}") +# logger.info(f"RABBITMQ_PORT={RABBITMQ_PORT}") +# logger.info(f"RABBITMQ_VHOST={RABBITMQ_VHOST}") +# logger.info(f"RABBITMQ_USER={RABBITMQ_USER}") +# logger.info("###################") + SUBMISSION_QUEUE = os.getenv("SUBMISSION_QUEUE", "plagiarism_submissions") FEEDBACK_QUEUE = os.getenv("FEEDBACK_QUEUE", "plagiarism_feedback") diff --git a/app.py b/app.py index b54e76d..344e037 100644 --- a/app.py +++ b/app.py @@ -30,6 +30,10 @@ def validate_configuration(): ] missing = [var for var in required_env_vars if not os.getenv(var)] + #print the required env vars and their values for debugging + # for var in required_env_vars: + # logger.info("###################") + # logger.info(f"{var}={os.getenv(var)}") if missing: logger.error(f"Missing required environment variables: {missing}") raise ValueError(f"Missing required environment variables: {missing}") diff --git a/database/db_manager.py b/database/db_manager.py index 9263f3d..6db7936 100644 --- a/database/db_manager.py +++ b/database/db_manager.py @@ -67,6 +67,17 @@ async def init_pool(self): db_name = os.getenv("POSTGRES_DB") or os.getenv("DB_NAME") db_host = os.getenv("POSTGRES_HOST") or os.getenv("DB_HOST", "localhost") db_port = int(os.getenv("POSTGRES_PORT") or os.getenv("DB_PORT", "5432")) + # db_port = 5435 # TEMP OVERRIDE FOR TESTING + + #print the db connection details for debugging + # logger.info("###################") + # logger.info(f"DB Host: {db_host}") + # logger.info(f"DB Port: {db_port}") + # logger.info(f"DB Name: {db_name}") + # logger.info(f"DB User: {db_user}") + # logger.info(f"DB db_password: {db_password}") + # logger.info("###################") + if not all([db_user, db_password, db_name]): raise ValueError("Missing required database environment variables") diff --git a/mq/rmq_client.py b/mq/rmq_client.py index f99877f..6571c0b 100644 --- a/mq/rmq_client.py +++ b/mq/rmq_client.py @@ -103,17 +103,38 @@ async def connect(self): ) logger.info(f"Dead Letter Queue declared: {self.DEAD_LETTER_QUEUE}") - # Declare main submission queue - self.submission_queue = await self.channel.declare_queue( - self.SUBMISSION_QUEUE, durable=True - ) - logger.info(f"Submission queue declared: {self.SUBMISSION_QUEUE}") + try: + # First try passive declaration to check if queue exists + self.submission_queue = await self.channel.declare_queue( + self.SUBMISSION_QUEUE, + durable=True, + passive=True # Only check, don't create + ) + logger.info(f"Submission queue already exists: {self.SUBMISSION_QUEUE}") + except Exception: + # Queue doesn't exist, create it + self.submission_queue = await self.channel.declare_queue( + self.SUBMISSION_QUEUE, + durable=True + ) + logger.info(f"Submission queue created: {self.SUBMISSION_QUEUE}") + + try: + # First try passive declaration to check if queue exists + self.feedback_queue = await self.channel.declare_queue( + self.FEEDBACK_QUEUE, + durable=True, + passive=True # Only check, don't create + ) + logger.info(f"Feedback queue already exists: {self.FEEDBACK_QUEUE}") + except Exception: + # Queue doesn't exist, create it + self.feedback_queue = await self.channel.declare_queue( + self.FEEDBACK_QUEUE, + durable=True + ) + logger.info(f"Feedback queue created: {self.FEEDBACK_QUEUE}") - # Declare feedback queue for publishing results - self.feedback_queue = await self.channel.declare_queue( - self.FEEDBACK_QUEUE, durable=True - ) - logger.info(f"Feedback queue declared: {self.FEEDBACK_QUEUE}") logger.info( f"Connected to RabbitMQ with prefetch_count={self.PREFETCH_COUNT}, all queues declared" diff --git a/plag_checker/submissions_checker.py b/plag_checker/submissions_checker.py index 0787003..4fb46a8 100644 --- a/plag_checker/submissions_checker.py +++ b/plag_checker/submissions_checker.py @@ -119,6 +119,8 @@ async def initialize(self): if self.image_worker is None: raise RuntimeError("ImageWorker failed to initialize") + logger.info("Submission Checker initialized successfully") + await self.start_consumer() async def process_submission(self, submission): @@ -282,6 +284,11 @@ async def process_submission(self, submission): data["similarity_score"] = result_text.get("similarity_score") data["is_plagiarized"] = result_text.get("is_plagiarized") data["match_type"] = result_text.get("match_type") + data["assignment_id"] = data.pop("assign_id") + data["is_ai_generated"] = result_text.get("is_ai_generated", False) + data["ai_detection_source"] = result_text.get("ai_detection_source", "") + data["ai_confidence"] = result_text.get("ai_confidence", 0.0) + data["plagiarism_source"] = result_text.get("plagiarism_source", "") publish_data = {k: v for k, v in data.items() if k != "db_record_id"} diff --git a/requirements.txt b/requirements.txt index c2db3ae..37d4505 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,4 +28,8 @@ numpy==2.3.4 # Utilities tqdm==4.67.1 -podman-compose==1.5.0 \ No newline at end of file +podman-compose==1.5.0 + +# Web Framework - FastAPI +fastapi==0.115.0 +uvicorn[standard]==0.34.0 \ No newline at end of file diff --git a/start-dev-env.README.md b/start-dev-env.README.md index c49ad6d..a13e591 100644 --- a/start-dev-env.README.md +++ b/start-dev-env.README.md @@ -1,349 +1,349 @@ -# Local Development Setup Scripts - Quick Guide - -## Overview -Automated scripts to set up local development environment for MentorMe Plagiarism Detection System. - ---- - -## Files -- `start-dev-env.sh` - Bash script (Linux/macOS/Git Bash) -- `start-dev-env.ps1` - PowerShell script (Windows) - ---- - -## Quick Start - -### **Option 1: Complete Setup (Recommended)** -Everything in one command - infrastructure + Python environment + dependencies: - -```bash -# Linux/macOS/Git Bash -chmod +x start-dev-env.sh -./start-dev-env.sh --full-setup - -# Windows PowerShell -.\start-dev-env.ps1 --full-setup -``` - -**What it does:** -- Creates PostgreSQL container (port 5432) -- Creates RabbitMQ container (ports 5672, 15672) -- Initializes database schema -- Creates `.env` configuration file -- Creates Python virtual environment -- Installs all dependencies (~5-10 minutes) -- Downloads CLIP model from HuggingFace (~3.5GB) -- Verifies setup - -**Time:** ~10-15 minutes (first run) - ---- - -### **Option 2: Infrastructure Only** (Default) -Just containers + database, manual Python setup: - -```bash -# Linux/macOS/Git Bash -./start-dev-env.sh - -# Windows PowerShell -.\start-dev-env.ps1 -``` - -**What it does:** -- Creates PostgreSQL + RabbitMQ containers -- Initializes database -- Creates `.env` file - -**Then manually:** -```bash -python -m venv venv -source venv/bin/activate # Linux/macOS -# OR -.\venv\Scripts\Activate.ps1 # Windows - -pip install -r requirements.txt -``` - ---- - -### **Option 3: With Wheelhouse** (Offline Installs) -Pre-compile dependencies for faster/offline installs: - -```bash -# Linux/macOS/Git Bash -./start-dev-env.sh --build-wheelhouse - -# Then install offline: -pip install --no-index --find-links=wheelhouse -r requirements.txt -``` - ---- - -## Available Flags - -| Flag | Description | -|------|-------------| -| `--full-setup` | Complete setup: infrastructure + Python + dependencies | -| `--build-wheelhouse` | Build wheelhouse for offline dependency installation | -| (no flags) | Infrastructure only (containers + database) | - ---- - -## What Gets Created - -### **Infrastructure** -| Service | Port | Access | Credentials | -|---------|------|--------|-------------| -| PostgreSQL | 5432 | localhost:5432 | postgres/postgres | -| RabbitMQ (AMQP) | 5672 | localhost:5672 | admin/admin123 | -| RabbitMQ (Management UI) | 15672 | http://localhost:15672 | guest/guest | - -### **Files** -- `.env` - Environment configuration (from `.env.example`) -- `venv/` - Python virtual environment (if `--full-setup`) -- `data/` - Data directories (reference_images, temp_images) -- `models/` - Model cache directory -- `logs/` - Application logs directory - -### **Database** -- Database: `plagiarism_db` -- Tables: `submissions`, `reference_images`, `feedback_logs` -- Extension: pgvector -- Indexes: B-tree, HNSW vector indexes - ---- - -## After Setup - -### **Start the Application** - -**Terminal 1 - Worker:** -```bash -source venv/bin/activate # Linux/macOS -# OR -.\venv\Scripts\Activate.ps1 # Windows - -python app.py -``` - -**Terminal 2 - API Server:** -```bash -source venv/bin/activate - -uvicorn api:app --reload --host 0.0.0.0 --port 8000 -``` - -**Access API:** http://localhost:8000/docs - ---- - -## Prerequisites - -### **Required** -- **Podman** (or Docker) - Container runtime -- **Python 3.10+** - Application runtime -- **8GB+ RAM** - For CLIP model -- **10GB+ disk** - For dependencies and models - -### **Optional** -- **CUDA GPU** - For faster CLIP inference (10x speedup) -- **curl** - For health checks - ---- - -## Troubleshooting - -### **"Podman not found"** -```bash -# Install Podman: https://podman.io/getting-started/installation -``` - -### **"Python not found"** -```bash -# Install Python 3.10+: https://www.python.org/downloads/ -``` - -### **"Port already in use"** -```bash -# Stop existing containers -podman stop mentorme-postgres mentorme-rabbitmq -podman rm mentorme-postgres mentorme-rabbitmq - -# Or change ports in script (POSTGRES_PORT, RABBITMQ_PORT) -``` - -### **"Database connection failed"** -```bash -# Check PostgreSQL is running -podman ps | grep mentorme-postgres - -# Check logs -podman logs mentorme-postgres - -# Restart container -podman restart mentorme-postgres -``` - -### **"RabbitMQ not ready"** -```bash -# Check RabbitMQ is running -podman ps | grep mentorme-rabbitmq - -# Access management UI -open http://localhost:15672 # guest/guest - -# Restart container -podman restart mentorme-rabbitmq -``` - ---- - -## Container Management - -### **View Logs** -```bash -podman logs mentorme-postgres -podman logs mentorme-rabbitmq -podman logs -f mentorme-postgres # Follow mode -``` - -### **Stop Containers** -```bash -podman stop mentorme-postgres mentorme-rabbitmq -``` - -### **Remove Containers** -```bash -podman rm mentorme-postgres mentorme-rabbitmq -``` - -### **Restart Containers** -```bash -podman restart mentorme-postgres mentorme-rabbitmq -``` - -### **Check Running Containers** -```bash -podman ps -``` - ---- - -## Configuration Override - -### **Edit `.env` After Creation** -Script creates `.env` from `.env.example` with localhost overrides. You can modify: - -```bash -# Example: Use different CLIP model -CLIP_MODEL=ViT-B/32 # Smaller, faster model (512D) - -# Example: Enable GPU -CLIP_DEVICE=cuda - -# Example: Enable pgvector instead of FAISS -USE_PGVECTOR=true -``` - -### **Environment Variables Priority** -1. System environment variables (highest) -2. `.env` file -3. `config.py` defaults (lowest) - ---- - -## What This Script Does NOT Do - - **Does not start the application** - You must run `python app.py` and `uvicorn api:app` - **Does not seed reference images** - Use `./seeding/seed-data.sh` or `python seeding/seed_ref_images.py` - **Does not expose port 8000** - Only exposed when API is running - **Does not use Docker Compose** - Uses Podman containers directly - ---- - -## Comparison: Script vs Docker Compose - -| Feature | This Script | Docker Compose | -|---------|-------------|----------------| -| **Tool** | Podman | Docker | -| **Python App** | Runs on host | Runs in container | -| **Development** | Faster (direct edits) | Requires rebuild | -| **Debugging** | Native debugger | Remote debugging | -| **Production** | Not recommended | Best practice | -| **Dependencies** | Installed on host | Isolated in container | - ---- - -## Examples - -### **First-Time Setup** -```bash -# Complete automated setup -./start-dev-env.sh --full-setup - -# Start worker -source venv/bin/activate -python app.py - -# In another terminal, start API -source venv/bin/activate -uvicorn api:app --host 0.0.0.0 --port 8000 -``` - -### **Daily Development** -```bash -# Containers already exist, just start them -podman start mentorme-postgres mentorme-rabbitmq - -# Activate venv and run -source venv/bin/activate -python app.py -``` - -### **Clean Restart** -```bash -# Stop and remove everything -podman stop mentorme-postgres mentorme-rabbitmq -podman rm mentorme-postgres mentorme-rabbitmq - -# Run script again -./start-dev-env.sh --full-setup -``` - ---- - -## Next Steps After Setup - -1. **(Optional) Seed reference images:** - ```bash - ./seeding/seed-data.sh --ref-images - # Or directly: python seeding/seed_ref_images.py --directory data/reference_images - ``` - -2. **Test the system:** - ```bash - python tests/simulation_e2e.py --vm-ip localhost \ - --image https://example.com/test.jpg \ - --student-id ST001 --assign-id A001 - ``` - -3. **Access API documentation:** - - OpenAPI: http://localhost:8000/docs - - ReDoc: http://localhost:8000/redoc - -4. **Monitor queues:** - - RabbitMQ UI: http://localhost:15672 - ---- - -## Support - -- **Documentation:** See `DOCUMENTATION.md` for complete system documentation -- **Issues:** Check logs in `logs/` directory -- **Database:** Connect with any PostgreSQL client to `localhost:5432` - ---- - -**Last Updated:** November 6, 2025 -**Version:** 1.0.0 +# Local Development Setup Scripts - Quick Guide + +## Overview +Automated scripts to set up local development environment for MentorMe Plagiarism Detection System. + +--- + +## Files +- `start-dev-env.sh` - Bash script (Linux/macOS/Git Bash) +- `start-dev-env.ps1` - PowerShell script (Windows) + +--- + +## Quick Start + +### **Option 1: Complete Setup (Recommended)** +Everything in one command - infrastructure + Python environment + dependencies: + +```bash +# Linux/macOS/Git Bash +chmod +x start-dev-env.sh +./start-dev-env.sh --full-setup + +# Windows PowerShell +.\start-dev-env.ps1 --full-setup +``` + +**What it does:** +- Creates PostgreSQL container (port 5432) +- Creates RabbitMQ container (ports 5672, 15672) +- Initializes database schema +- Creates `.env` configuration file +- Creates Python virtual environment +- Installs all dependencies (~5-10 minutes) +- Downloads CLIP model from HuggingFace (~3.5GB) +- Verifies setup + +**Time:** ~10-15 minutes (first run) + +--- + +### **Option 2: Infrastructure Only** (Default) +Just containers + database, manual Python setup: + +```bash +# Linux/macOS/Git Bash +./start-dev-env.sh + +# Windows PowerShell +.\start-dev-env.ps1 +``` + +**What it does:** +- Creates PostgreSQL + RabbitMQ containers +- Initializes database +- Creates `.env` file + +**Then manually:** +```bash +python -m venv venv +source venv/bin/activate # Linux/macOS +# OR +.\venv\Scripts\Activate.ps1 # Windows + +pip install -r requirements.txt +``` + +--- + +### **Option 3: With Wheelhouse** (Offline Installs) +Pre-compile dependencies for faster/offline installs: + +```bash +# Linux/macOS/Git Bash +./start-dev-env.sh --build-wheelhouse + +# Then install offline: +pip install --no-index --find-links=wheelhouse -r requirements.txt +``` + +--- + +## Available Flags + +| Flag | Description | +|------|-------------| +| `--full-setup` | Complete setup: infrastructure + Python + dependencies | +| `--build-wheelhouse` | Build wheelhouse for offline dependency installation | +| (no flags) | Infrastructure only (containers + database) | + +--- + +## What Gets Created + +### **Infrastructure** +| Service | Port | Access | Credentials | +|---------|------|--------|-------------| +| PostgreSQL | 5432 | localhost:5432 | postgres/postgres | +| RabbitMQ (AMQP) | 5672 | localhost:5672 | admin/admin123 | +| RabbitMQ (Management UI) | 15672 | http://localhost:15672 | guest/guest | + +### **Files** +- `.env` - Environment configuration (from `.env.example`) +- `venv/` - Python virtual environment (if `--full-setup`) +- `data/` - Data directories (reference_images, temp_images) +- `models/` - Model cache directory +- `logs/` - Application logs directory + +### **Database** +- Database: `plagiarism_db` +- Tables: `submissions`, `reference_images`, `feedback_logs` +- Extension: pgvector +- Indexes: B-tree, HNSW vector indexes + +--- + +## After Setup + +### **Start the Application** + +**Terminal 1 - Worker:** +```bash +source venv/bin/activate # Linux/macOS +# OR +.\venv\Scripts\Activate.ps1 # Windows + +python app.py +``` + +**Terminal 2 - API Server:** +```bash +source venv/bin/activate + +uvicorn api:app --reload --host 0.0.0.0 --port 8000 +``` + +**Access API:** http://localhost:8000/docs + +--- + +## Prerequisites + +### **Required** +- **Podman** (or Docker) - Container runtime +- **Python 3.10+** - Application runtime +- **8GB+ RAM** - For CLIP model +- **10GB+ disk** - For dependencies and models + +### **Optional** +- **CUDA GPU** - For faster CLIP inference (10x speedup) +- **curl** - For health checks + +--- + +## Troubleshooting + +### **"Podman not found"** +```bash +# Install Podman: https://podman.io/getting-started/installation +``` + +### **"Python not found"** +```bash +# Install Python 3.10+: https://www.python.org/downloads/ +``` + +### **"Port already in use"** +```bash +# Stop existing containers +podman stop mentorme-postgres mentorme-rabbitmq +podman rm mentorme-postgres mentorme-rabbitmq + +# Or change ports in script (POSTGRES_PORT, RABBITMQ_PORT) +``` + +### **"Database connection failed"** +```bash +# Check PostgreSQL is running +podman ps | grep mentorme-postgres + +# Check logs +podman logs mentorme-postgres + +# Restart container +podman restart mentorme-postgres +``` + +### **"RabbitMQ not ready"** +```bash +# Check RabbitMQ is running +podman ps | grep mentorme-rabbitmq + +# Access management UI +open http://localhost:15672 # guest/guest + +# Restart container +podman restart mentorme-rabbitmq +``` + +--- + +## Container Management + +### **View Logs** +```bash +podman logs mentorme-postgres +podman logs mentorme-rabbitmq +podman logs -f mentorme-postgres # Follow mode +``` + +### **Stop Containers** +```bash +podman stop mentorme-postgres mentorme-rabbitmq +``` + +### **Remove Containers** +```bash +podman rm mentorme-postgres mentorme-rabbitmq +``` + +### **Restart Containers** +```bash +podman restart mentorme-postgres mentorme-rabbitmq +``` + +### **Check Running Containers** +```bash +podman ps +``` + +--- + +## Configuration Override + +### **Edit `.env` After Creation** +Script creates `.env` from `.env.example` with localhost overrides. You can modify: + +```bash +# Example: Use different CLIP model +CLIP_MODEL=ViT-B/32 # Smaller, faster model (512D) + +# Example: Enable GPU +CLIP_DEVICE=cuda + +# Example: Enable pgvector instead of FAISS +USE_PGVECTOR=true +``` + +### **Environment Variables Priority** +1. System environment variables (highest) +2. `.env` file +3. `config.py` defaults (lowest) + +--- + +## What This Script Does NOT Do + + **Does not start the application** - You must run `python app.py` and `uvicorn api:app` + **Does not seed reference images** - Use `./seeding/seed-data.sh` or `python seeding/seed_ref_images.py` + **Does not expose port 8000** - Only exposed when API is running + **Does not use Docker Compose** - Uses Podman containers directly + +--- + +## Comparison: Script vs Docker Compose + +| Feature | This Script | Docker Compose | +|---------|-------------|----------------| +| **Tool** | Podman | Docker | +| **Python App** | Runs on host | Runs in container | +| **Development** | Faster (direct edits) | Requires rebuild | +| **Debugging** | Native debugger | Remote debugging | +| **Production** | Not recommended | Best practice | +| **Dependencies** | Installed on host | Isolated in container | + +--- + +## Examples + +### **First-Time Setup** +```bash +# Complete automated setup +./start-dev-env.sh --full-setup + +# Start worker +source venv/bin/activate +python app.py + +# In another terminal, start API +source venv/bin/activate +uvicorn api:app --host 0.0.0.0 --port 8000 +``` + +### **Daily Development** +```bash +# Containers already exist, just start them +podman start mentorme-postgres mentorme-rabbitmq + +# Activate venv and run +source venv/bin/activate +python app.py +``` + +### **Clean Restart** +```bash +# Stop and remove everything +podman stop mentorme-postgres mentorme-rabbitmq +podman rm mentorme-postgres mentorme-rabbitmq + +# Run script again +./start-dev-env.sh --full-setup +``` + +--- + +## Next Steps After Setup + +1. **(Optional) Seed reference images:** + ```bash + ./seeding/seed-data.sh --ref-images + # Or directly: python seeding/seed_ref_images.py --directory data/reference_images + ``` + +2. **Test the system:** + ```bash + python tests/simulation_e2e.py --vm-ip localhost \ + --image https://example.com/test.jpg \ + --student-id ST001 --assign-id A001 + ``` + +3. **Access API documentation:** + - OpenAPI: http://localhost:8000/docs + - ReDoc: http://localhost:8000/redoc + +4. **Monitor queues:** + - RabbitMQ UI: http://localhost:15672 + +--- + +## Support + +- **Documentation:** See `DOCUMENTATION.md` for complete system documentation +- **Issues:** Check logs in `logs/` directory +- **Database:** Connect with any PostgreSQL client to `localhost:5432` + +--- + +**Last Updated:** November 6, 2025 +**Version:** 1.0.0 diff --git a/start-dev-env.sh b/start-dev-env.sh old mode 100644 new mode 100755 index 298f205..15a9729 --- a/start-dev-env.sh +++ b/start-dev-env.sh @@ -1,472 +1,472 @@ -#!/usr/bin/env bash -# Local Development Startup Script -# Starts PostgreSQL and RabbitMQ containers using docker-compose - -set -e -FULL_SETUP=0 -START_API=0 -COMPOSE_FILE="docker-compose-dev.yml" - -while [[ "$#" -gt 0 ]]; do - case "$1" in - --full-setup) FULL_SETUP=1; shift ;; - --with-api) START_API=1; shift ;; - --prod) COMPOSE_FILE="docker-compose-prod.yml"; shift ;; - --dev) COMPOSE_FILE="docker-compose-dev.yml"; shift ;; - *) break ;; - esac -done - -# Detect platform -OS_TYPE="unknown" -UNAME_OUT=$(uname -s 2>/dev/null || true) -case "${UNAME_OUT}" in - MINGW*|MSYS*|CYGWIN*) OS_TYPE="windows-msys" ;; - Darwin) OS_TYPE="macos" ;; - Linux) OS_TYPE="linux" ;; - *) OS_TYPE="unix" ;; -esac - -# Detect Python executable -PY="" -if [ "${OS_TYPE}" = "windows-msys" ]; then - PY_CANDIDATES=("py" "python3" "python" "python.exe") -else - PY_CANDIDATES=("python3" "python" "py" "python.exe") -fi - -for candidate in "${PY_CANDIDATES[@]}"; do - if [ "$candidate" = "py" ]; then - if command -v py >/dev/null 2>&1; then - if py -3 -c "import sys; sys.stdout.write('Python found!!\n')" 2>/dev/null; then - PY='py -3' - break - fi - fi - continue - fi - - candidate_path=$(command -v "$candidate" 2>/dev/null || true) - if [ -n "$candidate_path" ]; then - case "$candidate_path" in - *WindowsApps*|*windowsapps*) continue ;; - esac - - if "$candidate" -c "import sys; sys.stdout.write('ok')" 2>/dev/null; then - PY="$candidate" - break - fi - fi -done - -if [ -z "$PY" ]; then - echo "ERROR: No Python 3.10+ found. Install Python or ensure it's in PATH." >&2 - exit 1 -fi - -# Detect container runtime (Podman or Docker) -CONTAINER_CMD="" -COMPOSE_CMD="" -if command -v podman &> /dev/null; then - CONTAINER_CMD="podman" - if command -v podman-compose &> /dev/null; then - COMPOSE_CMD="podman-compose" - else - echo "ERROR: podman-compose not found. Install it:" >&2 - echo " pip install podman-compose" >&2 - exit 1 - fi -elif command -v docker &> /dev/null; then - CONTAINER_CMD="docker" - if command -v docker-compose &> /dev/null; then - COMPOSE_CMD="docker-compose" - elif docker compose version &> /dev/null; then - COMPOSE_CMD="docker compose" - else - echo "ERROR: docker-compose not found. Install it:" >&2 - echo " https://docs.docker.com/compose/install/" >&2 - exit 1 - fi -else - echo "ERROR: Neither Podman nor Docker found. Install one of them:" >&2 - echo " Podman: https://podman.io/getting-started/installation" >&2 - echo " Docker: https://docs.docker.com/get-docker/" >&2 - exit 1 -fi - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -CYAN='\033[0;36m' -GRAY='\033[0;37m' -NC='\033[0m' - -echo -e "${CYAN}==================================================================" -echo -e " MentorMe Plagiarism Checker - Local Development Setup" -echo -e " Container Runtime: ${CONTAINER_CMD}" -echo -e " Compose File: ${COMPOSE_FILE}" -echo -e "==================================================================${NC}" -echo "" - -# Validate compose file exists -if [ ! -f "$COMPOSE_FILE" ]; then - echo -e "${RED}ERROR: $COMPOSE_FILE not found${NC}" - exit 1 -fi - -# Load configuration from .env if it exists -if [ -f ".env" ]; then - set -a - source <(grep -v '^#' .env | grep -v '^$' | sed 's/\r$//') - set +a -fi - -# Configuration (with defaults) -POSTGRES_CONTAINER="mentorme-plagiarism-postgres" -RABBITMQ_CONTAINER="mentorme-plagiarism-rabbitmq" -POSTGRES_PORT="${POSTGRES_PORT:-5432}" -RABBITMQ_PORT="${RABBITMQ_PORT:-5672}" -RABBITMQ_MGMT_PORT="${RABBITMQ_MANAGEMENT_PORT:-15672}" -POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-postgres}" -POSTGRES_DB="${POSTGRES_DB:-plagiarism_db}" -POSTGRES_USER="${POSTGRES_USER:-postgres}" -RABBITMQ_USER="${RABBITMQ_USER:-admin}" -RABBITMQ_PASS="${RABBITMQ_PASS:-admin123}" -CLIP_MODEL_URL="${CLIP_MODEL_URL:-https://huggingface.co/laion/CLIP-ViT-L-14-laion2B-s32B-b82K/resolve/main/open_clip_pytorch_model.bin}" - -stop_existing_containers() { - local containers_exist=false - - # Check if compose stack is running - if $COMPOSE_CMD -f $COMPOSE_FILE ps 2>/dev/null | grep -q "Up\|running"; then - containers_exist=true - fi - - if [ "$containers_exist" = true ]; then - echo -e "${YELLOW}Existing containers found:${NC}" - $COMPOSE_CMD -f $COMPOSE_FILE ps - echo "" - echo -e "${YELLOW}This will stop and remove existing containers.${NC}" - read -p "Continue? (y/N): " -n 1 -r - echo - - if [[ ! $REPLY =~ ^[Yy]$ ]]; then - echo -e "${RED}Aborted by user${NC}" - # exit 0 - else - echo -e "${CYAN}Stopping existing containers...${NC}" - $COMPOSE_CMD -f $COMPOSE_FILE down - fi - fi - - echo -e "${GREEN}[OK] Ready to start containers${NC}" -} - -start_containers() { - echo -e "${CYAN}Starting containers with $COMPOSE_FILE...${NC}" - - # Determine which services to start - #local services="postgres rabbitmq pgadmin plagiarism-checker" - local services="postgres rabbitmq plagiarism-checker" - - if [ "$START_API" -eq 1 ]; then - services="$services api" - echo -e "${CYAN}Including API service${NC}" - - # Force rebuild API container to ensure it uses Dockerfile.api (not Dockerfile) - echo -e "${YELLOW}Rebuilding API container with Dockerfile.api...${NC}" - $COMPOSE_CMD -f $COMPOSE_FILE build --no-cache api - - if [ $? -ne 0 ]; then - echo -e "${RED}ERROR: Failed to build API container${NC}" - exit 1 - fi - echo -e "${GREEN}[OK] API container rebuilt${NC}" - fi - - $COMPOSE_CMD -f $COMPOSE_FILE up -d $services - - if [ $? -ne 0 ]; then - echo -e "${RED}ERROR: Failed to start containers${NC}" - exit 1 - fi - - echo -e "${GREEN}[OK] Containers started${NC}" -} - -wait_for_postgres() { - echo -e "${YELLOW}Waiting for PostgreSQL...${NC}" - - local max_attempts=30 - local attempt=0 - - while [ $attempt -lt $max_attempts ]; do - attempt=$((attempt + 1)) - - if $CONTAINER_CMD exec $POSTGRES_CONTAINER pg_isready -U $POSTGRES_USER &>/dev/null; then - echo -e "${GREEN}[OK] PostgreSQL ready${NC}" - return 0 - fi - - sleep 2 - done - - echo -e "${RED}ERROR: PostgreSQL timeout${NC}" - exit 1 -} - -wait_for_rabbitmq() { - echo -e "${YELLOW}Waiting for RabbitMQ...${NC}" - - local max_attempts=30 - local attempt=0 - - while [ $attempt -lt $max_attempts ]; do - attempt=$((attempt + 1)) - - if curl -s -f http://localhost:$RABBITMQ_MGMT_PORT &>/dev/null; then - echo -e "${GREEN}[OK] RabbitMQ ready${NC}" - return 0 - fi - - sleep 2 - done - - echo -e "${RED}ERROR: RabbitMQ timeout${NC}" - exit 1 -} - -wait_for_api() { - if [ "$START_API" -ne 1 ]; then - return 0 - fi - - echo -e "${YELLOW}Waiting for API service...${NC}" - - local max_attempts=30 - local attempt=0 - - while [ $attempt -lt $max_attempts ]; do - attempt=$((attempt + 1)) - - if curl -s -f http://localhost:8000/health &>/dev/null; then - echo -e "${GREEN}[OK] API service ready${NC}" - return 0 - fi - - sleep 2 - done - - echo -e "${RED}ERROR: API service timeout${NC}" - echo -e "${YELLOW}Checking API container logs:${NC}" - $COMPOSE_CMD -f $COMPOSE_FILE logs --tail=50 api - exit 1 -} - -initialize_database() { - if [ ! -f "database/init.sql" ]; then - echo -e "${YELLOW}WARNING: database/init.sql not found${NC}" - return - fi - - echo -e "${CYAN}Initializing database...${NC}" - - # Run init.sql - if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < database/init.sql 2>/dev/null; then - echo -e "${GREEN}[OK] Database schema initialized${NC}" - else - echo -e "${GRAY}Database schema already exists${NC}" - fi - - # Create migrations tracking table if it doesn't exist - $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " - CREATE TABLE IF NOT EXISTS schema_migrations ( - id SERIAL PRIMARY KEY, - migration_name VARCHAR(255) UNIQUE NOT NULL, - applied_at TIMESTAMP DEFAULT NOW() - ); - " 2>/dev/null - - # Run migration scripts - if [ -d "database/migrations" ]; then - local migration_count=0 - for migration_file in database/migrations/*.sql; do - if [ -f "$migration_file" ]; then - local migration_name=$(basename "$migration_file") - - # Check if migration already applied - local already_applied=$($CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -t -c " - SELECT COUNT(*) FROM schema_migrations WHERE migration_name = '$migration_name'; - " 2>/dev/null | tr -d '[:space:]') - - if [ "$already_applied" = "0" ]; then - echo -e "${CYAN}Applying migration: $migration_name${NC}" - - if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < "$migration_file" 2>/dev/null; then - # Record migration as applied - $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " - INSERT INTO schema_migrations (migration_name) VALUES ('$migration_name'); - " 2>/dev/null - echo -e "${GREEN}[OK] Applied: $migration_name${NC}" - migration_count=$((migration_count + 1)) - else - echo -e "${YELLOW}WARNING: Failed to apply $migration_name${NC}" - fi - fi - fi - done - - if [ $migration_count -eq 0 ]; then - echo -e "${GRAY}All migrations already applied${NC}" - else - echo -e "${GREEN}[OK] Applied $migration_count migration(s)${NC}" - fi - fi -} - -create_env_file() { - if [ -f ".env" ]; then - echo -e "${GREEN}[OK] .env exists${NC}" - return - fi - - if [ ! -f ".env.example" ]; then - echo -e "${RED}ERROR: .env.example not found${NC}" - exit 1 - fi - - echo -e "${CYAN}Creating .env from template...${NC}" - cp .env.example .env - - # Keep service names for docker-compose (containers communicate via service names) - # No transformation needed - .env.example already has correct service names - - echo -e "${GREEN}[OK] .env created${NC}" -} - -show_summary() { - echo -e "\n${CYAN}==================================================================" - echo -e " Environment Ready!" - echo -e "==================================================================${NC}" - echo "" - echo -e "${GREEN}✓ Services Running:${NC}" - echo -e " PostgreSQL: localhost:$POSTGRES_PORT (with pgvector)" - echo -e " RabbitMQ: localhost:$RABBITMQ_PORT" - echo -e " RabbitMQ UI: http://localhost:$RABBITMQ_MGMT_PORT ($RABBITMQ_USER/$RABBITMQ_PASS)" - - if [ "$START_API" -eq 1 ]; then - echo -e " API: http://localhost:8000" - echo -e " API Docs: http://localhost:8000/docs" - fi - - echo "" - echo -e "${CYAN}💡 Quick Start:${NC}" - echo -e " ${YELLOW}./start-dev-env.sh --full-setup${NC} ${GRAY}# Development mode (default)${NC}" - echo -e " ${YELLOW}./start-dev-env.sh --with-api${NC} ${GRAY}# Include API container${NC}" - echo -e " ${YELLOW}./start-dev-env.sh --prod${NC} ${GRAY}# Production mode${NC}" - echo "" - echo -e "${CYAN}📋 Manual Setup:${NC}" - echo -e " 1. ${YELLOW}${PY} -m venv venv${NC}" - - case "${OS_TYPE}" in - windows-msys) echo -e " ${YELLOW}source venv/Scripts/activate${NC}" ;; - *) echo -e " ${YELLOW}source venv/bin/activate${NC}" ;; - esac - - echo -e " 2. ${YELLOW}${PY} -m pip install -r requirements.txt${NC}" - echo -e " 3. ${YELLOW}${PY} app.py${NC}" - echo "" - echo -e "${CYAN}🔧 Container Commands:${NC}" - echo -e " ${GRAY}Logs: $COMPOSE_CMD -f $COMPOSE_FILE logs -f${NC}" - echo -e " ${GRAY}Stop: $COMPOSE_CMD -f $COMPOSE_FILE stop${NC}" - echo -e " ${GRAY}Remove: $COMPOSE_CMD -f $COMPOSE_FILE down${NC}" - echo "" - echo -e "${CYAN}==================================================================${NC}" -} - -main() { - create_env_file - stop_existing_containers - start_containers - wait_for_postgres - wait_for_rabbitmq - wait_for_api - initialize_database - show_summary - echo -e "\n${GRAY}Containers running in background. Press Ctrl+C to exit this script.${NC}" -} - -setup_python_environment() { - echo -e "\n${CYAN}==================================================================" - echo -e " Full Setup: Python Environment" - echo -e "==================================================================${NC}" - - if [ -d "venv" ]; then - echo -e "${GRAY}Virtual environment exists${NC}" - else - echo -e "${CYAN}Creating virtual environment...${NC}" - $PY -m venv venv - echo -e "${GREEN}[OK] venv created${NC}" - fi - - echo -e "${CYAN}Activating virtual environment...${NC}" - if [ "${OS_TYPE}" = "windows-msys" ]; then - source venv/Scripts/activate - else - source venv/bin/activate - fi - - echo -e "${CYAN}Installing dependencies (5-10 minutes)...${NC}" - python -m pip install --upgrade pip setuptools wheel - python -m pip install -r requirements.txt - echo -e "${GREEN}[OK] Dependencies installed${NC}" - - echo -e "${CYAN}Creating directories...${NC}" - mkdir -p data/reference_images data/models/clip logs - echo -e "${GREEN}[OK] Directories created${NC}" - - # Download CLIP model using curl - echo -e "${CYAN}Checking CLIP model...${NC}" - if [ ! -f "data/models/clip/open_clip_pytorch_model.bin" ]; then - echo -e "${YELLOW}Downloading CLIP model (this may take a while)...${NC}" - curl -L -o data/models/clip/open_clip_pytorch_model.bin \ - ${CLIP_MODEL_URL} || { - echo -e "${YELLOW}WARNING: CLIP model download failed, will download on first run${NC}" - } - - if [ -f "data/models/clip/open_clip_pytorch_model.bin" ]; then - echo -e "${GREEN}[OK] CLIP model downloaded${NC}" - fi - else - echo -e "${GRAY}CLIP model already exists${NC}" - fi - - echo -e "${CYAN}Verifying environment...${NC}" - python -c " -import open_clip, asyncpg, aio_pika, PIL, imagehash -print('✓ All imports successful') -" || { - echo -e "${RED}ERROR: Environment verification failed${NC}" - exit 1 - } - - echo -e "\n${CYAN}==================================================================" - echo -e " Setup Complete!" - echo -e "==================================================================${NC}" - echo -e "\n${GREEN}✓ Next Steps:${NC}" - echo -e "\n${CYAN}Terminal 1 - Worker:${NC}" - [ "${OS_TYPE}" = "windows-msys" ] && echo -e " ${YELLOW}source venv/Scripts/activate${NC}" || echo -e " ${YELLOW}source venv/bin/activate${NC}" - echo -e " ${YELLOW}python app.py${NC}" - echo -e "\n${CYAN}Terminal 2 - API:${NC}" - [ "${OS_TYPE}" = "windows-msys" ] && echo -e " ${YELLOW}source venv/Scripts/activate${NC}" || echo -e " ${YELLOW}source venv/bin/activate${NC}" - echo -e " ${YELLOW}cd api && uvicorn api:app --reload --host 0.0.0.0 --port 8000${NC}" - echo -e "\n${CYAN}API Docs:${NC} ${YELLOW}http://localhost:8000/docs${NC}" - echo "" -} - -main - -if [ "$FULL_SETUP" -eq 1 ]; then - setup_python_environment -fi +#!/usr/bin/env bash +# Local Development Startup Script +# Starts PostgreSQL and RabbitMQ containers using docker-compose + +set -e +FULL_SETUP=0 +START_API=0 +COMPOSE_FILE="docker-compose-dev.yml" + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --full-setup) FULL_SETUP=1; shift ;; + --with-api) START_API=1; shift ;; + --prod) COMPOSE_FILE="docker-compose-prod.yml"; shift ;; + --dev) COMPOSE_FILE="docker-compose-dev.yml"; shift ;; + *) break ;; + esac +done + +# Detect platform +OS_TYPE="unknown" +UNAME_OUT=$(uname -s 2>/dev/null || true) +case "${UNAME_OUT}" in + MINGW*|MSYS*|CYGWIN*) OS_TYPE="windows-msys" ;; + Darwin) OS_TYPE="macos" ;; + Linux) OS_TYPE="linux" ;; + *) OS_TYPE="unix" ;; +esac + +# Detect Python executable +PY="" +if [ "${OS_TYPE}" = "windows-msys" ]; then + PY_CANDIDATES=("py" "python3" "python" "python.exe") +else + PY_CANDIDATES=("python3" "python" "py" "python.exe") +fi + +for candidate in "${PY_CANDIDATES[@]}"; do + if [ "$candidate" = "py" ]; then + if command -v py >/dev/null 2>&1; then + if py -3 -c "import sys; sys.stdout.write('Python found!!\n')" 2>/dev/null; then + PY='py -3' + break + fi + fi + continue + fi + + candidate_path=$(command -v "$candidate" 2>/dev/null || true) + if [ -n "$candidate_path" ]; then + case "$candidate_path" in + *WindowsApps*|*windowsapps*) continue ;; + esac + + if "$candidate" -c "import sys; sys.stdout.write('ok')" 2>/dev/null; then + PY="$candidate" + break + fi + fi +done + +if [ -z "$PY" ]; then + echo "ERROR: No Python 3.10+ found. Install Python or ensure it's in PATH." >&2 + exit 1 +fi + +# Detect container runtime (Podman or Docker) +CONTAINER_CMD="" +COMPOSE_CMD="" +if command -v podman &> /dev/null; then + CONTAINER_CMD="podman" + if command -v podman-compose &> /dev/null; then + COMPOSE_CMD="podman-compose" + else + echo "ERROR: podman-compose not found. Install it:" >&2 + echo " pip install podman-compose" >&2 + exit 1 + fi +elif command -v docker &> /dev/null; then + CONTAINER_CMD="docker" + if command -v docker-compose &> /dev/null; then + COMPOSE_CMD="docker-compose" + elif docker compose version &> /dev/null; then + COMPOSE_CMD="docker compose" + else + echo "ERROR: docker-compose not found. Install it:" >&2 + echo " https://docs.docker.com/compose/install/" >&2 + exit 1 + fi +else + echo "ERROR: Neither Podman nor Docker found. Install one of them:" >&2 + echo " Podman: https://podman.io/getting-started/installation" >&2 + echo " Docker: https://docs.docker.com/get-docker/" >&2 + exit 1 +fi + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +GRAY='\033[0;37m' +NC='\033[0m' + +echo -e "${CYAN}==================================================================" +echo -e " MentorMe Plagiarism Checker - Local Development Setup" +echo -e " Container Runtime: ${CONTAINER_CMD}" +echo -e " Compose File: ${COMPOSE_FILE}" +echo -e "==================================================================${NC}" +echo "" + +# Validate compose file exists +if [ ! -f "$COMPOSE_FILE" ]; then + echo -e "${RED}ERROR: $COMPOSE_FILE not found${NC}" + exit 1 +fi + +# Load configuration from .env if it exists +if [ -f ".env" ]; then + set -a + source <(grep -v '^#' .env | grep -v '^$' | sed 's/\r$//') + set +a +fi + +# Configuration (with defaults) +POSTGRES_CONTAINER="mentorme-plagiarism-postgres" +RABBITMQ_CONTAINER="mentorme-plagiarism-rabbitmq" +POSTGRES_PORT="${POSTGRES_PORT:-5432}" +RABBITMQ_PORT="${RABBITMQ_PORT:-5672}" +RABBITMQ_MGMT_PORT="${RABBITMQ_MANAGEMENT_PORT:-15672}" +POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-postgres}" +POSTGRES_DB="${POSTGRES_DB:-plagiarism_db}" +POSTGRES_USER="${POSTGRES_USER:-postgres}" +RABBITMQ_USER="${RABBITMQ_USER:-admin}" +RABBITMQ_PASS="${RABBITMQ_PASS:-admin123}" +CLIP_MODEL_URL="${CLIP_MODEL_URL:-https://huggingface.co/laion/CLIP-ViT-L-14-laion2B-s32B-b82K/resolve/main/open_clip_pytorch_model.bin}" + +stop_existing_containers() { + local containers_exist=false + + # Check if compose stack is running + if $COMPOSE_CMD -f $COMPOSE_FILE ps 2>/dev/null | grep -q "Up\|running"; then + containers_exist=true + fi + + if [ "$containers_exist" = true ]; then + echo -e "${YELLOW}Existing containers found:${NC}" + $COMPOSE_CMD -f $COMPOSE_FILE ps + echo "" + echo -e "${YELLOW}This will stop and remove existing containers.${NC}" + read -p "Continue? (y/N): " -n 1 -r + echo + + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo -e "${RED}Aborted by user${NC}" + # exit 0 + else + echo -e "${CYAN}Stopping existing containers...${NC}" + $COMPOSE_CMD -f $COMPOSE_FILE down + fi + fi + + echo -e "${GREEN}[OK] Ready to start containers${NC}" +} + +start_containers() { + echo -e "${CYAN}Starting containers with $COMPOSE_FILE...${NC}" + + # Determine which services to start + #local services="postgres rabbitmq pgadmin plagiarism-checker" + local services="postgres rabbitmq plagiarism-checker" + + if [ "$START_API" -eq 1 ]; then + services="$services api" + echo -e "${CYAN}Including API service${NC}" + + # Force rebuild API container to ensure it uses Dockerfile.api (not Dockerfile) + echo -e "${YELLOW}Rebuilding API container with Dockerfile.api...${NC}" + $COMPOSE_CMD -f $COMPOSE_FILE build --no-cache api + + if [ $? -ne 0 ]; then + echo -e "${RED}ERROR: Failed to build API container${NC}" + exit 1 + fi + echo -e "${GREEN}[OK] API container rebuilt${NC}" + fi + + $COMPOSE_CMD -f $COMPOSE_FILE up -d $services + + if [ $? -ne 0 ]; then + echo -e "${RED}ERROR: Failed to start containers${NC}" + exit 1 + fi + + echo -e "${GREEN}[OK] Containers started${NC}" +} + +wait_for_postgres() { + echo -e "${YELLOW}Waiting for PostgreSQL...${NC}" + + local max_attempts=30 + local attempt=0 + + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt + 1)) + + if $CONTAINER_CMD exec $POSTGRES_CONTAINER pg_isready -U $POSTGRES_USER &>/dev/null; then + echo -e "${GREEN}[OK] PostgreSQL ready${NC}" + return 0 + fi + + sleep 2 + done + + echo -e "${RED}ERROR: PostgreSQL timeout${NC}" + exit 1 +} + +wait_for_rabbitmq() { + echo -e "${YELLOW}Waiting for RabbitMQ...${NC}" + + local max_attempts=30 + local attempt=0 + + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt + 1)) + + if curl -s -f http://localhost:$RABBITMQ_MGMT_PORT &>/dev/null; then + echo -e "${GREEN}[OK] RabbitMQ ready${NC}" + return 0 + fi + + sleep 2 + done + + echo -e "${RED}ERROR: RabbitMQ timeout${NC}" + exit 1 +} + +wait_for_api() { + if [ "$START_API" -ne 1 ]; then + return 0 + fi + + echo -e "${YELLOW}Waiting for API service...${NC}" + + local max_attempts=30 + local attempt=0 + + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt + 1)) + + if curl -s -f http://localhost:8000/health &>/dev/null; then + echo -e "${GREEN}[OK] API service ready${NC}" + return 0 + fi + + sleep 2 + done + + echo -e "${RED}ERROR: API service timeout${NC}" + echo -e "${YELLOW}Checking API container logs:${NC}" + $COMPOSE_CMD -f $COMPOSE_FILE logs --tail=50 api + exit 1 +} + +initialize_database() { + if [ ! -f "database/init.sql" ]; then + echo -e "${YELLOW}WARNING: database/init.sql not found${NC}" + return + fi + + echo -e "${CYAN}Initializing database...${NC}" + + # Run init.sql + if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < database/init.sql 2>/dev/null; then + echo -e "${GREEN}[OK] Database schema initialized${NC}" + else + echo -e "${GRAY}Database schema already exists${NC}" + fi + + # Create migrations tracking table if it doesn't exist + $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " + CREATE TABLE IF NOT EXISTS schema_migrations ( + id SERIAL PRIMARY KEY, + migration_name VARCHAR(255) UNIQUE NOT NULL, + applied_at TIMESTAMP DEFAULT NOW() + ); + " 2>/dev/null + + # Run migration scripts + if [ -d "database/migrations" ]; then + local migration_count=0 + for migration_file in database/migrations/*.sql; do + if [ -f "$migration_file" ]; then + local migration_name=$(basename "$migration_file") + + # Check if migration already applied + local already_applied=$($CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -t -c " + SELECT COUNT(*) FROM schema_migrations WHERE migration_name = '$migration_name'; + " 2>/dev/null | tr -d '[:space:]') + + if [ "$already_applied" = "0" ]; then + echo -e "${CYAN}Applying migration: $migration_name${NC}" + + if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < "$migration_file" 2>/dev/null; then + # Record migration as applied + $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " + INSERT INTO schema_migrations (migration_name) VALUES ('$migration_name'); + " 2>/dev/null + echo -e "${GREEN}[OK] Applied: $migration_name${NC}" + migration_count=$((migration_count + 1)) + else + echo -e "${YELLOW}WARNING: Failed to apply $migration_name${NC}" + fi + fi + fi + done + + if [ $migration_count -eq 0 ]; then + echo -e "${GRAY}All migrations already applied${NC}" + else + echo -e "${GREEN}[OK] Applied $migration_count migration(s)${NC}" + fi + fi +} + +create_env_file() { + if [ -f ".env" ]; then + echo -e "${GREEN}[OK] .env exists${NC}" + return + fi + + if [ ! -f ".env.example" ]; then + echo -e "${RED}ERROR: .env.example not found${NC}" + exit 1 + fi + + echo -e "${CYAN}Creating .env from template...${NC}" + cp .env.example .env + + # Keep service names for docker-compose (containers communicate via service names) + # No transformation needed - .env.example already has correct service names + + echo -e "${GREEN}[OK] .env created${NC}" +} + +show_summary() { + echo -e "\n${CYAN}==================================================================" + echo -e " Environment Ready!" + echo -e "==================================================================${NC}" + echo "" + echo -e "${GREEN}✓ Services Running:${NC}" + echo -e " PostgreSQL: localhost:$POSTGRES_PORT (with pgvector)" + echo -e " RabbitMQ: localhost:$RABBITMQ_PORT" + echo -e " RabbitMQ UI: http://localhost:$RABBITMQ_MGMT_PORT ($RABBITMQ_USER/$RABBITMQ_PASS)" + + if [ "$START_API" -eq 1 ]; then + echo -e " API: http://localhost:8000" + echo -e " API Docs: http://localhost:8000/docs" + fi + + echo "" + echo -e "${CYAN}💡 Quick Start:${NC}" + echo -e " ${YELLOW}./start-dev-env.sh --full-setup${NC} ${GRAY}# Development mode (default)${NC}" + echo -e " ${YELLOW}./start-dev-env.sh --with-api${NC} ${GRAY}# Include API container${NC}" + echo -e " ${YELLOW}./start-dev-env.sh --prod${NC} ${GRAY}# Production mode${NC}" + echo "" + echo -e "${CYAN}📋 Manual Setup:${NC}" + echo -e " 1. ${YELLOW}${PY} -m venv venv${NC}" + + case "${OS_TYPE}" in + windows-msys) echo -e " ${YELLOW}source venv/Scripts/activate${NC}" ;; + *) echo -e " ${YELLOW}source venv/bin/activate${NC}" ;; + esac + + echo -e " 2. ${YELLOW}${PY} -m pip install -r requirements.txt${NC}" + echo -e " 3. ${YELLOW}${PY} app.py${NC}" + echo "" + echo -e "${CYAN}🔧 Container Commands:${NC}" + echo -e " ${GRAY}Logs: $COMPOSE_CMD -f $COMPOSE_FILE logs -f${NC}" + echo -e " ${GRAY}Stop: $COMPOSE_CMD -f $COMPOSE_FILE stop${NC}" + echo -e " ${GRAY}Remove: $COMPOSE_CMD -f $COMPOSE_FILE down${NC}" + echo "" + echo -e "${CYAN}==================================================================${NC}" +} + +main() { + create_env_file + stop_existing_containers + start_containers + wait_for_postgres + wait_for_rabbitmq + wait_for_api + initialize_database + show_summary + echo -e "\n${GRAY}Containers running in background. Press Ctrl+C to exit this script.${NC}" +} + +setup_python_environment() { + echo -e "\n${CYAN}==================================================================" + echo -e " Full Setup: Python Environment" + echo -e "==================================================================${NC}" + + if [ -d "venv" ]; then + echo -e "${GRAY}Virtual environment exists${NC}" + else + echo -e "${CYAN}Creating virtual environment...${NC}" + $PY -m venv venv + echo -e "${GREEN}[OK] venv created${NC}" + fi + + echo -e "${CYAN}Activating virtual environment...${NC}" + if [ "${OS_TYPE}" = "windows-msys" ]; then + source venv/Scripts/activate + else + source venv/bin/activate + fi + + echo -e "${CYAN}Installing dependencies (5-10 minutes)...${NC}" + python -m pip install --upgrade pip setuptools wheel + python -m pip install -r requirements.txt + echo -e "${GREEN}[OK] Dependencies installed${NC}" + + echo -e "${CYAN}Creating directories...${NC}" + mkdir -p data/reference_images data/models/clip logs + echo -e "${GREEN}[OK] Directories created${NC}" + + # Download CLIP model using curl + echo -e "${CYAN}Checking CLIP model...${NC}" + if [ ! -f "data/models/clip/open_clip_pytorch_model.bin" ]; then + echo -e "${YELLOW}Downloading CLIP model (this may take a while)...${NC}" + curl -L -o data/models/clip/open_clip_pytorch_model.bin \ + ${CLIP_MODEL_URL} || { + echo -e "${YELLOW}WARNING: CLIP model download failed, will download on first run${NC}" + } + + if [ -f "data/models/clip/open_clip_pytorch_model.bin" ]; then + echo -e "${GREEN}[OK] CLIP model downloaded${NC}" + fi + else + echo -e "${GRAY}CLIP model already exists${NC}" + fi + + echo -e "${CYAN}Verifying environment...${NC}" + python -c " +import open_clip, asyncpg, aio_pika, PIL, imagehash +print('✓ All imports successful') +" || { + echo -e "${RED}ERROR: Environment verification failed${NC}" + exit 1 + } + + echo -e "\n${CYAN}==================================================================" + echo -e " Setup Complete!" + echo -e "==================================================================${NC}" + echo -e "\n${GREEN}✓ Next Steps:${NC}" + echo -e "\n${CYAN}Terminal 1 - Worker:${NC}" + [ "${OS_TYPE}" = "windows-msys" ] && echo -e " ${YELLOW}source venv/Scripts/activate${NC}" || echo -e " ${YELLOW}source venv/bin/activate${NC}" + echo -e " ${YELLOW}python app.py${NC}" + echo -e "\n${CYAN}Terminal 2 - API:${NC}" + [ "${OS_TYPE}" = "windows-msys" ] && echo -e " ${YELLOW}source venv/Scripts/activate${NC}" || echo -e " ${YELLOW}source venv/bin/activate${NC}" + echo -e " ${YELLOW}cd api && uvicorn api:app --reload --host 0.0.0.0 --port 8000${NC}" + echo -e "\n${CYAN}API Docs:${NC} ${YELLOW}http://localhost:8000/docs${NC}" + echo "" +} + +main + +if [ "$FULL_SETUP" -eq 1 ]; then + setup_python_environment +fi From e87d4fb7c418b29ec4fbffeea5385b307c795bd0 Mon Sep 17 00:00:00 2001 From: Manu Date: Fri, 2 Jan 2026 10:28:09 +0530 Subject: [PATCH 2/7] plg check against reference images --- .gitignore | 2 + config/config.py | 2 +- database/init.sql | 4 +- image_worker/assigment_ref_images.py | 347 +++++++++++++++++++++++++++ image_worker/worker.py | 87 ++++++- processors/image_processor.py | 3 + 6 files changed, 438 insertions(+), 7 deletions(-) create mode 100644 image_worker/assigment_ref_images.py diff --git a/.gitignore b/.gitignore index 3e1e6d7..897c762 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +scratch* + # Python __pycache__/ *.py[cod] diff --git a/config/config.py b/config/config.py index a9d9572..63060ff 100644 --- a/config/config.py +++ b/config/config.py @@ -93,7 +93,7 @@ class DetectionConfig(BaseSettings): exact_dup_threshold: float = Field(default=0.95, env="EXACT_DUPLICATE_THRESHOLD") near_dup_threshold: float = Field(default=0.90, env="NEAR_DUPLICATE_THRESHOLD") - semantic_threshold: float = Field(default=0.80, env="SEMANTIC_MATCH_THRESHOLD") + semantic_threshold: float = Field(default=0.70, env="SEMANTIC_MATCH_THRESHOLD") # Hash matching thresholds (Hamming distance, 0-64 bits) hash_threshold: int = Field(default=8, env="HASH_MATCH_THRESHOLD") diff --git a/database/init.sql b/database/init.sql index 0448456..265a61b 100644 --- a/database/init.sql +++ b/database/init.sql @@ -72,12 +72,12 @@ USING hnsw (clip_embedding vector_ip_ops); -- Reference images corpus CREATE TABLE IF NOT EXISTS reference_images ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), - reference_id VARCHAR(100) UNIQUE NOT NULL, + reference_id VARCHAR(200) UNIQUE NOT NULL, -- will be used as assignment id when fetching references from assignments image_path TEXT NOT NULL, phash VARCHAR(64) NOT NULL, dhash VARCHAR(64) NOT NULL, ahash VARCHAR(64) NOT NULL, - category VARCHAR(100), + category VARCHAR(200), description TEXT, source VARCHAR(200), faiss_index_position INTEGER, diff --git a/image_worker/assigment_ref_images.py b/image_worker/assigment_ref_images.py new file mode 100644 index 0000000..10617f4 --- /dev/null +++ b/image_worker/assigment_ref_images.py @@ -0,0 +1,347 @@ +import base64 +import io +import os +from PIL import Image +import requests +from typing import List, Dict, Optional +from dotenv import load_dotenv +import asyncpg +from datetime import datetime +import logging + +load_dotenv() + +logger = logging.getLogger(__name__) + +# Environment variables +ASSIGNMENT_CACHE_DAYS = int(os.getenv("ASSIGNMENT_CACHE_DAYS", "2")) +ENABLE_CACHE = os.getenv("ENABLE_CACHE", "true").lower() == "true" +PURGE_CACHE = os.getenv("PURGE_CACHE", "false").lower() == "true" + +# Database configuration +DB_CONFIG = { + "host": os.getenv("POSTGRES_HOST", "localhost"), + "port": int(os.getenv("POSTGRES_PORT", 5432)), + "database": os.getenv("POSTGRES_DB", "plagiarism_db"), + "user": os.getenv("POSTGRES_USER", "postgres"), + "password": os.getenv("POSTGRES_PASSWORD", "postgres"), +} + + +async def get_db_connection(): + """Create async database connection.""" + conn_string = ( + f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}" + f"@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}" + ) + return await asyncpg.connect(conn_string) + + +async def cleanup_cache(): + """Delete cached assignment reference images older than ASSIGNMENT_CACHE_DAYS.""" + try: + conn = await get_db_connection() + try: + # cutoff_date = datetime.utcnow() - timedelta(days=ASSIGNMENT_CACHE_DAYS) + + # Delete old assignment caches (reference_ids starting with "ASSIGN-") + result = await conn.execute( + """ + DELETE FROM reference_images + WHERE reference_id LIKE 'ASSIGN-%' + """ + ) + + deleted_count = int(result.split()[-1]) if result else 0 + if deleted_count > 0: + logger.info(f"Cleaned up {deleted_count} cached assignment reference images") + + finally: + await conn.close() + + except Exception as e: + logger.error(f"Cache cleanup failed: {e}") + + +async def get_cached_assignment(assignment_id: str) -> Optional[List[Dict]]: + """ + Retrieve cached reference images for an assignment from database. + + Args: + assignment_id: Assignment identifier + + Returns: + List of reference image dicts with precomputed hashes (no embeddings), or None if not cached + """ + try: + conn = await get_db_connection() + try: + # Query for cached images with this assignment_id (no clip_embedding in SELECT) + rows = await conn.fetch( + """ + SELECT reference_id, image_path, phash, dhash, ahash, created_at + FROM reference_images + WHERE reference_id LIKE $1 + ORDER BY reference_id + """, + f"ASSIGN-{assignment_id}-%" + ) + + if not rows: + logger.info(f"No cached reference images found for assignment: {assignment_id}") + return None + + # Check if cache is still valid + oldest_created = min(row['created_at'] for row in rows) + age_days = (datetime.utcnow() - oldest_created).days + + if age_days > ASSIGNMENT_CACHE_DAYS: + logger.info(f"Cache expired for assignment {assignment_id} (age: {age_days} days)") + return None + + logger.info(f"Retrieved {len(rows)} cached reference images for assignment: {assignment_id}") + + # Return just the hashes and name (no embeddings) + images = [] + for row in rows: + images.append({ + "name": row['reference_id'] + row['image_path'], # Image name stored in image_path + "phash": row['phash'], + "dhash": row['dhash'], + "ahash": row['ahash'] + }) + + return images + + finally: + await conn.close() + + except Exception as e: + logger.error(f"Failed to retrieve cached assignment: {e}") + return None + + +async def save_to_cache(assignment_id: str, images: List[Dict]): + """ + Save assignment reference images (hashes + embeddings) to database cache. + + Args: + assignment_id: Assignment identifier + images: List of image dicts with "name", "phash", "dhash", "ahash", and optionally "embedding" + """ + try: + conn = await get_db_connection() + try: + cached_count = 0 + + for idx, img_data in enumerate(images): + try: + # Create unique reference_id for this assignment's reference image + reference_id = f"ASSIGN-{assignment_id}-{idx:03d}" + image_name = img_data.get("name", f"ref_{idx}") + + # Check if hashes are precomputed + if "phash" not in img_data or "dhash" not in img_data or "ahash" not in img_data: + logger.warning(f"Hashes not precomputed for image {idx}, skipping cache") + continue + + # Extract embedding if present + embedding = img_data.get("embedding") + + if embedding is not None: + # Convert numpy array to pgvector format + embedding_str = '[' + ','.join(map(str, embedding.tolist())) + ']' + clip_generated = True + else: + embedding_str = None + clip_generated = False + + # Insert into database with embedding + await conn.execute( + """ + INSERT INTO reference_images + (reference_id, image_path, phash, dhash, ahash, category, description, source, + clip_embedding_generated, clip_embedding) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10::vector) + ON CONFLICT (reference_id) DO UPDATE SET + image_path = EXCLUDED.image_path, + phash = EXCLUDED.phash, + dhash = EXCLUDED.dhash, + ahash = EXCLUDED.ahash, + clip_embedding_generated = EXCLUDED.clip_embedding_generated, + clip_embedding = EXCLUDED.clip_embedding, + updated_at = NOW() + """, + reference_id, + image_name, + img_data["phash"], + img_data["dhash"], + img_data["ahash"], + "assignment_cache", + f"Reference image from {image_name}", + f"assignment_{assignment_id}", + clip_generated, + embedding_str + ) + + cached_count += 1 + + except Exception as img_err: + logger.error(f"Failed to cache image {idx} for assignment {assignment_id}: {img_err}") + continue + + logger.info(f"Cached {cached_count}/{len(images)} reference images for assignment: {assignment_id}") + + finally: + await conn.close() + + except Exception as e: + logger.error(f"Failed to save to cache: {e}") + + +async def get_reference_images( + assignment_id: str, + clip_handler=None, + hash_handler=None +) -> Optional[List[Dict]]: + """ + Fetch reference images for an assignment from TAP LMS API with caching support. + + Args: + assignment_id: Assignment identifier + clip_handler: CLIPHandler instance from worker (optional) + hash_handler: HashHandler instance from worker (optional) + + Returns: + List of reference image dictionaries with precomputed hashes (no embeddings in return) + """ + try: + # Cleanup old cache if enabled + if PURGE_CACHE: + await cleanup_cache() + + # Check cache first if enabled + if ENABLE_CACHE: + cached_images = await get_cached_assignment(assignment_id) + if cached_images is not None: + logger.info(f"Using cached reference images for assignment: {assignment_id}") + return cached_images + + # Fetch from API + logger.info(f"Fetching reference images from API for assignment: {assignment_id}") + images = await fetch_from_api(assignment_id, clip_handler, hash_handler) + + # Save to cache if enabled (embeddings will be saved to DB but not returned) + if ENABLE_CACHE and images: + await save_to_cache(assignment_id, images) + + # Remove embeddings from return object + if images: + for img in images: + img.pop("embedding", None) + + return images + + except Exception as e: + logger.error(f"Error fetching reference images: {e}") + return None + + +async def fetch_from_api( + assignment_id: str, + clip_handler=None, + hash_handler=None +) -> Optional[List[Dict]]: + """ + Fetch reference images for an assignment from TAP LMS API and compute hashes + embeddings. + + Args: + assignment_id: Assignment identifier + clip_handler: CLIPHandler instance from worker (optional) + hash_handler: HashHandler instance from worker (optional) + + Returns: + List of reference image dictionaries with precomputed hashes and embeddings + """ + api_key = os.getenv("FRAPPE_API_KEY") + api_secret = os.getenv("FRAPPE_API_SECRET") + base_url = os.getenv("FRAPPE_API_BASE_URL") + + if not all([api_key, api_secret, base_url]): + logger.error("Missing API configuration: FRAPPE_API_KEY, FRAPPE_API_SECRET, or FRAPPE_API_BASE_URL") + return None + + if hash_handler is None: + logger.error("No hash_handler provided, cannot compute hashes") + return None + + assignment_context_endpoint = "api/method/tap_lms.imgana.submission.get_assignment_context" + + headers = { + "Content-Type": "application/json", + "Authorization": f"token {api_key}:{api_secret}" + } + + api_url = f"{base_url.rstrip('/')}/{assignment_context_endpoint.lstrip('/')}" + + try: + # Use synchronous requests in async context (consider aiohttp for true async) + response = requests.post( + api_url, + headers=headers, + json={"assignment_id": assignment_id}, + timeout=30 + ) + + response.raise_for_status() + data = response.json() + + reference_images = data.get("message", {}).get("assignment", {}).get("reference_images", []) + + if not reference_images: + logger.warning(f"No reference images found for assignment: {assignment_id}") + return [] + + # Process images: decode, compute hashes and embeddings, then discard PIL objects + processed_images = [] + for image in reference_images: + try: + decoded_bytes = base64.b64decode(image["content"]) + image_obj = Image.open(io.BytesIO(decoded_bytes)) + + # Compute hashes using passed handler + hashes = hash_handler.compute_hashes(image_obj) + + # Generate CLIP embedding if handler provided + embedding = None + if clip_handler is not None: + try: + embedding = clip_handler.generate_embedding(image_obj) + except Exception as embed_err: + logger.error(f"Failed to generate embedding for {image.get('name', 'unknown')}: {embed_err}") + + # Close PIL Image - we don't need it anymore + image_obj.close() + + # Store hashes and embedding (embedding will be saved to DB but removed before return) + processed_images.append({ + "name": image.get("name", f"ref_{len(processed_images)}"), + "phash": hashes["phash"], + "dhash": hashes["dhash"], + "ahash": hashes["ahash"], + "embedding": embedding # Temporary, for save_to_cache + }) + + except Exception as img_err: + logger.error(f"Failed to process image {image.get('name', 'unknown')}: {img_err}") + continue + + logger.info(f"Fetched and processed {len(processed_images)} reference images from API for assignment: {assignment_id}") + return processed_images + + except requests.exceptions.RequestException as e: + logger.error(f"API request failed for assignment {assignment_id}: {e}") + return None + except Exception as e: + logger.error(f"Failed to fetch from API: {e}") + return None \ No newline at end of file diff --git a/image_worker/worker.py b/image_worker/worker.py index 0798bef..012d2e9 100644 --- a/image_worker/worker.py +++ b/image_worker/worker.py @@ -11,6 +11,7 @@ from typing import Dict, Optional, Tuple, Any from urllib.parse import urlparse from dotenv import load_dotenv +from image_worker.assigment_ref_images import get_reference_images from config.config import config from database.db_manager import DatabaseManager @@ -291,7 +292,75 @@ async def _async_compare_self(self, hashes, prev): ) return comparison, prev - async def check_hash_match( + async def check_assignment_reference_hash_match( + self, hashes: dict, assignment_id: str + ) -> Tuple[bool, Optional[str], Optional[float], Optional[str]]: + """ + Check if submission matches any reference image via perceptual hash comparison. + + Uses three hash types (pHash, dHash, aHash) for robust duplicate detection. + + Args: + hashes: Dict containing 'phash', 'dhash', 'ahash' hex strings + + Returns: + Tuple of (is_match, reference_id, similarity_score, image_url) + - is_match: True if hash match found + - reference_id: UUID of matched reference (or None) + - similarity_score: 0.0-1.0 similarity score (or None) + - image_url: URL of matched reference image (or None) + + Raises: + Exception: If database query fails + """ + try: + + references = await get_reference_images(assignment_id, self.clip_handler, self.hash_handler) + if not references: + return False, None, None, None + + # for ref_image in references: + # if ref_image["content"] is not None: + # hashes = self.hash_handler.compute_hashes(ref_image["content"]) + # ref_image['phash'] = hashes['phash'] + # ref_image['dhash'] = hashes['dhash'] + # ref_image['ahash'] = hashes['ahash'] + + tasks = [self._async_compare_ref(hashes, ref) for ref in references] + results = await asyncio.gather(*tasks) + + + best_match = None + best_score = 999 + best_comparison = None + for comparison, ref in results: + if comparison["is_match"] and comparison["avg_distance"] < best_score: + best_score = comparison["avg_distance"] + best_match = ref + best_comparison = comparison + + if best_match and best_comparison: + print("#"*70) + print("Best match found:",best_match["name"]) + print("#"*70) + logger.info("Assignment reference match found") + similarity = 1 - (best_score / 64.0) + return ( + True, + str(best_match["name"]), + similarity, + str(best_match["name"]), + ) + else: + logger.info("No assignment reference match found") + return False, None, None, None + + except Exception as e: + logger.error(f"Hash check failed: {e}", exc_info=True) + raise + + + async def check_db_reference_hash_match( self, hashes: dict ) -> Tuple[bool, Optional[str], Optional[float], Optional[str]]: """ @@ -375,6 +444,11 @@ async def check_clip_match( if not results: return None, 0.0, None + + print("#"*70) + for ref_id, sim, meta in results: + print(f" Ref ID: {ref_id}, Similarity: {sim:.4f}, Meta: {meta}") + print("#"*70) matches = [ (ref_id, sim, meta) @@ -540,7 +614,8 @@ async def process_submission(self, data: Dict[str, Any]) -> Optional[str]: student_id, self_result.get("first_submission_date_for_image", None), ) - hash_check_result = await self.check_hash_match(hashes) + # hash_check_result = await self.check_db_reference_hash_match(hashes) + hash_check_result = await self.check_assignment_reference_hash_match(hashes,assign_id) ( hash_match, @@ -1020,6 +1095,9 @@ def _create_stock_image_result( "student_id": student_id, "assignment_id": assign_id, "image_url": image_url, + "is_ai_generated": False, + "ai_detection_source": "None", + "ai_confidence": 0.0, "is_plagiarized": True, "similarity_score": 1.0, "match_type": "stock_image", @@ -1290,8 +1368,9 @@ def format_results( ) message["ai_confidence"] = plagiarism_status.get("ai_confidence", 0.0) - payload_preview = json.dumps(message, indent=2)[:2000] - logger.info(f"Result payload preview (2000 chars):\n{payload_preview}...") + # payload_preview = json.dumps(message, indent=2)[:2000] + # logger.info(f"Result payload preview (2000 chars):\n{payload_preview}...") + return json.dumps(message) diff --git a/processors/image_processor.py b/processors/image_processor.py index 653a66c..02f0035 100644 --- a/processors/image_processor.py +++ b/processors/image_processor.py @@ -65,6 +65,9 @@ async def process(self, data: dict) -> dict: ) return {"error": "Invalid JSON response from worker"} + logger.info("Result payload ") + logger.info(json.dumps(result, indent=2)) + logger.info( f"Successfully processed submission: {data.get('submission_id')}" ) From 2b7ca67e60b9a3fd58d60a4f4a61066d8237ff15 Mon Sep 17 00:00:00 2001 From: manua-glitch Date: Fri, 2 Jan 2026 10:32:25 +0530 Subject: [PATCH 3/7] Delete .env.prod --- .env.prod | 119 ------------------------------------------------------ 1 file changed, 119 deletions(-) delete mode 100644 .env.prod diff --git a/.env.prod b/.env.prod deleted file mode 100644 index 101b48d..0000000 --- a/.env.prod +++ /dev/null @@ -1,119 +0,0 @@ -# RABBITMQ CONFIGURATION -RABBITMQ_HOST=armadillo.rmq.cloudamqp.com -RABBITMQ_PORT=5672 -RABBITMQ_USER=fzdqidte -RABBITMQ_PASS=0SMrDogBVcWUcu9brWwp2QhET_kArl59 -RABBITMQ_VHOST=fzdqidte -RABBITMQ_MANAGEMENT_PORT=15672 -RABBITMQ_PREFETCH_COUNT=1 - -# Message retry configuration -# Maximum number of retries before sending to DLQ (prevents poison messages) -MAX_RETRIES=3 - -# Queue Names -SUBMISSION_QUEUE=plagiarism_submissions -FEEDBACK_QUEUE=plagiarism_feedback -# Dead Letter Queue (optional - leave empty to disable) -DEAD_LETTER_QUEUE=plagiarism_failed_submissions - -# POSTGRESQL CONFIGURATION -POSTGRES_HOST=db.example.com -POSTGRES_PORT=5432 -POSTGRES_DB=plagiarism_db -POSTGRES_USER=postgres -POSTGRES_PASSWORD=postgres - -# PGADMIN CONFIGURATION (Optional - for development only) -PGADMIN_EMAIL=admin@admin.com -PGADMIN_PASSWORD=admin123 - -# Connection Pool -POSTGRES_POOL_SIZE=10 -POSTGRES_MAX_OVERFLOW=20 - -# PLAGIARISM DETECTION THRESHOLDS -EXACT_DUPLICATE_THRESHOLD=0.95 -NEAR_DUPLICATE_THRESHOLD=0.90 -SEMANTIC_MATCH_THRESHOLD=0.80 - -# ==== PRODUCTION: Uncomment below for 7-day window ==== -RESUBMISSION_WINDOW_DAYS=14 - -# Hash comparison threshold (Hamming distance) -HASH_MATCH_THRESHOLD=10 - -# IMAGE PROCESSING -# Maximum image size in MB -MAX_IMAGE_SIZE_MB=10 - -# Image download timeout in seconds -IMAGE_DOWNLOAD_TIMEOUT=30 - -# Image validation thresholds -# Min variance to detect blank images (lower = more strict) -IMAGE_MIN_VARIANCE=5.0 -# Min unique colors required -IMAGE_MIN_UNIQUE_COLORS=10 -# Max ratio of dominant color (higher = more permissive) -IMAGE_MAX_SOLID_COLOR_RATIO=0.95 - -# CLIP Model Configuration -CLIP_MODEL=ViT-L/14 -CLIP_DEVICE=cpu -CLIP_PRETRAINED=laion2B-s32B-b82K - -# Local Model Path (Optional - use pre-downloaded models) -# If set, the system will load the model from this path instead of downloading from HuggingFace -# Example: CLIP_LOCAL_MODEL_PATH=./models/clip/open_clip_pytorch_model.bin -# Download models from: https://huggingface.co/laion/CLIP-ViT-L-14-laion2B-s32B-b82K -CLIP_LOCAL_MODEL_PATH=/app/models/clip/open_clip_pytorch_model.bin - -# Disable SSL verification for HuggingFace downloads (for corporate proxy/self-signed certs) -# Set to "true" only if you encounter SSL certificate errors -DISABLE_SSL_VERIFY=true -PYTHONHTTPSVERIFY=0 - -# VECTOR SEARCH CONFIGURATION -# Use pgvector (PostgreSQL) or FAISS for vector similarity search -USE_PGVECTOR=true - -# FAISS Configuration -FAISS_INDEX_PATH=/app/data/faiss_index.bin -FAISS_METADATA_PATH=/app/data/faiss_metadata.json -FAISS_DIMENSION=768 -FAISS_TOP_K=4 # Number of top candidates to retrieve from FAISS search - -# STORAGE PATHS -# Reference images directory -REFERENCE_IMAGES_DIR=./data/reference_images - -# Temporary storage for downloaded submissions -TEMP_IMAGES_DIR=./data/temp_images - -# Logs directory -#LOGS_DIR=./logs - -# APPLICATION SETTINGS -LOG_LEVEL=INFO - -# Worker concurrency (number of threads) -#WORKER_THREADS=4 - -# Enable performance metrics(ignore) -#ENABLE_METRICS=true - -# DEVELOPMENT SETTINGS -# Set to "development" or "production" -#ENVIRONMENT=development - -#DEBUG=true - - - -# Mock Glific API (for testing without WhatsApp) -#Used to skip WhatsApp delivery in testing mode -MOCK_GLIFIC=true -# ==== TESTING: 2-minute resubmission window (comment out for production) ==== -RESUBMISSION_WINDOW_MINUTES=2 - From 598574cf710e6fca8978318d6d914f99ec6092f5 Mon Sep 17 00:00:00 2001 From: Manu Date: Fri, 2 Jan 2026 11:11:54 +0530 Subject: [PATCH 4/7] podman-compose changes --- docker-compose-prod.yml | 57 ++++- docs/DOCUMENTATION.md | 6 +- requirements.txt | 61 +++++- start-dev-env.ps1 | 4 +- start-dev-env.sh | 4 +- start-prod-env.sh | 445 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 554 insertions(+), 23 deletions(-) create mode 100755 start-prod-env.sh diff --git a/docker-compose-prod.yml b/docker-compose-prod.yml index 89a21cb..3e2e144 100644 --- a/docker-compose-prod.yml +++ b/docker-compose-prod.yml @@ -1,6 +1,42 @@ version: '3.8' services: + # =================================== + # POSTGRESQL - Database + # =================================== + postgres: + image: pgvector/pgvector:pg16 + container_name: plg-postgres + ports: + - "5432:5432" + environment: + POSTGRES_DB: ${POSTGRES_DB} + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_INITDB_ARGS: "-E UTF8" + POSTGRES_MAX_CONNECTIONS: 20 + PGDATA: /var/lib/postgresql/data/pgdata + volumes: + - postgres_data:/var/lib/postgresql/data + - ./database/init.sql:/docker-entrypoint-initdb.d/init.sql + healthcheck: + test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER -d $$POSTGRES_DB"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + deploy: + resources: + limits: + cpus: '0.5' + memory: 512M + reservations: + cpus: '0.25' + memory: 256M + networks: + - plg-network + restart: unless-stopped + # =================================== # PLAGIARISM CHECKER SERVICE # =================================== @@ -8,32 +44,37 @@ services: build: context: . dockerfile: Dockerfile - container_name: mentorme-plagiarism-checker + container_name: plg-checker env_file: - .env volumes: - ./data:/app/data - ./logs:/app/logs depends_on: - rabbitmq: - condition: service_healthy postgres: condition: service_healthy deploy: resources: limits: - cpus: '1.0' - memory: 4G + cpus: '8.0' # Increased from 4.0 - allows up to 8 CPU cores + memory: 8G reservations: - cpus: '0.5' + cpus: '2' # Increased from 1 - guarantees 2 cores minimum memory: 2G restart: unless-stopped networks: - - mentorme-plagiarism-network + - plg-network + +# =================================== +# VOLUMES +# =================================== +volumes: + postgres_data: + driver: local # =================================== # NETWORKS # =================================== networks: - mentorme-plagiarism-network: + plg-network: driver: bridge diff --git a/docs/DOCUMENTATION.md b/docs/DOCUMENTATION.md index 9bae9d4..9747212 100644 --- a/docs/DOCUMENTATION.md +++ b/docs/DOCUMENTATION.md @@ -776,11 +776,11 @@ asyncio.run(test()) #### Build Docker Image ```bash # Standard build (model downloaded on first run) -docker build -t mentorme-plagiarism:latest . +docker build -t plg:latest . # With HuggingFace token for model prefetch during build (optional) # This pre-downloads the CLIP model into the Docker image -docker build -t mentorme-plagiarism:latest \ +docker build -t plg:latest \ --build-arg HUGGINGFACE_HUB_TOKEN=your_token_here . # Note: HuggingFace token is optional - public models can be downloaded without authentication @@ -804,7 +804,7 @@ docker-compose down # docker-compose.yml snippet services: worker: - image: mentorme-plagiarism:latest + image: plg:latest environment: - POSTGRES_HOST=postgres - RABBITMQ_HOST=rabbitmq diff --git a/requirements.txt b/requirements.txt index 37d4505..4e590f1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,28 +8,73 @@ pgvector==0.4.1 # HTTP Client aiohttp==3.13.2 +aiohappyeyeballs==2.6.1 +aiosignal==1.4.0 requests==2.32.3 # Configuration Management (Pydantic v2 compatible) -pydantic<2.0.0,>=1.10.0 +pydantic==1.10.24 python-dotenv==1.2.1 # Image Processing pillow==12.0.0 -imagehash==4.3.2 +ImageHash==4.3.2 # Machine Learning - CLIP & FAISS torch==2.9.0 torchvision==0.24.0 -open-clip-torch==3.2.0 +open_clip_torch==3.2.0 faiss-cpu==1.12.0 numpy==2.3.4 +timm==1.0.22 +safetensors==0.7.0 +huggingface_hub==1.2.1 +filelock==3.20.0 +fsspec==2025.12.0 +ftfy==6.3.1 +regex==2025.11.3 +scipy==1.16.3 +mpmath==1.3.0 +sympy==1.14.0 +networkx==3.6 +PyWavelets==1.9.0 + +# Web Framework - FastAPI +fastapi==0.115.0 +uvicorn==0.34.0 +starlette==0.38.6 +anyio==4.12.0 +h11==0.16.0 +httptools==0.7.1 +uvloop==0.22.1 +watchfiles==1.1.1 +websockets==15.0.1 +click==8.3.1 +typer-slim==0.20.0 +shellingham==1.5.4 +httpcore==1.0.9 +httpx==0.28.1 +hf-xet==1.2.0 # Utilities tqdm==4.67.1 - podman-compose==1.5.0 - -# Web Framework - FastAPI -fastapi==0.115.0 -uvicorn[standard]==0.34.0 \ No newline at end of file +annotated-doc==0.0.4 +attrs==25.4.0 +certifi==2025.11.12 +charset-normalizer==3.4.4 +frozenlist==1.8.0 +idna==3.11 +Jinja2==3.1.6 +MarkupSafe==3.0.3 +multidict==6.7.0 +packaging==25.0 +pamqp==3.3.0 +propcache==0.4.1 +PyYAML==6.0.3 +setuptools==80.9.0 +typing_extensions==4.15.0 +urllib3==2.6.0 +wcwidth==0.2.14 +wheel==0.45.1 +yarl==1.22.0 diff --git a/start-dev-env.ps1 b/start-dev-env.ps1 index 2186b0d..d59d24d 100644 --- a/start-dev-env.ps1 +++ b/start-dev-env.ps1 @@ -80,8 +80,8 @@ if (Test-Path ".env") { } # Configuration (with defaults from environment or hardcoded) -$POSTGRES_CONTAINER = "mentorme-plagiarism-postgres" -$RABBITMQ_CONTAINER = "mentorme-plagiarism-rabbitmq" +$POSTGRES_CONTAINER = "plg-postgres" +$RABBITMQ_CONTAINER = "plg-rabbitmq" $POSTGRES_PORT = if ($env:POSTGRES_PORT) { $env:POSTGRES_PORT } else { 5432 } $RABBITMQ_PORT = if ($env:RABBITMQ_PORT) { $env:RABBITMQ_PORT } else { 5672 } $RABBITMQ_MGMT_PORT = if ($env:RABBITMQ_MANAGEMENT_PORT) { $env:RABBITMQ_MANAGEMENT_PORT } else { 15672 } diff --git a/start-dev-env.sh b/start-dev-env.sh index 15a9729..8878a03 100755 --- a/start-dev-env.sh +++ b/start-dev-env.sh @@ -123,8 +123,8 @@ if [ -f ".env" ]; then fi # Configuration (with defaults) -POSTGRES_CONTAINER="mentorme-plagiarism-postgres" -RABBITMQ_CONTAINER="mentorme-plagiarism-rabbitmq" +POSTGRES_CONTAINER="plg-postgres" +RABBITMQ_CONTAINER="plg-rabbitmq" POSTGRES_PORT="${POSTGRES_PORT:-5432}" RABBITMQ_PORT="${RABBITMQ_PORT:-5672}" RABBITMQ_MGMT_PORT="${RABBITMQ_MANAGEMENT_PORT:-15672}" diff --git a/start-prod-env.sh b/start-prod-env.sh new file mode 100755 index 0000000..8f92a20 --- /dev/null +++ b/start-prod-env.sh @@ -0,0 +1,445 @@ +#!/usr/bin/env bash +# Local Development Startup Script +# Starts PostgreSQL and RabbitMQ containers using docker-compose + +set -e +FULL_SETUP=0 +START_API=0 +COMPOSE_FILE="docker-compose-prod.yml" + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --full-setup) FULL_SETUP=1; shift ;; + --with-api) START_API=1; shift ;; + --prod) COMPOSE_FILE="docker-compose-prod.yml"; shift ;; + --dev) COMPOSE_FILE="docker-compose-dev.yml"; shift ;; + *) break ;; + esac +done + +# Detect platform +OS_TYPE="unknown" +UNAME_OUT=$(uname -s 2>/dev/null || true) +case "${UNAME_OUT}" in + MINGW*|MSYS*|CYGWIN*) OS_TYPE="windows-msys" ;; + Darwin) OS_TYPE="macos" ;; + Linux) OS_TYPE="linux" ;; + *) OS_TYPE="unix" ;; +esac + +# Detect Python executable +PY="" +if [ "${OS_TYPE}" = "windows-msys" ]; then + PY_CANDIDATES=("py" "python3" "python" "python.exe") +else + PY_CANDIDATES=("python3" "python" "py" "python.exe") +fi + +for candidate in "${PY_CANDIDATES[@]}"; do + if [ "$candidate" = "py" ]; then + if command -v py >/dev/null 2>&1; then + if py -3 -c "import sys; sys.stdout.write('Python found!!\n')" 2>/dev/null; then + PY='py -3' + break + fi + fi + continue + fi + + candidate_path=$(command -v "$candidate" 2>/dev/null || true) + if [ -n "$candidate_path" ]; then + case "$candidate_path" in + *WindowsApps*|*windowsapps*) continue ;; + esac + + if "$candidate" -c "import sys; sys.stdout.write('ok')" 2>/dev/null; then + PY="$candidate" + break + fi + fi +done + +if [ -z "$PY" ]; then + echo "ERROR: No Python 3.10+ found. Install Python or ensure it's in PATH." >&2 + exit 1 +fi + +# Detect container runtime (Podman or Docker) +CONTAINER_CMD="" +COMPOSE_CMD="" +if command -v podman &> /dev/null; then + CONTAINER_CMD="podman" + if command -v podman-compose &> /dev/null; then + COMPOSE_CMD="podman-compose" + else + echo "ERROR: podman-compose not found. Install it:" >&2 + echo " pip install podman-compose" >&2 + exit 1 + fi +elif command -v docker &> /dev/null; then + CONTAINER_CMD="docker" + if command -v docker-compose &> /dev/null; then + COMPOSE_CMD="docker-compose" + elif docker compose version &> /dev/null; then + COMPOSE_CMD="docker compose" + else + echo "ERROR: docker-compose not found. Install it:" >&2 + echo " https://docs.docker.com/compose/install/" >&2 + exit 1 + fi +else + echo "ERROR: Neither Podman nor Docker found. Install one of them:" >&2 + echo " Podman: https://podman.io/getting-started/installation" >&2 + echo " Docker: https://docs.docker.com/get-docker/" >&2 + exit 1 +fi + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +GRAY='\033[0;37m' +NC='\033[0m' + +echo -e "${CYAN}==================================================================" +echo -e " Plagiarism Checker - PROD Development Setup" +echo -e " Container Runtime: ${CONTAINER_CMD}" +echo -e " Compose File: ${COMPOSE_FILE}" +echo -e "==================================================================${NC}" +echo "" + +# Validate compose file exists +if [ ! -f "$COMPOSE_FILE" ]; then + echo -e "${RED}ERROR: $COMPOSE_FILE not found${NC}" + exit 1 +fi + +# Load configuration from .env if it exists +if [ -f ".env" ]; then + set -a + source <(grep -v '^#' .env | grep -v '^$' | sed 's/\r$//') + set +a +fi + +# Configuration (with defaults) +POSTGRES_CONTAINER="plg-postgres" +POSTGRES_PORT="${POSTGRES_PORT:-5432}" +POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-postgres}" +POSTGRES_DB="${POSTGRES_DB:-plagiarism_db}" +POSTGRES_USER="${POSTGRES_USER:-postgres}" +CLIP_MODEL_URL="${CLIP_MODEL_URL:-https://huggingface.co/laion/CLIP-ViT-L-14-laion2B-s32B-b82K/resolve/main/open_clip_pytorch_model.bin}" + +stop_existing_containers() { + local containers_exist=false + + # Check if compose stack is running + if $COMPOSE_CMD -f $COMPOSE_FILE ps 2>/dev/null | grep -q "Up\|running"; then + containers_exist=true + fi + + if [ "$containers_exist" = true ]; then + echo -e "${YELLOW}Existing containers found:${NC}" + $COMPOSE_CMD -f $COMPOSE_FILE ps + echo "" + echo -e "${YELLOW}This will stop and remove existing containers.${NC}" + read -p "Continue? (y/N): " -n 1 -r + echo + + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo -e "${RED}Aborted by user${NC}" + # exit 0 + else + echo -e "${CYAN}Stopping existing containers...${NC}" + $COMPOSE_CMD -f $COMPOSE_FILE down + fi + fi + + echo -e "${GREEN}[OK] Ready to start containers${NC}" +} + +start_containers() { + echo -e "${CYAN}Starting containers with $COMPOSE_FILE...${NC}" + + # Determine which services to start + #local services="postgres rabbitmq pgadmin plagiarism-checker" + local services="postgres plagiarism-checker" + + if [ "$START_API" -eq 1 ]; then + services="$services api" + echo -e "${CYAN}Including API service${NC}" + + # Force rebuild API container to ensure it uses Dockerfile.api (not Dockerfile) + echo -e "${YELLOW}Rebuilding API container with Dockerfile.api...${NC}" + $COMPOSE_CMD -f $COMPOSE_FILE build --no-cache api + + if [ $? -ne 0 ]; then + echo -e "${RED}ERROR: Failed to build API container${NC}" + exit 1 + fi + echo -e "${GREEN}[OK] API container rebuilt${NC}" + fi + + $COMPOSE_CMD -f $COMPOSE_FILE up -d $services + + if [ $? -ne 0 ]; then + echo -e "${RED}ERROR: Failed to start containers${NC}" + exit 1 + fi + + echo -e "${GREEN}[OK] Containers started${NC}" +} + +wait_for_postgres() { + echo -e "${YELLOW}Waiting for PostgreSQL...${NC}" + + local max_attempts=30 + local attempt=0 + + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt + 1)) + + if $CONTAINER_CMD exec $POSTGRES_CONTAINER pg_isready -U $POSTGRES_USER &>/dev/null; then + echo -e "${GREEN}[OK] PostgreSQL ready${NC}" + return 0 + fi + + sleep 2 + done + + echo -e "${RED}ERROR: PostgreSQL timeout${NC}" + exit 1 +} + + +wait_for_api() { + if [ "$START_API" -ne 1 ]; then + return 0 + fi + + echo -e "${YELLOW}Waiting for API service...${NC}" + + local max_attempts=30 + local attempt=0 + + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt + 1)) + + if curl -s -f http://localhost:8000/health &>/dev/null; then + echo -e "${GREEN}[OK] API service ready${NC}" + return 0 + fi + + sleep 2 + done + + echo -e "${RED}ERROR: API service timeout${NC}" + echo -e "${YELLOW}Checking API container logs:${NC}" + $COMPOSE_CMD -f $COMPOSE_FILE logs --tail=50 api + exit 1 +} + +initialize_database() { + if [ ! -f "database/init.sql" ]; then + echo -e "${YELLOW}WARNING: database/init.sql not found${NC}" + return + fi + + echo -e "${CYAN}Initializing database...${NC}" + + # Run init.sql + if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < database/init.sql 2>/dev/null; then + echo -e "${GREEN}[OK] Database schema initialized${NC}" + else + echo -e "${GRAY}Database schema already exists${NC}" + fi + + # Create migrations tracking table if it doesn't exist + $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " + CREATE TABLE IF NOT EXISTS schema_migrations ( + id SERIAL PRIMARY KEY, + migration_name VARCHAR(255) UNIQUE NOT NULL, + applied_at TIMESTAMP DEFAULT NOW() + ); + " 2>/dev/null + + # Run migration scripts + if [ -d "database/migrations" ]; then + local migration_count=0 + for migration_file in database/migrations/*.sql; do + if [ -f "$migration_file" ]; then + local migration_name=$(basename "$migration_file") + + # Check if migration already applied + local already_applied=$($CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -t -c " + SELECT COUNT(*) FROM schema_migrations WHERE migration_name = '$migration_name'; + " 2>/dev/null | tr -d '[:space:]') + + if [ "$already_applied" = "0" ]; then + echo -e "${CYAN}Applying migration: $migration_name${NC}" + + if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < "$migration_file" 2>/dev/null; then + # Record migration as applied + $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " + INSERT INTO schema_migrations (migration_name) VALUES ('$migration_name'); + " 2>/dev/null + echo -e "${GREEN}[OK] Applied: $migration_name${NC}" + migration_count=$((migration_count + 1)) + else + echo -e "${YELLOW}WARNING: Failed to apply $migration_name${NC}" + fi + fi + fi + done + + if [ $migration_count -eq 0 ]; then + echo -e "${GRAY}All migrations already applied${NC}" + else + echo -e "${GREEN}[OK] Applied $migration_count migration(s)${NC}" + fi + fi +} + +create_env_file() { + if [ -f ".env" ]; then + echo -e "${GREEN}[OK] .env exists${NC}" + return + fi + + if [ ! -f ".env.example" ]; then + echo -e "${RED}ERROR: .env.example not found${NC}" + exit 1 + fi + + echo -e "${CYAN}Creating .env from template...${NC}" + cp .env.example .env + + # Keep service names for docker-compose (containers communicate via service names) + # No transformation needed - .env.example already has correct service names + + echo -e "${GREEN}[OK] .env created${NC}" +} + +show_summary() { + echo -e "\n${CYAN}==================================================================" + echo -e " Environment Ready!" + echo -e "==================================================================${NC}" + echo "" + echo -e "${GREEN}✓ Services Running:${NC}" + echo -e " PostgreSQL: localhost:$POSTGRES_PORT (with pgvector)" + + if [ "$START_API" -eq 1 ]; then + echo -e " API: http://localhost:8000" + echo -e " API Docs: http://localhost:8000/docs" + fi + + echo "" + echo -e "${CYAN}💡 Quick Start:${NC}" + echo -e " ${YELLOW}./start-dev-env.sh --full-setup${NC} ${GRAY}# Development mode (default)${NC}" + echo -e " ${YELLOW}./start-dev-env.sh --with-api${NC} ${GRAY}# Include API container${NC}" + echo -e " ${YELLOW}./start-dev-env.sh --prod${NC} ${GRAY}# Production mode${NC}" + echo "" + echo -e "${CYAN}📋 Manual Setup:${NC}" + echo -e " 1. ${YELLOW}${PY} -m venv venv${NC}" + + case "${OS_TYPE}" in + windows-msys) echo -e " ${YELLOW}source venv/Scripts/activate${NC}" ;; + *) echo -e " ${YELLOW}source venv/bin/activate${NC}" ;; + esac + + echo -e " 2. ${YELLOW}${PY} -m pip install -r requirements.txt${NC}" + echo -e " 3. ${YELLOW}${PY} app.py${NC}" + echo "" + echo -e "${CYAN}🔧 Container Commands:${NC}" + echo -e " ${GRAY}Logs: $COMPOSE_CMD -f $COMPOSE_FILE logs -f${NC}" + echo -e " ${GRAY}Stop: $COMPOSE_CMD -f $COMPOSE_FILE stop${NC}" + echo -e " ${GRAY}Remove: $COMPOSE_CMD -f $COMPOSE_FILE down${NC}" + echo "" + echo -e "${CYAN}==================================================================${NC}" +} + +main() { + create_env_file + stop_existing_containers + start_containers + wait_for_postgres + wait_for_rabbitmq + wait_for_api + initialize_database + show_summary + echo -e "\n${GRAY}Containers running in background. Press Ctrl+C to exit this script.${NC}" +} + +setup_python_environment() { + echo -e "\n${CYAN}==================================================================" + echo -e " Full Setup: Python Environment" + echo -e "==================================================================${NC}" + + if [ -d "venv" ]; then + echo -e "${GRAY}Virtual environment exists${NC}" + else + echo -e "${CYAN}Creating virtual environment...${NC}" + $PY -m venv venv + echo -e "${GREEN}[OK] venv created${NC}" + fi + + echo -e "${CYAN}Activating virtual environment...${NC}" + if [ "${OS_TYPE}" = "windows-msys" ]; then + source venv/Scripts/activate + else + source venv/bin/activate + fi + + echo -e "${CYAN}Installing dependencies (5-10 minutes)...${NC}" + python -m pip install --upgrade pip setuptools wheel + python -m pip install -r requirements.txt + echo -e "${GREEN}[OK] Dependencies installed${NC}" + + echo -e "${CYAN}Creating directories...${NC}" + mkdir -p data/reference_images data/models/clip logs + echo -e "${GREEN}[OK] Directories created${NC}" + + # Download CLIP model using curl + echo -e "${CYAN}Checking CLIP model...${NC}" + if [ ! -f "data/models/clip/open_clip_pytorch_model.bin" ]; then + echo -e "${YELLOW}Downloading CLIP model (this may take a while)...${NC}" + curl -L -o data/models/clip/open_clip_pytorch_model.bin \ + ${CLIP_MODEL_URL} || { + echo -e "${YELLOW}WARNING: CLIP model download failed, will download on first run${NC}" + } + + if [ -f "data/models/clip/open_clip_pytorch_model.bin" ]; then + echo -e "${GREEN}[OK] CLIP model downloaded${NC}" + fi + else + echo -e "${GRAY}CLIP model already exists${NC}" + fi + + echo -e "${CYAN}Verifying environment...${NC}" + python -c " +import open_clip, asyncpg, aio_pika, PIL, imagehash +print('✓ All imports successful') +" || { + echo -e "${RED}ERROR: Environment verification failed${NC}" + exit 1 + } + + echo -e "\n${CYAN}==================================================================" + echo -e " Setup Complete!" + echo -e "==================================================================${NC}" + echo -e "\n${GREEN}✓ Next Steps:${NC}" + echo -e "\n${CYAN}Terminal 1 - Worker:${NC}" + [ "${OS_TYPE}" = "windows-msys" ] && echo -e " ${YELLOW}source venv/Scripts/activate${NC}" || echo -e " ${YELLOW}source venv/bin/activate${NC}" + echo -e " ${YELLOW}python app.py${NC}" + echo -e "\n${CYAN}Terminal 2 - API:${NC}" + [ "${OS_TYPE}" = "windows-msys" ] && echo -e " ${YELLOW}source venv/Scripts/activate${NC}" || echo -e " ${YELLOW}source venv/bin/activate${NC}" + echo -e " ${YELLOW}cd api && uvicorn api:app --reload --host 0.0.0.0 --port 8000${NC}" + echo -e "\n${CYAN}API Docs:${NC} ${YELLOW}http://localhost:8000/docs${NC}" + echo "" +} + +main + +if [ "$FULL_SETUP" -eq 1 ]; then + setup_python_environment +fi From 578cc5dcadc385596227e0316b5e1fad9a2ce3c0 Mon Sep 17 00:00:00 2001 From: Manu Date: Fri, 9 Jan 2026 19:45:45 +0530 Subject: [PATCH 5/7] Reference image --- .gitignore | 1 + Dockerfile | 21 +++---- Dockerfile.api | 20 +++--- database/db_manager.py | 10 +-- docker-compose-prod.yml | 29 +++++++++ image_worker/worker.py | 11 ++-- mq/rmq_client.py | 5 ++ scripts/docker-postgres.yml | 53 ++++++++++++++++ scripts/postgres_setup.sh | 118 ++++++++++++++++++++++++++++++++++++ start-prod-env.sh | 5 +- 10 files changed, 236 insertions(+), 37 deletions(-) create mode 100644 scripts/docker-postgres.yml create mode 100644 scripts/postgres_setup.sh diff --git a/.gitignore b/.gitignore index 897c762..cf0db6d 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ build/ # Environment variables .env .env.local +.env.prod # Logs logs/ diff --git a/Dockerfile b/Dockerfile index 5426394..37475e8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,6 +5,7 @@ WORKDIR /app # Install build dependencies in a single layer RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ + vim \ gcc \ g++ \ git \ @@ -18,10 +19,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # Copy only requirements first for better caching COPY requirements.txt . -# Use pip cache and install in parallel -RUN --mount=type=cache,target=/root/.cache/pip \ - python -m pip install --upgrade pip setuptools wheel && \ - pip install -r requirements.txt --user --no-warn-script-location +# Install to explicit location +RUN python -m pip install --no-cache-dir --prefix=/install --upgrade pip setuptools wheel && \ + pip install --no-cache-dir --no-deps --prefix=/install -r requirements.txt + +RUN pip install --prefix=/install -r requirements.txt --no-cache-dir # ============================================ # Final stage - minimal runtime image @@ -37,18 +39,15 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ && rm -rf /var/lib/apt/lists/* # Copy installed packages from builder -COPY --from=builder /root/.local /root/.local - -ENV PATH=/root/.local/bin:$PATH +COPY --from=builder /install /usr/local # Create necessary directories RUN mkdir -p /app/data /app/logs /root/.cache/clip +RUN ls + # Copy application code (do this last for better caching) COPY . . -# Lightweight healthcheck -HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ - CMD python -c "import sys; sys.exit(0)" || exit 1 -CMD ["python", "app.py"] +CMD ["python", "app.py"] \ No newline at end of file diff --git a/Dockerfile.api b/Dockerfile.api index 01fcc66..d0b5c03 100644 --- a/Dockerfile.api +++ b/Dockerfile.api @@ -1,4 +1,4 @@ -FROM python:3.13-slim +FROM python:3.13-slim as builder WORKDIR /app @@ -7,9 +7,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf # Copy requirements and install dependencies COPY api/requirements.txt /app/api/requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - python -m pip install --upgrade pip && \ - pip install -r /app/api/requirements.txt +RUN python -m pip install --prefix=/install -r --upgrade pip && \ + pip install --prefix=/install -r /app/api/requirements.txt --no-cache-dir + +# ============================================ +# Final stage - minimal runtime image +# ============================================ +FROM python:3.13-slim + +WORKDIR /app + +# Copy installed packages from builder +COPY --from=builder /install /usr/local # Copy application code COPY api/ /app/api/ @@ -18,9 +27,6 @@ COPY utils/ /app/utils/ # Expose API port EXPOSE 8000 -# Healthcheck (check if uvicorn is responding) -HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \ - CMD curl -f http://localhost:8000/ || exit 1 # Run the API CMD ["uvicorn", "api.api:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/database/db_manager.py b/database/db_manager.py index 6db7936..d77b70a 100644 --- a/database/db_manager.py +++ b/database/db_manager.py @@ -69,14 +69,6 @@ async def init_pool(self): db_port = int(os.getenv("POSTGRES_PORT") or os.getenv("DB_PORT", "5432")) # db_port = 5435 # TEMP OVERRIDE FOR TESTING - #print the db connection details for debugging - # logger.info("###################") - # logger.info(f"DB Host: {db_host}") - # logger.info(f"DB Port: {db_port}") - # logger.info(f"DB Name: {db_name}") - # logger.info(f"DB User: {db_user}") - # logger.info(f"DB db_password: {db_password}") - # logger.info("###################") if not all([db_user, db_password, db_name]): @@ -525,7 +517,7 @@ async def fetch_reference_images_by_id(self, reference_id): image_path = await self._fetch( """ SELECT image_path - FROM reference_images where id = $1; + FROM reference_images where reference_id = $1; """, reference_id, ) diff --git a/docker-compose-prod.yml b/docker-compose-prod.yml index 3e2e144..0f28bd6 100644 --- a/docker-compose-prod.yml +++ b/docker-compose-prod.yml @@ -65,6 +65,35 @@ services: networks: - plg-network + # =================================== + # API SERVICE + # =================================== + api: + build: + context: . + dockerfile: Dockerfile.api + container_name: plg-api + env_file: + - .env + volumes: + - ./data:/app/data + - ./logs:/app/logs + depends_on: + postgres: + condition: service_healthy + deploy: + resources: + limits: + cpus: '8.0' # Increased from 4.0 - allows up to 8 CPU cores + memory: 8G + reservations: + cpus: '2' # Increased from 1 - guarantees 2 cores minimum + memory: 2G + restart: unless-stopped + networks: + - plg-network + + # =================================== # VOLUMES # =================================== diff --git a/image_worker/worker.py b/image_worker/worker.py index 012d2e9..a54c630 100644 --- a/image_worker/worker.py +++ b/image_worker/worker.py @@ -340,9 +340,6 @@ async def check_assignment_reference_hash_match( best_comparison = comparison if best_match and best_comparison: - print("#"*70) - print("Best match found:",best_match["name"]) - print("#"*70) logger.info("Assignment reference match found") similarity = 1 - (best_score / 64.0) return ( @@ -445,10 +442,10 @@ async def check_clip_match( if not results: return None, 0.0, None - print("#"*70) - for ref_id, sim, meta in results: - print(f" Ref ID: {ref_id}, Similarity: {sim:.4f}, Meta: {meta}") - print("#"*70) + # print("#"*70) + # for ref_id, sim, meta in results: + # print(f" Ref ID: {ref_id}, Similarity: {sim:.4f}, Meta: {meta}") + # print("#"*70) matches = [ (ref_id, sim, meta) diff --git a/mq/rmq_client.py b/mq/rmq_client.py index 6571c0b..a0f0786 100644 --- a/mq/rmq_client.py +++ b/mq/rmq_client.py @@ -102,6 +102,11 @@ async def connect(self): self.DEAD_LETTER_QUEUE, durable=True ) logger.info(f"Dead Letter Queue declared: {self.DEAD_LETTER_QUEUE}") + self.submission_queue = await self.channel.declare_queue( + self.SUBMISSION_QUEUE, + durable=True + ) + logger.info(f"Submission queue created: {self.SUBMISSION_QUEUE}") try: # First try passive declaration to check if queue exists diff --git a/scripts/docker-postgres.yml b/scripts/docker-postgres.yml new file mode 100644 index 0000000..f6dac63 --- /dev/null +++ b/scripts/docker-postgres.yml @@ -0,0 +1,53 @@ +version: '3.8' + +services: + # =================================== + # POSTGRESQL - Database + # =================================== + postgres: + image: pgvector/pgvector:pg16 + container_name: plg-postgres + ports: + - "5432:5432" + environment: + POSTGRES_DB: plagiarism_db + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_INITDB_ARGS: "-E UTF8" + POSTGRES_MAX_CONNECTIONS: 20 + PGDATA: /var/lib/postgresql/data/pgdata + volumes: + - postgres_data:/var/lib/postgresql/data + - ./database/init.sql:/docker-entrypoint-initdb.d/init.sql + healthcheck: + test: ["CMD-SHELL", "pg_isready -U $$POSTGRES_USER -d $$POSTGRES_DB"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + deploy: + resources: + limits: + cpus: '0.5' + memory: 512M + reservations: + cpus: '0.25' + memory: 256M + networks: + - plg-network + restart: always + + +# =================================== +# VOLUMES +# =================================== +volumes: + postgres_data: + driver: local + +# =================================== +# NETWORKS +# =================================== +networks: + plg-network: + driver: bridge diff --git a/scripts/postgres_setup.sh b/scripts/postgres_setup.sh new file mode 100644 index 0000000..92396b9 --- /dev/null +++ b/scripts/postgres_setup.sh @@ -0,0 +1,118 @@ +COMPOSE_FILE="docker-postgres.yml" +COMPOSE_CMD="podman-compose" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +GRAY='\033[0;37m' +NC='\033[0m' + + +# Load configuration from .env if it exists +if [ -f ".env" ]; then + set -a + source <(grep -v '^#' .env | grep -v '^$' | sed 's/\r$//') + set +a +fi + +# Configuration (with defaults) +POSTGRES_CONTAINER="plg-postgres" +POSTGRES_PORT="${POSTGRES_PORT:-5432}" +POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-postgres}" +POSTGRES_DB="${POSTGRES_DB:-plagiarism_db}" +POSTGRES_USER="${POSTGRES_USER:-postgres}" + +$COMPOSE_CMD -f $COMPOSE_FILE up -d + +wait_for_postgres() { + echo -e "${YELLOW}Waiting for PostgreSQL...${NC}" + + local max_attempts=30 + local attempt=0 + + while [ $attempt -lt $max_attempts ]; do + attempt=$((attempt + 1)) + + if $CONTAINER_CMD exec $POSTGRES_CONTAINER pg_isready -U $POSTGRES_USER &>/dev/null; then + echo -e "${GREEN}[OK] PostgreSQL ready${NC}" + return 0 + fi + + sleep 2 + done + + echo -e "${RED}ERROR: PostgreSQL timeout${NC}" + exit 1 +} + +initialize_database() { + if [ ! -f "database/init.sql" ]; then + echo -e "${YELLOW}WARNING: database/init.sql not found${NC}" + return + fi + + echo -e "${CYAN}Initializing database...${NC}" + + # Run init.sql + if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < database/init.sql 2>/dev/null; then + echo -e "${GREEN}[OK] Database schema initialized${NC}" + else + echo -e "${GRAY}Database schema already exists${NC}" + fi + + # Create migrations tracking table if it doesn't exist + $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " + CREATE TABLE IF NOT EXISTS schema_migrations ( + id SERIAL PRIMARY KEY, + migration_name VARCHAR(255) UNIQUE NOT NULL, + applied_at TIMESTAMP DEFAULT NOW() + ); + " 2>/dev/null + + # Run migration scripts + if [ -d "database/migrations" ]; then + local migration_count=0 + for migration_file in database/migrations/*.sql; do + if [ -f "$migration_file" ]; then + local migration_name=$(basename "$migration_file") + + # Check if migration already applied + local already_applied=$($CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -t -c " + SELECT COUNT(*) FROM schema_migrations WHERE migration_name = '$migration_name'; + " 2>/dev/null | tr -d '[:space:]') + + if [ "$already_applied" = "0" ]; then + echo -e "${CYAN}Applying migration: $migration_name${NC}" + + if $CONTAINER_CMD exec -i $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB < "$migration_file" 2>/dev/null; then + # Record migration as applied + $CONTAINER_CMD exec $POSTGRES_CONTAINER psql -U $POSTGRES_USER -d $POSTGRES_DB -c " + INSERT INTO schema_migrations (migration_name) VALUES ('$migration_name'); + " 2>/dev/null + echo -e "${GREEN}[OK] Applied: $migration_name${NC}" + migration_count=$((migration_count + 1)) + else + echo -e "${YELLOW}WARNING: Failed to apply $migration_name${NC}" + fi + fi + fi + done + + if [ $migration_count -eq 0 ]; then + echo -e "${GRAY}All migrations already applied${NC}" + else + echo -e "${GREEN}[OK] Applied $migration_count migration(s)${NC}" + fi + fi +} + +main() { + wait_for_postgres + initialize_database + echo -e "\n${GRAY}Containers running in background. Press Ctrl+C to exit this script.${NC}" +} + +main +echo -e "${YELLOW}PostgreSQL done...${NC}" \ No newline at end of file diff --git a/start-prod-env.sh b/start-prod-env.sh index 8f92a20..b9b8e0c 100755 --- a/start-prod-env.sh +++ b/start-prod-env.sh @@ -4,12 +4,12 @@ set -e FULL_SETUP=0 -START_API=0 +START_API=1 COMPOSE_FILE="docker-compose-prod.yml" while [[ "$#" -gt 0 ]]; do case "$1" in - --full-setup) FULL_SETUP=1; shift ;; + --full-setup) FULpodmanL_SETUP=1; shift ;; --with-api) START_API=1; shift ;; --prod) COMPOSE_FILE="docker-compose-prod.yml"; shift ;; --dev) COMPOSE_FILE="docker-compose-dev.yml"; shift ;; @@ -363,7 +363,6 @@ main() { stop_existing_containers start_containers wait_for_postgres - wait_for_rabbitmq wait_for_api initialize_database show_summary From c1406eb8256c37ed0c3a9cd0be1e2a7596ba1266 Mon Sep 17 00:00:00 2001 From: Manu Date: Fri, 9 Jan 2026 19:48:42 +0530 Subject: [PATCH 6/7] Reference Image --- database/db_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/database/db_manager.py b/database/db_manager.py index d77b70a..fd7fb8d 100644 --- a/database/db_manager.py +++ b/database/db_manager.py @@ -514,7 +514,7 @@ async def fetch_reference_images_by_id(self, reference_id): raise RuntimeError("Database pool not initialized") try: - image_path = await self._fetch( + image_path = await self._fetchval( """ SELECT image_path FROM reference_images where reference_id = $1; From 825065497cdea6f215024a9d12da20a2a34ec4a5 Mon Sep 17 00:00:00 2001 From: Manu Date: Tue, 17 Feb 2026 22:27:33 +0530 Subject: [PATCH 7/7] video media handling --- image_worker/image_validator.py | 38 +++++++++++++++++++++++++++++++ image_worker/worker.py | 40 +++++++++++++++++++++++++++++++-- mq/rmq_client.py | 9 ++++---- 3 files changed, 80 insertions(+), 7 deletions(-) diff --git a/image_worker/image_validator.py b/image_worker/image_validator.py index 43a7b25..e4d1666 100644 --- a/image_worker/image_validator.py +++ b/image_worker/image_validator.py @@ -53,6 +53,26 @@ class ImageValidator: "vecteezy", ] + + IMAGE_EXTENSIONS = { + "jpg", + "jpeg", + "png", + "gif", + "webp", + "bmp", + "tiff", + "heic", + } + VIDEO_EXTENSIONS = { + "mp4", + "mov", + "webm", + "mkv", + "avi", + "mpeg", + "mpg", + } def __init__( self, min_variance_threshold: float = 5.0, @@ -71,6 +91,24 @@ def __init__( self.min_unique_colors = min_unique_colors self.max_solid_color_ratio = max_solid_color_ratio + + def detect_media_type(self, submission_url: str) -> str: + """Detect media type based on URL/extension.""" + if not submission_url: + return "image" + + url_without_query = submission_url.split("?", 1)[0].lower() + if "." in url_without_query: + ext = url_without_query.rsplit(".", 1)[-1] + if ext in self.IMAGE_EXTENSIONS: + return "image" + if ext in self.VIDEO_EXTENSIONS: + return "video" + + if "video" in url_without_query: + return "video" + return "image" + def check_stock_image_url(self, image_url: str) -> Tuple[bool, Optional[str]]: """ Check if URL is from a known stock image website. diff --git a/image_worker/worker.py b/image_worker/worker.py index a54c630..b5d67b1 100644 --- a/image_worker/worker.py +++ b/image_worker/worker.py @@ -356,7 +356,6 @@ async def check_assignment_reference_hash_match( logger.error(f"Hash check failed: {e}", exc_info=True) raise - async def check_db_reference_hash_match( self, hashes: dict ) -> Tuple[bool, Optional[str], Optional[float], Optional[str]]: @@ -550,10 +549,24 @@ async def process_submission(self, data: Dict[str, Any]) -> Optional[str]: try: extracted = self._validate_input(data) - submission_id, student_id, assign_id, image_url, db_record_id = extracted + submission_id, student_id, assign_id, submission_url, db_record_id = extracted logger.info(f"Processing submission: {submission_id}") + # Check for video URLs before attempting to download + media_type = self.image_validator.detect_media_type(submission_url) + if media_type == "video": + logger.warning( + f"Video URL rejected: submission={submission_id}, url={submission_url}" + ) + video_result = self._create_video_url_result( + submission_id, student_id, assign_id, submission_url + ) + processing_time_ms = int((time.time() - start_time) * 1000) + return json.dumps(video_result) + else: + image_url = submission_url + # Check for stock image URLs before downloading is_stock, stock_site = self.image_validator.check_stock_image_url(image_url) if is_stock and stock_site: @@ -1101,6 +1114,29 @@ def _create_stock_image_result( "plagiarism_source": f"stock_image_{stock_site}", "similar_sources": [{"source": stock_site, "url": image_url}], } + + def _create_video_url_result( + self, + submission_id: str, + student_id: str, + assign_id: str, + submission_url: str + ) -> dict: + """Create video URL detection result dictionary.""" + return { + "submission_id": submission_id, + "student_id": student_id, + "assignment_id": assign_id, + "image_url": submission_url, + "is_ai_generated": False, + "ai_detection_source": "None", + "ai_confidence": 0.0, + "is_plagiarized": False, + "similarity_score": 1.0, + "match_type": "original", + "plagiarism_source": None, + "similar_sources": None, + } async def _build_reference_result( self, diff --git a/mq/rmq_client.py b/mq/rmq_client.py index a0f0786..1e4f31a 100644 --- a/mq/rmq_client.py +++ b/mq/rmq_client.py @@ -102,11 +102,7 @@ async def connect(self): self.DEAD_LETTER_QUEUE, durable=True ) logger.info(f"Dead Letter Queue declared: {self.DEAD_LETTER_QUEUE}") - self.submission_queue = await self.channel.declare_queue( - self.SUBMISSION_QUEUE, - durable=True - ) - logger.info(f"Submission queue created: {self.SUBMISSION_QUEUE}") + try: # First try passive declaration to check if queue exists @@ -124,6 +120,8 @@ async def connect(self): ) logger.info(f"Submission queue created: {self.SUBMISSION_QUEUE}") + + try: # First try passive declaration to check if queue exists self.feedback_queue = await self.channel.declare_queue( @@ -165,6 +163,7 @@ async def publish_message(self, message_body): logger.info( f"Published submission {message_body.get('submission_id')} for user {message_body.get('student_id')}" ) + logger.info(f"Published message body: {message_body}") except asyncio.CancelledError as e: logger.warning("publish_message CancelledError") raise Exception("publish_message CancelledError") from e