From 742a1b2a4debc34fc3e52b6dd08559683e7fe864 Mon Sep 17 00:00:00 2001 From: Yegor P Date: Mon, 8 Dec 2025 16:50:12 +0000 Subject: [PATCH] Add CR-HyperVR: Cloud Run hypergraph-vector recommender for media discovery - INT8 ONNX MiniLM embeddings (CPU-only, no GPU required) - pgvector similarity search + hyperedge signals - Live Cloud Run endpoints ready to use - Fits Entertainment Discovery track --- apps/cr-hypervr/Dockerfile | 27 ++ apps/cr-hypervr/LICENSE | 21 + apps/cr-hypervr/Makefile | 47 ++ apps/cr-hypervr/README.md | 411 ++++++++++++++++++ apps/cr-hypervr/app/__init__.py | 3 + apps/cr-hypervr/app/api/__init__.py | 1 + apps/cr-hypervr/app/core/config.py | 28 ++ apps/cr-hypervr/app/db/__init__.py | 1 + apps/cr-hypervr/app/db/client.py | 155 +++++++ apps/cr-hypervr/app/main.py | 343 +++++++++++++++ apps/cr-hypervr/app/schemas.py | 79 ++++ apps/cr-hypervr/app/services/__init__.py | 1 + apps/cr-hypervr/app/services/embedder.py | 142 ++++++ apps/cr-hypervr/app/services/reranker.py | 30 ++ apps/cr-hypervr/app/services/scoring.py | 24 + apps/cr-hypervr/cloudbuild.yaml | 10 + apps/cr-hypervr/db/pgvector.sql | 1 + apps/cr-hypervr/db/schema.sql | 56 +++ apps/cr-hypervr/pipeline/__init__.py | 1 + .../__pycache__/__init__.cpython-313.pyc | Bin 0 -> 159 bytes .../__pycache__/triplets.cpython-313.pyc | Bin 0 -> 5858 bytes .../__pycache__/user_profiles.cpython-313.pyc | Bin 0 -> 5810 bytes apps/cr-hypervr/pipeline/netflix_parser.py | 77 ++++ apps/cr-hypervr/pipeline/tmdb_enrich.py | 51 +++ apps/cr-hypervr/pipeline/triplets.py | 97 +++++ apps/cr-hypervr/pipeline/user_profiles.py | 108 +++++ apps/cr-hypervr/requirements.txt | 17 + .../scripts/backfill_embeddings_db.py | 172 ++++++++ apps/cr-hypervr/scripts/build_hyperedges.py | 230 ++++++++++ apps/cr-hypervr/scripts/db_apply_cloudsql.sh | 14 + apps/cr-hypervr/scripts/deploy_cloud_run.sh | 27 ++ .../scripts/deploy_graph_service.sh | 39 ++ apps/cr-hypervr/scripts/deploy_jobs.sh | 193 ++++++++ apps/cr-hypervr/scripts/download_minilm.py | 37 ++ .../scripts/download_movielens_25m.sh | 13 + apps/cr-hypervr/scripts/download_tmdb_full.py | 33 ++ apps/cr-hypervr/scripts/export_openapi.py | 30 ++ apps/cr-hypervr/scripts/gcloud_env.sh | 31 ++ apps/cr-hypervr/scripts/gcp_log.py | 137 ++++++ apps/cr-hypervr/scripts/gcp_verify.sh | 43 ++ apps/cr-hypervr/scripts/join_datasets.py | 108 +++++ apps/cr-hypervr/scripts/migrate_db.py | 61 +++ apps/cr-hypervr/scripts/provision_core.sh | 126 ++++++ .../cr-hypervr/scripts/run_pipeline_phase2.py | 45 ++ .../cr-hypervr/scripts/run_pipeline_phase3.py | 164 +++++++ apps/cr-hypervr/scripts/seed_embeddings.py | 154 +++++++ apps/cr-hypervr/scripts/seed_movies.py | 76 ++++ apps/cr-hypervr/scripts/seed_ratings.py | 60 +++ apps/cr-hypervr/scripts/setup_gcp.sh | 22 + apps/cr-hypervr/scripts/setup_secrets.sh | 36 ++ apps/cr-hypervr/scripts/upload_gcs_assets.sh | 112 +++++ .../cr-hypervr/scripts/validate_hyperedges.py | 180 ++++++++ .../scripts/validate_triplets_coverage.py | 61 +++ apps/cr-hypervr/scripts/verify_gcp_access.py | 124 ++++++ apps/cr-hypervr/training/onnx_export.py | 23 + apps/cr-hypervr/training/quantize_int8.py | 20 + apps/cr-hypervr/training/train_finetune.py | 96 ++++ 57 files changed, 4198 insertions(+) create mode 100644 apps/cr-hypervr/Dockerfile create mode 100644 apps/cr-hypervr/LICENSE create mode 100644 apps/cr-hypervr/Makefile create mode 100644 apps/cr-hypervr/README.md create mode 100644 apps/cr-hypervr/app/__init__.py create mode 100644 apps/cr-hypervr/app/api/__init__.py create mode 100644 apps/cr-hypervr/app/core/config.py create mode 100644 apps/cr-hypervr/app/db/__init__.py create mode 100644 apps/cr-hypervr/app/db/client.py create mode 100644 apps/cr-hypervr/app/main.py create mode 100644 apps/cr-hypervr/app/schemas.py create mode 100644 apps/cr-hypervr/app/services/__init__.py create mode 100644 apps/cr-hypervr/app/services/embedder.py create mode 100644 apps/cr-hypervr/app/services/reranker.py create mode 100644 apps/cr-hypervr/app/services/scoring.py create mode 100644 apps/cr-hypervr/cloudbuild.yaml create mode 100644 apps/cr-hypervr/db/pgvector.sql create mode 100644 apps/cr-hypervr/db/schema.sql create mode 100644 apps/cr-hypervr/pipeline/__init__.py create mode 100644 apps/cr-hypervr/pipeline/__pycache__/__init__.cpython-313.pyc create mode 100644 apps/cr-hypervr/pipeline/__pycache__/triplets.cpython-313.pyc create mode 100644 apps/cr-hypervr/pipeline/__pycache__/user_profiles.cpython-313.pyc create mode 100644 apps/cr-hypervr/pipeline/netflix_parser.py create mode 100644 apps/cr-hypervr/pipeline/tmdb_enrich.py create mode 100644 apps/cr-hypervr/pipeline/triplets.py create mode 100644 apps/cr-hypervr/pipeline/user_profiles.py create mode 100644 apps/cr-hypervr/requirements.txt create mode 100644 apps/cr-hypervr/scripts/backfill_embeddings_db.py create mode 100644 apps/cr-hypervr/scripts/build_hyperedges.py create mode 100644 apps/cr-hypervr/scripts/db_apply_cloudsql.sh create mode 100644 apps/cr-hypervr/scripts/deploy_cloud_run.sh create mode 100644 apps/cr-hypervr/scripts/deploy_graph_service.sh create mode 100644 apps/cr-hypervr/scripts/deploy_jobs.sh create mode 100644 apps/cr-hypervr/scripts/download_minilm.py create mode 100644 apps/cr-hypervr/scripts/download_movielens_25m.sh create mode 100644 apps/cr-hypervr/scripts/download_tmdb_full.py create mode 100644 apps/cr-hypervr/scripts/export_openapi.py create mode 100644 apps/cr-hypervr/scripts/gcloud_env.sh create mode 100644 apps/cr-hypervr/scripts/gcp_log.py create mode 100644 apps/cr-hypervr/scripts/gcp_verify.sh create mode 100644 apps/cr-hypervr/scripts/join_datasets.py create mode 100644 apps/cr-hypervr/scripts/migrate_db.py create mode 100644 apps/cr-hypervr/scripts/provision_core.sh create mode 100644 apps/cr-hypervr/scripts/run_pipeline_phase2.py create mode 100644 apps/cr-hypervr/scripts/run_pipeline_phase3.py create mode 100644 apps/cr-hypervr/scripts/seed_embeddings.py create mode 100644 apps/cr-hypervr/scripts/seed_movies.py create mode 100644 apps/cr-hypervr/scripts/seed_ratings.py create mode 100644 apps/cr-hypervr/scripts/setup_gcp.sh create mode 100644 apps/cr-hypervr/scripts/setup_secrets.sh create mode 100644 apps/cr-hypervr/scripts/upload_gcs_assets.sh create mode 100644 apps/cr-hypervr/scripts/validate_hyperedges.py create mode 100644 apps/cr-hypervr/scripts/validate_triplets_coverage.py create mode 100644 apps/cr-hypervr/scripts/verify_gcp_access.py create mode 100644 apps/cr-hypervr/training/onnx_export.py create mode 100644 apps/cr-hypervr/training/quantize_int8.py create mode 100644 apps/cr-hypervr/training/train_finetune.py diff --git a/apps/cr-hypervr/Dockerfile b/apps/cr-hypervr/Dockerfile new file mode 100644 index 00000000..317c7aab --- /dev/null +++ b/apps/cr-hypervr/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.11-slim + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 + +WORKDIR /app + +# System deps (minimal) +RUN apt-get update && apt-get install -y --no-install-recommends build-essential && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt ./ +RUN pip install --upgrade pip && pip install -r requirements.txt + +# Preload base model for offline CPU inference (no heredoc for wider builder support) +RUN python -c "from sentence_transformers import SentenceTransformer; import os; os.makedirs('models/base-minilm', exist_ok=True); SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2').save('models/base-minilm'); print('Base MiniLM model cached at models/base-minilm')" + +COPY app ./app +COPY scripts ./scripts +COPY pipeline ./pipeline +COPY training ./training + +EXPOSE 8080 + +ENV MODEL_DIR="" +ENV BASE_MODEL_DIR="models/base-minilm" +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8080"] diff --git a/apps/cr-hypervr/LICENSE b/apps/cr-hypervr/LICENSE new file mode 100644 index 00000000..db48ebd2 --- /dev/null +++ b/apps/cr-hypervr/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 YP + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/apps/cr-hypervr/Makefile b/apps/cr-hypervr/Makefile new file mode 100644 index 00000000..a96e8311 --- /dev/null +++ b/apps/cr-hypervr/Makefile @@ -0,0 +1,47 @@ +PY ?= python3 + +.PHONY: gcp-provision gcp-secrets gcp-verify gcp-build gcp-deploy gcp-deploy-infra gcp-jobs-deploy gcp-job-run-phase2 gcp-job-run-phase3 db-apply-cloudsql export-openapi + +# GCP: Core provisioning (Artifact Registry, bucket, Cloud SQL, SAs, secret) +gcp-provision: + bash scripts/provision_core.sh + +# GCP: Create/update Secret Manager entries (e.g., DATABASE_URL, Kaggle creds) +gcp-secrets: + bash scripts/setup_secrets.sh + +# GCP: Sanity checks (services, AR repo, Cloud SQL, SAs) +gcp-verify: + bash scripts/gcp_verify.sh + +# GCP: Build container in Cloud Build and push to Artifact Registry +gcp-build: + gcloud builds submit --region=$${REGION:-europe-west2} --config=cloudbuild.yaml --substitutions=_REGION=$${REGION:-europe-west2} + +# GCP: Deploy primary API service (embedding-service) +gcp-deploy: + bash scripts/deploy_cloud_run.sh + +# GCP: Deploy infra-service (graph‑focused, POST‑only) +gcp-deploy-infra: + bash scripts/deploy_graph_service.sh + +# GCP: Deploy Cloud Run Jobs for data pipelines and validation +gcp-jobs-deploy: + bash scripts/deploy_jobs.sh + +# Run Phase 2 (join → profiles → hyperedges + validation) as a Cloud Run Job +gcp-job-run-phase2: + PROJECT_ID=$${PROJECT_ID} REGION=$${REGION:-europe-west2} bash -lc 'gcloud beta run jobs execute pipeline-phase2 --region $$REGION' + +# Run Phase 3 (fine‑tune → ONNX → INT8) as a Cloud Run Job +gcp-job-run-phase3: + PROJECT_ID=$${PROJECT_ID} REGION=$${REGION:-europe-west2} bash -lc 'gcloud beta run jobs execute pipeline-phase3 --region $$REGION' + +# Apply schema + pgvector to Cloud SQL (uses Cloud SQL connect) +db-apply-cloudsql: + bash scripts/db_apply_cloudsql.sh + +# Export OpenAPI JSON (writes openapi.json in repo root) +export-openapi: + $(PY) scripts/export_openapi.py diff --git a/apps/cr-hypervr/README.md b/apps/cr-hypervr/README.md new file mode 100644 index 00000000..16ca4b52 --- /dev/null +++ b/apps/cr-hypervr/README.md @@ -0,0 +1,411 @@ +# CR-HyperVR + +**Cloud Run Hypergraph-Vector Recommender** + +CPU-only FastAPI service using INT8 ONNX MiniLM, pgvector similarity, and hyperedge signals. + +## What It Does + +- Recommends films based on descriptions and user rating history +- Creates compact embeddings of movies and user taste profiles +- Searches for semantically similar content via vector similarity +- Enhances results using hypergraph signals (co-watch patterns, shared genres) +- Runs entirely on GCP Cloud Run with no GPU required + +## Key Benefits + +- **Low latency & cost** — INT8-quantized ONNX model runs efficiently on CPU +- **Cold-start friendly** — Hypergraph edges help recommend even with sparse user data +- **Scalable** — Cloud Run auto-scales based on traffic +- **Easy integration** — Simple JSON POST endpoints for any client or agent + +--- + +## Quick Start + +Live endpoints are public and ready to use: + +| Service | URL | +|---------|-----| +| Embedding API | `https://embedding-service-5pgvctvdpq-nw.a.run.app` | +| Graph Service | `https://infra-service-5pgvctvdpq-nw.a.run.app` | + +### Try It Now + +**Embed free text:** +```bash +curl -s -X POST \ + https://embedding-service-5pgvctvdpq-nw.a.run.app/embed/text \ + -H 'Content-Type: application/json' \ + -d '{"text":"neo-noir heist with witty banter"}' +``` + +**Get graph-powered recommendations:** +```bash +curl -s -X POST \ + https://infra-service-5pgvctvdpq-nw.a.run.app/graph/recommend \ + -H 'Content-Type: application/json' \ + -d '{"query":"space opera adventure","top_k":5,"seed_top_k":15,"hops":2}' +``` + +### Full API Examples + +
+Search similar movies + +```bash +curl -s -X POST \ + https://embedding-service-5pgvctvdpq-nw.a.run.app/search/similar \ + -H 'Content-Type: application/json' \ + -d '{ + "text":"grounded space survival drama", + "top_k": 10 + }' +``` +
+ +
+Recommend for a user ID + +```bash +curl -s -X POST \ + https://embedding-service-5pgvctvdpq-nw.a.run.app/search/recommend \ + -H 'Content-Type: application/json' \ + -d '{ + "user_id": 123, + "top_k": 10, + "exclude_movie_ids": [1,2,3] + }' +``` +
+ +
+Embed a movie object + +```bash +curl -s -X POST \ + https://embedding-service-5pgvctvdpq-nw.a.run.app/embed/movie \ + -H 'Content-Type: application/json' \ + -d '{ + "title":"The Grand Budapest Hotel", + "genres":["Comedy","Drama"], + "description":"A whimsical concierge and lobby boy embark on capers across a pastel Europe." + }' +``` +
+ +
+Embed a user taste profile + +```bash +curl -s -X POST \ + https://embedding-service-5pgvctvdpq-nw.a.run.app/embed/user \ + -H 'Content-Type: application/json' \ + -d '{ + "liked_genres":["Sci-Fi","Adventure"], + "liked_movies":["Star Wars","Guardians of the Galaxy"], + "disliked_genres":["Horror"] + }' +``` +
+ +
+Batch embed texts + +```bash +curl -s -X POST \ + https://embedding-service-5pgvctvdpq-nw.a.run.app/embed/batch \ + -H 'Content-Type: application/json' \ + -d '{ + "texts": [ + "gritty detective thriller set in Boston", + "lighthearted family fantasy with magical creatures" + ] + }' +``` +
+ +
+Graph recommendations with weight tuning + +```bash +curl -s -X POST \ + https://infra-service-5pgvctvdpq-nw.a.run.app/graph/recommend \ + -H 'Content-Type: application/json' \ + -d '{ + "query":"A feel-good romantic comedy set in New York City with witty banter and heartfelt moments.", + "top_k": 5, + "seed_top_k": 15, + "hops": 2, + "embed_weight": 1.0, + "cowatch_weight": 0.5, + "genre_weight": 0.25 + }' +``` +
+ +--- + +## Architecture + +Fully managed GCP infrastructure with automatic scaling and pay-per-use billing. + +```mermaid +flowchart TD + subgraph Clients + A[Client / Agent] + end + + subgraph Cloud Run + B((embedding-service)) + C((infra-service)) + end + + subgraph Storage + D[(Cloud SQL)] + E[(GCS)] + end + + subgraph Jobs + F{{Cloud Run Jobs}} + end + + A -->|JSON POST| B + A -->|JSON POST| C + B -->|pgvector| D + C -->|pgvector + hyperedges| D + E -->|ONNX model| B + E -->|datasets| F + F -->|write hyperedges| D +``` + +### Infrastructure Components + +| Component | Purpose | +|-----------|---------| +| **Cloud Run Services** | Auto-scaling API endpoints for embedding and recommendations | +| **Cloud SQL (PostgreSQL 15)** | Stores movie embeddings and hyperedges with pgvector | +| **Cloud Storage** | Hosts datasets, trained models, and pipeline outputs | +| **Cloud Run Jobs** | Executes data pipelines and model training | +| **Secret Manager** | Secures database credentials | + +### How It Works + +1. **Query arrives** → Text is embedded using INT8 ONNX MiniLM +2. **Vector search** → pgvector finds semantically similar movies +3. **Graph expansion** → Hyperedges add co-watch and genre neighbors +4. **Score fusion** → Weighted combination produces final ranking + +```mermaid +flowchart TD + Q[Query text] --> E[Embed vector] + E --> S[Seed candidates via pgvector] + S --> COW[Co-watch neighbors] + S --> GEN[Shared-genre neighbors] + COW --> M[Normalize + weighted sum] + GEN --> M + E -->|as scores| M + M --> R[Top-K ranked list] +``` + +--- + +## API Reference + +### Embedding Service Flow + +```mermaid +flowchart LR + A[POST Request] --> B[embedding-service] + B --> C{Endpoint} + C -->|/embed/text| D[Tokenize + ONNX inference] + C -->|/embed/movie| D + C -->|/embed/user| D + C -->|/embed/batch| D + C -->|/search/similar| E[Embed + pgvector query] + C -->|/search/recommend| F[Load profile + pgvector + rerank] + D --> G[384-dim vector] + E --> H[Ranked movie list] + F --> H +``` + +### Infra Service Flow + +```mermaid +flowchart LR + A[POST /graph/recommend] --> B[infra-service] + B --> C[Embed query text] + C --> D[pgvector seed candidates] + D --> E[Hyperedge expansion] + E --> F[Co-watch edges] + E --> G[Genre edges] + F --> H[Score fusion] + G --> H + H --> I[Top-K results] +``` + +### Embedding Endpoints + +| Endpoint | Description | Response | +|----------|-------------|----------| +| `POST /embed/text` | Embed free text | `{ embedding, dimension, model }` | +| `POST /embed/batch` | Embed multiple texts | Array of embeddings | +| `POST /embed/movie` | Embed from title + genres + description | Embedding object | +| `POST /embed/user` | Embed user taste profile | Embedding object | + +### Search Endpoints + +| Endpoint | Description | +|----------|-------------| +| `POST /search/similar` | Vector search over movie embeddings | +| `POST /search/recommend` | Recommendations from user profile | +| `POST /graph/recommend` | Graph-enhanced recommendations | + +### Health & Metrics + +| Endpoint | Description | +|----------|-------------| +| `GET /healthz` | Health check | +| `GET /ready` | Readiness probe | +| `GET /metrics` | Service metrics | + +**Export OpenAPI spec:** +```bash +make export-openapi +``` + +--- + +## Data Pipeline + +### Data Sources + +- **TMDB** — Movie descriptions from Kaggle dataset `tmdb-movies-dataset-2023-930k-movies` +- **MovieLens 25M** — User ratings for collaborative signals + +### Embedding Model + +- **MiniLM-L6-v2** — Base embedding model from sentence-transformers + +### Pipeline Phases + +```mermaid +flowchart TD + A[Phase 2 Outputs] --> B[train_finetune.py] + B --> C[onnx_export.py] + C --> D[quantize_int8.py] + D --> E[(GCS)] + E --> F((Cloud Run)) +``` + +| Phase | Script | Output | +|-------|--------|--------| +| **Join** | `scripts/join_datasets.py` | Merged TMDB + MovieLens Parquet | +| **Train** | `training/train_finetune.py` | Fine-tuned MiniLM model | +| **Export** | `training/onnx_export.py` | `model.onnx` | +| **Quantize** | `training/quantize_int8.py` | `model-int8.onnx` | +| **Validate** | `scripts/validate_hyperedges.py` | DB edge verification | + +--- + +## GCP Deployment + +### Prerequisites + +Enable these GCP APIs: +- Cloud Run +- Cloud Build +- Artifact Registry +- Cloud SQL Admin +- Secret Manager +- VPC Access +- Cloud Storage + +### Provision Infrastructure + +```bash +PROJECT_ID=agentics-foundation25lon-1809 \ +REGION=europe-west2 \ +AR_REPO=embedding-service \ +BUCKET_NAME=${PROJECT_ID}-${REGION}-embeddings \ +SQL_INSTANCE=embeddings-sql-${REGION} \ +DB_NAME=movies DB_USER=app_user \ +make gcp-provision +``` + +### Configure Secrets + +```bash +make gcp-secrets +``` + +### Deploy Services + +**Embedding service:** +```bash +REGION=europe-west2 make gcp-build + +PROJECT_ID=agentics-foundation25lon-1809 REGION=europe-west2 \ + SERVICE_NAME=embedding-service \ + MODEL_GCS_URI=gs://agentics-foundation25lon-1809-europe-west2-models-20251207/models/movie-minilm-v1/model-int8.onnx \ + make gcp-deploy +``` + +**Infra service (graph recommendations):** +```bash +PROJECT_ID=agentics-foundation25lon-1809 REGION=europe-west2 \ + SERVICE_NAME=infra-service make gcp-deploy-infra +``` + +### Deploy Pipeline Jobs + +```bash +# Deploy jobs +PROJECT_ID=agentics-foundation25lon-1809 REGION=europe-west2 \ + AR_REPO=embedding-service make gcp-jobs-deploy + +# Run Phase 2 (embeddings + hyperedges) +PROJECT_ID=agentics-foundation25lon-1809 REGION=europe-west2 make gcp-job-run-phase2 + +# Run Phase 3 (fine-tuning → ONNX → INT8) +PROJECT_ID=agentics-foundation25lon-1809 REGION=europe-west2 make gcp-job-run-phase3 +``` + +--- + +## Configuration + +### Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `DATABASE_URL` | PostgreSQL connection (via Secret Manager) | — | +| `MODEL_GCS_URI` | GCS path to ONNX model | — | +| `EMBEDDING_BACKEND` | Backend selection | `auto` | +| `USE_GRAPH_SCORER` | Enable graph scoring | `false` | +| `USE_RERANKER` | Enable reranking | `false` | +| `VECTOR_DIM` | Embedding dimension | `384` | +| `LOG_LEVEL` | Logging verbosity | `INFO` | +| `ALLOWED_ORIGINS` | CORS origins (comma-separated) | — | + +### Embedding Backends + +| Value | Description | +|-------|-------------| +| `auto` | Auto-detect best available | +| `onnx` | ONNX Runtime (recommended) | +| `st` | Sentence Transformers | +| `hash` | Hash-based fallback | + +--- + +## Roadmap + +- [ ] Curriculum sampling with temperature-controlled hard negatives +- [ ] Weak supervision from genre and co-watch edges during training +- [ ] Portable ONNX with shared tokenizer +- [ ] TinyBERT/MiniLM cross-encoder reranker +- [ ] Nightly retraining with drift detection +- [ ] Canary deployments with automated guardrails + +--- diff --git a/apps/cr-hypervr/app/__init__.py b/apps/cr-hypervr/app/__init__.py new file mode 100644 index 00000000..4e3a74d3 --- /dev/null +++ b/apps/cr-hypervr/app/__init__.py @@ -0,0 +1,3 @@ +__all__ = ["__version__"] +__version__ = "0.1.0" + diff --git a/apps/cr-hypervr/app/api/__init__.py b/apps/cr-hypervr/app/api/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/apps/cr-hypervr/app/api/__init__.py @@ -0,0 +1 @@ + diff --git a/apps/cr-hypervr/app/core/config.py b/apps/cr-hypervr/app/core/config.py new file mode 100644 index 00000000..5c77bd39 --- /dev/null +++ b/apps/cr-hypervr/app/core/config.py @@ -0,0 +1,28 @@ +from pydantic import Field +from pydantic_settings import BaseSettings +from typing import List + + +class Settings(BaseSettings): + app_name: str = Field(default="CR-HyperVR") + environment: str = Field(default="dev") + log_level: str = Field(default="INFO") + database_url: str | None = None + model_dir: str = Field(default="models/movie-minilm-v1") + base_model_dir: str = Field(default="models/base-minilm") + model_name: str = Field(default="movie-minilm-v1") + vector_dim: int = Field(default=384) + allowed_origins: List[str] = Field(default_factory=list, description="CORS allowed origins") + # Embedding backend: auto|onnx|st|hash (hash = deterministic lightweight backend for tests/offline) + embedding_backend: str = Field(default="auto") + # Graph scoring options + use_graph_scorer: bool = Field(default=False) + graph_score_weight: float = Field(default=0.05) + # Optional reranker toggle (stub implementation) + use_reranker: bool = Field(default=False) + + class Config: + env_file = ".env" + + +settings = Settings() diff --git a/apps/cr-hypervr/app/db/__init__.py b/apps/cr-hypervr/app/db/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/apps/cr-hypervr/app/db/__init__.py @@ -0,0 +1 @@ + diff --git a/apps/cr-hypervr/app/db/client.py b/apps/cr-hypervr/app/db/client.py new file mode 100644 index 00000000..799bf244 --- /dev/null +++ b/apps/cr-hypervr/app/db/client.py @@ -0,0 +1,155 @@ +from __future__ import annotations + +import asyncio +from typing import Any +import asyncpg +import numpy as np +from app.core.config import settings + + +class DB: + def __init__(self, dsn: str | None = None) -> None: + self._dsn = dsn or settings.database_url + self._pool: asyncpg.Pool | None = None + + async def connect(self) -> None: + if self._pool is None: + if not self._dsn: + import os + env_dsn = os.getenv("DATABASE_URL") + if env_dsn: + self._dsn = env_dsn + else: + raise RuntimeError("DATABASE_URL not configured") + self._pool = await asyncpg.create_pool(self._dsn, min_size=1, max_size=5) + + async def close(self) -> None: + if self._pool is not None: + await self._pool.close() + self._pool = None + + async def fetch_similar(self, query_vec: np.ndarray, top_k: int = 10) -> list[dict[str, Any]]: + await self.connect() + assert self._pool is not None + # Query movies by cosine distance, return movie and score + q = """ + SELECT m.movie_id, m.title, m.genres, 1 - (e.embedding <=> $1::vector) AS score + FROM movie_embeddings e + JOIN movies m USING (movie_id) + ORDER BY e.embedding <=> $1::vector + LIMIT $2 + """ + def _vec_to_pgtext(v: np.ndarray) -> str: + return "[" + ",".join(str(float(x)) for x in v.tolist()) + "]" + vec = _vec_to_pgtext(query_vec.astype(float)) + async with self._pool.acquire() as conn: + rows = await conn.fetch(q, vec, top_k) + return [dict(r) for r in rows] + + async def fetch_user_profile_embedding(self, user_id: int, min_rating: float = 4.0) -> np.ndarray | None: + """Return an average embedding of movies the user rated >= min_rating. + Falls back to None if no vectors exist. + """ + await self.connect() + assert self._pool is not None + q = """ + SELECT e.embedding + FROM user_ratings r + JOIN movie_embeddings e USING (movie_id) + WHERE r.user_id = $1 AND r.rating >= $2 + LIMIT 1000 + """ + async with self._pool.acquire() as conn: + rows = await conn.fetch(q, user_id, min_rating) + if not rows: + return None + def _parse_vec(val: Any) -> np.ndarray: + if isinstance(val, str): + s = val.strip().strip("[]") + parts = [p for p in s.split(",") if p.strip() != ""] + return np.array([float(p) for p in parts], dtype=np.float32) + # assume list-like of floats + return np.array(list(val), dtype=np.float32) + + vecs = [_parse_vec(r["embedding"]) for r in rows] + mean_vec = np.mean(np.stack(vecs, axis=0), axis=0) + # Normalize to unit length for cosine search + n = np.linalg.norm(mean_vec) + if n > 0: + mean_vec = mean_vec / n + return mean_vec.astype(np.float32) + + async def fetch_genre_weights(self, movie_ids: list[int]) -> dict[int, float]: + """Sum of genre-edge weights per movie for simple graph boost.""" + if not movie_ids: + return {} + await self.connect() + assert self._pool is not None + q = """ + SELECT src_id AS movie_id, COALESCE(SUM(weight),0) AS w + FROM hyperedges + WHERE src_kind='movie' AND dst_kind='genre' AND src_id = ANY($1::bigint[]) + GROUP BY src_id + """ + async with self._pool.acquire() as conn: + rows = await conn.fetch(q, movie_ids) + return {int(r["movie_id"]): float(r["w"]) for r in rows} + + async def fetch_neighbors_cowatch(self, movie_ids: list[int], top_k: int = 100) -> dict[int, float]: + """Return co‑watch neighbors aggregated across a set of seed movie_ids. + + Uses hyperedges where (src_kind='movie', dst_kind='movie'). + """ + if not movie_ids: + return {} + await self.connect() + assert self._pool is not None + q = ( + "SELECT dst_id AS movie_id, SUM(weight) AS w " + "FROM hyperedges WHERE src_kind='movie' AND dst_kind='movie' AND src_id = ANY($1::bigint[]) " + "GROUP BY dst_id ORDER BY SUM(weight) DESC LIMIT $2" + ) + async with self._pool.acquire() as conn: + rows = await conn.fetch(q, movie_ids, top_k) + return {int(r["movie_id"]): float(r["w"]) for r in rows} + + async def fetch_neighbors_shared_genre(self, movie_ids: list[int], top_k: int = 200) -> dict[int, float]: + """Return neighbors via shared genres. + + We derive genre nodes from movie->genre edges and then collect other + movies pointing to those genres. Weight is sum of (w_src * w_dst). + """ + if not movie_ids: + return {} + await self.connect() + assert self._pool is not None + q = ( + "SELECT he2.src_id AS movie_id, SUM(he1.weight * he2.weight) AS w " + "FROM hyperedges he1 " + "JOIN hyperedges he2 ON he1.dst_kind='genre' AND he2.dst_kind='genre' AND he2.dst_id=he1.dst_id " + "WHERE he1.src_kind='movie' AND he1.src_id = ANY($1::bigint[]) " + "GROUP BY he2.src_id ORDER BY w DESC LIMIT $2" + ) + async with self._pool.acquire() as conn: + rows = await conn.fetch(q, movie_ids, top_k) + return {int(r["movie_id"]): float(r["w"]) for r in rows} + + async def fetch_movies_by_ids(self, movie_ids: list[int]) -> dict[int, dict]: + if not movie_ids: + return {} + await self.connect() + assert self._pool is not None + q = "SELECT movie_id, title, genres FROM movies WHERE movie_id = ANY($1::int[])" + async with self._pool.acquire() as conn: + rows = await conn.fetch(q, movie_ids) + return {int(r["movie_id"]): {"title": r["title"], "genres": r["genres"]} for r in rows} + + +_db_singleton: DB | None = None + + +def get_db() -> DB: + global _db_singleton + if _db_singleton is None: + _db_singleton = DB() + return _db_singleton diff --git a/apps/cr-hypervr/app/main.py b/apps/cr-hypervr/app/main.py new file mode 100644 index 00000000..01e593f2 --- /dev/null +++ b/apps/cr-hypervr/app/main.py @@ -0,0 +1,343 @@ +from fastapi import FastAPI, Depends, Response +from app.core.config import settings +from app import __version__ +from app.schemas import ( + EmbedTextRequest, + EmbedBatchRequest, + EmbedVectorResponse, + SimilarSearchRequest, + SimilarSearchResponse, + SimilarItem, + RecommendRequest, + RecommendResponse, + MovieEmbedRequest, + UserEmbedRequest, + GraphRecommendRequest, + GraphRecommendResponse, + GraphRecommendItem, +) +from app.services import embedder as embedder_service +from app.services.reranker import get_reranker +from app.services.scoring import combine_scores, reorder_by_scores +from typing import TYPE_CHECKING +import numpy as np +from fastapi.middleware.cors import CORSMiddleware +from fastapi import HTTPException + +# Simple in-memory metrics +_metrics = { + "requests_total": 0, + "embed_text_total": 0, + "embed_batch_total": 0, + "embed_movie_total": 0, + "embed_user_total": 0, + "search_similar_total": 0, + "search_recommend_total": 0, + "graph_recommend_total": 0, +} + + +if TYPE_CHECKING: # for type checkers only + from app.db.client import DB # pragma: no cover + + +def _get_db_dep(): + # Lazy import; tolerate missing asyncpg in environments without DB + try: + from app.db.client import get_db # type: ignore + return get_db() + except Exception: + class _NoDB: + async def connect(self): + raise RuntimeError("DATABASE_URL not configured or driver not installed") + + return _NoDB() + + +def create_app() -> FastAPI: + app = FastAPI(title=settings.app_name, version=__version__) + + # CORS + if settings.allowed_origins: + app.add_middleware( + CORSMiddleware, + allow_origins=settings.allowed_origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + @app.get("/healthz") + async def healthz(): + return {"status": "ok"} + + @app.get("/") + async def root(): + return { + "service": settings.app_name, + "version": __version__, + "environment": settings.environment, + } + + @app.post("/embed/text", response_model=EmbedVectorResponse) + async def embed_text(payload: EmbedTextRequest): + _metrics["requests_total"] += 1 + _metrics["embed_text_total"] += 1 + vec = embedder_service.get_embedder().encode([payload.text])[0] + return { + "embedding": vec.tolist(), + "dimension": settings.vector_dim, + "model": settings.model_name, + } + + @app.post("/embed/batch", response_model=list[EmbedVectorResponse]) + async def embed_batch(payload: EmbedBatchRequest): + _metrics["requests_total"] += 1 + _metrics["embed_batch_total"] += 1 + vecs = embedder_service.get_embedder().encode(payload.texts) + return [ + { + "embedding": v.tolist(), + "dimension": settings.vector_dim, + "model": settings.model_name, + } + for v in vecs + ] + + @app.post("/embed/movie", response_model=EmbedVectorResponse) + async def embed_movie(payload: MovieEmbedRequest): + _metrics["requests_total"] += 1 + _metrics["embed_movie_total"] += 1 + genres = ", ".join(payload.genres) if payload.genres else "" + desc = (payload.description or "").strip() + if len(desc) > 500: + desc = desc[:500] + text = f"{payload.title}. {genres}. {desc}" + vec = embedder_service.get_embedder().encode([text])[0] + return { + "embedding": vec.tolist(), + "dimension": settings.vector_dim, + "model": settings.model_name, + } + + @app.post("/embed/user", response_model=EmbedVectorResponse) + async def embed_user(payload: UserEmbedRequest): + _metrics["requests_total"] += 1 + _metrics["embed_user_total"] += 1 + likes = ", ".join(payload.liked_movies) + top_genres = ", ".join(payload.liked_genres) + dislikes = ", ".join(payload.disliked_genres) + text = f"Enjoys {top_genres}. Liked movies such as {likes}. Avoids {dislikes}." + vec = embedder_service.get_embedder().encode([text])[0] + return { + "embedding": vec.tolist(), + "dimension": settings.vector_dim, + "model": settings.model_name, + } + + @app.post("/search/similar", response_model=SimilarSearchResponse) + async def search_similar(payload: SimilarSearchRequest, db = Depends(_get_db_dep)): + _metrics["requests_total"] += 1 + _metrics["search_similar_total"] += 1 + query_vec = embedder_service.get_embedder().encode([payload.text])[0] + items = await db.fetch_similar(query_vec.astype(np.float32), top_k=payload.top_k) + # Optional graph-based scoring using genre hyperedges + if getattr(settings, "use_graph_scorer", False) and items: + mids = [int(i["movie_id"]) for i in items] + gweights = await db.fetch_genre_weights(mids) + base = {int(i["movie_id"]): float(i.get("score", 0.0)) for i in items} + scores = combine_scores(base, gweights, weight=getattr(settings, "graph_score_weight", 0.05)) + items = reorder_by_scores(items, scores) + # Optional rerank + if settings.use_reranker and items: + items = get_reranker().rerank(payload.text, items) + return { + "items": [ + SimilarItem(movie_id=i["movie_id"], title=i["title"], genres=i.get("genres"), score=float(i["score"])) + for i in items + ] + } + + @app.post("/search/recommend", response_model=RecommendResponse) + async def recommend(payload: RecommendRequest, db = Depends(_get_db_dep)): + _metrics["requests_total"] += 1 + _metrics["search_recommend_total"] += 1 + # Prefer DB-derived user profile embedding (avg of liked items) + vec_np = await db.fetch_user_profile_embedding(payload.user_id) + if vec_np is None: + # Fallback: encode a deterministic user token + text = f"user_id:{payload.user_id}" + vec_np = embedder_service.get_embedder().encode([text])[0].astype(np.float32) + vec = vec_np.astype(np.float32) + items = await db.fetch_similar(vec, top_k=payload.top_k + (len(payload.exclude_movie_ids) if payload.exclude_movie_ids else 0)) + exclude = set(payload.exclude_movie_ids or []) + filtered = [i for i in items if i["movie_id"] not in exclude] + # Optional graph-based scoring + if getattr(settings, "use_graph_scorer", False) and filtered: + mids = [int(i["movie_id"]) for i in filtered] + gweights = await db.fetch_genre_weights(mids) + base = {int(i["movie_id"]): float(i.get("score", 0.0)) for i in filtered} + scores = combine_scores(base, gweights, weight=getattr(settings, "graph_score_weight", 0.05)) + filtered = reorder_by_scores(filtered, scores) + # Optional rerank + if settings.use_reranker and filtered: + filtered = get_reranker().rerank("user profile", filtered) + filtered = filtered[: payload.top_k] + return { + "items": [ + SimilarItem(movie_id=i["movie_id"], title=i["title"], genres=i.get("genres"), score=float(i["score"])) + for i in filtered + ] + } + + @app.post("/graph/recommend", response_model=GraphRecommendResponse) + async def graph_recommend(payload: GraphRecommendRequest, db = Depends(_get_db_dep)): + """Embed free‑text query, seed via vector search, expand through hypergraph, and return recommendations. + + Expansion rules: + - hops>=1 → co‑watch neighbors (movie→movie) + - hops>=2 → shared‑genre neighbors (movie→genre→movie) + Scores are normalized per‑signal and linearly combined using weights. + """ + _metrics["requests_total"] += 1 + _metrics["graph_recommend_total"] += 1 + + # Defensive: ensure DB has expected API + for need in ("fetch_similar", "fetch_neighbors_cowatch", "fetch_neighbors_shared_genre", "fetch_movies_by_ids"): + if not hasattr(db, need): + raise HTTPException(status_code=503, detail="Database not configured") + + # 1) Seed via vector search + query_vec = embedder_service.get_embedder().encode([payload.query])[0] + seeds = await db.fetch_similar(query_vec.astype(np.float32), top_k=max(payload.seed_top_k, payload.top_k)) + seed_ids = [int(s["movie_id"]) for s in seeds] + embed_scores = {int(s["movie_id"]): float(s.get("score", 0.0)) for s in seeds} + + # 2) Graph expansion signals + cowatch: dict[int, float] = {} + by_genre: dict[int, float] = {} + if payload.hops >= 1 and seed_ids: + cowatch = await db.fetch_neighbors_cowatch(seed_ids, top_k=max(3 * payload.seed_top_k, 200)) + if payload.hops >= 2 and seed_ids: + by_genre = await db.fetch_neighbors_shared_genre(seed_ids, top_k=max(5 * payload.seed_top_k, 400)) + + # 3) Normalize each signal to [0,1] for stable mixing + def _normalize(d: dict[int, float]) -> dict[int, float]: + if not d: + return {} + m = max(d.values()) + if m <= 0: + return {k: 0.0 for k in d} + return {k: float(v) / float(m) for k, v in d.items()} + + embed_n = _normalize(embed_scores) + cowatch_n = _normalize(cowatch) + genre_n = _normalize(by_genre) + + # 4) Aggregate combined scores; exclude seed items for recommendations + combined: dict[int, dict[str, float]] = {} + keys = set(embed_n) | set(cowatch_n) | set(genre_n) + for mid in keys: + if mid in seed_ids: + continue + e = embed_n.get(mid, 0.0) + c = cowatch_n.get(mid, 0.0) + g = genre_n.get(mid, 0.0) + score = payload.embed_weight * e + payload.cowatch_weight * c + payload.genre_weight * g + if score <= 0: + continue + combined[mid] = {"score": score, "e": e, "c": c, "g": g} + + ranked = sorted(combined.items(), key=lambda kv: kv[1]["score"], reverse=True) + ranked = ranked[: payload.top_k] + mids = [mid for mid, _ in ranked] + meta = await db.fetch_movies_by_ids(mids) + + def _sources(sig: dict[str, float]) -> list[str]: + out: list[str] = [] + if sig.get("e", 0) > 0: + out.append("embed") + if sig.get("c", 0) > 0: + out.append("cowatch") + if sig.get("g", 0) > 0: + out.append("genre") + return out + + items = [] + for mid, sig in ranked: + m = meta.get(mid, {}) + items.append( + GraphRecommendItem( + movie_id=mid, + title=m.get("title", str(mid)), + genres=m.get("genres"), + score=float(sig["score"]), + sources=_sources(sig) or None, + ) + ) + return {"items": items} + + @app.get("/ready") + async def ready() -> dict: + # Basic readiness check (lightweight) + try: + _ = embedder_service.get_embedder() + return {"ready": True} + except Exception: + return {"ready": False} + + @app.get("/metrics") + async def metrics() -> Response: + lines = [ + "# HELP service_requests_total Total HTTP requests.", + "# TYPE service_requests_total counter", + f"service_requests_total {_metrics['requests_total']}", + "# HELP embed_requests_total Total embed requests by type.", + "# TYPE embed_requests_total counter", + f"embed_requests_total{{type=\"text\"}} {_metrics['embed_text_total']}", + f"embed_requests_total{{type=\"batch\"}} {_metrics['embed_batch_total']}", + f"embed_requests_total{{type=\"movie\"}} {_metrics['embed_movie_total']}", + f"embed_requests_total{{type=\"user\"}} {_metrics['embed_user_total']}", + "# HELP search_requests_total Total search requests by type.", + "# TYPE search_requests_total counter", + f"search_requests_total{{type=\"similar\"}} {_metrics['search_similar_total']}", + f"search_requests_total{{type=\"recommend\"}} {_metrics['search_recommend_total']}", + f"search_requests_total{{type=\"graph_recommend\"}} {_metrics['graph_recommend_total']}", + ] + return Response("\n".join(lines) + "\n", media_type="text/plain; version=0.0.4") + + # Debug endpoints (safe for demos; avoid leaking secrets). These endpoints + # do not raise if DB is unavailable; they return availability flags. + @app.get("/debug/db_counts") + async def db_counts(db = Depends(_get_db_dep)): + try: + # Count movies and embeddings + await db.connect() + conn = db._pool # type: ignore[attr-defined] + assert conn is not None + async with conn.acquire() as c: # type: ignore + m = await c.fetchval("SELECT COUNT(*) FROM movies") + e = await c.fetchval("SELECT COUNT(*) FROM movie_embeddings") + return {"available": True, "movies": int(m or 0), "embeddings": int(e or 0)} + except Exception: + # DB not configured or unreachable + return {"available": False, "movies": 0, "embeddings": 0} + + @app.get("/debug/sample_movie") + async def sample_movie(db = Depends(_get_db_dep)): + try: + await db.connect() + conn = db._pool # type: ignore[attr-defined] + assert conn is not None + async with conn.acquire() as c: # type: ignore + row = await c.fetchrow("SELECT movie_id, title FROM movies LIMIT 1") + if row: + return {"available": True, "movie": {"movie_id": int(row["movie_id"]), "title": row["title"]}} + return {"available": True, "movie": None} + except Exception: + return {"available": False, "movie": None} + + return app + + +app = create_app() diff --git a/apps/cr-hypervr/app/schemas.py b/apps/cr-hypervr/app/schemas.py new file mode 100644 index 00000000..4f4c864a --- /dev/null +++ b/apps/cr-hypervr/app/schemas.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +from pydantic import BaseModel, Field +from typing import List, Optional + + +class EmbedTextRequest(BaseModel): + text: str + + +class EmbedBatchRequest(BaseModel): + texts: List[str] + + +class EmbedVectorResponse(BaseModel): + embedding: List[float] = Field(..., description="384-dimensional embedding") + dimension: int = Field(384, description="Embedding dimension") + model: str = Field("movie-minilm-v1", description="Model identifier") + + +class SimilarSearchRequest(BaseModel): + text: str + top_k: int = 10 + + +class SimilarItem(BaseModel): + movie_id: int + title: str + genres: Optional[str] = None + score: float + + +class SimilarSearchResponse(BaseModel): + items: List[SimilarItem] + + +class RecommendRequest(BaseModel): + user_id: int + top_k: int = 10 + exclude_movie_ids: List[int] | None = None + + +class RecommendResponse(BaseModel): + items: List[SimilarItem] + + +class MovieEmbedRequest(BaseModel): + title: str + description: Optional[str] = None + genres: List[str] = Field(default_factory=list) + + +class UserEmbedRequest(BaseModel): + liked_genres: List[str] = Field(default_factory=list) + liked_movies: List[str] = Field(default_factory=list) + disliked_genres: List[str] = Field(default_factory=list) + + +# --- Hypergraph query/recommendation --- +class GraphRecommendRequest(BaseModel): + """Free‑text query that seeds vector search, then expands via hypergraph. + + Fields allow light tuning without overcomplicating the interface. + """ + query: str = Field(..., description="User query to embed and seed the graph search") + top_k: int = Field(10, description="Number of recommendations to return") + seed_top_k: int = Field(20, description="Seed candidates from vector search") + hops: int = Field(2, description="Depth for graph expansion (1=cowatch, 2=+genres)") + embed_weight: float = Field(1.0, description="Weight for base embedding similarity") + cowatch_weight: float = Field(0.5, description="Weight for co‑watch edges") + genre_weight: float = Field(0.25, description="Weight for shared‑genre signal") + + +class GraphRecommendItem(SimilarItem): + sources: list[str] | None = Field(default=None, description="Signals that contributed (embed|cowatch|genre)") + + +class GraphRecommendResponse(BaseModel): + items: List[GraphRecommendItem] diff --git a/apps/cr-hypervr/app/services/__init__.py b/apps/cr-hypervr/app/services/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/apps/cr-hypervr/app/services/__init__.py @@ -0,0 +1 @@ + diff --git a/apps/cr-hypervr/app/services/embedder.py b/apps/cr-hypervr/app/services/embedder.py new file mode 100644 index 00000000..c4f245ba --- /dev/null +++ b/apps/cr-hypervr/app/services/embedder.py @@ -0,0 +1,142 @@ +from __future__ import annotations + +import threading +from functools import lru_cache +import os +import numpy as np + +try: # optional + import onnxruntime as ort # type: ignore +except Exception: # pragma: no cover - optional + ort = None + +from app.core.config import settings + + +class _Embedder: + def __init__(self) -> None: + self._lock = threading.Lock() + self._mode = "st" + self._st_model = None # lazy import to avoid heavy deps during tests + self._ort_session = None + self._prepare() + + def _prepare(self) -> None: + # Optional: pull fine-tuned model artifacts from GCS if requested + try: + self._maybe_pull_model_from_gcs() + except Exception: + # Best-effort; proceed with normal backend selection + pass + # Allow explicit backend override via env or settings + override = os.getenv("EMBEDDING_BACKEND") or getattr(settings, "embedding_backend", "auto") + if override == "hash": + self._mode = "hash" + return + if override == "onnx": + onnx_path = f"{settings.model_dir}/model-int8.onnx" + if ort is not None: + try: + self._ort_session = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"]) # type: ignore[arg-type] + self._mode = "onnx" + return + except Exception: + pass + # fallback chain: ST → hash + try: + self._ensure_st() + self._mode = "st" + return + except Exception: + self._mode = "hash" + return + # Prefer ONNX if available (auto) + onnx_path = f"{settings.model_dir}/model-int8.onnx" + if ort is not None: + try: + self._ort_session = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"]) # type: ignore[arg-type] + self._mode = "onnx" + return + except Exception: + pass + # Fallback chain: ST → hash + try: + self._ensure_st() + self._mode = "st" + return + except Exception: + self._mode = "hash" + return + + def _ensure_st(self) -> None: + if self._st_model is None: + from sentence_transformers import SentenceTransformer # type: ignore + self._st_model = SentenceTransformer(settings.model_dir if settings.model_dir else settings.base_model_dir) + + def _maybe_pull_model_from_gcs(self) -> None: + # If a GCS URI is provided and local ONNX not present, pull down. + gcs_uri = os.getenv("MODEL_GCS_URI") or os.getenv("GCS_MODEL_URI") + if not gcs_uri or not gcs_uri.startswith("gs://"): + # Derive from bucket hint if available + bucket = os.getenv("GCS_MODELS_BUCKET") + if bucket and bucket.startswith("gs://"): + gcs_uri = f"{bucket.rstrip('/')}/models/movie-minilm-v1/model-int8.onnx" + else: + return + # Destination + model_dir = getattr(settings, "model_dir", "models/movie-minilm-v1") or "models/movie-minilm-v1" + onnx_path = os.path.join(model_dir, "model-int8.onnx") + # Already present → nothing to do + if os.path.exists(onnx_path): + return + os.makedirs(model_dir, exist_ok=True) + # Pull file or directory + import fsspec + + fs = fsspec.filesystem("gcs") + if gcs_uri.endswith(".onnx"): + with fs.open(gcs_uri, "rb") as src, open(onnx_path, "wb") as dst: # type: ignore[attr-defined] + dst.write(src.read()) + return + # Otherwise, treat as prefix and sync all files + # Ensure trailing slash for globbing + prefix = gcs_uri.rstrip("/") + "/" + files = [p for p in fs.glob(prefix + "**") if not p.endswith("/")] + for obj in files: + rel = obj[len(prefix) :] + local = os.path.join(model_dir, rel) + os.makedirs(os.path.dirname(local), exist_ok=True) + with fs.open(obj, "rb") as src, open(local, "wb") as dst: # type: ignore[attr-defined] + dst.write(src.read()) + + def _encode_hash(self, texts: list[str]) -> np.ndarray: + dim = int(getattr(settings, "vector_dim", 384)) + out = [] + for t in texts: + seed = int.from_bytes(np.frombuffer(t.encode("utf-8"), dtype=np.uint8).sum().tobytes(), "little", signed=False) ^ (len(t) * 1315423911) + rng = np.random.default_rng(seed) + v = rng.standard_normal(dim) + n = np.linalg.norm(v) + if n == 0: + out.append(np.zeros(dim, dtype=np.float32)) + else: + out.append((v / n).astype(np.float32)) + return np.stack(out, axis=0) + + def encode(self, texts: list[str]) -> np.ndarray: + with self._lock: + if self._mode == "hash": + return self._encode_hash(texts) + if self._mode == "onnx" and self._ort_session is not None: + # For simplicity, use ST encode even when ONNX present (pipeline optimizable later) + self._ensure_st() + vecs = self._st_model.encode(texts, normalize_embeddings=True, convert_to_numpy=True) + else: + self._ensure_st() + vecs = self._st_model.encode(texts, normalize_embeddings=True, convert_to_numpy=True) + return vecs.astype(np.float32) + + +@lru_cache(maxsize=1) +def get_embedder() -> _Embedder: + return _Embedder() diff --git a/apps/cr-hypervr/app/services/reranker.py b/apps/cr-hypervr/app/services/reranker.py new file mode 100644 index 00000000..f1f8576c --- /dev/null +++ b/apps/cr-hypervr/app/services/reranker.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +from functools import lru_cache +from typing import Sequence + + +class Reranker: + """ + Tiny, optional reranker stub. Deterministic and lightweight so it can + be enabled in production without heavy deps. Intended as a seam where + a true cross-encoder (TinyBERT, etc.) could be integrated later. + """ + + def rerank(self, query: str, items: Sequence[dict]) -> list[dict]: + # Heuristic: prefer titles that share tokens with the query (case-insensitive), + # stable sort to keep original ranking when scores tie. + q_tokens = {t for t in query.lower().split() if t} + + def score(it: dict) -> int: + title = str(it.get("title", "")).lower() + tokens = set(title.split()) + return len(q_tokens & tokens) + + return sorted(list(items), key=score, reverse=True) + + +@lru_cache(maxsize=1) +def get_reranker() -> Reranker: + return Reranker() + diff --git a/apps/cr-hypervr/app/services/scoring.py b/apps/cr-hypervr/app/services/scoring.py new file mode 100644 index 00000000..37cedd42 --- /dev/null +++ b/apps/cr-hypervr/app/services/scoring.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from typing import Dict, List + + +def combine_scores( + base_scores: Dict[int, float], + genre_weights: Dict[int, float], + weight: float = 0.05, +) -> Dict[int, float]: + """ + Combine cosine similarity scores with simple graph-derived weights. + For now: new = base + weight * genre_weight(movie_id). + """ + out: Dict[int, float] = {} + for mid, s in base_scores.items(): + g = genre_weights.get(mid, 0.0) + out[mid] = float(s) + float(weight) * float(g) + return out + + +def reorder_by_scores(items: List[dict], scores: Dict[int, float]) -> List[dict]: + return sorted(items, key=lambda it: scores.get(int(it.get("movie_id")), 0.0), reverse=True) + diff --git a/apps/cr-hypervr/cloudbuild.yaml b/apps/cr-hypervr/cloudbuild.yaml new file mode 100644 index 00000000..883a3a17 --- /dev/null +++ b/apps/cr-hypervr/cloudbuild.yaml @@ -0,0 +1,10 @@ +steps: + - name: 'gcr.io/cloud-builders/docker' + args: ['build', '-t', '${_IMAGE}', '.'] +images: + - '${_IMAGE}' +substitutions: + # Default to Artifact Registry in-region; override _IMAGE if needed + _IMAGE: '${_REGION}-docker.pkg.dev/$PROJECT_ID/embedding-service/api:latest' +options: + logging: CLOUD_LOGGING_ONLY diff --git a/apps/cr-hypervr/db/pgvector.sql b/apps/cr-hypervr/db/pgvector.sql new file mode 100644 index 00000000..0aa0fc22 --- /dev/null +++ b/apps/cr-hypervr/db/pgvector.sql @@ -0,0 +1 @@ +CREATE EXTENSION IF NOT EXISTS vector; diff --git a/apps/cr-hypervr/db/schema.sql b/apps/cr-hypervr/db/schema.sql new file mode 100644 index 00000000..542690dd --- /dev/null +++ b/apps/cr-hypervr/db/schema.sql @@ -0,0 +1,56 @@ +-- Enable pgvector extension +CREATE EXTENSION IF NOT EXISTS vector; + +-- Movies base table (enriched metadata subset) +CREATE TABLE IF NOT EXISTS movies ( + movie_id INTEGER PRIMARY KEY, + title TEXT NOT NULL, + genres TEXT, + overview TEXT, + release_year INTEGER, + tmdb_id INTEGER +); + +-- Movie embeddings (384-d float32 vectors, unit-normalized for cosine) +CREATE TABLE IF NOT EXISTS movie_embeddings ( + movie_id INTEGER PRIMARY KEY REFERENCES movies(movie_id) ON DELETE CASCADE, + embedding vector(384) +); + +-- HNSW index for fast cosine similarity search +DROP INDEX IF EXISTS idx_movie_embeddings_hnsw; +CREATE INDEX idx_movie_embeddings_hnsw ON movie_embeddings USING hnsw (embedding vector_cosine_ops); + +-- Optional: user cached embeddings +CREATE TABLE IF NOT EXISTS user_embeddings ( + user_id BIGINT PRIMARY KEY, + embedding vector(384), + updated_at TIMESTAMP DEFAULT now() +); + +-- User ratings table (MovieLens compatible) +-- Kept minimal for analytics/pipeline joins; raw imports may live in GCS +CREATE TABLE IF NOT EXISTS user_ratings ( + user_id BIGINT NOT NULL, + movie_id INTEGER NOT NULL REFERENCES movies(movie_id) ON DELETE CASCADE, + rating NUMERIC(2,1) NOT NULL CHECK (rating >= 0.5 AND rating <= 5.0), + rated_at TIMESTAMP, + PRIMARY KEY (user_id, movie_id) +); +CREATE INDEX IF NOT EXISTS idx_user_ratings_user ON user_ratings(user_id); +CREATE INDEX IF NOT EXISTS idx_user_ratings_movie ON user_ratings(movie_id); + +-- Hyperedges table to support graph-like relationships (e.g., co-watch, genre-affinity) +-- Flexible JSONB payload for features/weights +CREATE TABLE IF NOT EXISTS hyperedges ( + id BIGSERIAL PRIMARY KEY, + src_kind TEXT NOT NULL, -- e.g., 'user' or 'movie' + src_id BIGINT NOT NULL, + dst_kind TEXT NOT NULL, -- e.g., 'movie' or 'genre' + dst_id BIGINT NOT NULL, + weight REAL DEFAULT 1.0, + payload JSONB, + created_at TIMESTAMP DEFAULT now() +); +CREATE INDEX IF NOT EXISTS idx_hyperedges_src ON hyperedges(src_kind, src_id); +CREATE INDEX IF NOT EXISTS idx_hyperedges_dst ON hyperedges(dst_kind, dst_id); diff --git a/apps/cr-hypervr/pipeline/__init__.py b/apps/cr-hypervr/pipeline/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/apps/cr-hypervr/pipeline/__init__.py @@ -0,0 +1 @@ + diff --git a/apps/cr-hypervr/pipeline/__pycache__/__init__.cpython-313.pyc b/apps/cr-hypervr/pipeline/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57004dc581175fa1b11dd6469ca8c05dea1d9841 GIT binary patch literal 159 zcmey&%ge<81dy?)Syv2Vd}L;1WGrF^vH-!CCKdny literal 0 HcmV?d00001 diff --git a/apps/cr-hypervr/pipeline/__pycache__/triplets.cpython-313.pyc b/apps/cr-hypervr/pipeline/__pycache__/triplets.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2cdc58eb622e94f4120a70d540203739be2782cd GIT binary patch literal 5858 zcmcgwU2qdumcIR?f610COR_CH$lw41Y+-EkV+h#T28^){YIp3EAX*`{MGscD=#~L1 zsTu9g!)RuzCeCD1Av05hXJ=~HTgd}eGY@R-JS3YukZc|#+KEKNQrR8mA#YA#s*&_niA3J#;v%2--}4AMdG0=#ThAYlbR8Jp6wkUPlaK zh%t1Mkca@L$uaUIB~cZc8lxo|`lvC3WURbRlDYD>NS0?XH#KIJtV|t4x1jIPlAWn% z3>C_8mJYXS3)bG75o2UbePp%gYkAC!MdM+tjIEb`25tOfoQxf>^c_N~KkH;1{WA!~ z%m|&W-;Da!77U;ym89CyS(^#PVArrGO0sT36EDo7=cr9+mI%5&!@NO4wZ#NM%)~Oh zD9G?hpN?fNOvH&bN%%!o|A~jEfp{H7^f4CotFOzNKgF5kd*sFA=Qu(FLHuvh|h`hi6E&`az;|EawaBawY<%}rBl3Y|wS1cgafnfYE2jP)J_sq8Z@Q*FI!E*cNe{KDB>vFQxzPH%EH-CI_ zx?pQ7&;vJz;qefd(PKt4K?fLwcz7R7^g3ceC`0s+0`WBSJ=JvN6Q<)3oCbzs&=@I@ zEEzW9RUdD)GayhAi#EnmVVg$R-mLB%Z6$$bY28ts2(h#qJ#Uc75RyqZIr zv{xHgh}NPA^f&4HgAK$q_`$%M7-KzROg%K{<~M=83F7BtZxXNjb_15+A?7B|3L>rDM{ic`hSiKS}t+es&8U*@h>1Rcd!% zB5T%W1kDomorKMy8j^8&R#qLyhnQph9LFrk87`$7Wl_p-36)OHiRV?z$kjMkIl-zy ziU~RcsF7S~;SQ`zM$xjJsUIuzH|{Ly>f)@$MG({D~MF~2ygcz5KF->>)HnS5UHO@Dah zR{Qm{Z=PM=R%+W*Y}@lDantus_rLpZH;sI7rRbY3IHy-Bi-oq(4dTX@Cv=wdB z1$z3cFI=c;hWPScebYlkSX_7P&Q%KD|Mx{5vd<8eRPZnTLoMjtz}6uT{hvk>==V^= z&<^@Nj{(yy)}d|Gds_*3s*T{N=;`p}&Ada8*@#CM^B#` znZoQ9Cg0;sdaB}`+u-o*0Bgq~ed-M#k4ldD37oJOJufyrmYntIerm+(qw8Zn(Tq@J z?Usb7(s{x-n{8;2#379zfOz@V8BLd`$1bc@$&7RlIq6upvVO5JhE9lD!4Sbmv(mmM zM;IdbyjB`((DE5$JU|MNE5dqi(0JeyGU;4qyydW~Hl|V9eVS^DG-!R{Mvbby!@e4= zQ7om&7fc}}m?L1@YFl@0T3Cx_iv;+vVhc+T86pH2aiinJi3hNI>)+hHb=~f*+PWdc zSkDlQ?Eod%X8wb*<6Qz?cpD*133k1&t{;w8rO(0I!!2uSu6yY^f{j?mH*v%u&5fS} zE_=9boq#oxsn>L4Ock5Mjk2*t0$Pv8`%Rt^>Oh~r&r0=dU3iP8TkWl#5x5_}KxiKJ zFE|AkTOWS5#-mZJQ`?i!z&7Ys#Wo9o%}ikk^No#l>2ut*NSkVUn*0a`EtST)wONc) zy9>1*p^xBzFG*~T!wz&Etot~E-N{@AX%5SsG9 z9lNJ`#|mD-!+O_qunmIuYxeEcB&Iy9IUL%c0DP|7t!~$KeYyo|eN!GyX0INd0mPDV z?1BFQiQU>NtXbn^$^Jhel)xO|-vF8?2rAu}`#W9F-7c3?~ACT&{}tEQer(yPB2`s~wmE zi*QlBUOF@<#$$8x;VzvE7lbm7nJ+&>IaKJ)-sTqy=ea)&y<;y8oGuQWUO6*WV4qjc zOc!2=C1um(71qo>vtdaHTcqrPv z`=aK@Y@?>&W4%Tuc9}~ARAN`w5zsgS88HxStzl~kZ(lW}E@FU8reR%)izT8OZpU?? zYLU52lot}*Rn?Rc0ZPX+Dk-G#aS;+?O0^}p+1UIXU}Pa#LB^_qOQkalswpXn^Xc;o zsv(w6&n>7lM$)Q*2lOsWIGw4+_yv)Va}s1OW3*162=v(Nu1AAV7{$#rsY@)YFyY(MMo%d!I7#J^YC&M>Iv1h z7G0Hk+F1QiDcFK)!N!V8;uRT|z;GWb2dKHp#H?xoAMqLfG879Ia1azcn6zQfonMIJ zg;g7Noc2M?t7be{*|-^xU`oPe6%4GMLa7&jTH~N85AYWRnz5c(F$sbR$#{QSQS#XP z9|D1a>bZ42aag4U%))Uje*&5Ns~mb%hZ;PK)33FbT-%GT?aMyJ)tx(e-{!ezw&$mR z>?oQ2MYDft_(pF5Q~Mto;g_Lt;?_B3G*TL!Eso9>l0s=zERKqc=N2amTZW4bBP;U@ zs|dYBoFE@UP!gjge4T>Y{B!&zkh!sYj`~-kKZ;(9tn{C}bx`S_E;?p%$5ySVv8CkR zQgm-w8dcm~%U6r;{U!H6(LJEJkLFI@v)W$?{xDdwZYo+gE!h=oXKtvxt1CD9z`kk8 zq1d}}BT(af+StB$q_Fv5!F8zMd@gTzP~VuRv;~=!JxAZ^QT80aedbcpA?3+(qxahO zS6^O|i@qJj#+`Y~lO49kSB|`L{9yiS*|+(P@4foHrAvx0m_Kpf z?Z4w}Dm&e8SYNeX+pjn~@`Lw$E!Pf}{5y;Ooo^4_Al^E5lo!p!8---P!Byzn;mTEHuA#-`!Sjdv?*PiSEaB;$YeC zDcM_#_SRpHlQg@EyDB^G@XSJlc)Gd#yieUF2^Er-~Lfcl3VW zk;Q$B!GeEx(b@ez^HJli1&d$SvTr?^H41yGeLwb%sUroBCD67Ua z1bs}F@Rg<-g!xo@K{|p3dGW)IAGolp{3cgTt#GYpIN@?7y76uW$coYEzaa@1c-35B zK9mv@^K;x`DGhxD{L8g zg7>Em=c)nTt47q&x@y9d8NnFlfp|As|7^pQL?GIp&ni^&9oq9T?SmgN=$0>6QzSCG l;I{|DRkl0-X#^|l`itxWy?;dycB6N@&4Y)icMlTq{3}E2{{sL3 literal 0 HcmV?d00001 diff --git a/apps/cr-hypervr/pipeline/__pycache__/user_profiles.cpython-313.pyc b/apps/cr-hypervr/pipeline/__pycache__/user_profiles.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..accd59cbe838ce1523d2416f12540000ee076512 GIT binary patch literal 5810 zcmcf_TW}NC_3lG&$$A+HKfv;Xhj>_E0x`}D<0m#4qRj?^0aZwAMIf(~yKBHElkxOR zBs0O}QA{#zoM~rbS~_8-`JgkIscAaXv|mz`#Nt9IGi^WUms4ma$yd)^d1Ye=NoU$# zTX)ai$GPX8_t`rRhZR9ffAvVRzXGAZ(igoMN&s`m&LH$A5|F_3qdo>R0SdGIY#)a? z4d(iJ%)=PhZ@@-)^8KbhGd62;hJH(*61zc0Biu@~_)l;P4tmBh273;9g^IQV2t~~Z z9r84wt@B3(P=ZTv_2^KA2}OYiKsARQb!gz(A@l@ShlZG-=Qdp%WQZ*)%V{;LCeyM4 zul&BK`uqSLk4r<*k(3%w#?&B7_)d5des@w)W5COD;{bvmb7vm_Z=%8SoQSs0!!jc> zVY+Tf6BvEP+WDC%k3cE{w~1*2?lz$yKk%&~p|piU#Gs~+OENLUQt6R6(8DPzCRRm_ zVpTbrRG+6LW`K0UWE9HZK!9|FL`A|%XjDq1ai~)|A*IrpVM$g)?cMMi+t?e89fvJx zIh09eq*PLtLL;yvlELYrWJ*$+Go!>6QPeb!CZtF@qwxs40lO1^N)Wnnbi-`RcD`a6 zZ!gw2yjS~f?Q~+cep|kNTefFnf6lfd$M3k(3Edr7jPBu_5e78Q-2^_pi9}QZ+PQ@V zZlQ3Qi*#413G98`k`QQyzzL|Iky()q1NWA;1)Y$&L6bf!a$$=OFFeJvJoJ-fUgXQN z*@%$Hd(pE7g$*Hv^`hQ7usZPv(h4#W4SRWk6%Bw0?PK@UBVW6hg9T(4p@GF7die0p%+YlFb^&4Wd~vI+0-7!pj!X z6sD40(nO2Sj{w>OccxT}rY4}gMEww|M}wLVq8q{DK&k-**0!^3ZRUZonh*vflM_lr zl5sNjycFNKt~nFMFN{bkrqaMvZEt@J-I(Y8wS?QWH9lt6cLdFt-i;U%F=a?0j-C#o z8&p;pRa9x17?m_ur8waesq|4|={gmYH031*9F-FiF-9{PNsb4t!~wb*&z zCH$0sKsSzxj_T~*T+Oze_0jQ;Tc*lcQ(fLvH)WbNt<9U(&aq~T`?`JkEw^vh-SmmO zso-9n-Fd@TI~jgw|AqZi!rO-mzIEB2n@-=gz0Vf>`~Q6ME-hlX?ICoNNCahg|-a5|W%~kX+&+fjp z%s<|7!{M3OJP~{KaF!`LYqROxiqK`|a^v*L%g^WRPi2`~X4_fo8SBL6R~=uPP({t0 z8QGoVec$}E1}#6pe0{^Y{0?F)?rV0}90$*Dl}5PkPun``&{hBHj%xnu6D+_VRU2qu zXYJU)eYBo|o>YSPBKyL7JGum+s}rEE?p+7QD&kQ!8Y&BjxaU`O@7mo3%iEvn9c#LG z$x`=K(yrT7PBBo*aY6R4b1OZU2-r7ekqvfdcoXw2_Xu}4$%>&R`wa-wY!B_3Vp1VHqjPBdhmp#m*vZv@*P-A{~?x& zW?rWY^g$_cW0AE7mQtiyyIbEQdqr=!@%}X;v&bKmlN(6aUE8?;8eFCC5ZodI$(Lu4 z8lcuhuZ~;rLUK^K#h|}KYK?;afw8LlLbM*5Z?^!7SlKDU3ytpJJeWH)xI?DLmQ9vH zfbIaJnKv(Ti?Zi3LpFk|lMSLlVA|S1)Y=Y_e^5RxvI68-JRJ3`v7DpT!tTW;XdGN{ z$TZ=AR;Grk5g4OrgzRi(NeGagm7OEK`h3n~96~{MK+9NyJmFH1nH!~3D&!QznpBc9 zX{3t3*Pp(PEXcfI(EhVVaIIqj`QsoPUpStpP6@l^NF`zu>sx> zs@m(!SKUVFgB&qw6eZ)tSju&vLP@GA3F;8%l8S+tV(HY#u&m%#z=F3!mw+EM*A5Ia zy|*l&x}Yh`K>+TJ%?}c~?w7-K-;%aS)Sx!B=~~=`E;0ibdBW3VjTqxNospxE7Aop! zrX;qD3}F<7ZVQxzVtzao9X=Y5K1o(RR3iGay|DETbw}gq4^1;K{PEP)Qy-1Z_8iFf z9QgFPLxrBhpYA+@!FlPtKs)ZKj{Z+Job@c^uMcO$!v_j3<@F1KmLuKL*>0At}eDQ)7Nwn+EhsGoVCINb!K0 z4vevZwS?I)W(rXKQi4GSLo`G|D-O}aD`^=%n3UBG0a}U=LKS@YFy2HFb1;fG1N5z> zJcX$*!UJ^Lh$VFd%Y+|D%TlmX%gHh2vNnx(&^2r(PV88U#v^)RLh~VFO5pTJ=IAKl zmE@QNH-H*LAqHtUqmB{-*lB8%nA9{>5HS^l!$E{UmQKoqOGqlQbV3%}jibX7vBH{p z#Ke%nl->=n8i=8xh?t?afO)9lZNo{q%vXa}LqJUf#ZwAumsom4Ruy8+q!pdnR#{5S z!+0B@(#jLU#)k+u0v5^uDX^Mw(L{n6=LN*U9_oWmQlBKN#H`Cgp*hyPKf32JU*J4I z`*8-gG{aAM3Vi4|x@$*{z?7q4ZyxW03gcE~{lrtbhR1U5$Fs(w+dpB>8veFo?Imwv z#m4dO8*5rVc%-nVb-Zr@`ieIhI+(j2RN>7sHyqBhk(VQrPk*{;$BetMspFG9$McR< zmc8X$e*W0GV^dWH-}+hKro3-c!PheDYt8#wKlinr>CSe5pL=7=gz|c8!Cw0dZmRv~ z#=L#?XZG4JIplwA&WN1uiQaRyQ{KG0DPOTl^Oa{u&yE&7%g(Pqw|+8N@T{Ho=RKQe zJ&)$$zx7DL6PfiS@}5M&lLYj24eyLz82#DlbG8ZYnyY5A?-DoN@hkJ?)t|ez!x@3~ z-+ji*|D99r(o?>y14hlQM)r#F&SGWt`Q7Js|775dIm=~*qOD@0reLeRX0I%o?PqOg zY`1Dw-l%Js+Vb|PVxZ|g%e$88b?-M{Y`)^Xa(W8aq+gQMar&SbGdDY&fht`E69XJ35z#mUZsvwpm{ z$lGUmf1dYGnm=vXIn!Eb*`0glxf~rjc!Rg(?2S{sQ*G1X54g*{zj0iN=0ZDWOgU@s zXZ$YB%Mf!UG8_dfh=8ww)DDfow=NubF+e}h;eB+#M$HXIG<)VX6Vd{%HjAnIg^vW( zuu7wmGV~(~QJ+v+`7}VmN@9&fhDN|cNRbGpxj8XWQ%og~Vw%bj7VO8EiOTV)g2Qyc z20<+)#Y*4As2>5#8qI)>VwzxJ8q_hZBec&67}5%)eL2(o4l#mfkmM7Z-O#Jb;2t88 zizN@M$D84Fd?Y13iBH2`8iL!+e1n{*IPkN0#fz`vq$J3tDj<+2&Y|abM?M zcdabb&fK;78Ry(KR9*W{)mv4$#_hSs`U_RNiA zF-5O$j-$Y>3iq6W0-(B{Wu-|d^o`bkS+vnH2J!anSb?j$#`{0xtKicDU;Xu5JBQ3} iDE}FE(eC pd.DataFrame: + files = [ + netflix_dir / "combined_data_1.txt", + netflix_dir / "combined_data_2.txt", + netflix_dir / "combined_data_3.txt", + netflix_dir / "combined_data_4.txt", + ] + rows: list[tuple[int, int, int, str]] = [] # movie_id, user_id, rating, date + for f in files: + if not f.exists(): + continue + movie_id = None + with f.open("r", encoding="latin-1") as fh: + for line in fh: + m = MOVIE_HEADER_RE.match(line) + if m: + movie_id = int(m.group(1)) + continue + if movie_id is None: + continue + parts = line.strip().split(",") + if len(parts) != 3: + continue + user_id, rating, date = int(parts[0]), int(parts[1]), parts[2] + rows.append((movie_id, user_id, rating, date)) + + df = pd.DataFrame(rows, columns=["movie_id", "user_id", "rating", "date"]) + return df + + +def load_movie_titles(netflix_dir: Path) -> pd.DataFrame: + mt = netflix_dir / "movie_titles.csv" + if not mt.exists(): + raise FileNotFoundError("movie_titles.csv not found") + # movie_id, year, title + df = pd.read_csv(mt, header=None, names=["movie_id", "year", "title"], encoding="latin-1") + return df + + +def build(netflix_dir: str = "data/netflix", out_dir: str = "data/processed") -> None: + netflix = Path(netflix_dir) + out = Path(out_dir) + out.mkdir(parents=True, exist_ok=True) + + print("Parsing Netflix combined data files...") + ratings = parse_combined_files(netflix) + print(f"Parsed {len(ratings):,} ratings") + ratings.to_parquet(out / "ratings.parquet") + + print("Loading movie titles...") + movies = load_movie_titles(netflix) + movies.to_parquet(out / "movies.parquet") + + # Basic validation stats + print( + { + "users": ratings["user_id"].nunique(), + "movies": ratings["movie_id"].nunique(), + "ratings": len(ratings), + } + ) + + +if __name__ == "__main__": + build() + diff --git a/apps/cr-hypervr/pipeline/tmdb_enrich.py b/apps/cr-hypervr/pipeline/tmdb_enrich.py new file mode 100644 index 00000000..7ffe9d5b --- /dev/null +++ b/apps/cr-hypervr/pipeline/tmdb_enrich.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from pathlib import Path +import pandas as pd + + +def normalize_title(t: str) -> str: + return (t or "").strip().lower() + + +def enrich( + processed_dir: str = "data/processed", + tmdb_csv_path: str = "data/tmdb/movies_metadata.csv", + out_path: str = "data/processed/movies_enriched.parquet", +) -> None: + movies_pq = Path(processed_dir) / "movies.parquet" + if not movies_pq.exists(): + raise FileNotFoundError("Run netflix_parser.build() first to generate movies.parquet") + + movies = pd.read_parquet(movies_pq) + movies["title_norm"] = movies["title"].map(normalize_title) + + if Path(tmdb_csv_path).exists(): + tm = pd.read_csv(tmdb_csv_path, low_memory=False) + # Keep relevant fields + keep = [ + "id", + "title", + "overview", + "genres", + "release_date", + "vote_average", + "popularity", + ] + tm = tm[keep] + tm["title_norm"] = tm["title"].map(normalize_title) + + # Naive title-based join (improve later with year-based matching) + merged = movies.merge(tm, on="title_norm", how="left", suffixes=("", "_tmdb")) + merged.rename(columns={"id": "tmdb_id"}, inplace=True) + merged.to_parquet(out_path) + print(f"Enriched movies saved to {out_path}") + else: + # Fallback: save base movies without enrichment + movies.to_parquet(out_path) + print("TMDB CSV not found; saved base movies without enrichment.") + + +if __name__ == "__main__": + enrich() + diff --git a/apps/cr-hypervr/pipeline/triplets.py b/apps/cr-hypervr/pipeline/triplets.py new file mode 100644 index 00000000..26eeab12 --- /dev/null +++ b/apps/cr-hypervr/pipeline/triplets.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +from pathlib import Path +import os +import pandas as pd +import numpy as np +import glob +try: + import gcsfs # noqa: F401 +except Exception: + gcsfs = None + + +def _storage_options(path: str | Path) -> dict | None: + p = str(path) + return {"token": "cloud"} if p.startswith("gs://") else None + + +def _list_ratings_parts(processed_dir: str) -> list[str]: + pat = f"{processed_dir}/ratings_enriched-*.parquet" + # Try gcsfs glob + if str(processed_dir).startswith("gs://"): + try: + import gcsfs # type: ignore + fs = gcsfs.GCSFileSystem() + matches = sorted(fs.glob(pat)) + if matches: + return [m if m.startswith("gs://") else ("gs://" + m) for m in matches] + except Exception: + pass + else: + files = sorted(glob.glob(pat)) + if files: + return files + # Fallback: sequential probe + return [f"{processed_dir}/ratings_enriched-{i:05d}.parquet" for i in range(0, 200)] + + +def generate_triplets( + processed_dir: str = os.getenv("GCS_PROCESSED_PREFIX", "data/processed"), + out_dir: str = os.getenv("GCS_TRIPLETS_PREFIX", "data/processed/triplets"), + user_sample: int | None = 10_000, + random_state: int = 42, +) -> None: + # Avoid Path round-tripping for GCS URIs; Path("gs://...") becomes "gs:/..." + out_is_gcs = str(out_dir).startswith("gs://") + if not out_is_gcs: + Path(out_dir).mkdir(parents=True, exist_ok=True) + + # Load movies metadata (small enough) + movies_path = f"{processed_dir}/movies_with_descriptions.parquet" if str(processed_dir).startswith("gs://") else Path(processed_dir) / "movies_with_descriptions.parquet" + movies = pd.read_parquet(movies_path, storage_options=_storage_options(movies_path)) + + # Split positives and negatives + # Build quick genre map if available + # genres may be JSON-like text; keep as raw string match for simplicity + movie_genres = movies.set_index("movieId")["genres"].to_dict() + rng = np.random.default_rng(random_state) + trip_rows: list[tuple[int, int, int]] = [] # (user_id, pos_movie, neg_movie) + + parts = _list_ratings_parts(processed_dir) + if not parts: + # Fallback to single-file + parts = [f"{processed_dir}/ratings_enriched.parquet"] + + for pth in parts: + df = pd.read_parquet(pth, storage_options=_storage_options(pth), columns=["user_id", "movieId", "rating"]) + positives = df[df["rating"] >= 4.0] + negatives = df[df["rating"] <= 2.0] + if positives.empty or negatives.empty: + continue + pos_by_user = positives.groupby("user_id")["movieId"].apply(list).to_dict() + neg_by_user = negatives.groupby("user_id")["movieId"].apply(list).to_dict() + for u, pos_list in pos_by_user.items(): + neg_list = neg_by_user.get(u) + if not neg_list: + continue + p = rng.choice(pos_list) + p_genres = str(movie_genres.get(int(p), "")) + candidates = [n for n in neg_list if any(tok in str(movie_genres.get(int(n), "")) for tok in p_genres.split())] + if not candidates: + candidates = neg_list + n = rng.choice(candidates) + trip_rows.append((int(u), int(p), int(n))) + if user_sample is not None and len(trip_rows) >= user_sample: + break + if user_sample is not None and len(trip_rows) >= user_sample: + break + + df = pd.DataFrame(trip_rows, columns=["user_id", "pos_movie_id", "neg_movie_id"]) + out_path = (f"{out_dir}/triplets_10k.parquet" if out_is_gcs else str(Path(out_dir) / "triplets_10k.parquet")) + df.to_parquet(out_path, storage_options=_storage_options(out_path)) + print(f"Saved {len(df):,} triplets to {out_path}") + + +if __name__ == "__main__": + generate_triplets() diff --git a/apps/cr-hypervr/pipeline/user_profiles.py b/apps/cr-hypervr/pipeline/user_profiles.py new file mode 100644 index 00000000..88ce8209 --- /dev/null +++ b/apps/cr-hypervr/pipeline/user_profiles.py @@ -0,0 +1,108 @@ +from __future__ import annotations + +from pathlib import Path +import os +import pandas as pd +from collections import defaultdict +from typing import Dict, List +import glob +try: + import gcsfs # noqa: F401 +except Exception: + gcsfs = None + + +def _storage_options(path: str | Path) -> dict | None: + p = str(path) + return {"token": "cloud"} if p.startswith("gs://") else None + + +def _list_ratings_parts(processed_dir: str) -> List[str]: + pat = f"{processed_dir}/ratings_enriched-*.parquet" + # Try gcsfs glob first + if processed_dir.startswith("gs://"): + try: + import gcsfs # type: ignore + + fs = gcsfs.GCSFileSystem() + matches = sorted(fs.glob(pat)) + if matches: + return [m if m.startswith("gs://") else ("gs://" + m) for m in matches] + except Exception: + pass + else: + files = sorted(glob.glob(pat)) + if files: + return files + # Fallback: sequential probe up to 200 parts + out: List[str] = [] + for i in range(0, 200): + p = f"{processed_dir}/ratings_enriched-{i:05d}.parquet" + # Defer existence check to reader; caller will catch FileNotFoundError + out.append(p) + return out + + +def build_user_profiles( + processed_dir: str = os.getenv("GCS_PROCESSED_PREFIX", "data/processed"), + out_path: str = os.getenv("GCS_PROFILES_PATH", "data/processed/user_profiles.parquet"), + min_ratings: int = 10, +) -> None: + # Stream-friendly aggregation across enriched parts + parts = _list_ratings_parts(processed_dir) + if not parts: + # Fallback to single-file path + single = f"{processed_dir}/ratings_enriched.parquet" + parts = [single] + + counts: Dict[int, int] = defaultdict(int) + pos_titles: Dict[int, List[str]] = defaultdict(list) + neg_titles: Dict[int, List[str]] = defaultdict(list) + + def _cap_append(d: Dict[int, List[str]], k: int, vals: List[str], cap: int = 50) -> None: + if not vals: + return + cur = d[k] + room = cap - len(cur) + if room <= 0: + return + cur.extend([v for v in vals[:room] if isinstance(v, str)]) + + for p in parts: + df = pd.read_parquet(p, storage_options=_storage_options(p), columns=["user_id", "rating", "title"]) + # counts + for uid, n in df.groupby("user_id").size().items(): + counts[int(uid)] += int(n) + # positives + pos = df[df["rating"] >= 4.0] + if not pos.empty: + agg = pos.groupby("user_id")["title"].apply(lambda s: list(s.dropna().astype(str))).to_dict() + for uid, titles in agg.items(): + _cap_append(pos_titles, int(uid), titles) + # negatives + neg = df[df["rating"] <= 2.0] + if not neg.empty: + agg = neg.groupby("user_id")["title"].apply(lambda s: list(s.dropna().astype(str))).to_dict() + for uid, titles in agg.items(): + _cap_append(neg_titles, int(uid), titles) + + # Build final DataFrame + rows = [] + for uid, cnt in counts.items(): + if cnt < min_ratings: + continue + rows.append( + { + "user_id": uid, + "num_ratings": int(cnt), + "liked_titles": ", ".join(pos_titles.get(uid, [])[:50]), + "disliked_titles": ", ".join(neg_titles.get(uid, [])[:50]), + } + ) + profiles = pd.DataFrame(rows) + profiles.to_parquet(out_path, storage_options=_storage_options(out_path), index=False) + print(f"User profiles saved to {out_path} ({len(profiles):,} users)") + + +if __name__ == "__main__": + build_user_profiles() diff --git a/apps/cr-hypervr/requirements.txt b/apps/cr-hypervr/requirements.txt new file mode 100644 index 00000000..ee14c919 --- /dev/null +++ b/apps/cr-hypervr/requirements.txt @@ -0,0 +1,17 @@ +fastapi>=0.109.0 +uvicorn[standard]>=0.27.0 +sentence-transformers>=3.0.0 +onnxruntime>=1.16.0 +onnx>=1.14.0 +asyncpg>=0.29.0 +pgvector>=0.2.0 +numpy>=1.26.0 +pydantic>=2.5.0 +pydantic-settings>=2.2.0 +pandas>=2.0.0 +pyarrow>=14.0.0 +tqdm>=4.66.0 +torch>=2.1.0 +python-dotenv>=1.0.0 +httpx>=0.27.0 +gcsfs>=2023.6.0 diff --git a/apps/cr-hypervr/scripts/backfill_embeddings_db.py b/apps/cr-hypervr/scripts/backfill_embeddings_db.py new file mode 100644 index 00000000..8a10191c --- /dev/null +++ b/apps/cr-hypervr/scripts/backfill_embeddings_db.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +""" +Backfill movie embeddings directly from the Cloud SQL database. + +Reads rows from `movies` that are missing an entry in `movie_embeddings`, +builds a simple text representation (title/genres/overview), encodes with the +service embedder (SentenceTransformer or hash backend), and upserts into DB. + +Environment: +- DATABASE_URL (required) +- BATCH_SIZE (optional, default 256) +- LIMIT (optional, limit number of rows for test runs) +- EMBEDDING_BACKEND (optional, e.g., st|hash|auto) + +Intended to run inside the same container image used by the API and Cloud Run +Jobs, so it has the same dependencies and cached base MiniLM model. +""" + +import asyncio +import os +from typing import Iterable, List, Tuple +import time +import requests + +import numpy as np + + +def build_movie_text(title: str | None, genres: str | None, overview: str | None) -> str: + return ( + f"Title: {title or ''}\n" + f"Genres: {genres or ''}\n" + f"Overview: {overview or ''}" + ) + + +def _fetch_id_token(audience: str) -> str | None: + tok = os.getenv("ID_TOKEN") + if tok: + return tok + try: + resp = requests.get( + "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity", + params={"audience": audience, "format": "full"}, + headers={"Metadata-Flavor": "Google"}, + timeout=3, + ) + if resp.status_code == 200 and resp.text: + return resp.text.strip() + except Exception: + pass + return None + + +def _encode(texts: List[str]) -> np.ndarray: + service_url = os.getenv("SERVICE_URL") + if not service_url: + if os.getenv("ALLOW_LOCAL_FALLBACK", "").lower() in ("1", "true", "yes"): + from app.services.embedder import get_embedder # type: ignore + + vecs = get_embedder().encode(texts) + vecs = vecs.astype(np.float32) + norms = np.linalg.norm(vecs, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + return vecs / norms + raise SystemExit("SERVICE_URL not set; Cloud Run embedding service required") + token = _fetch_id_token(service_url) + headers = {"Content-Type": "application/json"} + if token: + headers["Authorization"] = f"Bearer {token}" + sess = requests.Session() + out: list[np.ndarray] = [] + batch = int(os.getenv("BATCH_EMBED_SIZE", "256")) + for i in range(0, len(texts), batch): + chunk = texts[i : i + batch] + for attempt in range(4): + try: + r = sess.post( + f"{service_url.rstrip('/')}/embed/batch", + json={"texts": chunk}, + headers=headers, + timeout=30, + ) + if r.status_code >= 500 and attempt < 3: + time.sleep(1.5 * (attempt + 1)) + continue + r.raise_for_status() + payload = r.json() + vecs = [np.array(item["embedding"], dtype=np.float32) for item in payload] + out.extend(vecs) + break + except Exception: + if attempt >= 3: + raise + time.sleep(1.5 * (attempt + 1)) + continue + arr = np.stack(out, axis=0) + norms = np.linalg.norm(arr, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + return (arr / norms).astype(np.float32) + + +def _vec_to_pg(v: Iterable[float]) -> str: + return "[" + ",".join(str(float(x)) for x in v) + "]" + + +async def _fetch_missing(conn, limit: int | None) -> List[Tuple[int, str | None, str | None, str | None]]: + q = ( + "SELECT m.movie_id, m.title, m.genres, m.overview " + "FROM movies m LEFT JOIN movie_embeddings e USING (movie_id) " + "WHERE e.movie_id IS NULL ORDER BY m.movie_id" + ) + if limit and limit > 0: + q += " LIMIT $1" + rows = await conn.fetch(q, int(limit)) + else: + rows = await conn.fetch(q) + return [(int(r["movie_id"]), r["title"], r["genres"], r["overview"]) for r in rows] + + +async def _upsert(conn, mids: List[int], vecs: np.ndarray) -> None: + await conn.executemany( + ( + "INSERT INTO movie_embeddings (movie_id, embedding) " + "VALUES ($1, $2) ON CONFLICT (movie_id) DO UPDATE SET embedding=EXCLUDED.embedding" + ), + [(int(mid), _vec_to_pg(vec.tolist())) for mid, vec in zip(mids, vecs)], + ) + + +async def backfill(database_url: str, batch_size: int = 256, limit: int | None = None) -> int: + import asyncpg # lazy import to keep import-time deps light for tests + + conn = await asyncpg.connect(database_url) + processed = 0 + try: + pending = await _fetch_missing(conn, limit) + if not pending: + print("No missing embeddings found.") + return 0 + print(f"Missing embeddings: {len(pending)}") + # Process in batches + for i in range(0, len(pending), batch_size): + batch = pending[i : i + batch_size] + mids = [mid for (mid, _t, _g, _o) in batch] + texts = [build_movie_text(t, g, o) for (_mid, t, g, o) in batch] + vecs = _encode(texts) + await _upsert(conn, mids, vecs) + processed += len(batch) + print(f"Upserted {processed}/{len(pending)} embeddings...") + return processed + finally: + await conn.close() + + +def main() -> int: + db_url = os.getenv("DATABASE_URL") + if not db_url: + print("Set DATABASE_URL", flush=True) + return 2 + batch_size = int(os.getenv("BATCH_SIZE", "256")) + limit_env = os.getenv("LIMIT") + limit = int(limit_env) if limit_env else None + print(f"Starting backfill: batch_size={batch_size}, limit={limit}") + processed = asyncio.run(backfill(db_url, batch_size=batch_size, limit=limit)) + print(f"Backfill complete. Processed: {processed}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/apps/cr-hypervr/scripts/build_hyperedges.py b/apps/cr-hypervr/scripts/build_hyperedges.py new file mode 100644 index 00000000..6e22e5a2 --- /dev/null +++ b/apps/cr-hypervr/scripts/build_hyperedges.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import os +from pathlib import Path +import sys +import gc +from collections import defaultdict +import itertools +import glob +import pandas as pd + +try: + import gcsfs # type: ignore +except Exception: + gcsfs = None + + +def _storage_options(path: str | Path) -> dict | None: + p = str(path) + return {"token": "cloud"} if p.startswith("gs://") else None + + +def _list_ratings_parts(processed_dir: str) -> list[str]: + pat = f"{processed_dir}/ratings_enriched-*.parquet" + if str(processed_dir).startswith("gs://"): + try: + import gcsfs # type: ignore + fs = gcsfs.GCSFileSystem() + matches = sorted(fs.glob(pat)) + if matches: + return [m if m.startswith("gs://") else ("gs://" + m) for m in matches] + except Exception: + pass + else: + files = sorted(glob.glob(pat)) + if files: + return files + # Fallback: sequential probe (reader will fail fast if missing) + return [f"{processed_dir}/ratings_enriched-{i:05d}.parquet" for i in range(0, 200)] + + +def co_watch_edges( + df: pd.DataFrame, + min_rating: float = 4.0, + max_movies_per_user: int = 20, + min_pair_count: int = 3, + top_edges_per_movie: int = 50, +) -> list[tuple[str, int, str, int, float]]: + """Compute simple co-watch hyperedges between movies watched positively by same user. + Returns list of (src_kind, src_id, dst_kind, dst_id, weight). + """ + pos = df[df["rating"] >= min_rating] + by_user = pos.groupby("user_id")["movie_id"].apply(list) + counts: dict[tuple[int, int], int] = defaultdict(int) + for movies in by_user: + uniq = list(sorted(set(int(m) for m in movies))) + if len(uniq) > max_movies_per_user: + uniq = uniq[:max_movies_per_user] + for a, b in itertools.combinations(uniq, 2): + counts[(a, b)] += 1 + filtered = {k: v for k, v in counts.items() if v >= min_pair_count} + per_src: dict[int, list[tuple[int, float]]] = defaultdict(list) + for (a, b), c in filtered.items(): + per_src[a].append((b, float(c))) + per_src[b].append((a, float(c))) + edges: list[tuple[str, int, str, int, float]] = [] + for src, lst in per_src.items(): + lst.sort(key=lambda x: x[1], reverse=True) + for dst, w in lst[:top_edges_per_movie]: + edges.append(("movie", src, "movie", dst, w)) + return edges + + +def genre_affinity_edges(movies: pd.DataFrame) -> list[tuple[str, int, str, int, float]]: + """Create edges from movie->genre tokens for lightweight hypergraph support.""" + edges: list[tuple[str, int, str, int, float]] = [] + genre_ids: dict[str, int] = {} + next_gid = 1_000_000 # avoid collision with movie ids + for row in movies.itertuples(index=False): + mid = int(getattr(row, "movieId", getattr(row, "movie_id", 0))) + genres = str(getattr(row, "genres", "")).split("|") if "|" in str(getattr(row, "genres", "")) else str(getattr(row, "genres", "")).split(",") + for g in [g.strip() for g in genres if g and isinstance(g, str)]: + gid = genre_ids.setdefault(g, next_gid) + if gid == next_gid: + next_gid += 1 + edges.append(("movie", mid, "genre", gid, 1.0)) + return edges + + +def write_to_db(db_url: str, edges: list[tuple[str, int, str, int, float]]) -> None: + import asyncpg, asyncio + + async def run() -> None: + conn = await asyncpg.connect(db_url) + try: + await conn.executemany( + """ + INSERT INTO hyperedges (src_kind, src_id, dst_kind, dst_id, weight) + VALUES ($1,$2,$3,$4,$5) + """, + edges, + ) + finally: + await conn.close() + + asyncio.run(run()) + + +def main() -> None: + # Accept either PROCESSED_PREFIX or legacy GCS_PROCESSED_PREFIX for consistency with other jobs + processed = os.getenv("PROCESSED_PREFIX") or os.getenv("GCS_PROCESSED_PREFIX") or "data/processed" + ratings_parts = _list_ratings_parts(processed) + if not ratings_parts: + raise FileNotFoundError("ratings_enriched-*.parquet not found in processed dir") + movies_path = f"{processed}/movies_with_descriptions.parquet" if str(processed).startswith("gs://") else Path(processed) / "movies_with_descriptions.parquet" + movies = pd.read_parquet(movies_path, storage_options=_storage_options(movies_path)) + + # Stream through parts to build co-watch edges without loading all rows in memory + min_rating = float(os.getenv("MIN_RATING", "4.0")) + max_movies_per_user = int(os.getenv("MAX_MOVIES_PER_USER", "20")) + min_pair_count = int(os.getenv("MIN_PAIR_COUNT", "3")) + top_edges_per_movie = int(os.getenv("TOP_EDGES_PER_MOVIE", "50")) + max_parts = int(os.getenv("MAX_PARTS", "0")) # 0 = all + total_rows = 0 + # Prepare per-part edges output to reduce memory and aid retries + parts_dir = f"{processed}/hyperedges_parts" if str(processed).startswith("gs://") else str(Path(processed) / "hyperedges_parts") + if not str(parts_dir).startswith("gs://"): + Path(parts_dir).mkdir(parents=True, exist_ok=True) + + # Pass 1: generate bounded edges per part and persist + for idx, p in enumerate(ratings_parts): + if max_parts and idx >= max_parts: + break + try: + part_df = pd.read_parquet(p, storage_options=_storage_options(p)).copy() + except FileNotFoundError: + continue + if "user_id" not in part_df.columns and "userId" in part_df.columns: + part_df = part_df.rename(columns={"userId": "user_id"}) + if "movie_id" not in part_df.columns and "movieId" in part_df.columns: + part_df = part_df.rename(columns={"movieId": "movie_id"}) + total_rows += len(part_df) + part_edges = co_watch_edges( + part_df, + min_rating=min_rating, + max_movies_per_user=max_movies_per_user, + min_pair_count=min_pair_count, + top_edges_per_movie=top_edges_per_movie, + ) + # Persist part edges + edf = pd.DataFrame(part_edges, columns=["src_kind", "src_id", "dst_kind", "dst_id", "weight"]) if part_edges else pd.DataFrame(columns=["src_kind", "src_id", "dst_kind", "dst_id", "weight"]) + outp = f"{parts_dir}/edges_part_{idx:05d}.parquet" if str(parts_dir).startswith("gs://") else str(Path(parts_dir) / f"edges_part_{idx:05d}.parquet") + edf.to_parquet(outp, storage_options=_storage_options(outp), index=False) + print(f"Wrote edges for part {idx:05d}: {len(edf):,} -> {outp}") + sys.stdout.flush() + # Free memory between parts + del part_df, part_edges, edf + gc.collect() + + # Pass 2: aggregate per-part edges into bounded top-K per source + from glob import glob as _glob + parts_list: list[str] + if str(parts_dir).startswith("gs://"): + try: + import gcsfs # type: ignore + fs = gcsfs.GCSFileSystem() + parts_list = sorted(fs.glob(f"{parts_dir}/edges_part_*.parquet")) + parts_list = [p if p.startswith("gs://") else ("gs://" + p) for p in parts_list] + except Exception: + parts_list = [] + else: + parts_list = sorted(_glob(str(Path(parts_dir) / "edges_part_*.parquet"))) + + per_src_global: dict[int, dict[int, float]] = defaultdict(dict) + for j, ep in enumerate(parts_list): + try: + e = pd.read_parquet(ep, storage_options=_storage_options(ep), columns=["src_id", "dst_id", "weight"]) # type: ignore[arg-type] + except Exception: + continue + for row in e.itertuples(index=False): + src = int(getattr(row, "src_id")) + dst = int(getattr(row, "dst_id")) + w = float(getattr(row, "weight")) + d = per_src_global[src] + d[dst] = d.get(dst, 0.0) + w + # Prune per-src maps to keep bounded size + for src, d in list(per_src_global.items()): + if len(d) > top_edges_per_movie: + top = sorted(d.items(), key=lambda kv: kv[1], reverse=True)[:top_edges_per_movie] + per_src_global[src] = dict(top) + if (j + 1) % 10 == 0: + approx_edges = sum(len(d) for d in per_src_global.values()) + print(f"Aggregated {j+1}/{len(parts_list)} edge parts, approx edges {approx_edges:,}") + sys.stdout.flush() + + # Emit final edges + edges: list[tuple[str, int, str, int, float]] = [] + for src, d in per_src_global.items(): + for dst, w in d.items(): + edges.append(("movie", src, "movie", dst, float(w))) + edges += genre_affinity_edges(movies) + + # Write optional parquet artifact (local or GCS) + out_path = f"{processed}/hyperedges.parquet" if str(processed).startswith("gs://") else str(Path(processed) / "hyperedges.parquet") + pd.DataFrame(edges, columns=["src_kind", "src_id", "dst_kind", "dst_id", "weight"]).to_parquet(out_path, storage_options=_storage_options(out_path), index=False) + print(f"Hyperedges written to {out_path} ({len(edges):,} rows)") + + db_url = os.getenv("DATABASE_URL") + if db_url: + # Chunked DB insert to avoid long blocking executemany + CHUNK = int(os.getenv("EDGE_DB_CHUNK", "5000")) + import asyncpg, asyncio + async def run() -> None: + conn = await asyncpg.connect(db_url) + try: + q = "INSERT INTO hyperedges (src_kind, src_id, dst_kind, dst_id, weight) VALUES ($1,$2,$3,$4,$5)" + for i in range(0, len(edges), CHUNK): + await conn.executemany(q, edges[i:i+CHUNK]) + print(f"Inserted edges {min(i+CHUNK, len(edges))}/{len(edges)} into DB...") + sys.stdout.flush() + finally: + await conn.close() + asyncio.run(run()) + print("Also inserted into database hyperedges table.") + sys.stdout.flush() + + +if __name__ == "__main__": + main() diff --git a/apps/cr-hypervr/scripts/db_apply_cloudsql.sh b/apps/cr-hypervr/scripts/db_apply_cloudsql.sh new file mode 100644 index 00000000..fedc9639 --- /dev/null +++ b/apps/cr-hypervr/scripts/db_apply_cloudsql.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +set -euo pipefail + +SQL_INSTANCE=${SQL_INSTANCE:-embeddings-sql-europe-west2} +DB_NAME=${DB_NAME:-movies} + +echo "Applying pgvector extension as postgres..." +gcloud sql connect "$SQL_INSTANCE" --user=postgres --database="$DB_NAME" --quiet < db/pgvector.sql + +echo "Applying schema as postgres..." +gcloud sql connect "$SQL_INSTANCE" --user=postgres --database="$DB_NAME" --quiet < db/schema.sql + +echo "Schema applied." + diff --git a/apps/cr-hypervr/scripts/deploy_cloud_run.sh b/apps/cr-hypervr/scripts/deploy_cloud_run.sh new file mode 100644 index 00000000..982110dd --- /dev/null +++ b/apps/cr-hypervr/scripts/deploy_cloud_run.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Default to repo-local Cloud SDK config to avoid $HOME perms issues +export CLOUDSDK_CONFIG="${CLOUDSDK_CONFIG:-$(pwd)/.gcloud}" +mkdir -p "$CLOUDSDK_CONFIG" + +SERVICE_NAME=${SERVICE_NAME:-embedding-service} +PROJECT_ID=${PROJECT_ID:?set PROJECT_ID} +REGION=${REGION:-europe-west2} +AR_REPO=${AR_REPO:-embedding-service} +IMAGE=${IMAGE:-$REGION-docker.pkg.dev/$PROJECT_ID/$AR_REPO/api:latest} + +INSTANCE_CONNECTION_NAME=${INSTANCE_CONNECTION_NAME:-$(gcloud sql instances describe embeddings-sql-$REGION --format='value(connectionName)' 2>/dev/null || true)} + +gcloud run deploy "$SERVICE_NAME" \ + --image "$IMAGE" \ + --project "$PROJECT_ID" \ + --region "$REGION" \ + --platform managed \ + --allow-unauthenticated \ + --cpu 2 --memory 2Gi --max-instances 10 \ + --port 8080 \ + --add-cloudsql-instances "$INSTANCE_CONNECTION_NAME" \ + --set-secrets DATABASE_URL=database-url:latest \ + --set-env-vars ENVIRONMENT=prod,BASE_MODEL_DIR=models/base-minilm${MODEL_GCS_URI:+,MODEL_GCS_URI=${MODEL_GCS_URI}} \ + ${EXTRA_ARGS:-} diff --git a/apps/cr-hypervr/scripts/deploy_graph_service.sh b/apps/cr-hypervr/scripts/deploy_graph_service.sh new file mode 100644 index 00000000..db90dfb5 --- /dev/null +++ b/apps/cr-hypervr/scripts/deploy_graph_service.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Thin wrapper to deploy a dedicated Cloud Run service focused on graph recommendations. +# It reuses the same container image but sets a distinct service name and enables +# graph-related settings by default. + +: "${PROJECT_ID:?set PROJECT_ID}" +REGION=${REGION:-europe-west2} +SERVICE_NAME=${SERVICE_NAME:-infra-service} +AR_REPO=${AR_REPO:-embedding-service} +MODEL_GCS_URI=${MODEL_GCS_URI:-} + +# Default to repo-local Cloud SDK config if not provided +export CLOUDSDK_CONFIG="${CLOUDSDK_CONFIG:-$(pwd)/.gcloud}" +mkdir -p "$CLOUDSDK_CONFIG" + +EXTRA_ARGS=( + --set-env-vars USE_RERANKER=${USE_RERANKER:-false} + --set-env-vars USE_GRAPH_SCORER=${USE_GRAPH_SCORER:-true} +) + +if [[ -n "${EXTRA_SET_VARS:-}" ]]; then + # Allow callers to pass additional comma-separated vars like KEY=V,FOO=BAR + EXTRA_ARGS+=( --set-env-vars "${EXTRA_SET_VARS}" ) +fi + +# Allow callers to pass through arbitrary additional flags (e.g., --service-account=...) +if [[ -n "${EXTRA_FLAGS:-}" ]]; then + # Word-split intentionally to support multiple flags + # shellcheck disable=SC2206 + EXTRA_ARGS+=( ${EXTRA_FLAGS} ) +fi + +SERVICE_NAME=${SERVICE_NAME} PROJECT_ID=${PROJECT_ID} REGION=${REGION} AR_REPO=${AR_REPO} \ +EXTRA_ARGS="${EXTRA_ARGS[*]}" MODEL_GCS_URI="${MODEL_GCS_URI}" \ +bash "$(dirname "$0")/deploy_cloud_run.sh" + +echo "Deployed Cloud Run service: ${SERVICE_NAME} (project=${PROJECT_ID}, region=${REGION})" diff --git a/apps/cr-hypervr/scripts/deploy_jobs.sh b/apps/cr-hypervr/scripts/deploy_jobs.sh new file mode 100644 index 00000000..67efdabd --- /dev/null +++ b/apps/cr-hypervr/scripts/deploy_jobs.sh @@ -0,0 +1,193 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Deploy Cloud Run Jobs for data join and phase2 pipeline. +# Requirements: image pushed to Artifact Registry and service account for jobs. +# Usage: +# PROJECT_ID=... REGION=europe-west2 AR_REPO=embedding-service JOBS_SA=embedding-jobs \ +# DATA_PREFIX=gs:///data PROCESSED_PREFIX=gs:///data/processed \ +# TRIPLETS_PREFIX=gs:///triplets PROFILES_PATH=gs:///data/processed/user_profiles.parquet \ +# ./scripts/deploy_jobs.sh + +PROJECT_ID=${PROJECT_ID:?set PROJECT_ID} +REGION=${REGION:-europe-west2} +AR_REPO=${AR_REPO:-embedding-service} +IMAGE=${IMAGE:-$REGION-docker.pkg.dev/$PROJECT_ID/$AR_REPO/api:latest} +JOBS_SA=${JOBS_SA:-embedding-jobs@${PROJECT_ID}.iam.gserviceaccount.com} +INSTANCE_CONNECTION_NAME=${INSTANCE_CONNECTION_NAME:-$(gcloud sql instances describe ${SQL_INSTANCE:-embeddings-sql-${REGION}} --format='value(connectionName)' 2>/dev/null || true)} + +# Default to stable bucket envs if explicit prefixes not provided +DATA_PREFIX=${DATA_PREFIX:-${GCS_DATA_BUCKET}/data} +PROCESSED_PREFIX=${PROCESSED_PREFIX:-${GCS_EMB_BUCKET}/data/processed} +TRIPLETS_PREFIX=${TRIPLETS_PREFIX:-${GCS_EMB_BUCKET}/triplets} +PROFILES_PATH=${PROFILES_PATH:-${GCS_EMB_BUCKET}/data/processed/user_profiles.parquet} +SERVICE_URL=${SERVICE_URL:-$(gcloud run services describe embedding-service --project "$PROJECT_ID" --region="$REGION" --format='value(status.url)' 2>/dev/null || true)} + +echo "Image: $IMAGE" +echo "Jobs SA: $JOBS_SA" +echo "Cloud SQL: ${INSTANCE_CONNECTION_NAME:-[unset]}" +echo "Data prefix: ${DATA_PREFIX:-[unset]}" +echo "Processed: ${PROCESSED_PREFIX:-[unset]}" +echo "Triplets: ${TRIPLETS_PREFIX:-[unset]}" +echo "Profiles: ${PROFILES_PATH:-[unset]}" +echo "Service URL: ${SERVICE_URL:-[unset]}" + +common_env=( + --set-env-vars PYTHONPATH=/app + --set-env-vars GCS_DATA_PREFIX=${DATA_PREFIX} + --set-env-vars GCS_PROCESSED_PREFIX=${PROCESSED_PREFIX} + --set-env-vars GCS_TRIPLETS_PREFIX=${TRIPLETS_PREFIX} + --set-env-vars GCS_PROFILES_PATH=${PROFILES_PATH} + --set-env-vars GCS_MODELS_BUCKET=${GCS_MODELS_BUCKET:-} + --set-env-vars TRIPLET_USER_SAMPLE=${TRIPLET_USER_SAMPLE:-all} +) + +echo "Deploying job: data-join" +gcloud run jobs deploy data-join \ + --image "$IMAGE" \ + --project "$PROJECT_ID" \ + --region "$REGION" \ + --service-account "$JOBS_SA" \ + --cpu 2 --memory 4Gi \ + --task-timeout 3600 \ + --max-retries 1 \ + --command python \ + --args scripts/join_datasets.py \ + "${common_env[@]}" + +echo "Deploying job: pipeline-phase2" +gcloud run jobs deploy pipeline-phase2 \ + --image "$IMAGE" \ + --project "$PROJECT_ID" \ + --region "$REGION" \ + --service-account "$JOBS_SA" \ + --cpu 4 --memory 8Gi \ + --task-timeout 14400 \ + --max-retries 1 \ + ${INSTANCE_CONNECTION_NAME:+--set-cloudsql-instances "$INSTANCE_CONNECTION_NAME"} \ + --set-secrets DATABASE_URL=database-url:latest \ + --command python \ + --args scripts/run_pipeline_phase2.py \ + --set-env-vars PROCESSED_PREFIX=${PROCESSED_PREFIX} \ + "${common_env[@]}" + +echo "Deploying job: pipeline-phase3" +gcloud run jobs deploy pipeline-phase3 \ + --image "$IMAGE" \ + --project "$PROJECT_ID" \ + --region "$REGION" \ + --service-account "$JOBS_SA" \ + --cpu 4 --memory 8Gi \ + --task-timeout 21600 \ + --max-retries 1 \ + --set-env-vars BASE_MODEL_DIR=${BASE_MODEL_DIR:-models/base-minilm} \ + --set-env-vars OUTPUT_DIR=${OUTPUT_DIR:-models/movie-minilm-v1} \ + --set-env-vars EPOCHS=${EPOCHS:-1} \ + --set-env-vars BATCH_SIZE=${BATCH_SIZE:-64} \ + --set-env-vars RUN_PHASE2_IF_MISSING=${RUN_PHASE2_IF_MISSING:-false} \ + --command python \ + --args scripts/run_pipeline_phase3.py \ + "${common_env[@]}" + +echo "Jobs deployed. Use: gcloud run jobs run --region=$REGION --wait" + +echo "Deploying job: seed-movies" +gcloud run jobs deploy seed-movies \ + --image "$IMAGE" \ + --project "$PROJECT_ID" \ + --region "$REGION" \ + --service-account "$JOBS_SA" \ + --cpu 2 --memory 2Gi \ + --task-timeout 3600 \ + --max-retries 1 \ + ${INSTANCE_CONNECTION_NAME:+--set-cloudsql-instances "$INSTANCE_CONNECTION_NAME"} \ + --set-secrets DATABASE_URL=database-url:latest \ + --set-env-vars PROCESSED_PREFIX=${PROCESSED_PREFIX} \ + --set-env-vars GCS_PROCESSED_PREFIX=${PROCESSED_PREFIX} \ + --command python \ + --args scripts/seed_movies.py + +echo "Deploying job: seed-embeddings" +gcloud run jobs deploy seed-embeddings \ + --image "$IMAGE" \ + --project "$PROJECT_ID" \ + --region "$REGION" \ + --service-account "$JOBS_SA" \ + --cpu 4 --memory 8Gi \ + --task-timeout 14400 \ + --max-retries 1 \ + ${INSTANCE_CONNECTION_NAME:+--set-cloudsql-instances "$INSTANCE_CONNECTION_NAME"} \ + --set-secrets DATABASE_URL=database-url:latest \ + --set-env-vars PROCESSED_PREFIX=${PROCESSED_PREFIX} \ + --set-env-vars GCS_PROCESSED_PREFIX=${PROCESSED_PREFIX} \ + --set-env-vars SERVICE_URL=${SERVICE_URL} \ + --set-env-vars BATCH_EMBED_SIZE=${BATCH_EMBED_SIZE:-256} \ + --set-env-vars UPSERT_CHUNK_SIZE=${UPSERT_CHUNK_SIZE:-1000} \ + --set-env-vars MOVIES_ROW_CHUNK=${MOVIES_ROW_CHUNK:-5000} \ + --set-env-vars MODEL_DIR=models/base-minilm \ + --set-env-vars EMBEDDING_BACKEND=st \ + --command python \ + --args scripts/seed_embeddings.py + +echo "Deploying job: backfill-embeddings-db" +gcloud run jobs deploy backfill-embeddings-db \ + --image "$IMAGE" \ + --project "$PROJECT_ID" \ + --region "$REGION" \ + --service-account "$JOBS_SA" \ + --cpu 2 --memory 4Gi \ + --task-timeout 14400 \ + --max-retries 1 \ + ${INSTANCE_CONNECTION_NAME:+--set-cloudsql-instances "$INSTANCE_CONNECTION_NAME"} \ + --set-secrets DATABASE_URL=database-url:latest \ + --set-env-vars SERVICE_URL=${SERVICE_URL} \ + --set-env-vars EMBEDDING_BACKEND=st \ + --command python \ + --args scripts/backfill_embeddings_db.py + +echo "Deploying job: validate-triplets" +gcloud run jobs deploy validate-triplets \ + --image "$IMAGE" \ + --project "$PROJECT_ID" \ + --region "$REGION" \ + --service-account "$JOBS_SA" \ + --cpu 1 --memory 1Gi \ + --task-timeout 1800 \ + --max-retries 1 \ + ${INSTANCE_CONNECTION_NAME:+--set-cloudsql-instances "$INSTANCE_CONNECTION_NAME"} \ + --set-secrets DATABASE_URL=database-url:latest \ + --set-env-vars GCS_TRIPLETS_PREFIX=${TRIPLETS_PREFIX} \ + --command python \ + --args scripts/validate_triplets_coverage.py + +echo "Deploying job: validate-hyperedges" +gcloud run jobs deploy validate-hyperedges \ + --image "$IMAGE" \ + --project "$PROJECT_ID" \ + --region "$REGION" \ + --service-account "$JOBS_SA" \ + --cpu 1 --memory 2Gi \ + --task-timeout 3600 \ + --max-retries 1 \ + ${INSTANCE_CONNECTION_NAME:+--set-cloudsql-instances "$INSTANCE_CONNECTION_NAME"} \ + --set-secrets DATABASE_URL=database-url:latest \ + --set-env-vars PROCESSED_PREFIX=${PROCESSED_PREFIX} \ + --set-env-vars GCS_PROCESSED_PREFIX=${PROCESSED_PREFIX} \ + --command python \ + --args scripts/validate_hyperedges.py + +echo "Deploying job: build-hyperedges" +gcloud run jobs deploy build-hyperedges \ + --image "$IMAGE" \ + --project "$PROJECT_ID" \ + --region "$REGION" \ + --service-account "$JOBS_SA" \ + --cpu 4 --memory 8Gi \ + --task-timeout 10800 \ + --max-retries 1 \ + ${INSTANCE_CONNECTION_NAME:+--set-cloudsql-instances "$INSTANCE_CONNECTION_NAME"} \ + --set-secrets DATABASE_URL=database-url:latest \ + --set-env-vars PROCESSED_PREFIX=${PROCESSED_PREFIX} \ + --set-env-vars GCS_PROCESSED_PREFIX=${PROCESSED_PREFIX} \ + --command python \ + --args scripts/build_hyperedges.py diff --git a/apps/cr-hypervr/scripts/download_minilm.py b/apps/cr-hypervr/scripts/download_minilm.py new file mode 100644 index 00000000..06cc0853 --- /dev/null +++ b/apps/cr-hypervr/scripts/download_minilm.py @@ -0,0 +1,37 @@ +import os +import sys +import subprocess + + +def main(): + target_dir = os.environ.get("BASE_MODEL_DIR", "models/base-minilm") + os.makedirs(target_dir, exist_ok=True) + # Prefer sentence-transformers quick download path + code = subprocess.call( + [ + sys.executable, + "-c", + "from sentence_transformers import SentenceTransformer; SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2').save('{}')".format( + target_dir + ), + ] + ) + if code != 0: + print("Falling back to huggingface-cli download...") + code = subprocess.call( + [ + "bash", + "-lc", + f"huggingface-cli download sentence-transformers/all-MiniLM-L6-v2 --local-dir {target_dir}", + ] + ) + if code == 0: + print(f"Model downloaded to {target_dir}") + else: + print("Failed to download model. Ensure git-lfs and huggingface-cli are available.") + sys.exit(1) + + +if __name__ == "__main__": + main() + diff --git a/apps/cr-hypervr/scripts/download_movielens_25m.sh b/apps/cr-hypervr/scripts/download_movielens_25m.sh new file mode 100644 index 00000000..6b6bede5 --- /dev/null +++ b/apps/cr-hypervr/scripts/download_movielens_25m.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROJECT_ROOT=$(cd "$(dirname "$0")"/.. && pwd) +mkdir -p "$PROJECT_ROOT/data/movielens" + +cd "$PROJECT_ROOT/data/movielens" +echo "Downloading MovieLens 25M..." +curl -fL https://files.grouplens.org/datasets/movielens/ml-25m.zip -o ml-25m.zip +echo "Extracting..." +unzip -o ml-25m.zip +echo "MovieLens ready in data/movielens/ml-25m/" + diff --git a/apps/cr-hypervr/scripts/download_tmdb_full.py b/apps/cr-hypervr/scripts/download_tmdb_full.py new file mode 100644 index 00000000..146dad0f --- /dev/null +++ b/apps/cr-hypervr/scripts/download_tmdb_full.py @@ -0,0 +1,33 @@ +import os +import subprocess +from pathlib import Path + + +def main(): + project_root = Path(__file__).parent.parent + os.chdir(project_root) + os.environ.setdefault("KAGGLE_CONFIG_DIR", str(project_root / ".kaggle")) + + out_dir = project_root / "data/tmdb" + out_dir.mkdir(parents=True, exist_ok=True) + zip_path = out_dir / "tmdb-movies-dataset-2023-930k-movies.zip" + + print("Downloading TMDB 2024 dataset via Kaggle...") + cmd = [ + "bash", + "-lc", + f"pip install -q kaggle && kaggle datasets download -d asaniczka/tmdb-movies-dataset-2023-930k-movies -p {out_dir} --force", + ] + code = subprocess.call(cmd) + if code != 0: + raise SystemExit("Kaggle download failed. Ensure KAGGLE_CONFIG_DIR and credentials are set.") + + # Unzip + print("Extracting TMDB zip...") + subprocess.check_call(["bash", "-lc", f"cd {out_dir} && unzip -o *.zip"]) # extracts TMDB_movie_dataset_v11.csv + print("TMDB dataset ready in data/tmdb/") + + +if __name__ == "__main__": + main() + diff --git a/apps/cr-hypervr/scripts/export_openapi.py b/apps/cr-hypervr/scripts/export_openapi.py new file mode 100644 index 00000000..fc6d6005 --- /dev/null +++ b/apps/cr-hypervr/scripts/export_openapi.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +from fastapi.openapi.utils import get_openapi +from app.main import app + + +def main() -> None: + p = argparse.ArgumentParser() + p.add_argument("--out", default="docs/openapi.json", help="Output file path") + args = p.parse_args() + schema = get_openapi( + title=app.title, + version=app.version, + routes=app.routes, + description="Movie Embedding Service OpenAPI", + ) + out = Path(args.out) + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(json.dumps(schema, indent=2)) + print(f"Wrote {out}") + + +if __name__ == "__main__": + main() + diff --git a/apps/cr-hypervr/scripts/gcloud_env.sh b/apps/cr-hypervr/scripts/gcloud_env.sh new file mode 100644 index 00000000..7384c396 --- /dev/null +++ b/apps/cr-hypervr/scripts/gcloud_env.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Use a repo-local Cloud SDK config to avoid $HOME permission issues. +export CLOUDSDK_CONFIG="${CLOUDSDK_CONFIG:-$(pwd)/.gcloud}" +mkdir -p "$CLOUDSDK_CONFIG" + +if [[ -n "${PROJECT_ID:-}" ]]; then + gcloud config set core/project "$PROJECT_ID" >/dev/null +fi +if [[ -n "${REGION:-}" ]]; then + gcloud config set compute/region "$REGION" >/dev/null +fi + +echo "CLOUDSDK_CONFIG=$CLOUDSDK_CONFIG" +gcloud config list 2>/dev/null || true + +# Stable bucket envs (point to existing dated buckets by default; no reupload) +# Users may export these to override. +PROJECT_ID=${PROJECT_ID:-$(gcloud config get-value core/project 2>/dev/null)} +REGION=${REGION:-$(gcloud config get-value compute/region 2>/dev/null)} +DATE_SUFFIX=${DATE_SUFFIX:-20251207} + +export GCS_DATA_BUCKET=${GCS_DATA_BUCKET:-gs://${PROJECT_ID}-${REGION}-datasets-${DATE_SUFFIX}} +export GCS_MODELS_BUCKET=${GCS_MODELS_BUCKET:-gs://${PROJECT_ID}-${REGION}-models-${DATE_SUFFIX}} +export GCS_EMB_BUCKET=${GCS_EMB_BUCKET:-gs://${PROJECT_ID}-${REGION}-embeddings-${DATE_SUFFIX}} + +echo "Buckets:" +echo " GCS_DATA_BUCKET=$GCS_DATA_BUCKET" +echo " GCS_MODELS_BUCKET=$GCS_MODELS_BUCKET" +echo " GCS_EMB_BUCKET=$GCS_EMB_BUCKET" diff --git a/apps/cr-hypervr/scripts/gcp_log.py b/apps/cr-hypervr/scripts/gcp_log.py new file mode 100644 index 00000000..cc9f9d4c --- /dev/null +++ b/apps/cr-hypervr/scripts/gcp_log.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import os +import shlex +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path + + +def iso_utc_now() -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def write_log( + log_path: Path, + *, + timestamp: str, + executor: str, + purpose: str, + commands: str, + result: str, + stdout: str | None = None, + stderr: str | None = None, + exit_code: int | None = None, +) -> None: + log_path.parent.mkdir(parents=True, exist_ok=True) + with log_path.open("a", encoding="utf-8") as f: + f.write("- Timestamp (UTC): " + timestamp + "\n") + f.write("- Executor: " + executor + "\n") + f.write("- Command(s): " + commands.strip() + "\n") + f.write("- Purpose: " + purpose.strip() + "\n") + if exit_code is not None: + f.write(f"- Exit code: {exit_code}\n") + f.write("- Result: " + result + "\n") + if stdout: + f.write("- Stdout (truncated):\n") + f.write("```\n") + f.write(_truncate(stdout)) + f.write("\n````\n") + if stderr: + f.write("- Stderr (truncated):\n") + f.write("```\n") + f.write(_truncate(stderr)) + f.write("\n````\n") + f.write("\n") + + +def _truncate(s: str, limit: int = 2000) -> str: + if len(s) <= limit: + return s + head = s[: limit - 20] + return head + "\n...[truncated]..." + + +def run_and_log(args: argparse.Namespace) -> int: + ts = iso_utc_now() + log_file = Path(os.getenv("GCP_ACTIVITY_LOG_FILE", "gcp-activity-log.md")) + executor = args.executor or os.getenv("GCP_LOG_EXECUTOR", "Agent") + purpose = args.purpose or "Unspecified" + if args.run: + # Execute the command in a login shell for compatibility + cmd = args.run + # Show a normalized form in the log for readability + display_cmd = cmd + try: + proc = subprocess.run(cmd, shell=True, capture_output=True, text=True) + rc = proc.returncode + result = "success" if rc == 0 else "error" + write_log( + log_file, + timestamp=ts, + executor=executor, + purpose=purpose, + commands=display_cmd, + result=result, + stdout=proc.stdout.strip(), + stderr=proc.stderr.strip(), + exit_code=rc, + ) + return rc + except Exception as e: # pragma: no cover + write_log( + log_file, + timestamp=ts, + executor=executor, + purpose=purpose, + commands=display_cmd, + result="error", + stdout="", + stderr=str(e), + exit_code=-1, + ) + return 1 + else: + # Append-only mode (no command execution) + result = args.result or "success" + commands = args.commands or "n/a" + details = args.details or "" + write_log( + log_file, + timestamp=ts, + executor=executor, + purpose=purpose, + commands=commands, + result=result, + stdout=details, + stderr=None, + exit_code=None, + ) + return 0 + + +def build_parser() -> argparse.ArgumentParser: + p = argparse.ArgumentParser(description="Append structured entries to gcp-activity-log.md") + mode = p.add_mutually_exclusive_group(required=False) + mode.add_argument("--run", help="Shell command to execute and log result") + p.add_argument("--executor", help="Executor label (User/Agent/CI)") + p.add_argument("--purpose", help="Purpose of the action") + # Append-only fields + p.add_argument("--commands", help="Commands text when not using --run") + p.add_argument("--result", choices=["success", "error"], help="Result when not using --run") + p.add_argument("--details", help="Additional details text for append-only mode") + return p + + +def main(argv: list[str] | None = None) -> int: + parser = build_parser() + args = parser.parse_args(argv) + return run_and_log(args) + + +if __name__ == "__main__": + sys.exit(main()) + diff --git a/apps/cr-hypervr/scripts/gcp_verify.sh b/apps/cr-hypervr/scripts/gcp_verify.sh new file mode 100644 index 00000000..c7ad701b --- /dev/null +++ b/apps/cr-hypervr/scripts/gcp_verify.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Default to repo-local Cloud SDK config to avoid $HOME perms issues +export CLOUDSDK_CONFIG="${CLOUDSDK_CONFIG:-$(pwd)/.gcloud}" +mkdir -p "$CLOUDSDK_CONFIG" + +PROJECT_ID=${PROJECT_ID:-$(gcloud config get-value core/project 2>/dev/null)} +REGION=${REGION:-$(gcloud config get-value compute/region 2>/dev/null)} + +echo "Project: ${PROJECT_ID}" +echo "Region: ${REGION}" + +echo "== Enabled services (key set) ==" +for s in run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com sqladmin.googleapis.com secretmanager.googleapis.com vpcaccess.googleapis.com storage.googleapis.com compute.googleapis.com; do + printf "%-35s : " "$s"; gcloud services list --enabled --filter="NAME:$s" --format='value(NAME)' || true +done + +echo "== Artifact Registry (europe-west2) ==" +gcloud artifacts repositories list --location=europe-west2 --format='table(name,format,location)' + +echo "== Service Accounts ==" +gcloud iam service-accounts list --format='table(displayName,email)' + +echo "== Bucket exists? ==" +# Default to new datasets bucket with 20251207 suffix unless BUCKET_NAME provided +BUCKET_NAME=${BUCKET_NAME:-${PROJECT_ID}-europe-west2-datasets-20251207} +if gsutil ls -b gs://$BUCKET_NAME >/dev/null 2>&1; then + echo "YES: gs://$BUCKET_NAME" +else + echo "NO: (expected: gs://$BUCKET_NAME)" +fi + +echo "== Cloud SQL instance ==" +SQL_INSTANCE=${SQL_INSTANCE:-embeddings-sql-europe-west2} +gcloud sql instances describe "$SQL_INSTANCE" --format='table(name,region,state,backendType)' || true +echo "== Databases ==" +gcloud sql databases list --instance="$SQL_INSTANCE" --format='table(name)' +echo "== Users ==" +gcloud sql users list --instance="$SQL_INSTANCE" --format='table(name,type)' + +echo "== Secrets ==" +gcloud secrets list --format='table(name)' | sed -n '1,200p' diff --git a/apps/cr-hypervr/scripts/join_datasets.py b/apps/cr-hypervr/scripts/join_datasets.py new file mode 100644 index 00000000..63b5f839 --- /dev/null +++ b/apps/cr-hypervr/scripts/join_datasets.py @@ -0,0 +1,108 @@ +from __future__ import annotations + +from pathlib import Path +import os +import pandas as pd +import math + + +def _storage_options(path: str | Path) -> dict | None: + p = str(path) + return {"token": "cloud"} if p.startswith("gs://") else None + + +def main(): + project_root = Path(__file__).parent.parent + data_prefix = os.getenv("DATA_PREFIX") or os.getenv("GCS_DATA_PREFIX") + processed_prefix = os.getenv("PROCESSED_PREFIX") or os.getenv("GCS_PROCESSED_PREFIX") + + if data_prefix: + if str(data_prefix).startswith("gs://"): + tmdb_csv = f"{data_prefix}/tmdb/TMDB_movie_dataset_v11.csv" + links_csv = f"{data_prefix}/movielens/ml-25m/links.csv" + ratings_csv = f"{data_prefix}/movielens/ml-25m/ratings.csv" + else: + dp = Path(str(data_prefix)) + tmdb_csv = dp / "tmdb/TMDB_movie_dataset_v11.csv" + links_csv = dp / "movielens/ml-25m/links.csv" + ratings_csv = dp / "movielens/ml-25m/ratings.csv" + else: + tmdb_csv = project_root / "data/tmdb/TMDB_movie_dataset_v11.csv" + links_csv = project_root / "data/movielens/ml-25m/links.csv" + ratings_csv = project_root / "data/movielens/ml-25m/ratings.csv" + + if processed_prefix: + out_dir = processed_prefix + else: + out_dir = project_root / "data/processed" + Path(out_dir).mkdir(parents=True, exist_ok=True) + + print("Loading TMDB (filtered columns)...") + tmdb = pd.read_csv( + tmdb_csv, + storage_options=_storage_options(tmdb_csv), + usecols=[c for c in ["imdb_id", "status", "overview", "title", "genres", "vote_average", "release_date"] if True], + ) + tmdb = tmdb[tmdb["status"] == "Released"] + # Keep modest descriptions locally; production datasets will far exceed this + tmdb = tmdb[tmdb["overview"].notna() & (tmdb["overview"].astype(str).str.len() > 10)] + tmdb["imdb_id_clean"] = tmdb["imdb_id"].astype(str).str.replace("tt", "", regex=False) + tmdb["imdb_id_clean"] = pd.to_numeric(tmdb["imdb_id_clean"], errors="coerce") + tmdb = tmdb.dropna(subset=["imdb_id_clean"]) # keep rows with parsed imdb + + print("Loading MovieLens links (small) and preparing mapping...") + links = pd.read_csv(links_csv, storage_options=_storage_options(links_csv)) + links["imdbId"] = pd.to_numeric(links["imdbId"], errors="coerce") + links = links.dropna(subset=["imdbId"]) # keep joinable + + print("Joining TMDB -> MovieLens (movies metadata only)...") + movies_joined = pd.merge( + tmdb[["imdb_id_clean", "title", "overview", "genres", "vote_average", "release_date"]], + links[["movieId", "imdbId"]], + left_on="imdb_id_clean", + right_on="imdbId", + how="inner", + ) + + keep_cols = ["movieId", "title", "overview", "genres", "vote_average", "release_date"] + movies_keep = movies_joined[[c for c in keep_cols if c in movies_joined.columns]].copy() + + # Write movies metadata + if isinstance(out_dir, str) and str(out_dir).startswith("gs://"): + movies_keep.to_parquet(f"{out_dir}/movies_with_descriptions.parquet", storage_options=_storage_options(out_dir)) + else: + Path(out_dir).mkdir(parents=True, exist_ok=True) + Path(out_dir, "triplets").mkdir(exist_ok=True) + movies_keep.to_parquet(Path(out_dir) / "movies_with_descriptions.parquet") + + # Stream ratings in chunks to avoid OOM; write partitioned enriched chunks + print("Streaming ratings -> enriched parquet parts (chunked)...") + part = 0 + chunksize = int(os.getenv("JOIN_RATINGS_CHUNKSIZE", "1000000")) + usecols = ["userId", "movieId", "rating"] + reader = pd.read_csv(ratings_csv, storage_options=_storage_options(ratings_csv), usecols=usecols, chunksize=chunksize) + total_rows = 0 + for chunk in reader: + total_rows += len(chunk) + chunk = chunk[["userId", "movieId", "rating"]] + enriched = chunk.merge(movies_keep[["movieId", "title", "genres"]], on="movieId", how="inner") + enriched = enriched.rename(columns={"userId": "user_id"}) + if isinstance(out_dir, str) and str(out_dir).startswith("gs://"): + outp = f"{out_dir}/ratings_enriched-{part:05d}.parquet" + enriched.to_parquet(outp, storage_options=_storage_options(outp), index=False) + else: + outp = Path(out_dir) / f"ratings_enriched-{part:05d}.parquet" + enriched.to_parquet(outp, index=False) + print(f"Wrote part {part:05d} with {len(enriched):,} rows -> {outp}") + part += 1 + + print({ + "tmdb_descriptions": len(tmdb), + "movies_matched": len(movies_keep), + "ratings_rows_processed": total_rows, + "ratings_parts": part, + }) + + +if __name__ == "__main__": + main() diff --git a/apps/cr-hypervr/scripts/migrate_db.py b/apps/cr-hypervr/scripts/migrate_db.py new file mode 100644 index 00000000..468120a1 --- /dev/null +++ b/apps/cr-hypervr/scripts/migrate_db.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import asyncio +import os +import sys +from typing import Optional + +import asyncpg + + +SQL_FILES = [ + "${GCS_EMB_BUCKET}/db/pgvector.sql", + "${GCS_EMB_BUCKET}/db/schema.sql", +] + + +async def run_sql(conn: asyncpg.Connection, sql_text: str) -> None: + # asyncpg can execute multiple statements in one call + await conn.execute(sql_text) + + +async def load_gcs_text(path: str) -> str: + import gcsfs # lazy import + + fs = gcsfs.GCSFileSystem() + with fs.open(path, "r") as f: + return f.read() + + +async def main() -> int: + db_url = os.getenv("DATABASE_URL") + gcs_bucket = os.getenv("GCS_EMB_BUCKET") + if not db_url or not gcs_bucket: + print("Missing env: DATABASE_URL or GCS_EMB_BUCKET", file=sys.stderr) + return 2 + + # Resolve file paths with env substitution + files = [p.replace("${GCS_EMB_BUCKET}", gcs_bucket) for p in SQL_FILES] + print("Applying SQL files:", files) + conn: Optional[asyncpg.Connection] = None + try: + conn = await asyncpg.connect(dsn=db_url) + for p in files: + try: + sql_text = await load_gcs_text(p) + print(f"-- Executing: {p} ({len(sql_text)} bytes)") + await run_sql(conn, sql_text) + print(f"OK: {p}") + except Exception as e: + print(f"ERROR executing {p}: {e}", file=sys.stderr) + return 1 + return 0 + finally: + if conn: + await conn.close() + + +if __name__ == "__main__": + sys.exit(asyncio.run(main())) + diff --git a/apps/cr-hypervr/scripts/provision_core.sh b/apps/cr-hypervr/scripts/provision_core.sh new file mode 100644 index 00000000..e74cadb7 --- /dev/null +++ b/apps/cr-hypervr/scripts/provision_core.sh @@ -0,0 +1,126 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Default to repo-local Cloud SDK config to avoid $HOME perms issues +export CLOUDSDK_CONFIG="${CLOUDSDK_CONFIG:-$(pwd)/.gcloud}" +mkdir -p "$CLOUDSDK_CONFIG" + +# Creates core infra: bucket, service accounts, Cloud SQL instance/db/user, database-url secret. +# Usage: +# PROJECT_ID=... REGION=europe-west2 BUCKET_NAME=... SQL_INSTANCE=... DB_NAME=movies DB_USER=app_user ./scripts/provision_core.sh +# Optional: +# DB_PASSWORD (auto-generated if empty), AR_REPO (defaults embedding-service), RUNTIME_SA, JOBS_SA + +PROJECT_ID=${PROJECT_ID:-$(gcloud config get-value core/project 2>/dev/null)} +REGION=${REGION:-$(gcloud config get-value compute/region 2>/dev/null)} +AR_REPO=${AR_REPO:-embedding-service} +BUCKET_NAME=${BUCKET_NAME:-${PROJECT_ID}-${REGION}-embeddings} +SQL_INSTANCE=${SQL_INSTANCE:-embeddings-sql-${REGION}} +DB_NAME=${DB_NAME:-movies} +DB_USER=${DB_USER:-app_user} +DB_PASSWORD=${DB_PASSWORD:-} +RUNTIME_SA_NAME=${RUNTIME_SA:-embedding-service} +JOBS_SA_NAME=${JOBS_SA:-embedding-jobs} + +if [[ -z "$PROJECT_ID" || -z "$REGION" ]]; then + echo "PROJECT_ID/REGION not set. Set env vars or run 'gcloud config set project/region'." >&2 + exit 1 +fi + +RUNTIME_SA_EMAIL="${RUNTIME_SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com" +JOBS_SA_EMAIL="${JOBS_SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com" + +echo "== Summary ==" +echo "Project: $PROJECT_ID" +echo "Region: $REGION" +echo "AR Repo: $AR_REPO" +echo "Bucket: $BUCKET_NAME" +echo "SQL Instance: $SQL_INSTANCE" +echo "DB: $DB_NAME" +echo "DB User: $DB_USER" +echo "Runtime SA: $RUNTIME_SA_EMAIL" +echo "Jobs SA: $JOBS_SA_EMAIL" + +if [[ -z "$DB_PASSWORD" ]]; then + if command -v openssl >/dev/null 2>&1; then + DB_PASSWORD=$(openssl rand -base64 20 | tr -d '=+' | cut -c1-20) + else + DB_PASSWORD=$(head -c 24 /dev/urandom | base64 | tr -d '=+' | cut -c1-20) + fi + echo "Generated DB password (not printed)." +fi + +echo "== Creating service accounts (idempotent) ==" +gcloud iam service-accounts describe "$RUNTIME_SA_EMAIL" >/dev/null 2>&1 || \ + gcloud iam service-accounts create "$RUNTIME_SA_NAME" --display-name="Embedding Service Runtime" +gcloud iam service-accounts describe "$JOBS_SA_EMAIL" >/dev/null 2>&1 || \ + gcloud iam service-accounts create "$JOBS_SA_NAME" --display-name="Embedding Jobs" + +echo "== Granting roles ==" +gcloud projects add-iam-policy-binding "$PROJECT_ID" \ + --member="serviceAccount:$RUNTIME_SA_EMAIL" --role="roles/cloudsql.client" --quiet >/dev/null +gcloud projects add-iam-policy-binding "$PROJECT_ID" \ + --member="serviceAccount:$RUNTIME_SA_EMAIL" --role="roles/secretmanager.secretAccessor" --quiet >/dev/null +gcloud projects add-iam-policy-binding "$PROJECT_ID" \ + --member="serviceAccount:$RUNTIME_SA_EMAIL" --role="roles/artifactregistry.reader" --quiet >/dev/null + +gcloud projects add-iam-policy-binding "$PROJECT_ID" \ + --member="serviceAccount:$JOBS_SA_EMAIL" --role="roles/cloudsql.client" --quiet >/dev/null +gcloud projects add-iam-policy-binding "$PROJECT_ID" \ + --member="serviceAccount:$JOBS_SA_EMAIL" --role="roles/secretmanager.secretAccessor" --quiet >/dev/null +gcloud projects add-iam-policy-binding "$PROJECT_ID" \ + --member="serviceAccount:$JOBS_SA_EMAIL" --role="roles/storage.admin" --quiet >/dev/null + +echo "== Creating GCS bucket (idempotent) ==" +if gsutil ls -b "gs://$BUCKET_NAME" >/dev/null 2>&1; then + echo "Bucket exists: gs://$BUCKET_NAME" +else + gsutil mb -l "$REGION" "gs://$BUCKET_NAME" +fi +gsutil -q cp /dev/null "gs://$BUCKET_NAME/models/.keep" || true +gsutil -q cp /dev/null "gs://$BUCKET_NAME/data/netflix/.keep" || true +gsutil -q cp /dev/null "gs://$BUCKET_NAME/data/tmdb/.keep" || true +gsutil -q cp /dev/null "gs://$BUCKET_NAME/data/processed/.keep" || true +gsutil -q cp /dev/null "gs://$BUCKET_NAME/embeddings/.keep" || true + +echo "== Creating Cloud SQL instance/database/user (idempotent) ==" +if gcloud sql instances describe "$SQL_INSTANCE" --project "$PROJECT_ID" >/dev/null 2>&1; then + echo "SQL instance exists: $SQL_INSTANCE" +else + gcloud sql instances create "$SQL_INSTANCE" \ + --database-version=POSTGRES_15 --cpu=2 --memory=7680MB \ + --region="$REGION" --availability-type=ZONAL --quiet +fi + +if gcloud sql databases describe "$DB_NAME" --instance "$SQL_INSTANCE" >/dev/null 2>&1; then + echo "Database exists: $DB_NAME" +else + gcloud sql databases create "$DB_NAME" --instance "$SQL_INSTANCE" --quiet +fi + +if gcloud sql users list --instance "$SQL_INSTANCE" --format="value(name)" | grep -qx "$DB_USER"; then + echo "User exists: $DB_USER (updating password)" + gcloud sql users set-password "$DB_USER" --instance "$SQL_INSTANCE" --password "$DB_PASSWORD" --quiet +else + gcloud sql users create "$DB_USER" --instance "$SQL_INSTANCE" --password "$DB_PASSWORD" --quiet +fi + +INSTANCE_CONNECTION_NAME=$(gcloud sql instances describe "$SQL_INSTANCE" --format='value(connectionName)') +DB_URL="postgresql://$DB_USER:$DB_PASSWORD@/$DB_NAME?host=/cloudsql/$INSTANCE_CONNECTION_NAME" + +echo "== Writing database-url secret (idempotent) ==" +if gcloud secrets describe database-url >/dev/null 2>&1; then + echo -n "$DB_URL" | gcloud secrets versions add database-url --data-file=- >/dev/null +else + gcloud secrets create database-url >/dev/null + echo -n "$DB_URL" | gcloud secrets versions add database-url --data-file=- >/dev/null +fi + +echo "== Summary Outputs ==" +echo "BUCKET_NAME=$BUCKET_NAME" +echo "SQL_INSTANCE=$SQL_INSTANCE" +echo "INSTANCE_CONNECTION_NAME=$INSTANCE_CONNECTION_NAME" +echo "DB_NAME=$DB_NAME" +echo "DB_USER=$DB_USER" +echo "DB_PASSWORD=[REDACTED]" +echo "DATABASE_URL stored in Secret Manager: database-url (latest)" diff --git a/apps/cr-hypervr/scripts/run_pipeline_phase2.py b/apps/cr-hypervr/scripts/run_pipeline_phase2.py new file mode 100644 index 00000000..c54a99cd --- /dev/null +++ b/apps/cr-hypervr/scripts/run_pipeline_phase2.py @@ -0,0 +1,45 @@ +from pipeline.user_profiles import build_user_profiles +from pipeline.triplets import generate_triplets +import subprocess +import sys +import os + + +def main(): + # Assume data-join job produced movies_with_descriptions + ratings_enriched-*.parquet + # If not present, attempt a join on a small chunk by setting a tiny chunksize to reduce memory. + processed = os.getenv("GCS_PROCESSED_PREFIX", "data/processed") + data_prefix = os.getenv("GCS_DATA_PREFIX") + # Heuristic: if movies parquet is missing, run join (chunked) + need_join = False + import pandas as pd + from pathlib import Path + movies_path = f"{processed}/movies_with_descriptions.parquet" if str(processed).startswith("gs://") else Path(processed) / "movies_with_descriptions.parquet" + try: + pd.read_parquet(movies_path, storage_options={"token": "cloud"} if str(movies_path).startswith("gs://") else None) + except Exception: + need_join = True + if need_join: + os.environ.setdefault("JOIN_RATINGS_CHUNKSIZE", "250000") + subprocess.check_call([sys.executable, "scripts/join_datasets.py"]) # chunked join + + profiles_path = os.getenv( + "GCS_PROFILES_PATH", + (processed + "/user_profiles.parquet") if str(processed).startswith("gs://") else "data/processed/user_profiles.parquet", + ) + min_ratings = int(os.getenv("MIN_RATINGS", "10")) + build_user_profiles(processed_dir=processed, out_path=profiles_path, min_ratings=min_ratings) + triplets_out = os.getenv("GCS_TRIPLETS_PREFIX", "data/processed/triplets") + # Allow overriding triplet sample size. Set TRIPLET_USER_SAMPLE=all for full dataset. + samp_env = os.getenv("TRIPLET_USER_SAMPLE", "10000") + user_sample = None if str(samp_env).lower() in ("all", "none", "0", "-1") else int(samp_env) + generate_triplets(processed_dir=processed, out_dir=triplets_out, user_sample=user_sample) + + # Always build and validate hyperedges at the end of Phase 2 + os.environ.setdefault("PROCESSED_PREFIX", processed) + subprocess.check_call([sys.executable, "scripts/build_hyperedges.py"]) # writes parquet (+optional DB insert) + subprocess.check_call([sys.executable, "scripts/validate_hyperedges.py"]) # exits non-zero on mismatch + + +if __name__ == "__main__": + main() diff --git a/apps/cr-hypervr/scripts/run_pipeline_phase3.py b/apps/cr-hypervr/scripts/run_pipeline_phase3.py new file mode 100644 index 00000000..ec5e1287 --- /dev/null +++ b/apps/cr-hypervr/scripts/run_pipeline_phase3.py @@ -0,0 +1,164 @@ +from __future__ import annotations + +import os +import subprocess +import sys +from pathlib import Path + + +def _is_gcs(p: str) -> bool: + return str(p).startswith("gs://") + + +def _parquet_exists(path: str) -> bool: + try: + import pandas as pd # noqa: F401 + # We rely on fsspec to resolve gs://. Just try a metadata read. + # Using pyarrow, this will error fast if missing. + pd.read_parquet(path, columns=[], engine="pyarrow") # type: ignore[arg-type] + return True + except Exception: + return False + + +def _require_phase2_outputs(processed: str) -> None: + # Accept either name for the movies parquet (pipeline writes the first) + movies_candidates = [ + f"{processed}/movies_with_descriptions.parquet", + f"{processed}/movies_enriched.parquet", + ] + profiles = f"{processed}/user_profiles.parquet" + triplets = f"{processed}/triplets/triplets_10k.parquet" + + has_movies = any(_parquet_exists(m) for m in movies_candidates) + has_profiles = _parquet_exists(profiles) + has_triplets = _parquet_exists(triplets) + + if has_movies and has_profiles and has_triplets: + return + + # Optionally run Phase 2 to produce missing outputs + if os.getenv("RUN_PHASE2_IF_MISSING", "").lower() in ("1", "true", "yes"): + print("Phase 2 outputs missing — invoking scripts/run_pipeline_phase2.py ...") + env = os.environ.copy() + # Respect GCS_* envs if user set them + subprocess.check_call(["python", "scripts/run_pipeline_phase2.py"], env=env) + # Re-check + has_movies = any(_parquet_exists(m) for m in movies_candidates) + has_profiles = _parquet_exists(profiles) + has_triplets = _parquet_exists(triplets) + if has_movies and has_profiles and has_triplets: + return + + missing = [] + if not has_movies: + missing.append("movies_with_descriptions.parquet") + if not has_profiles: + missing.append("user_profiles.parquet") + if not has_triplets: + missing.append("triplets/triplets_10k.parquet") + raise SystemExit( + "Phase 3 requires Phase 2 outputs. Missing: " + ", ".join(missing) + ) + + +def main() -> None: + # Locations + processed = ( + os.getenv("GCS_PROCESSED_PREFIX") + or os.getenv("PROCESSED_DIR") + or os.getenv("PROCESSED_PREFIX") + or "data/processed" + ) + base_model_dir = os.getenv("BASE_MODEL_DIR", "models/base-minilm") + output_dir = os.getenv("OUTPUT_DIR", "models/movie-minilm-v1") + + # Validate inputs; if using local FS, ensure directories exist + if not _is_gcs(processed): + Path(processed).mkdir(parents=True, exist_ok=True) + if not _is_gcs(base_model_dir): + Path(base_model_dir).mkdir(parents=True, exist_ok=True) + Path(output_dir).mkdir(parents=True, exist_ok=True) + + # Ensure Phase 2 outputs are present (or produce them if allowed) + _require_phase2_outputs(processed) + + # Training config + epochs = os.getenv("EPOCHS", "1") + batch_size = os.getenv("BATCH_SIZE", "64") + use_triplet = os.getenv("USE_TRIPLET", os.getenv("USE_TRIPLET_LOSS", "0")) + + env = os.environ.copy() + env.update( + { + "BASE_MODEL_DIR": base_model_dir, + "PROCESSED_DIR": processed, + "OUTPUT_DIR": output_dir, + "EPOCHS": str(epochs), + "BATCH_SIZE": str(batch_size), + "USE_TRIPLET": str(use_triplet or "0"), + } + ) + + if os.getenv("SKIP_TRAIN", "").lower() in ("1", "true", "yes"): + print("[Phase 3] Skipping fine-tuning per SKIP_TRAIN") + else: + print("[Phase 3] Starting fine-tuning…") + subprocess.check_call([sys.executable, "training/train_finetune.py"], env=env) + + if os.getenv("SKIP_ONNX_EXPORT", "").lower() not in ("1", "true", "yes"): + print("[Phase 3] Exporting ONNX…") + subprocess.check_call([sys.executable, "training/onnx_export.py"], env=env) + else: + print("[Phase 3] Skipping ONNX export per SKIP_ONNX_EXPORT") + + if os.getenv("SKIP_QUANTIZE", "").lower() not in ("1", "true", "yes"): + print("[Phase 3] Quantizing ONNX to INT8…") + subprocess.check_call([sys.executable, "training/quantize_int8.py"], env=env) + else: + print("[Phase 3] Skipping quantization per SKIP_QUANTIZE") + + print("[Phase 3] Completed. Artifacts under:") + print(f" - output_dir = {output_dir}") + print(f" - model.onnx and model-int8.onnx if export/quantize enabled") + + # Optional: upload artifacts to GCS + upload_uri = ( + os.getenv("MODEL_UPLOAD_URI") + or os.getenv("GCS_MODEL_UPLOAD_URI") + or ( + f"{os.getenv('GCS_MODELS_BUCKET').rstrip('/')}/models/movie-minilm-v1" + if os.getenv("GCS_MODELS_BUCKET") + else None + ) + ) + if upload_uri and upload_uri.startswith("gs://"): + try: + import fsspec + from pathlib import PurePosixPath + + fs = fsspec.filesystem("gcs") + print(f"[Phase 3] Uploading artifacts to {upload_uri} …") + base = Path(output_dir) + for local in base.rglob("*"): + if local.is_dir(): + continue + rel = local.relative_to(base) + tgt = str(PurePosixPath(upload_uri.strip("/")) / str(rel)) + # Ensure parent dir on GCS + parent = str(PurePosixPath(tgt).parent) + try: + fs.mkdir(parent) + except Exception: + pass + with open(local, "rb") as fsrc, fs.open(tgt, "wb") as fdst: # type: ignore[attr-defined] + fdst.write(fsrc.read()) + print("[Phase 3] Upload complete.") + except Exception as e: + print(f"[Phase 3] WARN: Upload to GCS failed: {e}") + elif upload_uri: + print(f"[Phase 3] WARN: Unsupported upload URI: {upload_uri}") + + +if __name__ == "__main__": + main() diff --git a/apps/cr-hypervr/scripts/seed_embeddings.py b/apps/cr-hypervr/scripts/seed_embeddings.py new file mode 100644 index 00000000..756f11d1 --- /dev/null +++ b/apps/cr-hypervr/scripts/seed_embeddings.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +import asyncio +import os +from pathlib import Path +import time +from typing import List +import numpy as np +import pandas as pd +import requests + + +EMBED_DIM = 384 +DEFAULT_BATCH = int(os.getenv("BATCH_EMBED_SIZE", "256")) +UPSERT_CHUNK = int(os.getenv("UPSERT_CHUNK_SIZE", "1000")) +DRY_RUN = os.getenv("DRY_RUN", "").lower() in ("1", "true", "yes") + + +def build_movie_text(row: pd.Series) -> str: + return f"Title: {row.get('title','')}\nGenres: {row.get('genres','')}\nOverview: {row.get('overview','')}" + + +async def seed_db(database_url: str, movie_ids: list[int], vectors: np.ndarray) -> None: + import asyncpg + + conn = await asyncpg.connect(database_url) + try: + # Upsert movies and embeddings + def _vec_to_pgtext(v: list[float]) -> str: + return "[" + ",".join(str(float(x)) for x in v) + "]" + + q = ( + "INSERT INTO movie_embeddings (movie_id, embedding) " + "VALUES ($1, $2) ON CONFLICT (movie_id) DO UPDATE SET embedding=EXCLUDED.embedding" + ) + total = len(movie_ids) + for i in range(0, total, UPSERT_CHUNK): + mids = movie_ids[i : i + UPSERT_CHUNK] + vecs = vectors[i : i + UPSERT_CHUNK] + rows = [(int(mid), _vec_to_pgtext(vec.tolist())) for mid, vec in zip(mids, vecs)] + await conn.executemany(q, rows) + print(f"Upserted {min(i+UPSERT_CHUNK,total)}/{total} embeddings to DB...") + finally: + await conn.close() + + +def _fetch_id_token(audience: str) -> str | None: + tok = os.getenv("ID_TOKEN") + if tok: + return tok + try: + resp = requests.get( + "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity", + params={"audience": audience, "format": "full"}, + headers={"Metadata-Flavor": "Google"}, + timeout=3, + ) + if resp.status_code == 200 and resp.text: + return resp.text.strip() + except Exception: + pass + return None + + +def _encode_vectors_via_service(texts: List[str], batch_size: int, timeout: float = 30.0) -> np.ndarray: + service_url = os.getenv("SERVICE_URL") + if not service_url: + if os.getenv("ALLOW_LOCAL_FALLBACK", "").lower() in ("1", "true", "yes"): + from app.services.embedder import get_embedder # type: ignore + + return get_embedder().encode(texts) + raise SystemExit("SERVICE_URL not set; Cloud Run embedding service required") + token = _fetch_id_token(service_url) + headers = {"Content-Type": "application/json"} + if token: + headers["Authorization"] = f"Bearer {token}" + sess = requests.Session() + out: list[np.ndarray] = [] + for i in range(0, len(texts), batch_size): + chunk = texts[i : i + batch_size] + for attempt in range(4): + try: + r = sess.post( + f"{service_url.rstrip('/')}/embed/batch", + json={"texts": chunk}, + headers=headers, + timeout=timeout, + ) + if r.status_code >= 500 and attempt < 3: + time.sleep(1.5 * (attempt + 1)) + continue + r.raise_for_status() + payload = r.json() + vecs = [np.array(item["embedding"], dtype=np.float32) for item in payload] + out.extend(vecs) + break + except Exception: + if attempt >= 3: + raise + time.sleep(1.5 * (attempt + 1)) + continue + arr = np.stack(out, axis=0) + n = np.linalg.norm(arr, axis=1, keepdims=True) + n[n == 0] = 1.0 + return (arr / n).astype(np.float32) + + +def _is_gcs(path: str | Path | str) -> bool: + return str(path).startswith("gs://") + + +def main(): + processed_env = os.getenv("PROCESSED_PREFIX", "data/processed") + if _is_gcs(processed_env): + movies_path = f"{processed_env}/movies_with_descriptions.parquet" + movies = pd.read_parquet(movies_path, storage_options={"token": "cloud"}) + else: + processed_dir = Path(processed_env) + movies_path = processed_dir / "movies_with_descriptions.parquet" + if not movies_path.exists(): + raise FileNotFoundError("movies_with_descriptions.parquet not found. Run Phase 2 first.") + movies = pd.read_parquet(movies_path) + # Align column names + if "movie_id" not in movies.columns and "movieId" in movies.columns: + movies = movies.rename(columns={"movieId": "movie_id"}) + + # Stream encode + upsert in row chunks to avoid long blocking DB operations + db_url = os.getenv("DATABASE_URL") + if not DRY_RUN and not db_url: + raise SystemExit("DATABASE_URL not set; expected to upsert into movie_embeddings table") + + ROW_CHUNK = int(os.getenv("MOVIES_ROW_CHUNK", "5000")) + total = len(movies) + print(f"Processing {total} movies in chunks of {ROW_CHUNK}...") + processed = 0 + for start in range(0, total, ROW_CHUNK): + end = min(start + ROW_CHUNK, total) + chunk = movies.iloc[start:end] + texts = chunk.apply(build_movie_text, axis=1).tolist() + mids = chunk["movie_id"].astype(int).tolist() + print(f"Encoding {len(texts)} movies [{start}:{end}] via service...") + vecs = _encode_vectors_via_service(texts, batch_size=DEFAULT_BATCH) + if DRY_RUN: + print(f"[DRY_RUN] Encoded {len(texts)} embeddings; skipping DB upsert.") + else: + print("Upserting chunk to DB...") + asyncio.run(seed_db(db_url, mids, vecs)) + processed = end + print(f"Progress: {processed}/{total}") + print(f"Completed seeding embeddings for {total} movies.") + + +if __name__ == "__main__": + main() diff --git a/apps/cr-hypervr/scripts/seed_movies.py b/apps/cr-hypervr/scripts/seed_movies.py new file mode 100644 index 00000000..bf8cfcf4 --- /dev/null +++ b/apps/cr-hypervr/scripts/seed_movies.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +import asyncio +import os +from pathlib import Path +import pandas as pd + + +def _is_gcs(path: str | Path) -> bool: + return str(path).startswith("gs://") + + +async def main(): + db_url = os.getenv("DATABASE_URL") + if not db_url: + raise SystemExit("Set DATABASE_URL") + import asyncpg + + processed_env = os.getenv("PROCESSED_PREFIX", "data/processed") + if _is_gcs(processed_env): + movies_path = f"{processed_env}/movies_with_descriptions.parquet" + df = pd.read_parquet(movies_path, storage_options={"token": "cloud"}) + else: + processed = Path(processed_env) + movies_path = processed / "movies_with_descriptions.parquet" + if not movies_path.exists(): + raise FileNotFoundError("data/processed/movies_with_descriptions.parquet not found") + df = pd.read_parquet(movies_path) + cols = ["movie_id", "title", "genres", "overview", "release_year", "tmdb_id"] + # Conform schema: derive release_year from release_date where present + if "movieId" in df.columns: + df["movie_id"] = df["movieId"].astype(int) + if "release_year" not in df.columns: + if "release_date" in df.columns: + df["release_year"] = pd.to_datetime(df["release_date"], errors="coerce").dt.year + else: + df["release_year"] = None + # Normalize dtypes and nulls for DB insert + df["release_year"] = pd.to_numeric(df["release_year"], errors="coerce").astype("Int64") + df["tmdb_id"] = pd.to_numeric(df.get("tmdbId", df.get("tmdb_id", None)), errors="coerce").astype("Int64") if ("tmdbId" in df.columns or "tmdb_id" in df.columns) else df.get("tmdb_id", None) + # ensure strings + df["title"] = df["title"].astype(str) + df["genres"] = df.get("genres", "").astype(str) + df["overview"] = df.get("overview", "").astype(str) + for col in cols: + if col not in df.columns: + df[col] = None + + # Convert pandas NA to Python None + def _py(v): + if hasattr(pd, "isna") and pd.isna(v): + return None + return v + rows = [(_py(a), _py(b), _py(c), _py(d), _py(e), _py(f)) for a,b,c,d,e,f in df[cols].itertuples(index=False, name=None)] + conn = await asyncpg.connect(db_url) + try: + await conn.executemany( + """ + INSERT INTO movies (movie_id, title, genres, overview, release_year, tmdb_id) + VALUES ($1,$2,$3,$4,$5,$6) + ON CONFLICT (movie_id) DO UPDATE SET + title=EXCLUDED.title, + genres=EXCLUDED.genres, + overview=EXCLUDED.overview, + release_year=EXCLUDED.release_year, + tmdb_id=EXCLUDED.tmdb_id + """, + rows, + ) + finally: + await conn.close() + print(f"Upserted {len(rows)} movies") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/apps/cr-hypervr/scripts/seed_ratings.py b/apps/cr-hypervr/scripts/seed_ratings.py new file mode 100644 index 00000000..06d46066 --- /dev/null +++ b/apps/cr-hypervr/scripts/seed_ratings.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import asyncio +import os +from pathlib import Path +import pandas as pd + + +async def seed(database_url: str, df: pd.DataFrame) -> None: + import asyncpg + + conn = await asyncpg.connect(database_url) + try: + norm = [] + for row in df[["user_id", "movie_id", "rating", "rated_at"]].itertuples(index=False, name=None): + uid, mid, rating, ts = row + ts_norm = None + if not pd.isna(ts): + try: + ts_norm = pd.to_datetime(ts).to_pydatetime() + except Exception: + ts_norm = None + norm.append((int(uid), int(mid), float(rating), ts_norm)) + await conn.executemany( + """ + INSERT INTO user_ratings (user_id, movie_id, rating, rated_at) + VALUES ($1,$2,$3,$4) + ON CONFLICT (user_id, movie_id) DO UPDATE SET rating=EXCLUDED.rating, rated_at=COALESCE(EXCLUDED.rated_at, user_ratings.rated_at) + """, + norm, + ) + finally: + await conn.close() + + +def main() -> None: + db_url = os.getenv("DATABASE_URL") + if not db_url: + raise SystemExit("Set DATABASE_URL") + processed = Path(os.getenv("PROCESSED_PREFIX", "data/sample-processed")) + parts = sorted(processed.glob("ratings_enriched-*.parquet")) + if not parts: + raise FileNotFoundError(f"No ratings_enriched-*.parquet in {processed}") + frames = [] + for p in parts: + df = pd.read_parquet(p) + if "userId" in df.columns: + df = df.rename(columns={"userId": "user_id"}) + if "movieId" in df.columns: + df = df.rename(columns={"movieId": "movie_id"}) + df["rated_at"] = pd.NaT + frames.append(df[["user_id", "movie_id", "rating", "rated_at"]]) + merged = pd.concat(frames, ignore_index=True) + asyncio.run(seed(db_url, merged)) + print(f"Upserted {len(merged):,} user_ratings rows") + + +if __name__ == "__main__": + main() diff --git a/apps/cr-hypervr/scripts/setup_gcp.sh b/apps/cr-hypervr/scripts/setup_gcp.sh new file mode 100644 index 00000000..366292da --- /dev/null +++ b/apps/cr-hypervr/scripts/setup_gcp.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -euo pipefail + +PROJECT_ID=${PROJECT_ID:?set PROJECT_ID} +REGION=${REGION:-europe-west2} + +echo "Setting project and region..." +gcloud config set project "$PROJECT_ID" +gcloud config set compute/region "$REGION" + +echo "Enabling required services..." +gcloud services enable \ + run.googleapis.com \ + cloudbuild.googleapis.com \ + artifactregistry.googleapis.com \ + sqladmin.googleapis.com \ + secretmanager.googleapis.com \ + vpcaccess.googleapis.com \ + storage.googleapis.com \ + compute.googleapis.com + +echo "Setup complete. Create bucket/SQL and deploy per mainPRD.md." diff --git a/apps/cr-hypervr/scripts/setup_secrets.sh b/apps/cr-hypervr/scripts/setup_secrets.sh new file mode 100644 index 00000000..ee202b1e --- /dev/null +++ b/apps/cr-hypervr/scripts/setup_secrets.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Creates/updates Kaggle (and optionally TMDB) secrets. +# Usage: +# KAGGLE_JSON=$HOME/.kaggle/kaggle.json ./scripts/setup_secrets.sh +# Optional (not required with TMDB Kaggle dataset): +# TMDB_API_KEY=... ./scripts/setup_secrets.sh + +if [[ -n "${TMDB_API_KEY:-}" ]]; then + if gcloud secrets describe tmdb-api-key >/dev/null 2>&1; then + echo -n "$TMDB_API_KEY" | gcloud secrets versions add tmdb-api-key --data-file=- >/dev/null + else + gcloud secrets create tmdb-api-key >/dev/null + echo -n "$TMDB_API_KEY" | gcloud secrets versions add tmdb-api-key --data-file=- >/dev/null + fi + echo "Updated secret: tmdb-api-key" +else + echo "TMDB_API_KEY not set; skipping (not required if using TMDB Kaggle dataset)." +fi + +if [[ -n "${KAGGLE_JSON:-}" ]]; then + if [[ ! -f "$KAGGLE_JSON" ]]; then + echo "KAGGLE_JSON path does not exist: $KAGGLE_JSON" >&2 + exit 1 + fi + if gcloud secrets describe kaggle-credentials >/dev/null 2>&1; then + gcloud secrets versions add kaggle-credentials --data-file="$KAGGLE_JSON" >/dev/null + else + gcloud secrets create kaggle-credentials >/dev/null + gcloud secrets versions add kaggle-credentials --data-file="$KAGGLE_JSON" >/dev/null + fi + echo "Updated secret: kaggle-credentials" +else + echo "KAGGLE_JSON not set; skipping." +fi diff --git a/apps/cr-hypervr/scripts/upload_gcs_assets.sh b/apps/cr-hypervr/scripts/upload_gcs_assets.sh new file mode 100644 index 00000000..b5199f87 --- /dev/null +++ b/apps/cr-hypervr/scripts/upload_gcs_assets.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Robust uploader with verification and detailed logs. +# Usage (pass explicit buckets; defaults still point to ${PROJECT_ID}-${REGION}-datasets/models): +# PROJECT_ID=agentics-foundation25lon-1809 REGION=europe-west2 \ +# DATA_BUCKET=gs://agentics-foundation25lon-1809-europe-west2-datasets-20251207 \ +# MODEL_BUCKET=gs://agentics-foundation25lon-1809-europe-west2-models-20251207 \ +# bash scripts/upload_gcs_assets.sh + +ts() { date -u +"%Y-%m-%dT%H:%M:%SZ"; } + +PROJECT_ID=${PROJECT_ID:-agentics-foundation25lon-1809} +REGION=${REGION:-europe-west2} +DATA_BUCKET=${DATA_BUCKET:-gs://${PROJECT_ID}-${REGION}-datasets} +MODEL_BUCKET=${MODEL_BUCKET:-gs://${PROJECT_ID}-${REGION}-models} + +GSDBG=${GSDBG:-0} +GSFLAGS=(-m) +if [[ "$GSDBG" == "1" ]]; then + GSFLAGS=(-m -D) +fi + +echo "[$(ts)] Start upload to GCS" +echo "Project: $PROJECT_ID" +echo "Region: $REGION" +echo "Data bkt: $DATA_BUCKET" +echo "Model bkt: $MODEL_BUCKET" + +ensure_bucket() { + local B=$1 + if gsutil ls -b "$B" >/dev/null 2>&1; then + echo "[$(ts)] Bucket exists: $B" + else + echo "[$(ts)] Creating bucket: $B" + gsutil mb -l "$REGION" "$B" + fi +} + +ensure_bucket "$DATA_BUCKET" +ensure_bucket "$MODEL_BUCKET" + +local_count_size() { + local P=$1 + local cnt size + cnt=$(find "$P" -type f | wc -l | tr -d ' ') + size=$(du -sk "$P" | awk '{print $1}') # KiB + echo "$cnt files, ${size}KiB" +} + +remote_count_size() { + local U=$1 + local cnt size + cnt=$(gsutil ls -r "$U" 2>/dev/null | grep -v '/$' | wc -l | tr -d ' ') + size=$(gsutil du -s "$U" 2>/dev/null | awk '{print $1}') + echo "$cnt objects, ${size}B" +} + +upload_dir() { + local SRC=$1 DST=$2 + echo "[$(ts)] Uploading directory: $SRC -> $DST" + echo " Local: $(local_count_size "$SRC")" + gsutil "${GSFLAGS[@]}" rsync -r -c "$SRC" "$DST" + echo " Remote: $(remote_count_size "$DST")" +} + +upload_file_verify() { + local SRC=$1 DST=$2 # DST ends with / or object path + local base=$(basename "$SRC") + local OBJ=$DST + if [[ "$DST" =~ /$ ]]; then OBJ="${DST}${base}"; fi + echo "[$(ts)] Uploading file: $SRC -> $OBJ" + gsutil "${GSFLAGS[@]}" cp -n "$SRC" "$OBJ" + # Verify MD5 if available + if command -v openssl >/dev/null 2>&1; then + local lmd5; lmd5=$(openssl md5 -binary "$SRC" | base64 | tr -d '[:space:]') + local rmd5; rmd5=$(gsutil stat "$OBJ" | awk -F": " '/Hash \(md5\)/{print $2}' | tr -d '[:space:]') + echo " Local MD5: $lmd5" + echo " Remote MD5: $rmd5" + if [[ -n "$rmd5" && "$lmd5" == "$rmd5" ]]; then + echo " Verify: OK" + else + echo " Verify: MISMATCH or unavailable" >&2 + return 1 + fi + else + echo " MD5 verify skipped (openssl not found)" + fi +} + +# Upload TMDB CSV +if [[ -f data/tmdb/TMDB_movie_dataset_v11.csv ]]; then + upload_file_verify data/tmdb/TMDB_movie_dataset_v11.csv "$DATA_BUCKET/data/tmdb/" +else + echo "[$(ts)] WARN: TMDB CSV missing locally; skipping" >&2 +fi + +# Upload MovieLens directory +if [[ -d data/movielens/ml-25m ]]; then + upload_dir data/movielens/ml-25m "$DATA_BUCKET/data/movielens/ml-25m/" +else + echo "[$(ts)] WARN: MovieLens directory missing; skipping" >&2 +fi + +# Upload base MiniLM model directory +if [[ -d models/base-minilm ]]; then + upload_dir models/base-minilm "$MODEL_BUCKET/models/base-minilm/" +else + echo "[$(ts)] WARN: base-minilm directory missing; skipping" >&2 +fi + +echo "[$(ts)] Upload completed" diff --git a/apps/cr-hypervr/scripts/validate_hyperedges.py b/apps/cr-hypervr/scripts/validate_hyperedges.py new file mode 100644 index 00000000..8329bfbc --- /dev/null +++ b/apps/cr-hypervr/scripts/validate_hyperedges.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import os +import sys +from pathlib import Path +from typing import Iterable, Iterator + +import pandas as pd + +# Optional heavy deps are imported lazily where possible to keep startup light + + +def _storage_options(path: str | Path) -> dict | None: + p = str(path) + return {"token": "cloud"} if p.startswith("gs://") else None + + +async def _ensure_tmp_table(conn) -> None: + # Create temp table for set comparison (preserve rows across implicit commits) + await conn.execute( + """ + CREATE TEMP TABLE IF NOT EXISTS tmp_edges ( + src_kind TEXT, + src_id BIGINT, + dst_kind TEXT, + dst_id BIGINT, + weight REAL + ) ON COMMIT PRESERVE ROWS + """ + ) + # Helpful index for JOIN/NOT EXISTS performance on large edge sets + await conn.execute( + "CREATE INDEX IF NOT EXISTS tmp_edges_idx ON tmp_edges(src_kind,src_id,dst_kind,dst_id)" + ) + + +async def _load_chunk(conn, df: pd.DataFrame) -> int: + if df is None or df.empty: + return 0 + part = df[["src_kind", "src_id", "dst_kind", "dst_id", "weight"]].copy() + rows = list( + zip( + part["src_kind"].astype(str), + part["src_id"].astype(int), + part["dst_kind"].astype(str), + part["dst_id"].astype(int), + part["weight"].astype(float), + ) + ) + if rows: + await conn.executemany( + "INSERT INTO tmp_edges (src_kind, src_id, dst_kind, dst_id, weight) VALUES ($1,$2,$3,$4,$5)", + rows, + ) + return len(rows) + + +def _iter_parquet_batches(parquet_path: str | Path, batch_size: int = 200_000) -> Iterator[pd.DataFrame]: + """Yield DataFrames with only required columns from a Parquet file. + + Uses pyarrow + fsspec for efficient row-group iteration and low memory use. + """ + import pyarrow.parquet as pq + try: + import fsspec # provided transitively by gcsfs + except Exception: # pragma: no cover + fsspec = None # type: ignore + + cols = ["src_kind", "src_id", "dst_kind", "dst_id", "weight"] + path_str = str(parquet_path) + if path_str.startswith("gs://") and fsspec is not None: + with fsspec.open(path_str, "rb") as f: # type: ignore + pf = pq.ParquetFile(f) + for batch in pf.iter_batches(columns=cols, batch_size=batch_size): + yield batch.to_pandas(types_mapper=None) + else: + pf = pq.ParquetFile(path_str) + for batch in pf.iter_batches(columns=cols, batch_size=batch_size): + yield batch.to_pandas(types_mapper=None) + + +async def validate(parquet_path: str, database_url: str, weight_tol: float = 1e-6) -> int: + import asyncpg + + # Stream parquet into temp table to avoid OOM on large files + # Also validates required columns exist in the first batch + + conn = await asyncpg.connect(database_url) + try: + await _ensure_tmp_table(conn) + + total = 0 + first = True + for df in _iter_parquet_batches(parquet_path): + if first: + first = False + need = {"src_kind", "src_id", "dst_kind", "dst_id", "weight"} + if not need.issubset(df.columns): + raise RuntimeError(f"Parquet missing columns: {need - set(df.columns)}") + total += await _load_chunk(conn, df) + + # Count matches/missing with float-tolerant comparison + # IMPORTANT: use EXISTS to avoid overcount when DB has duplicate rows for a given edge + q_matched_exists = ( + "SELECT COUNT(*) FROM tmp_edges t WHERE EXISTS (" + " SELECT 1 FROM hyperedges h WHERE h.src_kind=t.src_kind AND h.src_id=t.src_id " + " AND h.dst_kind=t.dst_kind AND h.dst_id=t.dst_id AND ABS(h.weight - t.weight) < $1" + ")" + ) + q_missing = ( + "SELECT COUNT(*) FROM tmp_edges t " + " WHERE NOT EXISTS (" + " SELECT 1 FROM hyperedges h WHERE h.src_kind=t.src_kind AND h.src_id=t.src_id " + " AND h.dst_kind=t.dst_kind AND h.dst_id=t.dst_id AND ABS(h.weight - t.weight) < $1" + " )" + ) + # Optional: extras present in DB but not in parquet (for debugging) + q_extra = ( + "SELECT COUNT(*) FROM hyperedges h WHERE NOT EXISTS (" + " SELECT 1 FROM tmp_edges t WHERE h.src_kind=t.src_kind AND h.src_id=t.src_id " + " AND h.dst_kind=t.dst_kind AND h.dst_id=t.dst_id AND ABS(h.weight - t.weight) < $1" + ")" + ) + matched = await conn.fetchval(q_matched_exists, weight_tol) + missing = await conn.fetchval(q_missing, weight_tol) + extra = await conn.fetchval(q_extra, weight_tol) + print({ + "parquet_edges": int(total), + "db_matched": int(matched or 0), + "db_missing": int(missing or 0), + "db_extra": int(extra or 0), + }) + return 0 if int(missing or 0) == 0 and int(matched or 0) == int(total) else 1 + finally: + try: + await conn.execute("DROP TABLE IF EXISTS tmp_edges") + finally: + await conn.close() + + +def main() -> int: + # Support both local and GCS-style envs + processed = os.getenv("PROCESSED_PREFIX") or os.getenv("GCS_PROCESSED_PREFIX") or "data/processed" + parquet_path = ( + f"{processed}/hyperedges.parquet" if str(processed).startswith("gs://") else str(Path(processed) / "hyperedges.parquet") + ) + db_url = os.getenv("DATABASE_URL") + if not db_url: + print("Set DATABASE_URL") + return 2 + # Allow weight tolerance override for float comparisons + try: + weight_tol = float(os.getenv("WEIGHT_TOL", "1e-6")) + except ValueError: + weight_tol = 1e-6 + import asyncio + + # Quick existence check for clearer error (local or GCS) + pstr = str(parquet_path) + if not pstr.startswith("gs://"): + if not Path(pstr).exists(): + print(f"Missing parquet at {pstr}") + return 3 + else: + try: + import gcsfs # type: ignore + fs = gcsfs.GCSFileSystem(token="cloud") + if not fs.exists(pstr): + print(f"Missing parquet at {pstr}") + return 3 + except Exception as e: # pragma: no cover + # If we cannot check existence, proceed and let the reader raise an error + print(f"Warning: couldn't verify GCS path existence for {pstr}: {e}") + + return asyncio.run(validate(parquet_path, db_url, weight_tol=weight_tol)) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/apps/cr-hypervr/scripts/validate_triplets_coverage.py b/apps/cr-hypervr/scripts/validate_triplets_coverage.py new file mode 100644 index 00000000..922f17bc --- /dev/null +++ b/apps/cr-hypervr/scripts/validate_triplets_coverage.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import asyncio +import os +from pathlib import Path +from typing import Iterable, Set + +import pandas as pd + + +def _storage_options(path: str | Path) -> dict | None: + p = str(path) + return {"token": "cloud"} if p.startswith("gs://") else None + + +async def _fetch_existing(conn, movie_ids: Iterable[int]) -> tuple[Set[int], Set[int]]: + mids = list(set(int(x) for x in movie_ids)) + if not mids: + return set(), set() + rows1 = await conn.fetch("SELECT movie_id FROM movies WHERE movie_id = ANY($1::int[])", mids) + rows2 = await conn.fetch("SELECT movie_id FROM movie_embeddings WHERE movie_id = ANY($1::int[])", mids) + have_movies = {int(r["movie_id"]) for r in rows1} + have_embs = {int(r["movie_id"]) for r in rows2} + return have_movies, have_embs + + +async def main() -> int: + db_url = os.getenv("DATABASE_URL") + triplets_dir = os.getenv("GCS_TRIPLETS_PREFIX", "data/processed/triplets") + if not db_url: + print("Set DATABASE_URL") + return 2 + trip_path = f"{triplets_dir}/triplets_10k.parquet" + df = pd.read_parquet(trip_path, storage_options=_storage_options(trip_path)) + needed: Set[int] = set(map(int, df["pos_movie_id"].tolist())) | set(map(int, df["neg_movie_id"].tolist())) + + import asyncpg # lazy import + + conn = await asyncpg.connect(db_url) + try: + have_movies, have_embs = await _fetch_existing(conn, needed) + finally: + await conn.close() + + missing_movies = needed - have_movies + missing_embs = needed - have_embs + print(f"Triplets movies referenced: {len(needed):,}") + print(f"Present in movies table: {len(have_movies):,} (missing {len(missing_movies):,})") + print(f"With embeddings present: {len(have_embs):,} (missing {len(missing_embs):,})") + if missing_movies: + print(f"Missing in movies table (sample): {sorted(list(missing_movies))[:10]}") + if missing_embs: + print(f"Missing embeddings (sample): {sorted(list(missing_embs))[:10]}") + # Non-zero exit if any gaps + return 0 if not missing_movies and not missing_embs else 1 + + +if __name__ == "__main__": + raise SystemExit(asyncio.run(main())) + diff --git a/apps/cr-hypervr/scripts/verify_gcp_access.py b/apps/cr-hypervr/scripts/verify_gcp_access.py new file mode 100644 index 00000000..13b50d54 --- /dev/null +++ b/apps/cr-hypervr/scripts/verify_gcp_access.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +""" +Lightweight GCP access verifier used by Makefile target `gcp-verify-py`. + +Checks (non-destructive): +- Active gcloud account +- Project and region config +- Cloud Run API access +- Cloud Storage bucket reachability (optional) +- Cloud SQL instance visibility (optional) +- Secret Manager listing + +Environment: +- CLOUDSDK_CONFIG respected (set to a repo-local path to avoid $HOME perms issues) +- Optional: GCP_PROJECT_ID, GCP_REGION, GCP_BUCKET, GCP_SQL_INSTANCE +""" +from __future__ import annotations + +import os +import subprocess +import sys +from dataclasses import dataclass + + +def run(cmd: str) -> tuple[int, str, str]: + p = subprocess.run(cmd, shell=True, capture_output=True, text=True) + return p.returncode, p.stdout.strip(), p.stderr.strip() + + +@dataclass +class Ctx: + project: str | None + region: str | None + bucket: str | None + sql_instance: str | None + cloudsdk_config: str | None + + +def ctx_from_env() -> Ctx: + return Ctx( + project=os.getenv("GCP_PROJECT_ID") or os.getenv("PROJECT_ID"), + region=os.getenv("GCP_REGION") or os.getenv("REGION"), + bucket=os.getenv("GCP_BUCKET"), + sql_instance=os.getenv("GCP_SQL_INSTANCE") or os.getenv("SQL_INSTANCE"), + cloudsdk_config=os.getenv("CLOUDSDK_CONFIG"), + ) + + +def main() -> int: + ctx = ctx_from_env() + print("== GCP Access Verification ==") + if ctx.cloudsdk_config: + print(f"CLOUDSDK_CONFIG={ctx.cloudsdk_config}") + + # 1) Active account + rc, out, err = run("gcloud auth list --filter=status:ACTIVE --format='value(account)'") + ok_auth = rc == 0 and bool(out) + print(("✓" if ok_auth else "✗") + f" Active account: {out or err}") + + # 2) Project / region + if not ctx.project: + _rc, p_out, _ = run("gcloud config get-value core/project") + ctx.project = p_out or None + if not ctx.region: + _rc, r_out, _ = run("gcloud config get-value compute/region") + ctx.region = r_out or None + print(("✓" if ctx.project else "✗") + f" Project: {ctx.project or '[unset]'}") + print(("✓" if ctx.region else "✗") + f" Region: {ctx.region or '[unset]'}") + + # 3) Core APIs / access + apis = [ + "run.googleapis.com", + "cloudbuild.googleapis.com", + "artifactregistry.googleapis.com", + "sqladmin.googleapis.com", + "secretmanager.googleapis.com", + "vpcaccess.googleapis.com", + "storage.googleapis.com", + "compute.googleapis.com", + ] + apis_ok = True + for s in apis: + rc, out, _ = run(f"gcloud services list --enabled --filter=NAME:{s} --format='value(NAME)'") + ok = rc == 0 and s in out + apis_ok = apis_ok and ok + print(("✓" if ok else "✗") + f" API enabled: {s}") + + # 4) Cloud Run access + rc, _, _ = run("gcloud run services list --limit=1 2>/dev/null") + print(("✓" if rc == 0 else "✗") + " Cloud Run access") + + # 5) Cloud Storage (optional) + if ctx.bucket: + rc, out, _ = run(f"gsutil ls -b gs://{ctx.bucket} 2>/dev/null") + print(("✓" if rc == 0 else "✗") + f" Bucket exists: gs://{ctx.bucket}") + else: + print("○ Bucket not provided; skip") + + # 6) Cloud SQL (optional) + if ctx.sql_instance: + rc, out, _ = run( + f"gcloud sql instances describe {ctx.sql_instance} --format='value(name)' 2>/dev/null" + ) + print(("✓" if rc == 0 and out else "✗") + f" Cloud SQL: {ctx.sql_instance}") + else: + print("○ SQL instance not provided; skip") + + # 7) Secret Manager + rc, _, _ = run("gcloud secrets list --limit=1 2>/dev/null") + print(("✓" if rc == 0 else "✗") + " Secret Manager access") + + # Summary exit code + critical_ok = ok_auth and bool(ctx.project) and bool(ctx.region) and apis_ok + if critical_ok: + print("\n✅ GCP access verified. Ready to proceed.") + return 0 + else: + print("\n❌ GCP access verification failed. Check credentials and permissions.") + return 2 + + +if __name__ == "__main__": + sys.exit(main()) + diff --git a/apps/cr-hypervr/training/onnx_export.py b/apps/cr-hypervr/training/onnx_export.py new file mode 100644 index 00000000..cf9da886 --- /dev/null +++ b/apps/cr-hypervr/training/onnx_export.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +import os +from pathlib import Path +from sentence_transformers import SentenceTransformer + + +def export_onnx(model_dir: str = "models/movie-minilm-v1", onnx_out: str = "models/movie-minilm-v1/model.onnx") -> None: + model = SentenceTransformer(model_dir) + Path(Path(onnx_out).parent).mkdir(parents=True, exist_ok=True) + # Export using built-in utility (available in sentence-transformers>=3) + model.export( + export_path=onnx_out, + format="onnx", + quantize=False, + optimize=True, + ) + print("Exported ONNX to", onnx_out) + + +if __name__ == "__main__": + export_onnx() + diff --git a/apps/cr-hypervr/training/quantize_int8.py b/apps/cr-hypervr/training/quantize_int8.py new file mode 100644 index 00000000..83d6800e --- /dev/null +++ b/apps/cr-hypervr/training/quantize_int8.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from pathlib import Path +from onnxruntime.quantization import quantize_dynamic, QuantType + + +def quantize(onnx_in: str = "models/movie-minilm-v1/model.onnx", onnx_out: str = "models/movie-minilm-v1/model-int8.onnx") -> None: + Path(Path(onnx_out).parent).mkdir(parents=True, exist_ok=True) + quantize_dynamic( + model_input=onnx_in, + model_output=onnx_out, + weight_type=QuantType.QInt8, + optimize_model=True, + ) + print("Quantized INT8 ONNX saved to", onnx_out) + + +if __name__ == "__main__": + quantize() + diff --git a/apps/cr-hypervr/training/train_finetune.py b/apps/cr-hypervr/training/train_finetune.py new file mode 100644 index 00000000..6767190e --- /dev/null +++ b/apps/cr-hypervr/training/train_finetune.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +import os +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable + +import pandas as pd +from sentence_transformers import InputExample, SentenceTransformer, losses +from torch.utils.data import DataLoader + + +@dataclass +class TrainConfig: + base_model_dir: str = os.getenv("BASE_MODEL_DIR", "models/base-minilm") + output_dir: str = os.getenv("OUTPUT_DIR", "models/movie-minilm-v1") + processed_dir: str = os.getenv("PROCESSED_DIR", "data/processed") + epochs: int = int(os.getenv("EPOCHS", 1)) + batch_size: int = int(os.getenv("BATCH_SIZE", 64)) + use_triplet_loss: bool = bool(int(os.getenv("USE_TRIPLET", "0"))) + + +def _read_parquet(path: str): + storage = {"token": "cloud"} if str(path).startswith("gs://") else None + return pd.read_parquet(path, storage_options=storage) + + +def build_examples(processed_dir: str) -> Iterable[InputExample]: + trips_path = f"{processed_dir}/triplets/triplets_10k.parquet" + users_path = f"{processed_dir}/user_profiles.parquet" + # Prefer movies_with_descriptions; allow legacy movies_enriched name + movies_primary = f"{processed_dir}/movies_with_descriptions.parquet" + movies_fallback = f"{processed_dir}/movies_enriched.parquet" + + trips = _read_parquet(trips_path) + users = _read_parquet(users_path)[ + ["user_id", "liked_titles", "disliked_titles"] + ] + try: + movies = _read_parquet(movies_primary)[["movie_id", "title", "overview", "genres"]] + except Exception: + movies = _read_parquet(movies_fallback)[["movie_id", "title", "overview", "genres"]] + + u = users.set_index("user_id") + m = movies.set_index("movie_id") + + def movie_text(mid: int) -> str: + row = m.loc[mid] + return f"Title: {row['title']}\nGenres: {row.get('genres', '')}\nOverview: {row.get('overview', '')}" + + for row in trips.itertuples(index=False): + user_id = int(row.user_id) + pos = int(row.pos_movie_id) + neg = int(row.neg_movie_id) + up = u.loc[user_id] + anchor = f"User likes: {up.get('liked_titles', '')} \nDislikes: {up.get('disliked_titles', '')}" + pos_txt = movie_text(pos) + neg_txt = movie_text(neg) + yield InputExample(texts=[anchor, pos_txt, neg_txt]) + + +def main(): + cfg = TrainConfig() + Path(cfg.output_dir).mkdir(parents=True, exist_ok=True) + + print("Loading base model from", cfg.base_model_dir) + model = SentenceTransformer(cfg.base_model_dir) + + examples = list(build_examples(cfg.processed_dir)) + if not examples: + raise RuntimeError("No training examples found. Ensure Phase 2 outputs exist.") + + if cfg.use_triplet_loss: + # Use explicit triplet loss with anchor-pos-neg + from sentence_transformers.losses import TripletLoss + + train_dataloader = DataLoader(examples, shuffle=True, batch_size=cfg.batch_size) + train_loss = TripletLoss(model) + else: + # Use MNR: Only anchor and positive are used; implicit in-batch negatives apply. + mnr_examples = [InputExample(texts=e.texts[:2]) for e in examples] + train_dataloader = DataLoader(mnr_examples, shuffle=True, batch_size=cfg.batch_size) + train_loss = losses.MultipleNegativesRankingLoss(model) + + print(f"Training for {cfg.epochs} epoch(s) with batch size {cfg.batch_size}") + model.fit( + train_objectives=[(train_dataloader, train_loss)], + epochs=cfg.epochs, + output_path=cfg.output_dir, + show_progress_bar=True, + ) + print("Model saved to", cfg.output_dir) + + +if __name__ == "__main__": + main()