diff --git a/apps/cr-hypervr/Dockerfile b/apps/cr-hypervr/Dockerfile
new file mode 100644
index 00000000..317c7aab
--- /dev/null
+++ b/apps/cr-hypervr/Dockerfile
@@ -0,0 +1,27 @@
+FROM python:3.11-slim
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+ PYTHONUNBUFFERED=1 \
+ PIP_NO_CACHE_DIR=1
+
+WORKDIR /app
+
+# System deps (minimal)
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt ./
+RUN pip install --upgrade pip && pip install -r requirements.txt
+
+# Preload base model for offline CPU inference (no heredoc for wider builder support)
+RUN python -c "from sentence_transformers import SentenceTransformer; import os; os.makedirs('models/base-minilm', exist_ok=True); SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2').save('models/base-minilm'); print('Base MiniLM model cached at models/base-minilm')"
+
+COPY app ./app
+COPY scripts ./scripts
+COPY pipeline ./pipeline
+COPY training ./training
+
+EXPOSE 8080
+
+ENV MODEL_DIR=""
+ENV BASE_MODEL_DIR="models/base-minilm"
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8080"]
diff --git a/apps/cr-hypervr/LICENSE b/apps/cr-hypervr/LICENSE
new file mode 100644
index 00000000..db48ebd2
--- /dev/null
+++ b/apps/cr-hypervr/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 YP
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/apps/cr-hypervr/Makefile b/apps/cr-hypervr/Makefile
new file mode 100644
index 00000000..a96e8311
--- /dev/null
+++ b/apps/cr-hypervr/Makefile
@@ -0,0 +1,47 @@
+PY ?= python3
+
+.PHONY: gcp-provision gcp-secrets gcp-verify gcp-build gcp-deploy gcp-deploy-infra gcp-jobs-deploy gcp-job-run-phase2 gcp-job-run-phase3 db-apply-cloudsql export-openapi
+
+# GCP: Core provisioning (Artifact Registry, bucket, Cloud SQL, SAs, secret)
+gcp-provision:
+ bash scripts/provision_core.sh
+
+# GCP: Create/update Secret Manager entries (e.g., DATABASE_URL, Kaggle creds)
+gcp-secrets:
+ bash scripts/setup_secrets.sh
+
+# GCP: Sanity checks (services, AR repo, Cloud SQL, SAs)
+gcp-verify:
+ bash scripts/gcp_verify.sh
+
+# GCP: Build container in Cloud Build and push to Artifact Registry
+gcp-build:
+ gcloud builds submit --region=$${REGION:-europe-west2} --config=cloudbuild.yaml --substitutions=_REGION=$${REGION:-europe-west2}
+
+# GCP: Deploy primary API service (embedding-service)
+gcp-deploy:
+ bash scripts/deploy_cloud_run.sh
+
+# GCP: Deploy infra-service (graph‑focused, POST‑only)
+gcp-deploy-infra:
+ bash scripts/deploy_graph_service.sh
+
+# GCP: Deploy Cloud Run Jobs for data pipelines and validation
+gcp-jobs-deploy:
+ bash scripts/deploy_jobs.sh
+
+# Run Phase 2 (join → profiles → hyperedges + validation) as a Cloud Run Job
+gcp-job-run-phase2:
+ PROJECT_ID=$${PROJECT_ID} REGION=$${REGION:-europe-west2} bash -lc 'gcloud beta run jobs execute pipeline-phase2 --region $$REGION'
+
+# Run Phase 3 (fine‑tune → ONNX → INT8) as a Cloud Run Job
+gcp-job-run-phase3:
+ PROJECT_ID=$${PROJECT_ID} REGION=$${REGION:-europe-west2} bash -lc 'gcloud beta run jobs execute pipeline-phase3 --region $$REGION'
+
+# Apply schema + pgvector to Cloud SQL (uses Cloud SQL connect)
+db-apply-cloudsql:
+ bash scripts/db_apply_cloudsql.sh
+
+# Export OpenAPI JSON (writes openapi.json in repo root)
+export-openapi:
+ $(PY) scripts/export_openapi.py
diff --git a/apps/cr-hypervr/README.md b/apps/cr-hypervr/README.md
new file mode 100644
index 00000000..16ca4b52
--- /dev/null
+++ b/apps/cr-hypervr/README.md
@@ -0,0 +1,411 @@
+# CR-HyperVR
+
+**Cloud Run Hypergraph-Vector Recommender**
+
+CPU-only FastAPI service using INT8 ONNX MiniLM, pgvector similarity, and hyperedge signals.
+
+## What It Does
+
+- Recommends films based on descriptions and user rating history
+- Creates compact embeddings of movies and user taste profiles
+- Searches for semantically similar content via vector similarity
+- Enhances results using hypergraph signals (co-watch patterns, shared genres)
+- Runs entirely on GCP Cloud Run with no GPU required
+
+## Key Benefits
+
+- **Low latency & cost** — INT8-quantized ONNX model runs efficiently on CPU
+- **Cold-start friendly** — Hypergraph edges help recommend even with sparse user data
+- **Scalable** — Cloud Run auto-scales based on traffic
+- **Easy integration** — Simple JSON POST endpoints for any client or agent
+
+---
+
+## Quick Start
+
+Live endpoints are public and ready to use:
+
+| Service | URL |
+|---------|-----|
+| Embedding API | `https://embedding-service-5pgvctvdpq-nw.a.run.app` |
+| Graph Service | `https://infra-service-5pgvctvdpq-nw.a.run.app` |
+
+### Try It Now
+
+**Embed free text:**
+```bash
+curl -s -X POST \
+ https://embedding-service-5pgvctvdpq-nw.a.run.app/embed/text \
+ -H 'Content-Type: application/json' \
+ -d '{"text":"neo-noir heist with witty banter"}'
+```
+
+**Get graph-powered recommendations:**
+```bash
+curl -s -X POST \
+ https://infra-service-5pgvctvdpq-nw.a.run.app/graph/recommend \
+ -H 'Content-Type: application/json' \
+ -d '{"query":"space opera adventure","top_k":5,"seed_top_k":15,"hops":2}'
+```
+
+### Full API Examples
+
+
+Search similar movies
+
+```bash
+curl -s -X POST \
+ https://embedding-service-5pgvctvdpq-nw.a.run.app/search/similar \
+ -H 'Content-Type: application/json' \
+ -d '{
+ "text":"grounded space survival drama",
+ "top_k": 10
+ }'
+```
+
+
+
+Recommend for a user ID
+
+```bash
+curl -s -X POST \
+ https://embedding-service-5pgvctvdpq-nw.a.run.app/search/recommend \
+ -H 'Content-Type: application/json' \
+ -d '{
+ "user_id": 123,
+ "top_k": 10,
+ "exclude_movie_ids": [1,2,3]
+ }'
+```
+
+
+
+Embed a movie object
+
+```bash
+curl -s -X POST \
+ https://embedding-service-5pgvctvdpq-nw.a.run.app/embed/movie \
+ -H 'Content-Type: application/json' \
+ -d '{
+ "title":"The Grand Budapest Hotel",
+ "genres":["Comedy","Drama"],
+ "description":"A whimsical concierge and lobby boy embark on capers across a pastel Europe."
+ }'
+```
+
+
+
+Embed a user taste profile
+
+```bash
+curl -s -X POST \
+ https://embedding-service-5pgvctvdpq-nw.a.run.app/embed/user \
+ -H 'Content-Type: application/json' \
+ -d '{
+ "liked_genres":["Sci-Fi","Adventure"],
+ "liked_movies":["Star Wars","Guardians of the Galaxy"],
+ "disliked_genres":["Horror"]
+ }'
+```
+
+
+
+Batch embed texts
+
+```bash
+curl -s -X POST \
+ https://embedding-service-5pgvctvdpq-nw.a.run.app/embed/batch \
+ -H 'Content-Type: application/json' \
+ -d '{
+ "texts": [
+ "gritty detective thriller set in Boston",
+ "lighthearted family fantasy with magical creatures"
+ ]
+ }'
+```
+
+
+
+Graph recommendations with weight tuning
+
+```bash
+curl -s -X POST \
+ https://infra-service-5pgvctvdpq-nw.a.run.app/graph/recommend \
+ -H 'Content-Type: application/json' \
+ -d '{
+ "query":"A feel-good romantic comedy set in New York City with witty banter and heartfelt moments.",
+ "top_k": 5,
+ "seed_top_k": 15,
+ "hops": 2,
+ "embed_weight": 1.0,
+ "cowatch_weight": 0.5,
+ "genre_weight": 0.25
+ }'
+```
+
+
+---
+
+## Architecture
+
+Fully managed GCP infrastructure with automatic scaling and pay-per-use billing.
+
+```mermaid
+flowchart TD
+ subgraph Clients
+ A[Client / Agent]
+ end
+
+ subgraph Cloud Run
+ B((embedding-service))
+ C((infra-service))
+ end
+
+ subgraph Storage
+ D[(Cloud SQL)]
+ E[(GCS)]
+ end
+
+ subgraph Jobs
+ F{{Cloud Run Jobs}}
+ end
+
+ A -->|JSON POST| B
+ A -->|JSON POST| C
+ B -->|pgvector| D
+ C -->|pgvector + hyperedges| D
+ E -->|ONNX model| B
+ E -->|datasets| F
+ F -->|write hyperedges| D
+```
+
+### Infrastructure Components
+
+| Component | Purpose |
+|-----------|---------|
+| **Cloud Run Services** | Auto-scaling API endpoints for embedding and recommendations |
+| **Cloud SQL (PostgreSQL 15)** | Stores movie embeddings and hyperedges with pgvector |
+| **Cloud Storage** | Hosts datasets, trained models, and pipeline outputs |
+| **Cloud Run Jobs** | Executes data pipelines and model training |
+| **Secret Manager** | Secures database credentials |
+
+### How It Works
+
+1. **Query arrives** → Text is embedded using INT8 ONNX MiniLM
+2. **Vector search** → pgvector finds semantically similar movies
+3. **Graph expansion** → Hyperedges add co-watch and genre neighbors
+4. **Score fusion** → Weighted combination produces final ranking
+
+```mermaid
+flowchart TD
+ Q[Query text] --> E[Embed vector]
+ E --> S[Seed candidates via pgvector]
+ S --> COW[Co-watch neighbors]
+ S --> GEN[Shared-genre neighbors]
+ COW --> M[Normalize + weighted sum]
+ GEN --> M
+ E -->|as scores| M
+ M --> R[Top-K ranked list]
+```
+
+---
+
+## API Reference
+
+### Embedding Service Flow
+
+```mermaid
+flowchart LR
+ A[POST Request] --> B[embedding-service]
+ B --> C{Endpoint}
+ C -->|/embed/text| D[Tokenize + ONNX inference]
+ C -->|/embed/movie| D
+ C -->|/embed/user| D
+ C -->|/embed/batch| D
+ C -->|/search/similar| E[Embed + pgvector query]
+ C -->|/search/recommend| F[Load profile + pgvector + rerank]
+ D --> G[384-dim vector]
+ E --> H[Ranked movie list]
+ F --> H
+```
+
+### Infra Service Flow
+
+```mermaid
+flowchart LR
+ A[POST /graph/recommend] --> B[infra-service]
+ B --> C[Embed query text]
+ C --> D[pgvector seed candidates]
+ D --> E[Hyperedge expansion]
+ E --> F[Co-watch edges]
+ E --> G[Genre edges]
+ F --> H[Score fusion]
+ G --> H
+ H --> I[Top-K results]
+```
+
+### Embedding Endpoints
+
+| Endpoint | Description | Response |
+|----------|-------------|----------|
+| `POST /embed/text` | Embed free text | `{ embedding, dimension, model }` |
+| `POST /embed/batch` | Embed multiple texts | Array of embeddings |
+| `POST /embed/movie` | Embed from title + genres + description | Embedding object |
+| `POST /embed/user` | Embed user taste profile | Embedding object |
+
+### Search Endpoints
+
+| Endpoint | Description |
+|----------|-------------|
+| `POST /search/similar` | Vector search over movie embeddings |
+| `POST /search/recommend` | Recommendations from user profile |
+| `POST /graph/recommend` | Graph-enhanced recommendations |
+
+### Health & Metrics
+
+| Endpoint | Description |
+|----------|-------------|
+| `GET /healthz` | Health check |
+| `GET /ready` | Readiness probe |
+| `GET /metrics` | Service metrics |
+
+**Export OpenAPI spec:**
+```bash
+make export-openapi
+```
+
+---
+
+## Data Pipeline
+
+### Data Sources
+
+- **TMDB** — Movie descriptions from Kaggle dataset `tmdb-movies-dataset-2023-930k-movies`
+- **MovieLens 25M** — User ratings for collaborative signals
+
+### Embedding Model
+
+- **MiniLM-L6-v2** — Base embedding model from sentence-transformers
+
+### Pipeline Phases
+
+```mermaid
+flowchart TD
+ A[Phase 2 Outputs] --> B[train_finetune.py]
+ B --> C[onnx_export.py]
+ C --> D[quantize_int8.py]
+ D --> E[(GCS)]
+ E --> F((Cloud Run))
+```
+
+| Phase | Script | Output |
+|-------|--------|--------|
+| **Join** | `scripts/join_datasets.py` | Merged TMDB + MovieLens Parquet |
+| **Train** | `training/train_finetune.py` | Fine-tuned MiniLM model |
+| **Export** | `training/onnx_export.py` | `model.onnx` |
+| **Quantize** | `training/quantize_int8.py` | `model-int8.onnx` |
+| **Validate** | `scripts/validate_hyperedges.py` | DB edge verification |
+
+---
+
+## GCP Deployment
+
+### Prerequisites
+
+Enable these GCP APIs:
+- Cloud Run
+- Cloud Build
+- Artifact Registry
+- Cloud SQL Admin
+- Secret Manager
+- VPC Access
+- Cloud Storage
+
+### Provision Infrastructure
+
+```bash
+PROJECT_ID=agentics-foundation25lon-1809 \
+REGION=europe-west2 \
+AR_REPO=embedding-service \
+BUCKET_NAME=${PROJECT_ID}-${REGION}-embeddings \
+SQL_INSTANCE=embeddings-sql-${REGION} \
+DB_NAME=movies DB_USER=app_user \
+make gcp-provision
+```
+
+### Configure Secrets
+
+```bash
+make gcp-secrets
+```
+
+### Deploy Services
+
+**Embedding service:**
+```bash
+REGION=europe-west2 make gcp-build
+
+PROJECT_ID=agentics-foundation25lon-1809 REGION=europe-west2 \
+ SERVICE_NAME=embedding-service \
+ MODEL_GCS_URI=gs://agentics-foundation25lon-1809-europe-west2-models-20251207/models/movie-minilm-v1/model-int8.onnx \
+ make gcp-deploy
+```
+
+**Infra service (graph recommendations):**
+```bash
+PROJECT_ID=agentics-foundation25lon-1809 REGION=europe-west2 \
+ SERVICE_NAME=infra-service make gcp-deploy-infra
+```
+
+### Deploy Pipeline Jobs
+
+```bash
+# Deploy jobs
+PROJECT_ID=agentics-foundation25lon-1809 REGION=europe-west2 \
+ AR_REPO=embedding-service make gcp-jobs-deploy
+
+# Run Phase 2 (embeddings + hyperedges)
+PROJECT_ID=agentics-foundation25lon-1809 REGION=europe-west2 make gcp-job-run-phase2
+
+# Run Phase 3 (fine-tuning → ONNX → INT8)
+PROJECT_ID=agentics-foundation25lon-1809 REGION=europe-west2 make gcp-job-run-phase3
+```
+
+---
+
+## Configuration
+
+### Environment Variables
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `DATABASE_URL` | PostgreSQL connection (via Secret Manager) | — |
+| `MODEL_GCS_URI` | GCS path to ONNX model | — |
+| `EMBEDDING_BACKEND` | Backend selection | `auto` |
+| `USE_GRAPH_SCORER` | Enable graph scoring | `false` |
+| `USE_RERANKER` | Enable reranking | `false` |
+| `VECTOR_DIM` | Embedding dimension | `384` |
+| `LOG_LEVEL` | Logging verbosity | `INFO` |
+| `ALLOWED_ORIGINS` | CORS origins (comma-separated) | — |
+
+### Embedding Backends
+
+| Value | Description |
+|-------|-------------|
+| `auto` | Auto-detect best available |
+| `onnx` | ONNX Runtime (recommended) |
+| `st` | Sentence Transformers |
+| `hash` | Hash-based fallback |
+
+---
+
+## Roadmap
+
+- [ ] Curriculum sampling with temperature-controlled hard negatives
+- [ ] Weak supervision from genre and co-watch edges during training
+- [ ] Portable ONNX with shared tokenizer
+- [ ] TinyBERT/MiniLM cross-encoder reranker
+- [ ] Nightly retraining with drift detection
+- [ ] Canary deployments with automated guardrails
+
+---
diff --git a/apps/cr-hypervr/app/__init__.py b/apps/cr-hypervr/app/__init__.py
new file mode 100644
index 00000000..4e3a74d3
--- /dev/null
+++ b/apps/cr-hypervr/app/__init__.py
@@ -0,0 +1,3 @@
+__all__ = ["__version__"]
+__version__ = "0.1.0"
+
diff --git a/apps/cr-hypervr/app/api/__init__.py b/apps/cr-hypervr/app/api/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/apps/cr-hypervr/app/api/__init__.py
@@ -0,0 +1 @@
+
diff --git a/apps/cr-hypervr/app/core/config.py b/apps/cr-hypervr/app/core/config.py
new file mode 100644
index 00000000..5c77bd39
--- /dev/null
+++ b/apps/cr-hypervr/app/core/config.py
@@ -0,0 +1,28 @@
+from pydantic import Field
+from pydantic_settings import BaseSettings
+from typing import List
+
+
+class Settings(BaseSettings):
+ app_name: str = Field(default="CR-HyperVR")
+ environment: str = Field(default="dev")
+ log_level: str = Field(default="INFO")
+ database_url: str | None = None
+ model_dir: str = Field(default="models/movie-minilm-v1")
+ base_model_dir: str = Field(default="models/base-minilm")
+ model_name: str = Field(default="movie-minilm-v1")
+ vector_dim: int = Field(default=384)
+ allowed_origins: List[str] = Field(default_factory=list, description="CORS allowed origins")
+ # Embedding backend: auto|onnx|st|hash (hash = deterministic lightweight backend for tests/offline)
+ embedding_backend: str = Field(default="auto")
+ # Graph scoring options
+ use_graph_scorer: bool = Field(default=False)
+ graph_score_weight: float = Field(default=0.05)
+ # Optional reranker toggle (stub implementation)
+ use_reranker: bool = Field(default=False)
+
+ class Config:
+ env_file = ".env"
+
+
+settings = Settings()
diff --git a/apps/cr-hypervr/app/db/__init__.py b/apps/cr-hypervr/app/db/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/apps/cr-hypervr/app/db/__init__.py
@@ -0,0 +1 @@
+
diff --git a/apps/cr-hypervr/app/db/client.py b/apps/cr-hypervr/app/db/client.py
new file mode 100644
index 00000000..799bf244
--- /dev/null
+++ b/apps/cr-hypervr/app/db/client.py
@@ -0,0 +1,155 @@
+from __future__ import annotations
+
+import asyncio
+from typing import Any
+import asyncpg
+import numpy as np
+from app.core.config import settings
+
+
+class DB:
+ def __init__(self, dsn: str | None = None) -> None:
+ self._dsn = dsn or settings.database_url
+ self._pool: asyncpg.Pool | None = None
+
+ async def connect(self) -> None:
+ if self._pool is None:
+ if not self._dsn:
+ import os
+ env_dsn = os.getenv("DATABASE_URL")
+ if env_dsn:
+ self._dsn = env_dsn
+ else:
+ raise RuntimeError("DATABASE_URL not configured")
+ self._pool = await asyncpg.create_pool(self._dsn, min_size=1, max_size=5)
+
+ async def close(self) -> None:
+ if self._pool is not None:
+ await self._pool.close()
+ self._pool = None
+
+ async def fetch_similar(self, query_vec: np.ndarray, top_k: int = 10) -> list[dict[str, Any]]:
+ await self.connect()
+ assert self._pool is not None
+ # Query movies by cosine distance, return movie and score
+ q = """
+ SELECT m.movie_id, m.title, m.genres, 1 - (e.embedding <=> $1::vector) AS score
+ FROM movie_embeddings e
+ JOIN movies m USING (movie_id)
+ ORDER BY e.embedding <=> $1::vector
+ LIMIT $2
+ """
+ def _vec_to_pgtext(v: np.ndarray) -> str:
+ return "[" + ",".join(str(float(x)) for x in v.tolist()) + "]"
+ vec = _vec_to_pgtext(query_vec.astype(float))
+ async with self._pool.acquire() as conn:
+ rows = await conn.fetch(q, vec, top_k)
+ return [dict(r) for r in rows]
+
+ async def fetch_user_profile_embedding(self, user_id: int, min_rating: float = 4.0) -> np.ndarray | None:
+ """Return an average embedding of movies the user rated >= min_rating.
+ Falls back to None if no vectors exist.
+ """
+ await self.connect()
+ assert self._pool is not None
+ q = """
+ SELECT e.embedding
+ FROM user_ratings r
+ JOIN movie_embeddings e USING (movie_id)
+ WHERE r.user_id = $1 AND r.rating >= $2
+ LIMIT 1000
+ """
+ async with self._pool.acquire() as conn:
+ rows = await conn.fetch(q, user_id, min_rating)
+ if not rows:
+ return None
+ def _parse_vec(val: Any) -> np.ndarray:
+ if isinstance(val, str):
+ s = val.strip().strip("[]")
+ parts = [p for p in s.split(",") if p.strip() != ""]
+ return np.array([float(p) for p in parts], dtype=np.float32)
+ # assume list-like of floats
+ return np.array(list(val), dtype=np.float32)
+
+ vecs = [_parse_vec(r["embedding"]) for r in rows]
+ mean_vec = np.mean(np.stack(vecs, axis=0), axis=0)
+ # Normalize to unit length for cosine search
+ n = np.linalg.norm(mean_vec)
+ if n > 0:
+ mean_vec = mean_vec / n
+ return mean_vec.astype(np.float32)
+
+ async def fetch_genre_weights(self, movie_ids: list[int]) -> dict[int, float]:
+ """Sum of genre-edge weights per movie for simple graph boost."""
+ if not movie_ids:
+ return {}
+ await self.connect()
+ assert self._pool is not None
+ q = """
+ SELECT src_id AS movie_id, COALESCE(SUM(weight),0) AS w
+ FROM hyperedges
+ WHERE src_kind='movie' AND dst_kind='genre' AND src_id = ANY($1::bigint[])
+ GROUP BY src_id
+ """
+ async with self._pool.acquire() as conn:
+ rows = await conn.fetch(q, movie_ids)
+ return {int(r["movie_id"]): float(r["w"]) for r in rows}
+
+ async def fetch_neighbors_cowatch(self, movie_ids: list[int], top_k: int = 100) -> dict[int, float]:
+ """Return co‑watch neighbors aggregated across a set of seed movie_ids.
+
+ Uses hyperedges where (src_kind='movie', dst_kind='movie').
+ """
+ if not movie_ids:
+ return {}
+ await self.connect()
+ assert self._pool is not None
+ q = (
+ "SELECT dst_id AS movie_id, SUM(weight) AS w "
+ "FROM hyperedges WHERE src_kind='movie' AND dst_kind='movie' AND src_id = ANY($1::bigint[]) "
+ "GROUP BY dst_id ORDER BY SUM(weight) DESC LIMIT $2"
+ )
+ async with self._pool.acquire() as conn:
+ rows = await conn.fetch(q, movie_ids, top_k)
+ return {int(r["movie_id"]): float(r["w"]) for r in rows}
+
+ async def fetch_neighbors_shared_genre(self, movie_ids: list[int], top_k: int = 200) -> dict[int, float]:
+ """Return neighbors via shared genres.
+
+ We derive genre nodes from movie->genre edges and then collect other
+ movies pointing to those genres. Weight is sum of (w_src * w_dst).
+ """
+ if not movie_ids:
+ return {}
+ await self.connect()
+ assert self._pool is not None
+ q = (
+ "SELECT he2.src_id AS movie_id, SUM(he1.weight * he2.weight) AS w "
+ "FROM hyperedges he1 "
+ "JOIN hyperedges he2 ON he1.dst_kind='genre' AND he2.dst_kind='genre' AND he2.dst_id=he1.dst_id "
+ "WHERE he1.src_kind='movie' AND he1.src_id = ANY($1::bigint[]) "
+ "GROUP BY he2.src_id ORDER BY w DESC LIMIT $2"
+ )
+ async with self._pool.acquire() as conn:
+ rows = await conn.fetch(q, movie_ids, top_k)
+ return {int(r["movie_id"]): float(r["w"]) for r in rows}
+
+ async def fetch_movies_by_ids(self, movie_ids: list[int]) -> dict[int, dict]:
+ if not movie_ids:
+ return {}
+ await self.connect()
+ assert self._pool is not None
+ q = "SELECT movie_id, title, genres FROM movies WHERE movie_id = ANY($1::int[])"
+ async with self._pool.acquire() as conn:
+ rows = await conn.fetch(q, movie_ids)
+ return {int(r["movie_id"]): {"title": r["title"], "genres": r["genres"]} for r in rows}
+
+
+_db_singleton: DB | None = None
+
+
+def get_db() -> DB:
+ global _db_singleton
+ if _db_singleton is None:
+ _db_singleton = DB()
+ return _db_singleton
diff --git a/apps/cr-hypervr/app/main.py b/apps/cr-hypervr/app/main.py
new file mode 100644
index 00000000..01e593f2
--- /dev/null
+++ b/apps/cr-hypervr/app/main.py
@@ -0,0 +1,343 @@
+from fastapi import FastAPI, Depends, Response
+from app.core.config import settings
+from app import __version__
+from app.schemas import (
+ EmbedTextRequest,
+ EmbedBatchRequest,
+ EmbedVectorResponse,
+ SimilarSearchRequest,
+ SimilarSearchResponse,
+ SimilarItem,
+ RecommendRequest,
+ RecommendResponse,
+ MovieEmbedRequest,
+ UserEmbedRequest,
+ GraphRecommendRequest,
+ GraphRecommendResponse,
+ GraphRecommendItem,
+)
+from app.services import embedder as embedder_service
+from app.services.reranker import get_reranker
+from app.services.scoring import combine_scores, reorder_by_scores
+from typing import TYPE_CHECKING
+import numpy as np
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi import HTTPException
+
+# Simple in-memory metrics
+_metrics = {
+ "requests_total": 0,
+ "embed_text_total": 0,
+ "embed_batch_total": 0,
+ "embed_movie_total": 0,
+ "embed_user_total": 0,
+ "search_similar_total": 0,
+ "search_recommend_total": 0,
+ "graph_recommend_total": 0,
+}
+
+
+if TYPE_CHECKING: # for type checkers only
+ from app.db.client import DB # pragma: no cover
+
+
+def _get_db_dep():
+ # Lazy import; tolerate missing asyncpg in environments without DB
+ try:
+ from app.db.client import get_db # type: ignore
+ return get_db()
+ except Exception:
+ class _NoDB:
+ async def connect(self):
+ raise RuntimeError("DATABASE_URL not configured or driver not installed")
+
+ return _NoDB()
+
+
+def create_app() -> FastAPI:
+ app = FastAPI(title=settings.app_name, version=__version__)
+
+ # CORS
+ if settings.allowed_origins:
+ app.add_middleware(
+ CORSMiddleware,
+ allow_origins=settings.allowed_origins,
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+ )
+
+ @app.get("/healthz")
+ async def healthz():
+ return {"status": "ok"}
+
+ @app.get("/")
+ async def root():
+ return {
+ "service": settings.app_name,
+ "version": __version__,
+ "environment": settings.environment,
+ }
+
+ @app.post("/embed/text", response_model=EmbedVectorResponse)
+ async def embed_text(payload: EmbedTextRequest):
+ _metrics["requests_total"] += 1
+ _metrics["embed_text_total"] += 1
+ vec = embedder_service.get_embedder().encode([payload.text])[0]
+ return {
+ "embedding": vec.tolist(),
+ "dimension": settings.vector_dim,
+ "model": settings.model_name,
+ }
+
+ @app.post("/embed/batch", response_model=list[EmbedVectorResponse])
+ async def embed_batch(payload: EmbedBatchRequest):
+ _metrics["requests_total"] += 1
+ _metrics["embed_batch_total"] += 1
+ vecs = embedder_service.get_embedder().encode(payload.texts)
+ return [
+ {
+ "embedding": v.tolist(),
+ "dimension": settings.vector_dim,
+ "model": settings.model_name,
+ }
+ for v in vecs
+ ]
+
+ @app.post("/embed/movie", response_model=EmbedVectorResponse)
+ async def embed_movie(payload: MovieEmbedRequest):
+ _metrics["requests_total"] += 1
+ _metrics["embed_movie_total"] += 1
+ genres = ", ".join(payload.genres) if payload.genres else ""
+ desc = (payload.description or "").strip()
+ if len(desc) > 500:
+ desc = desc[:500]
+ text = f"{payload.title}. {genres}. {desc}"
+ vec = embedder_service.get_embedder().encode([text])[0]
+ return {
+ "embedding": vec.tolist(),
+ "dimension": settings.vector_dim,
+ "model": settings.model_name,
+ }
+
+ @app.post("/embed/user", response_model=EmbedVectorResponse)
+ async def embed_user(payload: UserEmbedRequest):
+ _metrics["requests_total"] += 1
+ _metrics["embed_user_total"] += 1
+ likes = ", ".join(payload.liked_movies)
+ top_genres = ", ".join(payload.liked_genres)
+ dislikes = ", ".join(payload.disliked_genres)
+ text = f"Enjoys {top_genres}. Liked movies such as {likes}. Avoids {dislikes}."
+ vec = embedder_service.get_embedder().encode([text])[0]
+ return {
+ "embedding": vec.tolist(),
+ "dimension": settings.vector_dim,
+ "model": settings.model_name,
+ }
+
+ @app.post("/search/similar", response_model=SimilarSearchResponse)
+ async def search_similar(payload: SimilarSearchRequest, db = Depends(_get_db_dep)):
+ _metrics["requests_total"] += 1
+ _metrics["search_similar_total"] += 1
+ query_vec = embedder_service.get_embedder().encode([payload.text])[0]
+ items = await db.fetch_similar(query_vec.astype(np.float32), top_k=payload.top_k)
+ # Optional graph-based scoring using genre hyperedges
+ if getattr(settings, "use_graph_scorer", False) and items:
+ mids = [int(i["movie_id"]) for i in items]
+ gweights = await db.fetch_genre_weights(mids)
+ base = {int(i["movie_id"]): float(i.get("score", 0.0)) for i in items}
+ scores = combine_scores(base, gweights, weight=getattr(settings, "graph_score_weight", 0.05))
+ items = reorder_by_scores(items, scores)
+ # Optional rerank
+ if settings.use_reranker and items:
+ items = get_reranker().rerank(payload.text, items)
+ return {
+ "items": [
+ SimilarItem(movie_id=i["movie_id"], title=i["title"], genres=i.get("genres"), score=float(i["score"]))
+ for i in items
+ ]
+ }
+
+ @app.post("/search/recommend", response_model=RecommendResponse)
+ async def recommend(payload: RecommendRequest, db = Depends(_get_db_dep)):
+ _metrics["requests_total"] += 1
+ _metrics["search_recommend_total"] += 1
+ # Prefer DB-derived user profile embedding (avg of liked items)
+ vec_np = await db.fetch_user_profile_embedding(payload.user_id)
+ if vec_np is None:
+ # Fallback: encode a deterministic user token
+ text = f"user_id:{payload.user_id}"
+ vec_np = embedder_service.get_embedder().encode([text])[0].astype(np.float32)
+ vec = vec_np.astype(np.float32)
+ items = await db.fetch_similar(vec, top_k=payload.top_k + (len(payload.exclude_movie_ids) if payload.exclude_movie_ids else 0))
+ exclude = set(payload.exclude_movie_ids or [])
+ filtered = [i for i in items if i["movie_id"] not in exclude]
+ # Optional graph-based scoring
+ if getattr(settings, "use_graph_scorer", False) and filtered:
+ mids = [int(i["movie_id"]) for i in filtered]
+ gweights = await db.fetch_genre_weights(mids)
+ base = {int(i["movie_id"]): float(i.get("score", 0.0)) for i in filtered}
+ scores = combine_scores(base, gweights, weight=getattr(settings, "graph_score_weight", 0.05))
+ filtered = reorder_by_scores(filtered, scores)
+ # Optional rerank
+ if settings.use_reranker and filtered:
+ filtered = get_reranker().rerank("user profile", filtered)
+ filtered = filtered[: payload.top_k]
+ return {
+ "items": [
+ SimilarItem(movie_id=i["movie_id"], title=i["title"], genres=i.get("genres"), score=float(i["score"]))
+ for i in filtered
+ ]
+ }
+
+ @app.post("/graph/recommend", response_model=GraphRecommendResponse)
+ async def graph_recommend(payload: GraphRecommendRequest, db = Depends(_get_db_dep)):
+ """Embed free‑text query, seed via vector search, expand through hypergraph, and return recommendations.
+
+ Expansion rules:
+ - hops>=1 → co‑watch neighbors (movie→movie)
+ - hops>=2 → shared‑genre neighbors (movie→genre→movie)
+ Scores are normalized per‑signal and linearly combined using weights.
+ """
+ _metrics["requests_total"] += 1
+ _metrics["graph_recommend_total"] += 1
+
+ # Defensive: ensure DB has expected API
+ for need in ("fetch_similar", "fetch_neighbors_cowatch", "fetch_neighbors_shared_genre", "fetch_movies_by_ids"):
+ if not hasattr(db, need):
+ raise HTTPException(status_code=503, detail="Database not configured")
+
+ # 1) Seed via vector search
+ query_vec = embedder_service.get_embedder().encode([payload.query])[0]
+ seeds = await db.fetch_similar(query_vec.astype(np.float32), top_k=max(payload.seed_top_k, payload.top_k))
+ seed_ids = [int(s["movie_id"]) for s in seeds]
+ embed_scores = {int(s["movie_id"]): float(s.get("score", 0.0)) for s in seeds}
+
+ # 2) Graph expansion signals
+ cowatch: dict[int, float] = {}
+ by_genre: dict[int, float] = {}
+ if payload.hops >= 1 and seed_ids:
+ cowatch = await db.fetch_neighbors_cowatch(seed_ids, top_k=max(3 * payload.seed_top_k, 200))
+ if payload.hops >= 2 and seed_ids:
+ by_genre = await db.fetch_neighbors_shared_genre(seed_ids, top_k=max(5 * payload.seed_top_k, 400))
+
+ # 3) Normalize each signal to [0,1] for stable mixing
+ def _normalize(d: dict[int, float]) -> dict[int, float]:
+ if not d:
+ return {}
+ m = max(d.values())
+ if m <= 0:
+ return {k: 0.0 for k in d}
+ return {k: float(v) / float(m) for k, v in d.items()}
+
+ embed_n = _normalize(embed_scores)
+ cowatch_n = _normalize(cowatch)
+ genre_n = _normalize(by_genre)
+
+ # 4) Aggregate combined scores; exclude seed items for recommendations
+ combined: dict[int, dict[str, float]] = {}
+ keys = set(embed_n) | set(cowatch_n) | set(genre_n)
+ for mid in keys:
+ if mid in seed_ids:
+ continue
+ e = embed_n.get(mid, 0.0)
+ c = cowatch_n.get(mid, 0.0)
+ g = genre_n.get(mid, 0.0)
+ score = payload.embed_weight * e + payload.cowatch_weight * c + payload.genre_weight * g
+ if score <= 0:
+ continue
+ combined[mid] = {"score": score, "e": e, "c": c, "g": g}
+
+ ranked = sorted(combined.items(), key=lambda kv: kv[1]["score"], reverse=True)
+ ranked = ranked[: payload.top_k]
+ mids = [mid for mid, _ in ranked]
+ meta = await db.fetch_movies_by_ids(mids)
+
+ def _sources(sig: dict[str, float]) -> list[str]:
+ out: list[str] = []
+ if sig.get("e", 0) > 0:
+ out.append("embed")
+ if sig.get("c", 0) > 0:
+ out.append("cowatch")
+ if sig.get("g", 0) > 0:
+ out.append("genre")
+ return out
+
+ items = []
+ for mid, sig in ranked:
+ m = meta.get(mid, {})
+ items.append(
+ GraphRecommendItem(
+ movie_id=mid,
+ title=m.get("title", str(mid)),
+ genres=m.get("genres"),
+ score=float(sig["score"]),
+ sources=_sources(sig) or None,
+ )
+ )
+ return {"items": items}
+
+ @app.get("/ready")
+ async def ready() -> dict:
+ # Basic readiness check (lightweight)
+ try:
+ _ = embedder_service.get_embedder()
+ return {"ready": True}
+ except Exception:
+ return {"ready": False}
+
+ @app.get("/metrics")
+ async def metrics() -> Response:
+ lines = [
+ "# HELP service_requests_total Total HTTP requests.",
+ "# TYPE service_requests_total counter",
+ f"service_requests_total {_metrics['requests_total']}",
+ "# HELP embed_requests_total Total embed requests by type.",
+ "# TYPE embed_requests_total counter",
+ f"embed_requests_total{{type=\"text\"}} {_metrics['embed_text_total']}",
+ f"embed_requests_total{{type=\"batch\"}} {_metrics['embed_batch_total']}",
+ f"embed_requests_total{{type=\"movie\"}} {_metrics['embed_movie_total']}",
+ f"embed_requests_total{{type=\"user\"}} {_metrics['embed_user_total']}",
+ "# HELP search_requests_total Total search requests by type.",
+ "# TYPE search_requests_total counter",
+ f"search_requests_total{{type=\"similar\"}} {_metrics['search_similar_total']}",
+ f"search_requests_total{{type=\"recommend\"}} {_metrics['search_recommend_total']}",
+ f"search_requests_total{{type=\"graph_recommend\"}} {_metrics['graph_recommend_total']}",
+ ]
+ return Response("\n".join(lines) + "\n", media_type="text/plain; version=0.0.4")
+
+ # Debug endpoints (safe for demos; avoid leaking secrets). These endpoints
+ # do not raise if DB is unavailable; they return availability flags.
+ @app.get("/debug/db_counts")
+ async def db_counts(db = Depends(_get_db_dep)):
+ try:
+ # Count movies and embeddings
+ await db.connect()
+ conn = db._pool # type: ignore[attr-defined]
+ assert conn is not None
+ async with conn.acquire() as c: # type: ignore
+ m = await c.fetchval("SELECT COUNT(*) FROM movies")
+ e = await c.fetchval("SELECT COUNT(*) FROM movie_embeddings")
+ return {"available": True, "movies": int(m or 0), "embeddings": int(e or 0)}
+ except Exception:
+ # DB not configured or unreachable
+ return {"available": False, "movies": 0, "embeddings": 0}
+
+ @app.get("/debug/sample_movie")
+ async def sample_movie(db = Depends(_get_db_dep)):
+ try:
+ await db.connect()
+ conn = db._pool # type: ignore[attr-defined]
+ assert conn is not None
+ async with conn.acquire() as c: # type: ignore
+ row = await c.fetchrow("SELECT movie_id, title FROM movies LIMIT 1")
+ if row:
+ return {"available": True, "movie": {"movie_id": int(row["movie_id"]), "title": row["title"]}}
+ return {"available": True, "movie": None}
+ except Exception:
+ return {"available": False, "movie": None}
+
+ return app
+
+
+app = create_app()
diff --git a/apps/cr-hypervr/app/schemas.py b/apps/cr-hypervr/app/schemas.py
new file mode 100644
index 00000000..4f4c864a
--- /dev/null
+++ b/apps/cr-hypervr/app/schemas.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+from pydantic import BaseModel, Field
+from typing import List, Optional
+
+
+class EmbedTextRequest(BaseModel):
+ text: str
+
+
+class EmbedBatchRequest(BaseModel):
+ texts: List[str]
+
+
+class EmbedVectorResponse(BaseModel):
+ embedding: List[float] = Field(..., description="384-dimensional embedding")
+ dimension: int = Field(384, description="Embedding dimension")
+ model: str = Field("movie-minilm-v1", description="Model identifier")
+
+
+class SimilarSearchRequest(BaseModel):
+ text: str
+ top_k: int = 10
+
+
+class SimilarItem(BaseModel):
+ movie_id: int
+ title: str
+ genres: Optional[str] = None
+ score: float
+
+
+class SimilarSearchResponse(BaseModel):
+ items: List[SimilarItem]
+
+
+class RecommendRequest(BaseModel):
+ user_id: int
+ top_k: int = 10
+ exclude_movie_ids: List[int] | None = None
+
+
+class RecommendResponse(BaseModel):
+ items: List[SimilarItem]
+
+
+class MovieEmbedRequest(BaseModel):
+ title: str
+ description: Optional[str] = None
+ genres: List[str] = Field(default_factory=list)
+
+
+class UserEmbedRequest(BaseModel):
+ liked_genres: List[str] = Field(default_factory=list)
+ liked_movies: List[str] = Field(default_factory=list)
+ disliked_genres: List[str] = Field(default_factory=list)
+
+
+# --- Hypergraph query/recommendation ---
+class GraphRecommendRequest(BaseModel):
+ """Free‑text query that seeds vector search, then expands via hypergraph.
+
+ Fields allow light tuning without overcomplicating the interface.
+ """
+ query: str = Field(..., description="User query to embed and seed the graph search")
+ top_k: int = Field(10, description="Number of recommendations to return")
+ seed_top_k: int = Field(20, description="Seed candidates from vector search")
+ hops: int = Field(2, description="Depth for graph expansion (1=cowatch, 2=+genres)")
+ embed_weight: float = Field(1.0, description="Weight for base embedding similarity")
+ cowatch_weight: float = Field(0.5, description="Weight for co‑watch edges")
+ genre_weight: float = Field(0.25, description="Weight for shared‑genre signal")
+
+
+class GraphRecommendItem(SimilarItem):
+ sources: list[str] | None = Field(default=None, description="Signals that contributed (embed|cowatch|genre)")
+
+
+class GraphRecommendResponse(BaseModel):
+ items: List[GraphRecommendItem]
diff --git a/apps/cr-hypervr/app/services/__init__.py b/apps/cr-hypervr/app/services/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/apps/cr-hypervr/app/services/__init__.py
@@ -0,0 +1 @@
+
diff --git a/apps/cr-hypervr/app/services/embedder.py b/apps/cr-hypervr/app/services/embedder.py
new file mode 100644
index 00000000..c4f245ba
--- /dev/null
+++ b/apps/cr-hypervr/app/services/embedder.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+import threading
+from functools import lru_cache
+import os
+import numpy as np
+
+try: # optional
+ import onnxruntime as ort # type: ignore
+except Exception: # pragma: no cover - optional
+ ort = None
+
+from app.core.config import settings
+
+
+class _Embedder:
+ def __init__(self) -> None:
+ self._lock = threading.Lock()
+ self._mode = "st"
+ self._st_model = None # lazy import to avoid heavy deps during tests
+ self._ort_session = None
+ self._prepare()
+
+ def _prepare(self) -> None:
+ # Optional: pull fine-tuned model artifacts from GCS if requested
+ try:
+ self._maybe_pull_model_from_gcs()
+ except Exception:
+ # Best-effort; proceed with normal backend selection
+ pass
+ # Allow explicit backend override via env or settings
+ override = os.getenv("EMBEDDING_BACKEND") or getattr(settings, "embedding_backend", "auto")
+ if override == "hash":
+ self._mode = "hash"
+ return
+ if override == "onnx":
+ onnx_path = f"{settings.model_dir}/model-int8.onnx"
+ if ort is not None:
+ try:
+ self._ort_session = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"]) # type: ignore[arg-type]
+ self._mode = "onnx"
+ return
+ except Exception:
+ pass
+ # fallback chain: ST → hash
+ try:
+ self._ensure_st()
+ self._mode = "st"
+ return
+ except Exception:
+ self._mode = "hash"
+ return
+ # Prefer ONNX if available (auto)
+ onnx_path = f"{settings.model_dir}/model-int8.onnx"
+ if ort is not None:
+ try:
+ self._ort_session = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"]) # type: ignore[arg-type]
+ self._mode = "onnx"
+ return
+ except Exception:
+ pass
+ # Fallback chain: ST → hash
+ try:
+ self._ensure_st()
+ self._mode = "st"
+ return
+ except Exception:
+ self._mode = "hash"
+ return
+
+ def _ensure_st(self) -> None:
+ if self._st_model is None:
+ from sentence_transformers import SentenceTransformer # type: ignore
+ self._st_model = SentenceTransformer(settings.model_dir if settings.model_dir else settings.base_model_dir)
+
+ def _maybe_pull_model_from_gcs(self) -> None:
+ # If a GCS URI is provided and local ONNX not present, pull down.
+ gcs_uri = os.getenv("MODEL_GCS_URI") or os.getenv("GCS_MODEL_URI")
+ if not gcs_uri or not gcs_uri.startswith("gs://"):
+ # Derive from bucket hint if available
+ bucket = os.getenv("GCS_MODELS_BUCKET")
+ if bucket and bucket.startswith("gs://"):
+ gcs_uri = f"{bucket.rstrip('/')}/models/movie-minilm-v1/model-int8.onnx"
+ else:
+ return
+ # Destination
+ model_dir = getattr(settings, "model_dir", "models/movie-minilm-v1") or "models/movie-minilm-v1"
+ onnx_path = os.path.join(model_dir, "model-int8.onnx")
+ # Already present → nothing to do
+ if os.path.exists(onnx_path):
+ return
+ os.makedirs(model_dir, exist_ok=True)
+ # Pull file or directory
+ import fsspec
+
+ fs = fsspec.filesystem("gcs")
+ if gcs_uri.endswith(".onnx"):
+ with fs.open(gcs_uri, "rb") as src, open(onnx_path, "wb") as dst: # type: ignore[attr-defined]
+ dst.write(src.read())
+ return
+ # Otherwise, treat as prefix and sync all files
+ # Ensure trailing slash for globbing
+ prefix = gcs_uri.rstrip("/") + "/"
+ files = [p for p in fs.glob(prefix + "**") if not p.endswith("/")]
+ for obj in files:
+ rel = obj[len(prefix) :]
+ local = os.path.join(model_dir, rel)
+ os.makedirs(os.path.dirname(local), exist_ok=True)
+ with fs.open(obj, "rb") as src, open(local, "wb") as dst: # type: ignore[attr-defined]
+ dst.write(src.read())
+
+ def _encode_hash(self, texts: list[str]) -> np.ndarray:
+ dim = int(getattr(settings, "vector_dim", 384))
+ out = []
+ for t in texts:
+ seed = int.from_bytes(np.frombuffer(t.encode("utf-8"), dtype=np.uint8).sum().tobytes(), "little", signed=False) ^ (len(t) * 1315423911)
+ rng = np.random.default_rng(seed)
+ v = rng.standard_normal(dim)
+ n = np.linalg.norm(v)
+ if n == 0:
+ out.append(np.zeros(dim, dtype=np.float32))
+ else:
+ out.append((v / n).astype(np.float32))
+ return np.stack(out, axis=0)
+
+ def encode(self, texts: list[str]) -> np.ndarray:
+ with self._lock:
+ if self._mode == "hash":
+ return self._encode_hash(texts)
+ if self._mode == "onnx" and self._ort_session is not None:
+ # For simplicity, use ST encode even when ONNX present (pipeline optimizable later)
+ self._ensure_st()
+ vecs = self._st_model.encode(texts, normalize_embeddings=True, convert_to_numpy=True)
+ else:
+ self._ensure_st()
+ vecs = self._st_model.encode(texts, normalize_embeddings=True, convert_to_numpy=True)
+ return vecs.astype(np.float32)
+
+
+@lru_cache(maxsize=1)
+def get_embedder() -> _Embedder:
+ return _Embedder()
diff --git a/apps/cr-hypervr/app/services/reranker.py b/apps/cr-hypervr/app/services/reranker.py
new file mode 100644
index 00000000..f1f8576c
--- /dev/null
+++ b/apps/cr-hypervr/app/services/reranker.py
@@ -0,0 +1,30 @@
+from __future__ import annotations
+
+from functools import lru_cache
+from typing import Sequence
+
+
+class Reranker:
+ """
+ Tiny, optional reranker stub. Deterministic and lightweight so it can
+ be enabled in production without heavy deps. Intended as a seam where
+ a true cross-encoder (TinyBERT, etc.) could be integrated later.
+ """
+
+ def rerank(self, query: str, items: Sequence[dict]) -> list[dict]:
+ # Heuristic: prefer titles that share tokens with the query (case-insensitive),
+ # stable sort to keep original ranking when scores tie.
+ q_tokens = {t for t in query.lower().split() if t}
+
+ def score(it: dict) -> int:
+ title = str(it.get("title", "")).lower()
+ tokens = set(title.split())
+ return len(q_tokens & tokens)
+
+ return sorted(list(items), key=score, reverse=True)
+
+
+@lru_cache(maxsize=1)
+def get_reranker() -> Reranker:
+ return Reranker()
+
diff --git a/apps/cr-hypervr/app/services/scoring.py b/apps/cr-hypervr/app/services/scoring.py
new file mode 100644
index 00000000..37cedd42
--- /dev/null
+++ b/apps/cr-hypervr/app/services/scoring.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+from typing import Dict, List
+
+
+def combine_scores(
+ base_scores: Dict[int, float],
+ genre_weights: Dict[int, float],
+ weight: float = 0.05,
+) -> Dict[int, float]:
+ """
+ Combine cosine similarity scores with simple graph-derived weights.
+ For now: new = base + weight * genre_weight(movie_id).
+ """
+ out: Dict[int, float] = {}
+ for mid, s in base_scores.items():
+ g = genre_weights.get(mid, 0.0)
+ out[mid] = float(s) + float(weight) * float(g)
+ return out
+
+
+def reorder_by_scores(items: List[dict], scores: Dict[int, float]) -> List[dict]:
+ return sorted(items, key=lambda it: scores.get(int(it.get("movie_id")), 0.0), reverse=True)
+
diff --git a/apps/cr-hypervr/cloudbuild.yaml b/apps/cr-hypervr/cloudbuild.yaml
new file mode 100644
index 00000000..883a3a17
--- /dev/null
+++ b/apps/cr-hypervr/cloudbuild.yaml
@@ -0,0 +1,10 @@
+steps:
+ - name: 'gcr.io/cloud-builders/docker'
+ args: ['build', '-t', '${_IMAGE}', '.']
+images:
+ - '${_IMAGE}'
+substitutions:
+ # Default to Artifact Registry in-region; override _IMAGE if needed
+ _IMAGE: '${_REGION}-docker.pkg.dev/$PROJECT_ID/embedding-service/api:latest'
+options:
+ logging: CLOUD_LOGGING_ONLY
diff --git a/apps/cr-hypervr/db/pgvector.sql b/apps/cr-hypervr/db/pgvector.sql
new file mode 100644
index 00000000..0aa0fc22
--- /dev/null
+++ b/apps/cr-hypervr/db/pgvector.sql
@@ -0,0 +1 @@
+CREATE EXTENSION IF NOT EXISTS vector;
diff --git a/apps/cr-hypervr/db/schema.sql b/apps/cr-hypervr/db/schema.sql
new file mode 100644
index 00000000..542690dd
--- /dev/null
+++ b/apps/cr-hypervr/db/schema.sql
@@ -0,0 +1,56 @@
+-- Enable pgvector extension
+CREATE EXTENSION IF NOT EXISTS vector;
+
+-- Movies base table (enriched metadata subset)
+CREATE TABLE IF NOT EXISTS movies (
+ movie_id INTEGER PRIMARY KEY,
+ title TEXT NOT NULL,
+ genres TEXT,
+ overview TEXT,
+ release_year INTEGER,
+ tmdb_id INTEGER
+);
+
+-- Movie embeddings (384-d float32 vectors, unit-normalized for cosine)
+CREATE TABLE IF NOT EXISTS movie_embeddings (
+ movie_id INTEGER PRIMARY KEY REFERENCES movies(movie_id) ON DELETE CASCADE,
+ embedding vector(384)
+);
+
+-- HNSW index for fast cosine similarity search
+DROP INDEX IF EXISTS idx_movie_embeddings_hnsw;
+CREATE INDEX idx_movie_embeddings_hnsw ON movie_embeddings USING hnsw (embedding vector_cosine_ops);
+
+-- Optional: user cached embeddings
+CREATE TABLE IF NOT EXISTS user_embeddings (
+ user_id BIGINT PRIMARY KEY,
+ embedding vector(384),
+ updated_at TIMESTAMP DEFAULT now()
+);
+
+-- User ratings table (MovieLens compatible)
+-- Kept minimal for analytics/pipeline joins; raw imports may live in GCS
+CREATE TABLE IF NOT EXISTS user_ratings (
+ user_id BIGINT NOT NULL,
+ movie_id INTEGER NOT NULL REFERENCES movies(movie_id) ON DELETE CASCADE,
+ rating NUMERIC(2,1) NOT NULL CHECK (rating >= 0.5 AND rating <= 5.0),
+ rated_at TIMESTAMP,
+ PRIMARY KEY (user_id, movie_id)
+);
+CREATE INDEX IF NOT EXISTS idx_user_ratings_user ON user_ratings(user_id);
+CREATE INDEX IF NOT EXISTS idx_user_ratings_movie ON user_ratings(movie_id);
+
+-- Hyperedges table to support graph-like relationships (e.g., co-watch, genre-affinity)
+-- Flexible JSONB payload for features/weights
+CREATE TABLE IF NOT EXISTS hyperedges (
+ id BIGSERIAL PRIMARY KEY,
+ src_kind TEXT NOT NULL, -- e.g., 'user' or 'movie'
+ src_id BIGINT NOT NULL,
+ dst_kind TEXT NOT NULL, -- e.g., 'movie' or 'genre'
+ dst_id BIGINT NOT NULL,
+ weight REAL DEFAULT 1.0,
+ payload JSONB,
+ created_at TIMESTAMP DEFAULT now()
+);
+CREATE INDEX IF NOT EXISTS idx_hyperedges_src ON hyperedges(src_kind, src_id);
+CREATE INDEX IF NOT EXISTS idx_hyperedges_dst ON hyperedges(dst_kind, dst_id);
diff --git a/apps/cr-hypervr/pipeline/__init__.py b/apps/cr-hypervr/pipeline/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/apps/cr-hypervr/pipeline/__init__.py
@@ -0,0 +1 @@
+
diff --git a/apps/cr-hypervr/pipeline/__pycache__/__init__.cpython-313.pyc b/apps/cr-hypervr/pipeline/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 00000000..57004dc5
Binary files /dev/null and b/apps/cr-hypervr/pipeline/__pycache__/__init__.cpython-313.pyc differ
diff --git a/apps/cr-hypervr/pipeline/__pycache__/triplets.cpython-313.pyc b/apps/cr-hypervr/pipeline/__pycache__/triplets.cpython-313.pyc
new file mode 100644
index 00000000..2cdc58eb
Binary files /dev/null and b/apps/cr-hypervr/pipeline/__pycache__/triplets.cpython-313.pyc differ
diff --git a/apps/cr-hypervr/pipeline/__pycache__/user_profiles.cpython-313.pyc b/apps/cr-hypervr/pipeline/__pycache__/user_profiles.cpython-313.pyc
new file mode 100644
index 00000000..accd59cb
Binary files /dev/null and b/apps/cr-hypervr/pipeline/__pycache__/user_profiles.cpython-313.pyc differ
diff --git a/apps/cr-hypervr/pipeline/netflix_parser.py b/apps/cr-hypervr/pipeline/netflix_parser.py
new file mode 100644
index 00000000..ce7afd85
--- /dev/null
+++ b/apps/cr-hypervr/pipeline/netflix_parser.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+import re
+from pathlib import Path
+import pandas as pd
+from tqdm import tqdm
+
+
+MOVIE_HEADER_RE = re.compile(r"^(\d+):\s*$")
+
+
+def parse_combined_files(netflix_dir: Path) -> pd.DataFrame:
+ files = [
+ netflix_dir / "combined_data_1.txt",
+ netflix_dir / "combined_data_2.txt",
+ netflix_dir / "combined_data_3.txt",
+ netflix_dir / "combined_data_4.txt",
+ ]
+ rows: list[tuple[int, int, int, str]] = [] # movie_id, user_id, rating, date
+ for f in files:
+ if not f.exists():
+ continue
+ movie_id = None
+ with f.open("r", encoding="latin-1") as fh:
+ for line in fh:
+ m = MOVIE_HEADER_RE.match(line)
+ if m:
+ movie_id = int(m.group(1))
+ continue
+ if movie_id is None:
+ continue
+ parts = line.strip().split(",")
+ if len(parts) != 3:
+ continue
+ user_id, rating, date = int(parts[0]), int(parts[1]), parts[2]
+ rows.append((movie_id, user_id, rating, date))
+
+ df = pd.DataFrame(rows, columns=["movie_id", "user_id", "rating", "date"])
+ return df
+
+
+def load_movie_titles(netflix_dir: Path) -> pd.DataFrame:
+ mt = netflix_dir / "movie_titles.csv"
+ if not mt.exists():
+ raise FileNotFoundError("movie_titles.csv not found")
+ # movie_id, year, title
+ df = pd.read_csv(mt, header=None, names=["movie_id", "year", "title"], encoding="latin-1")
+ return df
+
+
+def build(netflix_dir: str = "data/netflix", out_dir: str = "data/processed") -> None:
+ netflix = Path(netflix_dir)
+ out = Path(out_dir)
+ out.mkdir(parents=True, exist_ok=True)
+
+ print("Parsing Netflix combined data files...")
+ ratings = parse_combined_files(netflix)
+ print(f"Parsed {len(ratings):,} ratings")
+ ratings.to_parquet(out / "ratings.parquet")
+
+ print("Loading movie titles...")
+ movies = load_movie_titles(netflix)
+ movies.to_parquet(out / "movies.parquet")
+
+ # Basic validation stats
+ print(
+ {
+ "users": ratings["user_id"].nunique(),
+ "movies": ratings["movie_id"].nunique(),
+ "ratings": len(ratings),
+ }
+ )
+
+
+if __name__ == "__main__":
+ build()
+
diff --git a/apps/cr-hypervr/pipeline/tmdb_enrich.py b/apps/cr-hypervr/pipeline/tmdb_enrich.py
new file mode 100644
index 00000000..7ffe9d5b
--- /dev/null
+++ b/apps/cr-hypervr/pipeline/tmdb_enrich.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from pathlib import Path
+import pandas as pd
+
+
+def normalize_title(t: str) -> str:
+ return (t or "").strip().lower()
+
+
+def enrich(
+ processed_dir: str = "data/processed",
+ tmdb_csv_path: str = "data/tmdb/movies_metadata.csv",
+ out_path: str = "data/processed/movies_enriched.parquet",
+) -> None:
+ movies_pq = Path(processed_dir) / "movies.parquet"
+ if not movies_pq.exists():
+ raise FileNotFoundError("Run netflix_parser.build() first to generate movies.parquet")
+
+ movies = pd.read_parquet(movies_pq)
+ movies["title_norm"] = movies["title"].map(normalize_title)
+
+ if Path(tmdb_csv_path).exists():
+ tm = pd.read_csv(tmdb_csv_path, low_memory=False)
+ # Keep relevant fields
+ keep = [
+ "id",
+ "title",
+ "overview",
+ "genres",
+ "release_date",
+ "vote_average",
+ "popularity",
+ ]
+ tm = tm[keep]
+ tm["title_norm"] = tm["title"].map(normalize_title)
+
+ # Naive title-based join (improve later with year-based matching)
+ merged = movies.merge(tm, on="title_norm", how="left", suffixes=("", "_tmdb"))
+ merged.rename(columns={"id": "tmdb_id"}, inplace=True)
+ merged.to_parquet(out_path)
+ print(f"Enriched movies saved to {out_path}")
+ else:
+ # Fallback: save base movies without enrichment
+ movies.to_parquet(out_path)
+ print("TMDB CSV not found; saved base movies without enrichment.")
+
+
+if __name__ == "__main__":
+ enrich()
+
diff --git a/apps/cr-hypervr/pipeline/triplets.py b/apps/cr-hypervr/pipeline/triplets.py
new file mode 100644
index 00000000..26eeab12
--- /dev/null
+++ b/apps/cr-hypervr/pipeline/triplets.py
@@ -0,0 +1,97 @@
+from __future__ import annotations
+
+from pathlib import Path
+import os
+import pandas as pd
+import numpy as np
+import glob
+try:
+ import gcsfs # noqa: F401
+except Exception:
+ gcsfs = None
+
+
+def _storage_options(path: str | Path) -> dict | None:
+ p = str(path)
+ return {"token": "cloud"} if p.startswith("gs://") else None
+
+
+def _list_ratings_parts(processed_dir: str) -> list[str]:
+ pat = f"{processed_dir}/ratings_enriched-*.parquet"
+ # Try gcsfs glob
+ if str(processed_dir).startswith("gs://"):
+ try:
+ import gcsfs # type: ignore
+ fs = gcsfs.GCSFileSystem()
+ matches = sorted(fs.glob(pat))
+ if matches:
+ return [m if m.startswith("gs://") else ("gs://" + m) for m in matches]
+ except Exception:
+ pass
+ else:
+ files = sorted(glob.glob(pat))
+ if files:
+ return files
+ # Fallback: sequential probe
+ return [f"{processed_dir}/ratings_enriched-{i:05d}.parquet" for i in range(0, 200)]
+
+
+def generate_triplets(
+ processed_dir: str = os.getenv("GCS_PROCESSED_PREFIX", "data/processed"),
+ out_dir: str = os.getenv("GCS_TRIPLETS_PREFIX", "data/processed/triplets"),
+ user_sample: int | None = 10_000,
+ random_state: int = 42,
+) -> None:
+ # Avoid Path round-tripping for GCS URIs; Path("gs://...") becomes "gs:/..."
+ out_is_gcs = str(out_dir).startswith("gs://")
+ if not out_is_gcs:
+ Path(out_dir).mkdir(parents=True, exist_ok=True)
+
+ # Load movies metadata (small enough)
+ movies_path = f"{processed_dir}/movies_with_descriptions.parquet" if str(processed_dir).startswith("gs://") else Path(processed_dir) / "movies_with_descriptions.parquet"
+ movies = pd.read_parquet(movies_path, storage_options=_storage_options(movies_path))
+
+ # Split positives and negatives
+ # Build quick genre map if available
+ # genres may be JSON-like text; keep as raw string match for simplicity
+ movie_genres = movies.set_index("movieId")["genres"].to_dict()
+ rng = np.random.default_rng(random_state)
+ trip_rows: list[tuple[int, int, int]] = [] # (user_id, pos_movie, neg_movie)
+
+ parts = _list_ratings_parts(processed_dir)
+ if not parts:
+ # Fallback to single-file
+ parts = [f"{processed_dir}/ratings_enriched.parquet"]
+
+ for pth in parts:
+ df = pd.read_parquet(pth, storage_options=_storage_options(pth), columns=["user_id", "movieId", "rating"])
+ positives = df[df["rating"] >= 4.0]
+ negatives = df[df["rating"] <= 2.0]
+ if positives.empty or negatives.empty:
+ continue
+ pos_by_user = positives.groupby("user_id")["movieId"].apply(list).to_dict()
+ neg_by_user = negatives.groupby("user_id")["movieId"].apply(list).to_dict()
+ for u, pos_list in pos_by_user.items():
+ neg_list = neg_by_user.get(u)
+ if not neg_list:
+ continue
+ p = rng.choice(pos_list)
+ p_genres = str(movie_genres.get(int(p), ""))
+ candidates = [n for n in neg_list if any(tok in str(movie_genres.get(int(n), "")) for tok in p_genres.split())]
+ if not candidates:
+ candidates = neg_list
+ n = rng.choice(candidates)
+ trip_rows.append((int(u), int(p), int(n)))
+ if user_sample is not None and len(trip_rows) >= user_sample:
+ break
+ if user_sample is not None and len(trip_rows) >= user_sample:
+ break
+
+ df = pd.DataFrame(trip_rows, columns=["user_id", "pos_movie_id", "neg_movie_id"])
+ out_path = (f"{out_dir}/triplets_10k.parquet" if out_is_gcs else str(Path(out_dir) / "triplets_10k.parquet"))
+ df.to_parquet(out_path, storage_options=_storage_options(out_path))
+ print(f"Saved {len(df):,} triplets to {out_path}")
+
+
+if __name__ == "__main__":
+ generate_triplets()
diff --git a/apps/cr-hypervr/pipeline/user_profiles.py b/apps/cr-hypervr/pipeline/user_profiles.py
new file mode 100644
index 00000000..88ce8209
--- /dev/null
+++ b/apps/cr-hypervr/pipeline/user_profiles.py
@@ -0,0 +1,108 @@
+from __future__ import annotations
+
+from pathlib import Path
+import os
+import pandas as pd
+from collections import defaultdict
+from typing import Dict, List
+import glob
+try:
+ import gcsfs # noqa: F401
+except Exception:
+ gcsfs = None
+
+
+def _storage_options(path: str | Path) -> dict | None:
+ p = str(path)
+ return {"token": "cloud"} if p.startswith("gs://") else None
+
+
+def _list_ratings_parts(processed_dir: str) -> List[str]:
+ pat = f"{processed_dir}/ratings_enriched-*.parquet"
+ # Try gcsfs glob first
+ if processed_dir.startswith("gs://"):
+ try:
+ import gcsfs # type: ignore
+
+ fs = gcsfs.GCSFileSystem()
+ matches = sorted(fs.glob(pat))
+ if matches:
+ return [m if m.startswith("gs://") else ("gs://" + m) for m in matches]
+ except Exception:
+ pass
+ else:
+ files = sorted(glob.glob(pat))
+ if files:
+ return files
+ # Fallback: sequential probe up to 200 parts
+ out: List[str] = []
+ for i in range(0, 200):
+ p = f"{processed_dir}/ratings_enriched-{i:05d}.parquet"
+ # Defer existence check to reader; caller will catch FileNotFoundError
+ out.append(p)
+ return out
+
+
+def build_user_profiles(
+ processed_dir: str = os.getenv("GCS_PROCESSED_PREFIX", "data/processed"),
+ out_path: str = os.getenv("GCS_PROFILES_PATH", "data/processed/user_profiles.parquet"),
+ min_ratings: int = 10,
+) -> None:
+ # Stream-friendly aggregation across enriched parts
+ parts = _list_ratings_parts(processed_dir)
+ if not parts:
+ # Fallback to single-file path
+ single = f"{processed_dir}/ratings_enriched.parquet"
+ parts = [single]
+
+ counts: Dict[int, int] = defaultdict(int)
+ pos_titles: Dict[int, List[str]] = defaultdict(list)
+ neg_titles: Dict[int, List[str]] = defaultdict(list)
+
+ def _cap_append(d: Dict[int, List[str]], k: int, vals: List[str], cap: int = 50) -> None:
+ if not vals:
+ return
+ cur = d[k]
+ room = cap - len(cur)
+ if room <= 0:
+ return
+ cur.extend([v for v in vals[:room] if isinstance(v, str)])
+
+ for p in parts:
+ df = pd.read_parquet(p, storage_options=_storage_options(p), columns=["user_id", "rating", "title"])
+ # counts
+ for uid, n in df.groupby("user_id").size().items():
+ counts[int(uid)] += int(n)
+ # positives
+ pos = df[df["rating"] >= 4.0]
+ if not pos.empty:
+ agg = pos.groupby("user_id")["title"].apply(lambda s: list(s.dropna().astype(str))).to_dict()
+ for uid, titles in agg.items():
+ _cap_append(pos_titles, int(uid), titles)
+ # negatives
+ neg = df[df["rating"] <= 2.0]
+ if not neg.empty:
+ agg = neg.groupby("user_id")["title"].apply(lambda s: list(s.dropna().astype(str))).to_dict()
+ for uid, titles in agg.items():
+ _cap_append(neg_titles, int(uid), titles)
+
+ # Build final DataFrame
+ rows = []
+ for uid, cnt in counts.items():
+ if cnt < min_ratings:
+ continue
+ rows.append(
+ {
+ "user_id": uid,
+ "num_ratings": int(cnt),
+ "liked_titles": ", ".join(pos_titles.get(uid, [])[:50]),
+ "disliked_titles": ", ".join(neg_titles.get(uid, [])[:50]),
+ }
+ )
+ profiles = pd.DataFrame(rows)
+ profiles.to_parquet(out_path, storage_options=_storage_options(out_path), index=False)
+ print(f"User profiles saved to {out_path} ({len(profiles):,} users)")
+
+
+if __name__ == "__main__":
+ build_user_profiles()
diff --git a/apps/cr-hypervr/requirements.txt b/apps/cr-hypervr/requirements.txt
new file mode 100644
index 00000000..ee14c919
--- /dev/null
+++ b/apps/cr-hypervr/requirements.txt
@@ -0,0 +1,17 @@
+fastapi>=0.109.0
+uvicorn[standard]>=0.27.0
+sentence-transformers>=3.0.0
+onnxruntime>=1.16.0
+onnx>=1.14.0
+asyncpg>=0.29.0
+pgvector>=0.2.0
+numpy>=1.26.0
+pydantic>=2.5.0
+pydantic-settings>=2.2.0
+pandas>=2.0.0
+pyarrow>=14.0.0
+tqdm>=4.66.0
+torch>=2.1.0
+python-dotenv>=1.0.0
+httpx>=0.27.0
+gcsfs>=2023.6.0
diff --git a/apps/cr-hypervr/scripts/backfill_embeddings_db.py b/apps/cr-hypervr/scripts/backfill_embeddings_db.py
new file mode 100644
index 00000000..8a10191c
--- /dev/null
+++ b/apps/cr-hypervr/scripts/backfill_embeddings_db.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+"""
+Backfill movie embeddings directly from the Cloud SQL database.
+
+Reads rows from `movies` that are missing an entry in `movie_embeddings`,
+builds a simple text representation (title/genres/overview), encodes with the
+service embedder (SentenceTransformer or hash backend), and upserts into DB.
+
+Environment:
+- DATABASE_URL (required)
+- BATCH_SIZE (optional, default 256)
+- LIMIT (optional, limit number of rows for test runs)
+- EMBEDDING_BACKEND (optional, e.g., st|hash|auto)
+
+Intended to run inside the same container image used by the API and Cloud Run
+Jobs, so it has the same dependencies and cached base MiniLM model.
+"""
+
+import asyncio
+import os
+from typing import Iterable, List, Tuple
+import time
+import requests
+
+import numpy as np
+
+
+def build_movie_text(title: str | None, genres: str | None, overview: str | None) -> str:
+ return (
+ f"Title: {title or ''}\n"
+ f"Genres: {genres or ''}\n"
+ f"Overview: {overview or ''}"
+ )
+
+
+def _fetch_id_token(audience: str) -> str | None:
+ tok = os.getenv("ID_TOKEN")
+ if tok:
+ return tok
+ try:
+ resp = requests.get(
+ "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity",
+ params={"audience": audience, "format": "full"},
+ headers={"Metadata-Flavor": "Google"},
+ timeout=3,
+ )
+ if resp.status_code == 200 and resp.text:
+ return resp.text.strip()
+ except Exception:
+ pass
+ return None
+
+
+def _encode(texts: List[str]) -> np.ndarray:
+ service_url = os.getenv("SERVICE_URL")
+ if not service_url:
+ if os.getenv("ALLOW_LOCAL_FALLBACK", "").lower() in ("1", "true", "yes"):
+ from app.services.embedder import get_embedder # type: ignore
+
+ vecs = get_embedder().encode(texts)
+ vecs = vecs.astype(np.float32)
+ norms = np.linalg.norm(vecs, axis=1, keepdims=True)
+ norms[norms == 0] = 1.0
+ return vecs / norms
+ raise SystemExit("SERVICE_URL not set; Cloud Run embedding service required")
+ token = _fetch_id_token(service_url)
+ headers = {"Content-Type": "application/json"}
+ if token:
+ headers["Authorization"] = f"Bearer {token}"
+ sess = requests.Session()
+ out: list[np.ndarray] = []
+ batch = int(os.getenv("BATCH_EMBED_SIZE", "256"))
+ for i in range(0, len(texts), batch):
+ chunk = texts[i : i + batch]
+ for attempt in range(4):
+ try:
+ r = sess.post(
+ f"{service_url.rstrip('/')}/embed/batch",
+ json={"texts": chunk},
+ headers=headers,
+ timeout=30,
+ )
+ if r.status_code >= 500 and attempt < 3:
+ time.sleep(1.5 * (attempt + 1))
+ continue
+ r.raise_for_status()
+ payload = r.json()
+ vecs = [np.array(item["embedding"], dtype=np.float32) for item in payload]
+ out.extend(vecs)
+ break
+ except Exception:
+ if attempt >= 3:
+ raise
+ time.sleep(1.5 * (attempt + 1))
+ continue
+ arr = np.stack(out, axis=0)
+ norms = np.linalg.norm(arr, axis=1, keepdims=True)
+ norms[norms == 0] = 1.0
+ return (arr / norms).astype(np.float32)
+
+
+def _vec_to_pg(v: Iterable[float]) -> str:
+ return "[" + ",".join(str(float(x)) for x in v) + "]"
+
+
+async def _fetch_missing(conn, limit: int | None) -> List[Tuple[int, str | None, str | None, str | None]]:
+ q = (
+ "SELECT m.movie_id, m.title, m.genres, m.overview "
+ "FROM movies m LEFT JOIN movie_embeddings e USING (movie_id) "
+ "WHERE e.movie_id IS NULL ORDER BY m.movie_id"
+ )
+ if limit and limit > 0:
+ q += " LIMIT $1"
+ rows = await conn.fetch(q, int(limit))
+ else:
+ rows = await conn.fetch(q)
+ return [(int(r["movie_id"]), r["title"], r["genres"], r["overview"]) for r in rows]
+
+
+async def _upsert(conn, mids: List[int], vecs: np.ndarray) -> None:
+ await conn.executemany(
+ (
+ "INSERT INTO movie_embeddings (movie_id, embedding) "
+ "VALUES ($1, $2) ON CONFLICT (movie_id) DO UPDATE SET embedding=EXCLUDED.embedding"
+ ),
+ [(int(mid), _vec_to_pg(vec.tolist())) for mid, vec in zip(mids, vecs)],
+ )
+
+
+async def backfill(database_url: str, batch_size: int = 256, limit: int | None = None) -> int:
+ import asyncpg # lazy import to keep import-time deps light for tests
+
+ conn = await asyncpg.connect(database_url)
+ processed = 0
+ try:
+ pending = await _fetch_missing(conn, limit)
+ if not pending:
+ print("No missing embeddings found.")
+ return 0
+ print(f"Missing embeddings: {len(pending)}")
+ # Process in batches
+ for i in range(0, len(pending), batch_size):
+ batch = pending[i : i + batch_size]
+ mids = [mid for (mid, _t, _g, _o) in batch]
+ texts = [build_movie_text(t, g, o) for (_mid, t, g, o) in batch]
+ vecs = _encode(texts)
+ await _upsert(conn, mids, vecs)
+ processed += len(batch)
+ print(f"Upserted {processed}/{len(pending)} embeddings...")
+ return processed
+ finally:
+ await conn.close()
+
+
+def main() -> int:
+ db_url = os.getenv("DATABASE_URL")
+ if not db_url:
+ print("Set DATABASE_URL", flush=True)
+ return 2
+ batch_size = int(os.getenv("BATCH_SIZE", "256"))
+ limit_env = os.getenv("LIMIT")
+ limit = int(limit_env) if limit_env else None
+ print(f"Starting backfill: batch_size={batch_size}, limit={limit}")
+ processed = asyncio.run(backfill(db_url, batch_size=batch_size, limit=limit))
+ print(f"Backfill complete. Processed: {processed}")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/apps/cr-hypervr/scripts/build_hyperedges.py b/apps/cr-hypervr/scripts/build_hyperedges.py
new file mode 100644
index 00000000..6e22e5a2
--- /dev/null
+++ b/apps/cr-hypervr/scripts/build_hyperedges.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import os
+from pathlib import Path
+import sys
+import gc
+from collections import defaultdict
+import itertools
+import glob
+import pandas as pd
+
+try:
+ import gcsfs # type: ignore
+except Exception:
+ gcsfs = None
+
+
+def _storage_options(path: str | Path) -> dict | None:
+ p = str(path)
+ return {"token": "cloud"} if p.startswith("gs://") else None
+
+
+def _list_ratings_parts(processed_dir: str) -> list[str]:
+ pat = f"{processed_dir}/ratings_enriched-*.parquet"
+ if str(processed_dir).startswith("gs://"):
+ try:
+ import gcsfs # type: ignore
+ fs = gcsfs.GCSFileSystem()
+ matches = sorted(fs.glob(pat))
+ if matches:
+ return [m if m.startswith("gs://") else ("gs://" + m) for m in matches]
+ except Exception:
+ pass
+ else:
+ files = sorted(glob.glob(pat))
+ if files:
+ return files
+ # Fallback: sequential probe (reader will fail fast if missing)
+ return [f"{processed_dir}/ratings_enriched-{i:05d}.parquet" for i in range(0, 200)]
+
+
+def co_watch_edges(
+ df: pd.DataFrame,
+ min_rating: float = 4.0,
+ max_movies_per_user: int = 20,
+ min_pair_count: int = 3,
+ top_edges_per_movie: int = 50,
+) -> list[tuple[str, int, str, int, float]]:
+ """Compute simple co-watch hyperedges between movies watched positively by same user.
+ Returns list of (src_kind, src_id, dst_kind, dst_id, weight).
+ """
+ pos = df[df["rating"] >= min_rating]
+ by_user = pos.groupby("user_id")["movie_id"].apply(list)
+ counts: dict[tuple[int, int], int] = defaultdict(int)
+ for movies in by_user:
+ uniq = list(sorted(set(int(m) for m in movies)))
+ if len(uniq) > max_movies_per_user:
+ uniq = uniq[:max_movies_per_user]
+ for a, b in itertools.combinations(uniq, 2):
+ counts[(a, b)] += 1
+ filtered = {k: v for k, v in counts.items() if v >= min_pair_count}
+ per_src: dict[int, list[tuple[int, float]]] = defaultdict(list)
+ for (a, b), c in filtered.items():
+ per_src[a].append((b, float(c)))
+ per_src[b].append((a, float(c)))
+ edges: list[tuple[str, int, str, int, float]] = []
+ for src, lst in per_src.items():
+ lst.sort(key=lambda x: x[1], reverse=True)
+ for dst, w in lst[:top_edges_per_movie]:
+ edges.append(("movie", src, "movie", dst, w))
+ return edges
+
+
+def genre_affinity_edges(movies: pd.DataFrame) -> list[tuple[str, int, str, int, float]]:
+ """Create edges from movie->genre tokens for lightweight hypergraph support."""
+ edges: list[tuple[str, int, str, int, float]] = []
+ genre_ids: dict[str, int] = {}
+ next_gid = 1_000_000 # avoid collision with movie ids
+ for row in movies.itertuples(index=False):
+ mid = int(getattr(row, "movieId", getattr(row, "movie_id", 0)))
+ genres = str(getattr(row, "genres", "")).split("|") if "|" in str(getattr(row, "genres", "")) else str(getattr(row, "genres", "")).split(",")
+ for g in [g.strip() for g in genres if g and isinstance(g, str)]:
+ gid = genre_ids.setdefault(g, next_gid)
+ if gid == next_gid:
+ next_gid += 1
+ edges.append(("movie", mid, "genre", gid, 1.0))
+ return edges
+
+
+def write_to_db(db_url: str, edges: list[tuple[str, int, str, int, float]]) -> None:
+ import asyncpg, asyncio
+
+ async def run() -> None:
+ conn = await asyncpg.connect(db_url)
+ try:
+ await conn.executemany(
+ """
+ INSERT INTO hyperedges (src_kind, src_id, dst_kind, dst_id, weight)
+ VALUES ($1,$2,$3,$4,$5)
+ """,
+ edges,
+ )
+ finally:
+ await conn.close()
+
+ asyncio.run(run())
+
+
+def main() -> None:
+ # Accept either PROCESSED_PREFIX or legacy GCS_PROCESSED_PREFIX for consistency with other jobs
+ processed = os.getenv("PROCESSED_PREFIX") or os.getenv("GCS_PROCESSED_PREFIX") or "data/processed"
+ ratings_parts = _list_ratings_parts(processed)
+ if not ratings_parts:
+ raise FileNotFoundError("ratings_enriched-*.parquet not found in processed dir")
+ movies_path = f"{processed}/movies_with_descriptions.parquet" if str(processed).startswith("gs://") else Path(processed) / "movies_with_descriptions.parquet"
+ movies = pd.read_parquet(movies_path, storage_options=_storage_options(movies_path))
+
+ # Stream through parts to build co-watch edges without loading all rows in memory
+ min_rating = float(os.getenv("MIN_RATING", "4.0"))
+ max_movies_per_user = int(os.getenv("MAX_MOVIES_PER_USER", "20"))
+ min_pair_count = int(os.getenv("MIN_PAIR_COUNT", "3"))
+ top_edges_per_movie = int(os.getenv("TOP_EDGES_PER_MOVIE", "50"))
+ max_parts = int(os.getenv("MAX_PARTS", "0")) # 0 = all
+ total_rows = 0
+ # Prepare per-part edges output to reduce memory and aid retries
+ parts_dir = f"{processed}/hyperedges_parts" if str(processed).startswith("gs://") else str(Path(processed) / "hyperedges_parts")
+ if not str(parts_dir).startswith("gs://"):
+ Path(parts_dir).mkdir(parents=True, exist_ok=True)
+
+ # Pass 1: generate bounded edges per part and persist
+ for idx, p in enumerate(ratings_parts):
+ if max_parts and idx >= max_parts:
+ break
+ try:
+ part_df = pd.read_parquet(p, storage_options=_storage_options(p)).copy()
+ except FileNotFoundError:
+ continue
+ if "user_id" not in part_df.columns and "userId" in part_df.columns:
+ part_df = part_df.rename(columns={"userId": "user_id"})
+ if "movie_id" not in part_df.columns and "movieId" in part_df.columns:
+ part_df = part_df.rename(columns={"movieId": "movie_id"})
+ total_rows += len(part_df)
+ part_edges = co_watch_edges(
+ part_df,
+ min_rating=min_rating,
+ max_movies_per_user=max_movies_per_user,
+ min_pair_count=min_pair_count,
+ top_edges_per_movie=top_edges_per_movie,
+ )
+ # Persist part edges
+ edf = pd.DataFrame(part_edges, columns=["src_kind", "src_id", "dst_kind", "dst_id", "weight"]) if part_edges else pd.DataFrame(columns=["src_kind", "src_id", "dst_kind", "dst_id", "weight"])
+ outp = f"{parts_dir}/edges_part_{idx:05d}.parquet" if str(parts_dir).startswith("gs://") else str(Path(parts_dir) / f"edges_part_{idx:05d}.parquet")
+ edf.to_parquet(outp, storage_options=_storage_options(outp), index=False)
+ print(f"Wrote edges for part {idx:05d}: {len(edf):,} -> {outp}")
+ sys.stdout.flush()
+ # Free memory between parts
+ del part_df, part_edges, edf
+ gc.collect()
+
+ # Pass 2: aggregate per-part edges into bounded top-K per source
+ from glob import glob as _glob
+ parts_list: list[str]
+ if str(parts_dir).startswith("gs://"):
+ try:
+ import gcsfs # type: ignore
+ fs = gcsfs.GCSFileSystem()
+ parts_list = sorted(fs.glob(f"{parts_dir}/edges_part_*.parquet"))
+ parts_list = [p if p.startswith("gs://") else ("gs://" + p) for p in parts_list]
+ except Exception:
+ parts_list = []
+ else:
+ parts_list = sorted(_glob(str(Path(parts_dir) / "edges_part_*.parquet")))
+
+ per_src_global: dict[int, dict[int, float]] = defaultdict(dict)
+ for j, ep in enumerate(parts_list):
+ try:
+ e = pd.read_parquet(ep, storage_options=_storage_options(ep), columns=["src_id", "dst_id", "weight"]) # type: ignore[arg-type]
+ except Exception:
+ continue
+ for row in e.itertuples(index=False):
+ src = int(getattr(row, "src_id"))
+ dst = int(getattr(row, "dst_id"))
+ w = float(getattr(row, "weight"))
+ d = per_src_global[src]
+ d[dst] = d.get(dst, 0.0) + w
+ # Prune per-src maps to keep bounded size
+ for src, d in list(per_src_global.items()):
+ if len(d) > top_edges_per_movie:
+ top = sorted(d.items(), key=lambda kv: kv[1], reverse=True)[:top_edges_per_movie]
+ per_src_global[src] = dict(top)
+ if (j + 1) % 10 == 0:
+ approx_edges = sum(len(d) for d in per_src_global.values())
+ print(f"Aggregated {j+1}/{len(parts_list)} edge parts, approx edges {approx_edges:,}")
+ sys.stdout.flush()
+
+ # Emit final edges
+ edges: list[tuple[str, int, str, int, float]] = []
+ for src, d in per_src_global.items():
+ for dst, w in d.items():
+ edges.append(("movie", src, "movie", dst, float(w)))
+ edges += genre_affinity_edges(movies)
+
+ # Write optional parquet artifact (local or GCS)
+ out_path = f"{processed}/hyperedges.parquet" if str(processed).startswith("gs://") else str(Path(processed) / "hyperedges.parquet")
+ pd.DataFrame(edges, columns=["src_kind", "src_id", "dst_kind", "dst_id", "weight"]).to_parquet(out_path, storage_options=_storage_options(out_path), index=False)
+ print(f"Hyperedges written to {out_path} ({len(edges):,} rows)")
+
+ db_url = os.getenv("DATABASE_URL")
+ if db_url:
+ # Chunked DB insert to avoid long blocking executemany
+ CHUNK = int(os.getenv("EDGE_DB_CHUNK", "5000"))
+ import asyncpg, asyncio
+ async def run() -> None:
+ conn = await asyncpg.connect(db_url)
+ try:
+ q = "INSERT INTO hyperedges (src_kind, src_id, dst_kind, dst_id, weight) VALUES ($1,$2,$3,$4,$5)"
+ for i in range(0, len(edges), CHUNK):
+ await conn.executemany(q, edges[i:i+CHUNK])
+ print(f"Inserted edges {min(i+CHUNK, len(edges))}/{len(edges)} into DB...")
+ sys.stdout.flush()
+ finally:
+ await conn.close()
+ asyncio.run(run())
+ print("Also inserted into database hyperedges table.")
+ sys.stdout.flush()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/apps/cr-hypervr/scripts/db_apply_cloudsql.sh b/apps/cr-hypervr/scripts/db_apply_cloudsql.sh
new file mode 100644
index 00000000..fedc9639
--- /dev/null
+++ b/apps/cr-hypervr/scripts/db_apply_cloudsql.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SQL_INSTANCE=${SQL_INSTANCE:-embeddings-sql-europe-west2}
+DB_NAME=${DB_NAME:-movies}
+
+echo "Applying pgvector extension as postgres..."
+gcloud sql connect "$SQL_INSTANCE" --user=postgres --database="$DB_NAME" --quiet < db/pgvector.sql
+
+echo "Applying schema as postgres..."
+gcloud sql connect "$SQL_INSTANCE" --user=postgres --database="$DB_NAME" --quiet < db/schema.sql
+
+echo "Schema applied."
+
diff --git a/apps/cr-hypervr/scripts/deploy_cloud_run.sh b/apps/cr-hypervr/scripts/deploy_cloud_run.sh
new file mode 100644
index 00000000..982110dd
--- /dev/null
+++ b/apps/cr-hypervr/scripts/deploy_cloud_run.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Default to repo-local Cloud SDK config to avoid $HOME perms issues
+export CLOUDSDK_CONFIG="${CLOUDSDK_CONFIG:-$(pwd)/.gcloud}"
+mkdir -p "$CLOUDSDK_CONFIG"
+
+SERVICE_NAME=${SERVICE_NAME:-embedding-service}
+PROJECT_ID=${PROJECT_ID:?set PROJECT_ID}
+REGION=${REGION:-europe-west2}
+AR_REPO=${AR_REPO:-embedding-service}
+IMAGE=${IMAGE:-$REGION-docker.pkg.dev/$PROJECT_ID/$AR_REPO/api:latest}
+
+INSTANCE_CONNECTION_NAME=${INSTANCE_CONNECTION_NAME:-$(gcloud sql instances describe embeddings-sql-$REGION --format='value(connectionName)' 2>/dev/null || true)}
+
+gcloud run deploy "$SERVICE_NAME" \
+ --image "$IMAGE" \
+ --project "$PROJECT_ID" \
+ --region "$REGION" \
+ --platform managed \
+ --allow-unauthenticated \
+ --cpu 2 --memory 2Gi --max-instances 10 \
+ --port 8080 \
+ --add-cloudsql-instances "$INSTANCE_CONNECTION_NAME" \
+ --set-secrets DATABASE_URL=database-url:latest \
+ --set-env-vars ENVIRONMENT=prod,BASE_MODEL_DIR=models/base-minilm${MODEL_GCS_URI:+,MODEL_GCS_URI=${MODEL_GCS_URI}} \
+ ${EXTRA_ARGS:-}
diff --git a/apps/cr-hypervr/scripts/deploy_graph_service.sh b/apps/cr-hypervr/scripts/deploy_graph_service.sh
new file mode 100644
index 00000000..db90dfb5
--- /dev/null
+++ b/apps/cr-hypervr/scripts/deploy_graph_service.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Thin wrapper to deploy a dedicated Cloud Run service focused on graph recommendations.
+# It reuses the same container image but sets a distinct service name and enables
+# graph-related settings by default.
+
+: "${PROJECT_ID:?set PROJECT_ID}"
+REGION=${REGION:-europe-west2}
+SERVICE_NAME=${SERVICE_NAME:-infra-service}
+AR_REPO=${AR_REPO:-embedding-service}
+MODEL_GCS_URI=${MODEL_GCS_URI:-}
+
+# Default to repo-local Cloud SDK config if not provided
+export CLOUDSDK_CONFIG="${CLOUDSDK_CONFIG:-$(pwd)/.gcloud}"
+mkdir -p "$CLOUDSDK_CONFIG"
+
+EXTRA_ARGS=(
+ --set-env-vars USE_RERANKER=${USE_RERANKER:-false}
+ --set-env-vars USE_GRAPH_SCORER=${USE_GRAPH_SCORER:-true}
+)
+
+if [[ -n "${EXTRA_SET_VARS:-}" ]]; then
+ # Allow callers to pass additional comma-separated vars like KEY=V,FOO=BAR
+ EXTRA_ARGS+=( --set-env-vars "${EXTRA_SET_VARS}" )
+fi
+
+# Allow callers to pass through arbitrary additional flags (e.g., --service-account=...)
+if [[ -n "${EXTRA_FLAGS:-}" ]]; then
+ # Word-split intentionally to support multiple flags
+ # shellcheck disable=SC2206
+ EXTRA_ARGS+=( ${EXTRA_FLAGS} )
+fi
+
+SERVICE_NAME=${SERVICE_NAME} PROJECT_ID=${PROJECT_ID} REGION=${REGION} AR_REPO=${AR_REPO} \
+EXTRA_ARGS="${EXTRA_ARGS[*]}" MODEL_GCS_URI="${MODEL_GCS_URI}" \
+bash "$(dirname "$0")/deploy_cloud_run.sh"
+
+echo "Deployed Cloud Run service: ${SERVICE_NAME} (project=${PROJECT_ID}, region=${REGION})"
diff --git a/apps/cr-hypervr/scripts/deploy_jobs.sh b/apps/cr-hypervr/scripts/deploy_jobs.sh
new file mode 100644
index 00000000..67efdabd
--- /dev/null
+++ b/apps/cr-hypervr/scripts/deploy_jobs.sh
@@ -0,0 +1,193 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Deploy Cloud Run Jobs for data join and phase2 pipeline.
+# Requirements: image pushed to Artifact Registry and service account for jobs.
+# Usage:
+# PROJECT_ID=... REGION=europe-west2 AR_REPO=embedding-service JOBS_SA=embedding-jobs \
+# DATA_PREFIX=gs:///data PROCESSED_PREFIX=gs:///data/processed \
+# TRIPLETS_PREFIX=gs:///triplets PROFILES_PATH=gs:///data/processed/user_profiles.parquet \
+# ./scripts/deploy_jobs.sh
+
+PROJECT_ID=${PROJECT_ID:?set PROJECT_ID}
+REGION=${REGION:-europe-west2}
+AR_REPO=${AR_REPO:-embedding-service}
+IMAGE=${IMAGE:-$REGION-docker.pkg.dev/$PROJECT_ID/$AR_REPO/api:latest}
+JOBS_SA=${JOBS_SA:-embedding-jobs@${PROJECT_ID}.iam.gserviceaccount.com}
+INSTANCE_CONNECTION_NAME=${INSTANCE_CONNECTION_NAME:-$(gcloud sql instances describe ${SQL_INSTANCE:-embeddings-sql-${REGION}} --format='value(connectionName)' 2>/dev/null || true)}
+
+# Default to stable bucket envs if explicit prefixes not provided
+DATA_PREFIX=${DATA_PREFIX:-${GCS_DATA_BUCKET}/data}
+PROCESSED_PREFIX=${PROCESSED_PREFIX:-${GCS_EMB_BUCKET}/data/processed}
+TRIPLETS_PREFIX=${TRIPLETS_PREFIX:-${GCS_EMB_BUCKET}/triplets}
+PROFILES_PATH=${PROFILES_PATH:-${GCS_EMB_BUCKET}/data/processed/user_profiles.parquet}
+SERVICE_URL=${SERVICE_URL:-$(gcloud run services describe embedding-service --project "$PROJECT_ID" --region="$REGION" --format='value(status.url)' 2>/dev/null || true)}
+
+echo "Image: $IMAGE"
+echo "Jobs SA: $JOBS_SA"
+echo "Cloud SQL: ${INSTANCE_CONNECTION_NAME:-[unset]}"
+echo "Data prefix: ${DATA_PREFIX:-[unset]}"
+echo "Processed: ${PROCESSED_PREFIX:-[unset]}"
+echo "Triplets: ${TRIPLETS_PREFIX:-[unset]}"
+echo "Profiles: ${PROFILES_PATH:-[unset]}"
+echo "Service URL: ${SERVICE_URL:-[unset]}"
+
+common_env=(
+ --set-env-vars PYTHONPATH=/app
+ --set-env-vars GCS_DATA_PREFIX=${DATA_PREFIX}
+ --set-env-vars GCS_PROCESSED_PREFIX=${PROCESSED_PREFIX}
+ --set-env-vars GCS_TRIPLETS_PREFIX=${TRIPLETS_PREFIX}
+ --set-env-vars GCS_PROFILES_PATH=${PROFILES_PATH}
+ --set-env-vars GCS_MODELS_BUCKET=${GCS_MODELS_BUCKET:-}
+ --set-env-vars TRIPLET_USER_SAMPLE=${TRIPLET_USER_SAMPLE:-all}
+)
+
+echo "Deploying job: data-join"
+gcloud run jobs deploy data-join \
+ --image "$IMAGE" \
+ --project "$PROJECT_ID" \
+ --region "$REGION" \
+ --service-account "$JOBS_SA" \
+ --cpu 2 --memory 4Gi \
+ --task-timeout 3600 \
+ --max-retries 1 \
+ --command python \
+ --args scripts/join_datasets.py \
+ "${common_env[@]}"
+
+echo "Deploying job: pipeline-phase2"
+gcloud run jobs deploy pipeline-phase2 \
+ --image "$IMAGE" \
+ --project "$PROJECT_ID" \
+ --region "$REGION" \
+ --service-account "$JOBS_SA" \
+ --cpu 4 --memory 8Gi \
+ --task-timeout 14400 \
+ --max-retries 1 \
+ ${INSTANCE_CONNECTION_NAME:+--set-cloudsql-instances "$INSTANCE_CONNECTION_NAME"} \
+ --set-secrets DATABASE_URL=database-url:latest \
+ --command python \
+ --args scripts/run_pipeline_phase2.py \
+ --set-env-vars PROCESSED_PREFIX=${PROCESSED_PREFIX} \
+ "${common_env[@]}"
+
+echo "Deploying job: pipeline-phase3"
+gcloud run jobs deploy pipeline-phase3 \
+ --image "$IMAGE" \
+ --project "$PROJECT_ID" \
+ --region "$REGION" \
+ --service-account "$JOBS_SA" \
+ --cpu 4 --memory 8Gi \
+ --task-timeout 21600 \
+ --max-retries 1 \
+ --set-env-vars BASE_MODEL_DIR=${BASE_MODEL_DIR:-models/base-minilm} \
+ --set-env-vars OUTPUT_DIR=${OUTPUT_DIR:-models/movie-minilm-v1} \
+ --set-env-vars EPOCHS=${EPOCHS:-1} \
+ --set-env-vars BATCH_SIZE=${BATCH_SIZE:-64} \
+ --set-env-vars RUN_PHASE2_IF_MISSING=${RUN_PHASE2_IF_MISSING:-false} \
+ --command python \
+ --args scripts/run_pipeline_phase3.py \
+ "${common_env[@]}"
+
+echo "Jobs deployed. Use: gcloud run jobs run --region=$REGION --wait"
+
+echo "Deploying job: seed-movies"
+gcloud run jobs deploy seed-movies \
+ --image "$IMAGE" \
+ --project "$PROJECT_ID" \
+ --region "$REGION" \
+ --service-account "$JOBS_SA" \
+ --cpu 2 --memory 2Gi \
+ --task-timeout 3600 \
+ --max-retries 1 \
+ ${INSTANCE_CONNECTION_NAME:+--set-cloudsql-instances "$INSTANCE_CONNECTION_NAME"} \
+ --set-secrets DATABASE_URL=database-url:latest \
+ --set-env-vars PROCESSED_PREFIX=${PROCESSED_PREFIX} \
+ --set-env-vars GCS_PROCESSED_PREFIX=${PROCESSED_PREFIX} \
+ --command python \
+ --args scripts/seed_movies.py
+
+echo "Deploying job: seed-embeddings"
+gcloud run jobs deploy seed-embeddings \
+ --image "$IMAGE" \
+ --project "$PROJECT_ID" \
+ --region "$REGION" \
+ --service-account "$JOBS_SA" \
+ --cpu 4 --memory 8Gi \
+ --task-timeout 14400 \
+ --max-retries 1 \
+ ${INSTANCE_CONNECTION_NAME:+--set-cloudsql-instances "$INSTANCE_CONNECTION_NAME"} \
+ --set-secrets DATABASE_URL=database-url:latest \
+ --set-env-vars PROCESSED_PREFIX=${PROCESSED_PREFIX} \
+ --set-env-vars GCS_PROCESSED_PREFIX=${PROCESSED_PREFIX} \
+ --set-env-vars SERVICE_URL=${SERVICE_URL} \
+ --set-env-vars BATCH_EMBED_SIZE=${BATCH_EMBED_SIZE:-256} \
+ --set-env-vars UPSERT_CHUNK_SIZE=${UPSERT_CHUNK_SIZE:-1000} \
+ --set-env-vars MOVIES_ROW_CHUNK=${MOVIES_ROW_CHUNK:-5000} \
+ --set-env-vars MODEL_DIR=models/base-minilm \
+ --set-env-vars EMBEDDING_BACKEND=st \
+ --command python \
+ --args scripts/seed_embeddings.py
+
+echo "Deploying job: backfill-embeddings-db"
+gcloud run jobs deploy backfill-embeddings-db \
+ --image "$IMAGE" \
+ --project "$PROJECT_ID" \
+ --region "$REGION" \
+ --service-account "$JOBS_SA" \
+ --cpu 2 --memory 4Gi \
+ --task-timeout 14400 \
+ --max-retries 1 \
+ ${INSTANCE_CONNECTION_NAME:+--set-cloudsql-instances "$INSTANCE_CONNECTION_NAME"} \
+ --set-secrets DATABASE_URL=database-url:latest \
+ --set-env-vars SERVICE_URL=${SERVICE_URL} \
+ --set-env-vars EMBEDDING_BACKEND=st \
+ --command python \
+ --args scripts/backfill_embeddings_db.py
+
+echo "Deploying job: validate-triplets"
+gcloud run jobs deploy validate-triplets \
+ --image "$IMAGE" \
+ --project "$PROJECT_ID" \
+ --region "$REGION" \
+ --service-account "$JOBS_SA" \
+ --cpu 1 --memory 1Gi \
+ --task-timeout 1800 \
+ --max-retries 1 \
+ ${INSTANCE_CONNECTION_NAME:+--set-cloudsql-instances "$INSTANCE_CONNECTION_NAME"} \
+ --set-secrets DATABASE_URL=database-url:latest \
+ --set-env-vars GCS_TRIPLETS_PREFIX=${TRIPLETS_PREFIX} \
+ --command python \
+ --args scripts/validate_triplets_coverage.py
+
+echo "Deploying job: validate-hyperedges"
+gcloud run jobs deploy validate-hyperedges \
+ --image "$IMAGE" \
+ --project "$PROJECT_ID" \
+ --region "$REGION" \
+ --service-account "$JOBS_SA" \
+ --cpu 1 --memory 2Gi \
+ --task-timeout 3600 \
+ --max-retries 1 \
+ ${INSTANCE_CONNECTION_NAME:+--set-cloudsql-instances "$INSTANCE_CONNECTION_NAME"} \
+ --set-secrets DATABASE_URL=database-url:latest \
+ --set-env-vars PROCESSED_PREFIX=${PROCESSED_PREFIX} \
+ --set-env-vars GCS_PROCESSED_PREFIX=${PROCESSED_PREFIX} \
+ --command python \
+ --args scripts/validate_hyperedges.py
+
+echo "Deploying job: build-hyperedges"
+gcloud run jobs deploy build-hyperedges \
+ --image "$IMAGE" \
+ --project "$PROJECT_ID" \
+ --region "$REGION" \
+ --service-account "$JOBS_SA" \
+ --cpu 4 --memory 8Gi \
+ --task-timeout 10800 \
+ --max-retries 1 \
+ ${INSTANCE_CONNECTION_NAME:+--set-cloudsql-instances "$INSTANCE_CONNECTION_NAME"} \
+ --set-secrets DATABASE_URL=database-url:latest \
+ --set-env-vars PROCESSED_PREFIX=${PROCESSED_PREFIX} \
+ --set-env-vars GCS_PROCESSED_PREFIX=${PROCESSED_PREFIX} \
+ --command python \
+ --args scripts/build_hyperedges.py
diff --git a/apps/cr-hypervr/scripts/download_minilm.py b/apps/cr-hypervr/scripts/download_minilm.py
new file mode 100644
index 00000000..06cc0853
--- /dev/null
+++ b/apps/cr-hypervr/scripts/download_minilm.py
@@ -0,0 +1,37 @@
+import os
+import sys
+import subprocess
+
+
+def main():
+ target_dir = os.environ.get("BASE_MODEL_DIR", "models/base-minilm")
+ os.makedirs(target_dir, exist_ok=True)
+ # Prefer sentence-transformers quick download path
+ code = subprocess.call(
+ [
+ sys.executable,
+ "-c",
+ "from sentence_transformers import SentenceTransformer; SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2').save('{}')".format(
+ target_dir
+ ),
+ ]
+ )
+ if code != 0:
+ print("Falling back to huggingface-cli download...")
+ code = subprocess.call(
+ [
+ "bash",
+ "-lc",
+ f"huggingface-cli download sentence-transformers/all-MiniLM-L6-v2 --local-dir {target_dir}",
+ ]
+ )
+ if code == 0:
+ print(f"Model downloaded to {target_dir}")
+ else:
+ print("Failed to download model. Ensure git-lfs and huggingface-cli are available.")
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()
+
diff --git a/apps/cr-hypervr/scripts/download_movielens_25m.sh b/apps/cr-hypervr/scripts/download_movielens_25m.sh
new file mode 100644
index 00000000..6b6bede5
--- /dev/null
+++ b/apps/cr-hypervr/scripts/download_movielens_25m.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROJECT_ROOT=$(cd "$(dirname "$0")"/.. && pwd)
+mkdir -p "$PROJECT_ROOT/data/movielens"
+
+cd "$PROJECT_ROOT/data/movielens"
+echo "Downloading MovieLens 25M..."
+curl -fL https://files.grouplens.org/datasets/movielens/ml-25m.zip -o ml-25m.zip
+echo "Extracting..."
+unzip -o ml-25m.zip
+echo "MovieLens ready in data/movielens/ml-25m/"
+
diff --git a/apps/cr-hypervr/scripts/download_tmdb_full.py b/apps/cr-hypervr/scripts/download_tmdb_full.py
new file mode 100644
index 00000000..146dad0f
--- /dev/null
+++ b/apps/cr-hypervr/scripts/download_tmdb_full.py
@@ -0,0 +1,33 @@
+import os
+import subprocess
+from pathlib import Path
+
+
+def main():
+ project_root = Path(__file__).parent.parent
+ os.chdir(project_root)
+ os.environ.setdefault("KAGGLE_CONFIG_DIR", str(project_root / ".kaggle"))
+
+ out_dir = project_root / "data/tmdb"
+ out_dir.mkdir(parents=True, exist_ok=True)
+ zip_path = out_dir / "tmdb-movies-dataset-2023-930k-movies.zip"
+
+ print("Downloading TMDB 2024 dataset via Kaggle...")
+ cmd = [
+ "bash",
+ "-lc",
+ f"pip install -q kaggle && kaggle datasets download -d asaniczka/tmdb-movies-dataset-2023-930k-movies -p {out_dir} --force",
+ ]
+ code = subprocess.call(cmd)
+ if code != 0:
+ raise SystemExit("Kaggle download failed. Ensure KAGGLE_CONFIG_DIR and credentials are set.")
+
+ # Unzip
+ print("Extracting TMDB zip...")
+ subprocess.check_call(["bash", "-lc", f"cd {out_dir} && unzip -o *.zip"]) # extracts TMDB_movie_dataset_v11.csv
+ print("TMDB dataset ready in data/tmdb/")
+
+
+if __name__ == "__main__":
+ main()
+
diff --git a/apps/cr-hypervr/scripts/export_openapi.py b/apps/cr-hypervr/scripts/export_openapi.py
new file mode 100644
index 00000000..fc6d6005
--- /dev/null
+++ b/apps/cr-hypervr/scripts/export_openapi.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+from fastapi.openapi.utils import get_openapi
+from app.main import app
+
+
+def main() -> None:
+ p = argparse.ArgumentParser()
+ p.add_argument("--out", default="docs/openapi.json", help="Output file path")
+ args = p.parse_args()
+ schema = get_openapi(
+ title=app.title,
+ version=app.version,
+ routes=app.routes,
+ description="Movie Embedding Service OpenAPI",
+ )
+ out = Path(args.out)
+ out.parent.mkdir(parents=True, exist_ok=True)
+ out.write_text(json.dumps(schema, indent=2))
+ print(f"Wrote {out}")
+
+
+if __name__ == "__main__":
+ main()
+
diff --git a/apps/cr-hypervr/scripts/gcloud_env.sh b/apps/cr-hypervr/scripts/gcloud_env.sh
new file mode 100644
index 00000000..7384c396
--- /dev/null
+++ b/apps/cr-hypervr/scripts/gcloud_env.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Use a repo-local Cloud SDK config to avoid $HOME permission issues.
+export CLOUDSDK_CONFIG="${CLOUDSDK_CONFIG:-$(pwd)/.gcloud}"
+mkdir -p "$CLOUDSDK_CONFIG"
+
+if [[ -n "${PROJECT_ID:-}" ]]; then
+ gcloud config set core/project "$PROJECT_ID" >/dev/null
+fi
+if [[ -n "${REGION:-}" ]]; then
+ gcloud config set compute/region "$REGION" >/dev/null
+fi
+
+echo "CLOUDSDK_CONFIG=$CLOUDSDK_CONFIG"
+gcloud config list 2>/dev/null || true
+
+# Stable bucket envs (point to existing dated buckets by default; no reupload)
+# Users may export these to override.
+PROJECT_ID=${PROJECT_ID:-$(gcloud config get-value core/project 2>/dev/null)}
+REGION=${REGION:-$(gcloud config get-value compute/region 2>/dev/null)}
+DATE_SUFFIX=${DATE_SUFFIX:-20251207}
+
+export GCS_DATA_BUCKET=${GCS_DATA_BUCKET:-gs://${PROJECT_ID}-${REGION}-datasets-${DATE_SUFFIX}}
+export GCS_MODELS_BUCKET=${GCS_MODELS_BUCKET:-gs://${PROJECT_ID}-${REGION}-models-${DATE_SUFFIX}}
+export GCS_EMB_BUCKET=${GCS_EMB_BUCKET:-gs://${PROJECT_ID}-${REGION}-embeddings-${DATE_SUFFIX}}
+
+echo "Buckets:"
+echo " GCS_DATA_BUCKET=$GCS_DATA_BUCKET"
+echo " GCS_MODELS_BUCKET=$GCS_MODELS_BUCKET"
+echo " GCS_EMB_BUCKET=$GCS_EMB_BUCKET"
diff --git a/apps/cr-hypervr/scripts/gcp_log.py b/apps/cr-hypervr/scripts/gcp_log.py
new file mode 100644
index 00000000..cc9f9d4c
--- /dev/null
+++ b/apps/cr-hypervr/scripts/gcp_log.py
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import os
+import shlex
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+def iso_utc_now() -> str:
+ return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+def write_log(
+ log_path: Path,
+ *,
+ timestamp: str,
+ executor: str,
+ purpose: str,
+ commands: str,
+ result: str,
+ stdout: str | None = None,
+ stderr: str | None = None,
+ exit_code: int | None = None,
+) -> None:
+ log_path.parent.mkdir(parents=True, exist_ok=True)
+ with log_path.open("a", encoding="utf-8") as f:
+ f.write("- Timestamp (UTC): " + timestamp + "\n")
+ f.write("- Executor: " + executor + "\n")
+ f.write("- Command(s): " + commands.strip() + "\n")
+ f.write("- Purpose: " + purpose.strip() + "\n")
+ if exit_code is not None:
+ f.write(f"- Exit code: {exit_code}\n")
+ f.write("- Result: " + result + "\n")
+ if stdout:
+ f.write("- Stdout (truncated):\n")
+ f.write("```\n")
+ f.write(_truncate(stdout))
+ f.write("\n````\n")
+ if stderr:
+ f.write("- Stderr (truncated):\n")
+ f.write("```\n")
+ f.write(_truncate(stderr))
+ f.write("\n````\n")
+ f.write("\n")
+
+
+def _truncate(s: str, limit: int = 2000) -> str:
+ if len(s) <= limit:
+ return s
+ head = s[: limit - 20]
+ return head + "\n...[truncated]..."
+
+
+def run_and_log(args: argparse.Namespace) -> int:
+ ts = iso_utc_now()
+ log_file = Path(os.getenv("GCP_ACTIVITY_LOG_FILE", "gcp-activity-log.md"))
+ executor = args.executor or os.getenv("GCP_LOG_EXECUTOR", "Agent")
+ purpose = args.purpose or "Unspecified"
+ if args.run:
+ # Execute the command in a login shell for compatibility
+ cmd = args.run
+ # Show a normalized form in the log for readability
+ display_cmd = cmd
+ try:
+ proc = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+ rc = proc.returncode
+ result = "success" if rc == 0 else "error"
+ write_log(
+ log_file,
+ timestamp=ts,
+ executor=executor,
+ purpose=purpose,
+ commands=display_cmd,
+ result=result,
+ stdout=proc.stdout.strip(),
+ stderr=proc.stderr.strip(),
+ exit_code=rc,
+ )
+ return rc
+ except Exception as e: # pragma: no cover
+ write_log(
+ log_file,
+ timestamp=ts,
+ executor=executor,
+ purpose=purpose,
+ commands=display_cmd,
+ result="error",
+ stdout="",
+ stderr=str(e),
+ exit_code=-1,
+ )
+ return 1
+ else:
+ # Append-only mode (no command execution)
+ result = args.result or "success"
+ commands = args.commands or "n/a"
+ details = args.details or ""
+ write_log(
+ log_file,
+ timestamp=ts,
+ executor=executor,
+ purpose=purpose,
+ commands=commands,
+ result=result,
+ stdout=details,
+ stderr=None,
+ exit_code=None,
+ )
+ return 0
+
+
+def build_parser() -> argparse.ArgumentParser:
+ p = argparse.ArgumentParser(description="Append structured entries to gcp-activity-log.md")
+ mode = p.add_mutually_exclusive_group(required=False)
+ mode.add_argument("--run", help="Shell command to execute and log result")
+ p.add_argument("--executor", help="Executor label (User/Agent/CI)")
+ p.add_argument("--purpose", help="Purpose of the action")
+ # Append-only fields
+ p.add_argument("--commands", help="Commands text when not using --run")
+ p.add_argument("--result", choices=["success", "error"], help="Result when not using --run")
+ p.add_argument("--details", help="Additional details text for append-only mode")
+ return p
+
+
+def main(argv: list[str] | None = None) -> int:
+ parser = build_parser()
+ args = parser.parse_args(argv)
+ return run_and_log(args)
+
+
+if __name__ == "__main__":
+ sys.exit(main())
+
diff --git a/apps/cr-hypervr/scripts/gcp_verify.sh b/apps/cr-hypervr/scripts/gcp_verify.sh
new file mode 100644
index 00000000..c7ad701b
--- /dev/null
+++ b/apps/cr-hypervr/scripts/gcp_verify.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Default to repo-local Cloud SDK config to avoid $HOME perms issues
+export CLOUDSDK_CONFIG="${CLOUDSDK_CONFIG:-$(pwd)/.gcloud}"
+mkdir -p "$CLOUDSDK_CONFIG"
+
+PROJECT_ID=${PROJECT_ID:-$(gcloud config get-value core/project 2>/dev/null)}
+REGION=${REGION:-$(gcloud config get-value compute/region 2>/dev/null)}
+
+echo "Project: ${PROJECT_ID}"
+echo "Region: ${REGION}"
+
+echo "== Enabled services (key set) =="
+for s in run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com sqladmin.googleapis.com secretmanager.googleapis.com vpcaccess.googleapis.com storage.googleapis.com compute.googleapis.com; do
+ printf "%-35s : " "$s"; gcloud services list --enabled --filter="NAME:$s" --format='value(NAME)' || true
+done
+
+echo "== Artifact Registry (europe-west2) =="
+gcloud artifacts repositories list --location=europe-west2 --format='table(name,format,location)'
+
+echo "== Service Accounts =="
+gcloud iam service-accounts list --format='table(displayName,email)'
+
+echo "== Bucket exists? =="
+# Default to new datasets bucket with 20251207 suffix unless BUCKET_NAME provided
+BUCKET_NAME=${BUCKET_NAME:-${PROJECT_ID}-europe-west2-datasets-20251207}
+if gsutil ls -b gs://$BUCKET_NAME >/dev/null 2>&1; then
+ echo "YES: gs://$BUCKET_NAME"
+else
+ echo "NO: (expected: gs://$BUCKET_NAME)"
+fi
+
+echo "== Cloud SQL instance =="
+SQL_INSTANCE=${SQL_INSTANCE:-embeddings-sql-europe-west2}
+gcloud sql instances describe "$SQL_INSTANCE" --format='table(name,region,state,backendType)' || true
+echo "== Databases =="
+gcloud sql databases list --instance="$SQL_INSTANCE" --format='table(name)'
+echo "== Users =="
+gcloud sql users list --instance="$SQL_INSTANCE" --format='table(name,type)'
+
+echo "== Secrets =="
+gcloud secrets list --format='table(name)' | sed -n '1,200p'
diff --git a/apps/cr-hypervr/scripts/join_datasets.py b/apps/cr-hypervr/scripts/join_datasets.py
new file mode 100644
index 00000000..63b5f839
--- /dev/null
+++ b/apps/cr-hypervr/scripts/join_datasets.py
@@ -0,0 +1,108 @@
+from __future__ import annotations
+
+from pathlib import Path
+import os
+import pandas as pd
+import math
+
+
+def _storage_options(path: str | Path) -> dict | None:
+ p = str(path)
+ return {"token": "cloud"} if p.startswith("gs://") else None
+
+
+def main():
+ project_root = Path(__file__).parent.parent
+ data_prefix = os.getenv("DATA_PREFIX") or os.getenv("GCS_DATA_PREFIX")
+ processed_prefix = os.getenv("PROCESSED_PREFIX") or os.getenv("GCS_PROCESSED_PREFIX")
+
+ if data_prefix:
+ if str(data_prefix).startswith("gs://"):
+ tmdb_csv = f"{data_prefix}/tmdb/TMDB_movie_dataset_v11.csv"
+ links_csv = f"{data_prefix}/movielens/ml-25m/links.csv"
+ ratings_csv = f"{data_prefix}/movielens/ml-25m/ratings.csv"
+ else:
+ dp = Path(str(data_prefix))
+ tmdb_csv = dp / "tmdb/TMDB_movie_dataset_v11.csv"
+ links_csv = dp / "movielens/ml-25m/links.csv"
+ ratings_csv = dp / "movielens/ml-25m/ratings.csv"
+ else:
+ tmdb_csv = project_root / "data/tmdb/TMDB_movie_dataset_v11.csv"
+ links_csv = project_root / "data/movielens/ml-25m/links.csv"
+ ratings_csv = project_root / "data/movielens/ml-25m/ratings.csv"
+
+ if processed_prefix:
+ out_dir = processed_prefix
+ else:
+ out_dir = project_root / "data/processed"
+ Path(out_dir).mkdir(parents=True, exist_ok=True)
+
+ print("Loading TMDB (filtered columns)...")
+ tmdb = pd.read_csv(
+ tmdb_csv,
+ storage_options=_storage_options(tmdb_csv),
+ usecols=[c for c in ["imdb_id", "status", "overview", "title", "genres", "vote_average", "release_date"] if True],
+ )
+ tmdb = tmdb[tmdb["status"] == "Released"]
+ # Keep modest descriptions locally; production datasets will far exceed this
+ tmdb = tmdb[tmdb["overview"].notna() & (tmdb["overview"].astype(str).str.len() > 10)]
+ tmdb["imdb_id_clean"] = tmdb["imdb_id"].astype(str).str.replace("tt", "", regex=False)
+ tmdb["imdb_id_clean"] = pd.to_numeric(tmdb["imdb_id_clean"], errors="coerce")
+ tmdb = tmdb.dropna(subset=["imdb_id_clean"]) # keep rows with parsed imdb
+
+ print("Loading MovieLens links (small) and preparing mapping...")
+ links = pd.read_csv(links_csv, storage_options=_storage_options(links_csv))
+ links["imdbId"] = pd.to_numeric(links["imdbId"], errors="coerce")
+ links = links.dropna(subset=["imdbId"]) # keep joinable
+
+ print("Joining TMDB -> MovieLens (movies metadata only)...")
+ movies_joined = pd.merge(
+ tmdb[["imdb_id_clean", "title", "overview", "genres", "vote_average", "release_date"]],
+ links[["movieId", "imdbId"]],
+ left_on="imdb_id_clean",
+ right_on="imdbId",
+ how="inner",
+ )
+
+ keep_cols = ["movieId", "title", "overview", "genres", "vote_average", "release_date"]
+ movies_keep = movies_joined[[c for c in keep_cols if c in movies_joined.columns]].copy()
+
+ # Write movies metadata
+ if isinstance(out_dir, str) and str(out_dir).startswith("gs://"):
+ movies_keep.to_parquet(f"{out_dir}/movies_with_descriptions.parquet", storage_options=_storage_options(out_dir))
+ else:
+ Path(out_dir).mkdir(parents=True, exist_ok=True)
+ Path(out_dir, "triplets").mkdir(exist_ok=True)
+ movies_keep.to_parquet(Path(out_dir) / "movies_with_descriptions.parquet")
+
+ # Stream ratings in chunks to avoid OOM; write partitioned enriched chunks
+ print("Streaming ratings -> enriched parquet parts (chunked)...")
+ part = 0
+ chunksize = int(os.getenv("JOIN_RATINGS_CHUNKSIZE", "1000000"))
+ usecols = ["userId", "movieId", "rating"]
+ reader = pd.read_csv(ratings_csv, storage_options=_storage_options(ratings_csv), usecols=usecols, chunksize=chunksize)
+ total_rows = 0
+ for chunk in reader:
+ total_rows += len(chunk)
+ chunk = chunk[["userId", "movieId", "rating"]]
+ enriched = chunk.merge(movies_keep[["movieId", "title", "genres"]], on="movieId", how="inner")
+ enriched = enriched.rename(columns={"userId": "user_id"})
+ if isinstance(out_dir, str) and str(out_dir).startswith("gs://"):
+ outp = f"{out_dir}/ratings_enriched-{part:05d}.parquet"
+ enriched.to_parquet(outp, storage_options=_storage_options(outp), index=False)
+ else:
+ outp = Path(out_dir) / f"ratings_enriched-{part:05d}.parquet"
+ enriched.to_parquet(outp, index=False)
+ print(f"Wrote part {part:05d} with {len(enriched):,} rows -> {outp}")
+ part += 1
+
+ print({
+ "tmdb_descriptions": len(tmdb),
+ "movies_matched": len(movies_keep),
+ "ratings_rows_processed": total_rows,
+ "ratings_parts": part,
+ })
+
+
+if __name__ == "__main__":
+ main()
diff --git a/apps/cr-hypervr/scripts/migrate_db.py b/apps/cr-hypervr/scripts/migrate_db.py
new file mode 100644
index 00000000..468120a1
--- /dev/null
+++ b/apps/cr-hypervr/scripts/migrate_db.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import asyncio
+import os
+import sys
+from typing import Optional
+
+import asyncpg
+
+
+SQL_FILES = [
+ "${GCS_EMB_BUCKET}/db/pgvector.sql",
+ "${GCS_EMB_BUCKET}/db/schema.sql",
+]
+
+
+async def run_sql(conn: asyncpg.Connection, sql_text: str) -> None:
+ # asyncpg can execute multiple statements in one call
+ await conn.execute(sql_text)
+
+
+async def load_gcs_text(path: str) -> str:
+ import gcsfs # lazy import
+
+ fs = gcsfs.GCSFileSystem()
+ with fs.open(path, "r") as f:
+ return f.read()
+
+
+async def main() -> int:
+ db_url = os.getenv("DATABASE_URL")
+ gcs_bucket = os.getenv("GCS_EMB_BUCKET")
+ if not db_url or not gcs_bucket:
+ print("Missing env: DATABASE_URL or GCS_EMB_BUCKET", file=sys.stderr)
+ return 2
+
+ # Resolve file paths with env substitution
+ files = [p.replace("${GCS_EMB_BUCKET}", gcs_bucket) for p in SQL_FILES]
+ print("Applying SQL files:", files)
+ conn: Optional[asyncpg.Connection] = None
+ try:
+ conn = await asyncpg.connect(dsn=db_url)
+ for p in files:
+ try:
+ sql_text = await load_gcs_text(p)
+ print(f"-- Executing: {p} ({len(sql_text)} bytes)")
+ await run_sql(conn, sql_text)
+ print(f"OK: {p}")
+ except Exception as e:
+ print(f"ERROR executing {p}: {e}", file=sys.stderr)
+ return 1
+ return 0
+ finally:
+ if conn:
+ await conn.close()
+
+
+if __name__ == "__main__":
+ sys.exit(asyncio.run(main()))
+
diff --git a/apps/cr-hypervr/scripts/provision_core.sh b/apps/cr-hypervr/scripts/provision_core.sh
new file mode 100644
index 00000000..e74cadb7
--- /dev/null
+++ b/apps/cr-hypervr/scripts/provision_core.sh
@@ -0,0 +1,126 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Default to repo-local Cloud SDK config to avoid $HOME perms issues
+export CLOUDSDK_CONFIG="${CLOUDSDK_CONFIG:-$(pwd)/.gcloud}"
+mkdir -p "$CLOUDSDK_CONFIG"
+
+# Creates core infra: bucket, service accounts, Cloud SQL instance/db/user, database-url secret.
+# Usage:
+# PROJECT_ID=... REGION=europe-west2 BUCKET_NAME=... SQL_INSTANCE=... DB_NAME=movies DB_USER=app_user ./scripts/provision_core.sh
+# Optional:
+# DB_PASSWORD (auto-generated if empty), AR_REPO (defaults embedding-service), RUNTIME_SA, JOBS_SA
+
+PROJECT_ID=${PROJECT_ID:-$(gcloud config get-value core/project 2>/dev/null)}
+REGION=${REGION:-$(gcloud config get-value compute/region 2>/dev/null)}
+AR_REPO=${AR_REPO:-embedding-service}
+BUCKET_NAME=${BUCKET_NAME:-${PROJECT_ID}-${REGION}-embeddings}
+SQL_INSTANCE=${SQL_INSTANCE:-embeddings-sql-${REGION}}
+DB_NAME=${DB_NAME:-movies}
+DB_USER=${DB_USER:-app_user}
+DB_PASSWORD=${DB_PASSWORD:-}
+RUNTIME_SA_NAME=${RUNTIME_SA:-embedding-service}
+JOBS_SA_NAME=${JOBS_SA:-embedding-jobs}
+
+if [[ -z "$PROJECT_ID" || -z "$REGION" ]]; then
+ echo "PROJECT_ID/REGION not set. Set env vars or run 'gcloud config set project/region'." >&2
+ exit 1
+fi
+
+RUNTIME_SA_EMAIL="${RUNTIME_SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com"
+JOBS_SA_EMAIL="${JOBS_SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com"
+
+echo "== Summary =="
+echo "Project: $PROJECT_ID"
+echo "Region: $REGION"
+echo "AR Repo: $AR_REPO"
+echo "Bucket: $BUCKET_NAME"
+echo "SQL Instance: $SQL_INSTANCE"
+echo "DB: $DB_NAME"
+echo "DB User: $DB_USER"
+echo "Runtime SA: $RUNTIME_SA_EMAIL"
+echo "Jobs SA: $JOBS_SA_EMAIL"
+
+if [[ -z "$DB_PASSWORD" ]]; then
+ if command -v openssl >/dev/null 2>&1; then
+ DB_PASSWORD=$(openssl rand -base64 20 | tr -d '=+' | cut -c1-20)
+ else
+ DB_PASSWORD=$(head -c 24 /dev/urandom | base64 | tr -d '=+' | cut -c1-20)
+ fi
+ echo "Generated DB password (not printed)."
+fi
+
+echo "== Creating service accounts (idempotent) =="
+gcloud iam service-accounts describe "$RUNTIME_SA_EMAIL" >/dev/null 2>&1 || \
+ gcloud iam service-accounts create "$RUNTIME_SA_NAME" --display-name="Embedding Service Runtime"
+gcloud iam service-accounts describe "$JOBS_SA_EMAIL" >/dev/null 2>&1 || \
+ gcloud iam service-accounts create "$JOBS_SA_NAME" --display-name="Embedding Jobs"
+
+echo "== Granting roles =="
+gcloud projects add-iam-policy-binding "$PROJECT_ID" \
+ --member="serviceAccount:$RUNTIME_SA_EMAIL" --role="roles/cloudsql.client" --quiet >/dev/null
+gcloud projects add-iam-policy-binding "$PROJECT_ID" \
+ --member="serviceAccount:$RUNTIME_SA_EMAIL" --role="roles/secretmanager.secretAccessor" --quiet >/dev/null
+gcloud projects add-iam-policy-binding "$PROJECT_ID" \
+ --member="serviceAccount:$RUNTIME_SA_EMAIL" --role="roles/artifactregistry.reader" --quiet >/dev/null
+
+gcloud projects add-iam-policy-binding "$PROJECT_ID" \
+ --member="serviceAccount:$JOBS_SA_EMAIL" --role="roles/cloudsql.client" --quiet >/dev/null
+gcloud projects add-iam-policy-binding "$PROJECT_ID" \
+ --member="serviceAccount:$JOBS_SA_EMAIL" --role="roles/secretmanager.secretAccessor" --quiet >/dev/null
+gcloud projects add-iam-policy-binding "$PROJECT_ID" \
+ --member="serviceAccount:$JOBS_SA_EMAIL" --role="roles/storage.admin" --quiet >/dev/null
+
+echo "== Creating GCS bucket (idempotent) =="
+if gsutil ls -b "gs://$BUCKET_NAME" >/dev/null 2>&1; then
+ echo "Bucket exists: gs://$BUCKET_NAME"
+else
+ gsutil mb -l "$REGION" "gs://$BUCKET_NAME"
+fi
+gsutil -q cp /dev/null "gs://$BUCKET_NAME/models/.keep" || true
+gsutil -q cp /dev/null "gs://$BUCKET_NAME/data/netflix/.keep" || true
+gsutil -q cp /dev/null "gs://$BUCKET_NAME/data/tmdb/.keep" || true
+gsutil -q cp /dev/null "gs://$BUCKET_NAME/data/processed/.keep" || true
+gsutil -q cp /dev/null "gs://$BUCKET_NAME/embeddings/.keep" || true
+
+echo "== Creating Cloud SQL instance/database/user (idempotent) =="
+if gcloud sql instances describe "$SQL_INSTANCE" --project "$PROJECT_ID" >/dev/null 2>&1; then
+ echo "SQL instance exists: $SQL_INSTANCE"
+else
+ gcloud sql instances create "$SQL_INSTANCE" \
+ --database-version=POSTGRES_15 --cpu=2 --memory=7680MB \
+ --region="$REGION" --availability-type=ZONAL --quiet
+fi
+
+if gcloud sql databases describe "$DB_NAME" --instance "$SQL_INSTANCE" >/dev/null 2>&1; then
+ echo "Database exists: $DB_NAME"
+else
+ gcloud sql databases create "$DB_NAME" --instance "$SQL_INSTANCE" --quiet
+fi
+
+if gcloud sql users list --instance "$SQL_INSTANCE" --format="value(name)" | grep -qx "$DB_USER"; then
+ echo "User exists: $DB_USER (updating password)"
+ gcloud sql users set-password "$DB_USER" --instance "$SQL_INSTANCE" --password "$DB_PASSWORD" --quiet
+else
+ gcloud sql users create "$DB_USER" --instance "$SQL_INSTANCE" --password "$DB_PASSWORD" --quiet
+fi
+
+INSTANCE_CONNECTION_NAME=$(gcloud sql instances describe "$SQL_INSTANCE" --format='value(connectionName)')
+DB_URL="postgresql://$DB_USER:$DB_PASSWORD@/$DB_NAME?host=/cloudsql/$INSTANCE_CONNECTION_NAME"
+
+echo "== Writing database-url secret (idempotent) =="
+if gcloud secrets describe database-url >/dev/null 2>&1; then
+ echo -n "$DB_URL" | gcloud secrets versions add database-url --data-file=- >/dev/null
+else
+ gcloud secrets create database-url >/dev/null
+ echo -n "$DB_URL" | gcloud secrets versions add database-url --data-file=- >/dev/null
+fi
+
+echo "== Summary Outputs =="
+echo "BUCKET_NAME=$BUCKET_NAME"
+echo "SQL_INSTANCE=$SQL_INSTANCE"
+echo "INSTANCE_CONNECTION_NAME=$INSTANCE_CONNECTION_NAME"
+echo "DB_NAME=$DB_NAME"
+echo "DB_USER=$DB_USER"
+echo "DB_PASSWORD=[REDACTED]"
+echo "DATABASE_URL stored in Secret Manager: database-url (latest)"
diff --git a/apps/cr-hypervr/scripts/run_pipeline_phase2.py b/apps/cr-hypervr/scripts/run_pipeline_phase2.py
new file mode 100644
index 00000000..c54a99cd
--- /dev/null
+++ b/apps/cr-hypervr/scripts/run_pipeline_phase2.py
@@ -0,0 +1,45 @@
+from pipeline.user_profiles import build_user_profiles
+from pipeline.triplets import generate_triplets
+import subprocess
+import sys
+import os
+
+
+def main():
+ # Assume data-join job produced movies_with_descriptions + ratings_enriched-*.parquet
+ # If not present, attempt a join on a small chunk by setting a tiny chunksize to reduce memory.
+ processed = os.getenv("GCS_PROCESSED_PREFIX", "data/processed")
+ data_prefix = os.getenv("GCS_DATA_PREFIX")
+ # Heuristic: if movies parquet is missing, run join (chunked)
+ need_join = False
+ import pandas as pd
+ from pathlib import Path
+ movies_path = f"{processed}/movies_with_descriptions.parquet" if str(processed).startswith("gs://") else Path(processed) / "movies_with_descriptions.parquet"
+ try:
+ pd.read_parquet(movies_path, storage_options={"token": "cloud"} if str(movies_path).startswith("gs://") else None)
+ except Exception:
+ need_join = True
+ if need_join:
+ os.environ.setdefault("JOIN_RATINGS_CHUNKSIZE", "250000")
+ subprocess.check_call([sys.executable, "scripts/join_datasets.py"]) # chunked join
+
+ profiles_path = os.getenv(
+ "GCS_PROFILES_PATH",
+ (processed + "/user_profiles.parquet") if str(processed).startswith("gs://") else "data/processed/user_profiles.parquet",
+ )
+ min_ratings = int(os.getenv("MIN_RATINGS", "10"))
+ build_user_profiles(processed_dir=processed, out_path=profiles_path, min_ratings=min_ratings)
+ triplets_out = os.getenv("GCS_TRIPLETS_PREFIX", "data/processed/triplets")
+ # Allow overriding triplet sample size. Set TRIPLET_USER_SAMPLE=all for full dataset.
+ samp_env = os.getenv("TRIPLET_USER_SAMPLE", "10000")
+ user_sample = None if str(samp_env).lower() in ("all", "none", "0", "-1") else int(samp_env)
+ generate_triplets(processed_dir=processed, out_dir=triplets_out, user_sample=user_sample)
+
+ # Always build and validate hyperedges at the end of Phase 2
+ os.environ.setdefault("PROCESSED_PREFIX", processed)
+ subprocess.check_call([sys.executable, "scripts/build_hyperedges.py"]) # writes parquet (+optional DB insert)
+ subprocess.check_call([sys.executable, "scripts/validate_hyperedges.py"]) # exits non-zero on mismatch
+
+
+if __name__ == "__main__":
+ main()
diff --git a/apps/cr-hypervr/scripts/run_pipeline_phase3.py b/apps/cr-hypervr/scripts/run_pipeline_phase3.py
new file mode 100644
index 00000000..ec5e1287
--- /dev/null
+++ b/apps/cr-hypervr/scripts/run_pipeline_phase3.py
@@ -0,0 +1,164 @@
+from __future__ import annotations
+
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+
+def _is_gcs(p: str) -> bool:
+ return str(p).startswith("gs://")
+
+
+def _parquet_exists(path: str) -> bool:
+ try:
+ import pandas as pd # noqa: F401
+ # We rely on fsspec to resolve gs://. Just try a metadata read.
+ # Using pyarrow, this will error fast if missing.
+ pd.read_parquet(path, columns=[], engine="pyarrow") # type: ignore[arg-type]
+ return True
+ except Exception:
+ return False
+
+
+def _require_phase2_outputs(processed: str) -> None:
+ # Accept either name for the movies parquet (pipeline writes the first)
+ movies_candidates = [
+ f"{processed}/movies_with_descriptions.parquet",
+ f"{processed}/movies_enriched.parquet",
+ ]
+ profiles = f"{processed}/user_profiles.parquet"
+ triplets = f"{processed}/triplets/triplets_10k.parquet"
+
+ has_movies = any(_parquet_exists(m) for m in movies_candidates)
+ has_profiles = _parquet_exists(profiles)
+ has_triplets = _parquet_exists(triplets)
+
+ if has_movies and has_profiles and has_triplets:
+ return
+
+ # Optionally run Phase 2 to produce missing outputs
+ if os.getenv("RUN_PHASE2_IF_MISSING", "").lower() in ("1", "true", "yes"):
+ print("Phase 2 outputs missing — invoking scripts/run_pipeline_phase2.py ...")
+ env = os.environ.copy()
+ # Respect GCS_* envs if user set them
+ subprocess.check_call(["python", "scripts/run_pipeline_phase2.py"], env=env)
+ # Re-check
+ has_movies = any(_parquet_exists(m) for m in movies_candidates)
+ has_profiles = _parquet_exists(profiles)
+ has_triplets = _parquet_exists(triplets)
+ if has_movies and has_profiles and has_triplets:
+ return
+
+ missing = []
+ if not has_movies:
+ missing.append("movies_with_descriptions.parquet")
+ if not has_profiles:
+ missing.append("user_profiles.parquet")
+ if not has_triplets:
+ missing.append("triplets/triplets_10k.parquet")
+ raise SystemExit(
+ "Phase 3 requires Phase 2 outputs. Missing: " + ", ".join(missing)
+ )
+
+
+def main() -> None:
+ # Locations
+ processed = (
+ os.getenv("GCS_PROCESSED_PREFIX")
+ or os.getenv("PROCESSED_DIR")
+ or os.getenv("PROCESSED_PREFIX")
+ or "data/processed"
+ )
+ base_model_dir = os.getenv("BASE_MODEL_DIR", "models/base-minilm")
+ output_dir = os.getenv("OUTPUT_DIR", "models/movie-minilm-v1")
+
+ # Validate inputs; if using local FS, ensure directories exist
+ if not _is_gcs(processed):
+ Path(processed).mkdir(parents=True, exist_ok=True)
+ if not _is_gcs(base_model_dir):
+ Path(base_model_dir).mkdir(parents=True, exist_ok=True)
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
+
+ # Ensure Phase 2 outputs are present (or produce them if allowed)
+ _require_phase2_outputs(processed)
+
+ # Training config
+ epochs = os.getenv("EPOCHS", "1")
+ batch_size = os.getenv("BATCH_SIZE", "64")
+ use_triplet = os.getenv("USE_TRIPLET", os.getenv("USE_TRIPLET_LOSS", "0"))
+
+ env = os.environ.copy()
+ env.update(
+ {
+ "BASE_MODEL_DIR": base_model_dir,
+ "PROCESSED_DIR": processed,
+ "OUTPUT_DIR": output_dir,
+ "EPOCHS": str(epochs),
+ "BATCH_SIZE": str(batch_size),
+ "USE_TRIPLET": str(use_triplet or "0"),
+ }
+ )
+
+ if os.getenv("SKIP_TRAIN", "").lower() in ("1", "true", "yes"):
+ print("[Phase 3] Skipping fine-tuning per SKIP_TRAIN")
+ else:
+ print("[Phase 3] Starting fine-tuning…")
+ subprocess.check_call([sys.executable, "training/train_finetune.py"], env=env)
+
+ if os.getenv("SKIP_ONNX_EXPORT", "").lower() not in ("1", "true", "yes"):
+ print("[Phase 3] Exporting ONNX…")
+ subprocess.check_call([sys.executable, "training/onnx_export.py"], env=env)
+ else:
+ print("[Phase 3] Skipping ONNX export per SKIP_ONNX_EXPORT")
+
+ if os.getenv("SKIP_QUANTIZE", "").lower() not in ("1", "true", "yes"):
+ print("[Phase 3] Quantizing ONNX to INT8…")
+ subprocess.check_call([sys.executable, "training/quantize_int8.py"], env=env)
+ else:
+ print("[Phase 3] Skipping quantization per SKIP_QUANTIZE")
+
+ print("[Phase 3] Completed. Artifacts under:")
+ print(f" - output_dir = {output_dir}")
+ print(f" - model.onnx and model-int8.onnx if export/quantize enabled")
+
+ # Optional: upload artifacts to GCS
+ upload_uri = (
+ os.getenv("MODEL_UPLOAD_URI")
+ or os.getenv("GCS_MODEL_UPLOAD_URI")
+ or (
+ f"{os.getenv('GCS_MODELS_BUCKET').rstrip('/')}/models/movie-minilm-v1"
+ if os.getenv("GCS_MODELS_BUCKET")
+ else None
+ )
+ )
+ if upload_uri and upload_uri.startswith("gs://"):
+ try:
+ import fsspec
+ from pathlib import PurePosixPath
+
+ fs = fsspec.filesystem("gcs")
+ print(f"[Phase 3] Uploading artifacts to {upload_uri} …")
+ base = Path(output_dir)
+ for local in base.rglob("*"):
+ if local.is_dir():
+ continue
+ rel = local.relative_to(base)
+ tgt = str(PurePosixPath(upload_uri.strip("/")) / str(rel))
+ # Ensure parent dir on GCS
+ parent = str(PurePosixPath(tgt).parent)
+ try:
+ fs.mkdir(parent)
+ except Exception:
+ pass
+ with open(local, "rb") as fsrc, fs.open(tgt, "wb") as fdst: # type: ignore[attr-defined]
+ fdst.write(fsrc.read())
+ print("[Phase 3] Upload complete.")
+ except Exception as e:
+ print(f"[Phase 3] WARN: Upload to GCS failed: {e}")
+ elif upload_uri:
+ print(f"[Phase 3] WARN: Unsupported upload URI: {upload_uri}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/apps/cr-hypervr/scripts/seed_embeddings.py b/apps/cr-hypervr/scripts/seed_embeddings.py
new file mode 100644
index 00000000..756f11d1
--- /dev/null
+++ b/apps/cr-hypervr/scripts/seed_embeddings.py
@@ -0,0 +1,154 @@
+from __future__ import annotations
+
+import asyncio
+import os
+from pathlib import Path
+import time
+from typing import List
+import numpy as np
+import pandas as pd
+import requests
+
+
+EMBED_DIM = 384
+DEFAULT_BATCH = int(os.getenv("BATCH_EMBED_SIZE", "256"))
+UPSERT_CHUNK = int(os.getenv("UPSERT_CHUNK_SIZE", "1000"))
+DRY_RUN = os.getenv("DRY_RUN", "").lower() in ("1", "true", "yes")
+
+
+def build_movie_text(row: pd.Series) -> str:
+ return f"Title: {row.get('title','')}\nGenres: {row.get('genres','')}\nOverview: {row.get('overview','')}"
+
+
+async def seed_db(database_url: str, movie_ids: list[int], vectors: np.ndarray) -> None:
+ import asyncpg
+
+ conn = await asyncpg.connect(database_url)
+ try:
+ # Upsert movies and embeddings
+ def _vec_to_pgtext(v: list[float]) -> str:
+ return "[" + ",".join(str(float(x)) for x in v) + "]"
+
+ q = (
+ "INSERT INTO movie_embeddings (movie_id, embedding) "
+ "VALUES ($1, $2) ON CONFLICT (movie_id) DO UPDATE SET embedding=EXCLUDED.embedding"
+ )
+ total = len(movie_ids)
+ for i in range(0, total, UPSERT_CHUNK):
+ mids = movie_ids[i : i + UPSERT_CHUNK]
+ vecs = vectors[i : i + UPSERT_CHUNK]
+ rows = [(int(mid), _vec_to_pgtext(vec.tolist())) for mid, vec in zip(mids, vecs)]
+ await conn.executemany(q, rows)
+ print(f"Upserted {min(i+UPSERT_CHUNK,total)}/{total} embeddings to DB...")
+ finally:
+ await conn.close()
+
+
+def _fetch_id_token(audience: str) -> str | None:
+ tok = os.getenv("ID_TOKEN")
+ if tok:
+ return tok
+ try:
+ resp = requests.get(
+ "http://metadata/computeMetadata/v1/instance/service-accounts/default/identity",
+ params={"audience": audience, "format": "full"},
+ headers={"Metadata-Flavor": "Google"},
+ timeout=3,
+ )
+ if resp.status_code == 200 and resp.text:
+ return resp.text.strip()
+ except Exception:
+ pass
+ return None
+
+
+def _encode_vectors_via_service(texts: List[str], batch_size: int, timeout: float = 30.0) -> np.ndarray:
+ service_url = os.getenv("SERVICE_URL")
+ if not service_url:
+ if os.getenv("ALLOW_LOCAL_FALLBACK", "").lower() in ("1", "true", "yes"):
+ from app.services.embedder import get_embedder # type: ignore
+
+ return get_embedder().encode(texts)
+ raise SystemExit("SERVICE_URL not set; Cloud Run embedding service required")
+ token = _fetch_id_token(service_url)
+ headers = {"Content-Type": "application/json"}
+ if token:
+ headers["Authorization"] = f"Bearer {token}"
+ sess = requests.Session()
+ out: list[np.ndarray] = []
+ for i in range(0, len(texts), batch_size):
+ chunk = texts[i : i + batch_size]
+ for attempt in range(4):
+ try:
+ r = sess.post(
+ f"{service_url.rstrip('/')}/embed/batch",
+ json={"texts": chunk},
+ headers=headers,
+ timeout=timeout,
+ )
+ if r.status_code >= 500 and attempt < 3:
+ time.sleep(1.5 * (attempt + 1))
+ continue
+ r.raise_for_status()
+ payload = r.json()
+ vecs = [np.array(item["embedding"], dtype=np.float32) for item in payload]
+ out.extend(vecs)
+ break
+ except Exception:
+ if attempt >= 3:
+ raise
+ time.sleep(1.5 * (attempt + 1))
+ continue
+ arr = np.stack(out, axis=0)
+ n = np.linalg.norm(arr, axis=1, keepdims=True)
+ n[n == 0] = 1.0
+ return (arr / n).astype(np.float32)
+
+
+def _is_gcs(path: str | Path | str) -> bool:
+ return str(path).startswith("gs://")
+
+
+def main():
+ processed_env = os.getenv("PROCESSED_PREFIX", "data/processed")
+ if _is_gcs(processed_env):
+ movies_path = f"{processed_env}/movies_with_descriptions.parquet"
+ movies = pd.read_parquet(movies_path, storage_options={"token": "cloud"})
+ else:
+ processed_dir = Path(processed_env)
+ movies_path = processed_dir / "movies_with_descriptions.parquet"
+ if not movies_path.exists():
+ raise FileNotFoundError("movies_with_descriptions.parquet not found. Run Phase 2 first.")
+ movies = pd.read_parquet(movies_path)
+ # Align column names
+ if "movie_id" not in movies.columns and "movieId" in movies.columns:
+ movies = movies.rename(columns={"movieId": "movie_id"})
+
+ # Stream encode + upsert in row chunks to avoid long blocking DB operations
+ db_url = os.getenv("DATABASE_URL")
+ if not DRY_RUN and not db_url:
+ raise SystemExit("DATABASE_URL not set; expected to upsert into movie_embeddings table")
+
+ ROW_CHUNK = int(os.getenv("MOVIES_ROW_CHUNK", "5000"))
+ total = len(movies)
+ print(f"Processing {total} movies in chunks of {ROW_CHUNK}...")
+ processed = 0
+ for start in range(0, total, ROW_CHUNK):
+ end = min(start + ROW_CHUNK, total)
+ chunk = movies.iloc[start:end]
+ texts = chunk.apply(build_movie_text, axis=1).tolist()
+ mids = chunk["movie_id"].astype(int).tolist()
+ print(f"Encoding {len(texts)} movies [{start}:{end}] via service...")
+ vecs = _encode_vectors_via_service(texts, batch_size=DEFAULT_BATCH)
+ if DRY_RUN:
+ print(f"[DRY_RUN] Encoded {len(texts)} embeddings; skipping DB upsert.")
+ else:
+ print("Upserting chunk to DB...")
+ asyncio.run(seed_db(db_url, mids, vecs))
+ processed = end
+ print(f"Progress: {processed}/{total}")
+ print(f"Completed seeding embeddings for {total} movies.")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/apps/cr-hypervr/scripts/seed_movies.py b/apps/cr-hypervr/scripts/seed_movies.py
new file mode 100644
index 00000000..bf8cfcf4
--- /dev/null
+++ b/apps/cr-hypervr/scripts/seed_movies.py
@@ -0,0 +1,76 @@
+from __future__ import annotations
+
+import asyncio
+import os
+from pathlib import Path
+import pandas as pd
+
+
+def _is_gcs(path: str | Path) -> bool:
+ return str(path).startswith("gs://")
+
+
+async def main():
+ db_url = os.getenv("DATABASE_URL")
+ if not db_url:
+ raise SystemExit("Set DATABASE_URL")
+ import asyncpg
+
+ processed_env = os.getenv("PROCESSED_PREFIX", "data/processed")
+ if _is_gcs(processed_env):
+ movies_path = f"{processed_env}/movies_with_descriptions.parquet"
+ df = pd.read_parquet(movies_path, storage_options={"token": "cloud"})
+ else:
+ processed = Path(processed_env)
+ movies_path = processed / "movies_with_descriptions.parquet"
+ if not movies_path.exists():
+ raise FileNotFoundError("data/processed/movies_with_descriptions.parquet not found")
+ df = pd.read_parquet(movies_path)
+ cols = ["movie_id", "title", "genres", "overview", "release_year", "tmdb_id"]
+ # Conform schema: derive release_year from release_date where present
+ if "movieId" in df.columns:
+ df["movie_id"] = df["movieId"].astype(int)
+ if "release_year" not in df.columns:
+ if "release_date" in df.columns:
+ df["release_year"] = pd.to_datetime(df["release_date"], errors="coerce").dt.year
+ else:
+ df["release_year"] = None
+ # Normalize dtypes and nulls for DB insert
+ df["release_year"] = pd.to_numeric(df["release_year"], errors="coerce").astype("Int64")
+ df["tmdb_id"] = pd.to_numeric(df.get("tmdbId", df.get("tmdb_id", None)), errors="coerce").astype("Int64") if ("tmdbId" in df.columns or "tmdb_id" in df.columns) else df.get("tmdb_id", None)
+ # ensure strings
+ df["title"] = df["title"].astype(str)
+ df["genres"] = df.get("genres", "").astype(str)
+ df["overview"] = df.get("overview", "").astype(str)
+ for col in cols:
+ if col not in df.columns:
+ df[col] = None
+
+ # Convert pandas NA to Python None
+ def _py(v):
+ if hasattr(pd, "isna") and pd.isna(v):
+ return None
+ return v
+ rows = [(_py(a), _py(b), _py(c), _py(d), _py(e), _py(f)) for a,b,c,d,e,f in df[cols].itertuples(index=False, name=None)]
+ conn = await asyncpg.connect(db_url)
+ try:
+ await conn.executemany(
+ """
+ INSERT INTO movies (movie_id, title, genres, overview, release_year, tmdb_id)
+ VALUES ($1,$2,$3,$4,$5,$6)
+ ON CONFLICT (movie_id) DO UPDATE SET
+ title=EXCLUDED.title,
+ genres=EXCLUDED.genres,
+ overview=EXCLUDED.overview,
+ release_year=EXCLUDED.release_year,
+ tmdb_id=EXCLUDED.tmdb_id
+ """,
+ rows,
+ )
+ finally:
+ await conn.close()
+ print(f"Upserted {len(rows)} movies")
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/apps/cr-hypervr/scripts/seed_ratings.py b/apps/cr-hypervr/scripts/seed_ratings.py
new file mode 100644
index 00000000..06d46066
--- /dev/null
+++ b/apps/cr-hypervr/scripts/seed_ratings.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import asyncio
+import os
+from pathlib import Path
+import pandas as pd
+
+
+async def seed(database_url: str, df: pd.DataFrame) -> None:
+ import asyncpg
+
+ conn = await asyncpg.connect(database_url)
+ try:
+ norm = []
+ for row in df[["user_id", "movie_id", "rating", "rated_at"]].itertuples(index=False, name=None):
+ uid, mid, rating, ts = row
+ ts_norm = None
+ if not pd.isna(ts):
+ try:
+ ts_norm = pd.to_datetime(ts).to_pydatetime()
+ except Exception:
+ ts_norm = None
+ norm.append((int(uid), int(mid), float(rating), ts_norm))
+ await conn.executemany(
+ """
+ INSERT INTO user_ratings (user_id, movie_id, rating, rated_at)
+ VALUES ($1,$2,$3,$4)
+ ON CONFLICT (user_id, movie_id) DO UPDATE SET rating=EXCLUDED.rating, rated_at=COALESCE(EXCLUDED.rated_at, user_ratings.rated_at)
+ """,
+ norm,
+ )
+ finally:
+ await conn.close()
+
+
+def main() -> None:
+ db_url = os.getenv("DATABASE_URL")
+ if not db_url:
+ raise SystemExit("Set DATABASE_URL")
+ processed = Path(os.getenv("PROCESSED_PREFIX", "data/sample-processed"))
+ parts = sorted(processed.glob("ratings_enriched-*.parquet"))
+ if not parts:
+ raise FileNotFoundError(f"No ratings_enriched-*.parquet in {processed}")
+ frames = []
+ for p in parts:
+ df = pd.read_parquet(p)
+ if "userId" in df.columns:
+ df = df.rename(columns={"userId": "user_id"})
+ if "movieId" in df.columns:
+ df = df.rename(columns={"movieId": "movie_id"})
+ df["rated_at"] = pd.NaT
+ frames.append(df[["user_id", "movie_id", "rating", "rated_at"]])
+ merged = pd.concat(frames, ignore_index=True)
+ asyncio.run(seed(db_url, merged))
+ print(f"Upserted {len(merged):,} user_ratings rows")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/apps/cr-hypervr/scripts/setup_gcp.sh b/apps/cr-hypervr/scripts/setup_gcp.sh
new file mode 100644
index 00000000..366292da
--- /dev/null
+++ b/apps/cr-hypervr/scripts/setup_gcp.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+PROJECT_ID=${PROJECT_ID:?set PROJECT_ID}
+REGION=${REGION:-europe-west2}
+
+echo "Setting project and region..."
+gcloud config set project "$PROJECT_ID"
+gcloud config set compute/region "$REGION"
+
+echo "Enabling required services..."
+gcloud services enable \
+ run.googleapis.com \
+ cloudbuild.googleapis.com \
+ artifactregistry.googleapis.com \
+ sqladmin.googleapis.com \
+ secretmanager.googleapis.com \
+ vpcaccess.googleapis.com \
+ storage.googleapis.com \
+ compute.googleapis.com
+
+echo "Setup complete. Create bucket/SQL and deploy per mainPRD.md."
diff --git a/apps/cr-hypervr/scripts/setup_secrets.sh b/apps/cr-hypervr/scripts/setup_secrets.sh
new file mode 100644
index 00000000..ee202b1e
--- /dev/null
+++ b/apps/cr-hypervr/scripts/setup_secrets.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Creates/updates Kaggle (and optionally TMDB) secrets.
+# Usage:
+# KAGGLE_JSON=$HOME/.kaggle/kaggle.json ./scripts/setup_secrets.sh
+# Optional (not required with TMDB Kaggle dataset):
+# TMDB_API_KEY=... ./scripts/setup_secrets.sh
+
+if [[ -n "${TMDB_API_KEY:-}" ]]; then
+ if gcloud secrets describe tmdb-api-key >/dev/null 2>&1; then
+ echo -n "$TMDB_API_KEY" | gcloud secrets versions add tmdb-api-key --data-file=- >/dev/null
+ else
+ gcloud secrets create tmdb-api-key >/dev/null
+ echo -n "$TMDB_API_KEY" | gcloud secrets versions add tmdb-api-key --data-file=- >/dev/null
+ fi
+ echo "Updated secret: tmdb-api-key"
+else
+ echo "TMDB_API_KEY not set; skipping (not required if using TMDB Kaggle dataset)."
+fi
+
+if [[ -n "${KAGGLE_JSON:-}" ]]; then
+ if [[ ! -f "$KAGGLE_JSON" ]]; then
+ echo "KAGGLE_JSON path does not exist: $KAGGLE_JSON" >&2
+ exit 1
+ fi
+ if gcloud secrets describe kaggle-credentials >/dev/null 2>&1; then
+ gcloud secrets versions add kaggle-credentials --data-file="$KAGGLE_JSON" >/dev/null
+ else
+ gcloud secrets create kaggle-credentials >/dev/null
+ gcloud secrets versions add kaggle-credentials --data-file="$KAGGLE_JSON" >/dev/null
+ fi
+ echo "Updated secret: kaggle-credentials"
+else
+ echo "KAGGLE_JSON not set; skipping."
+fi
diff --git a/apps/cr-hypervr/scripts/upload_gcs_assets.sh b/apps/cr-hypervr/scripts/upload_gcs_assets.sh
new file mode 100644
index 00000000..b5199f87
--- /dev/null
+++ b/apps/cr-hypervr/scripts/upload_gcs_assets.sh
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Robust uploader with verification and detailed logs.
+# Usage (pass explicit buckets; defaults still point to ${PROJECT_ID}-${REGION}-datasets/models):
+# PROJECT_ID=agentics-foundation25lon-1809 REGION=europe-west2 \
+# DATA_BUCKET=gs://agentics-foundation25lon-1809-europe-west2-datasets-20251207 \
+# MODEL_BUCKET=gs://agentics-foundation25lon-1809-europe-west2-models-20251207 \
+# bash scripts/upload_gcs_assets.sh
+
+ts() { date -u +"%Y-%m-%dT%H:%M:%SZ"; }
+
+PROJECT_ID=${PROJECT_ID:-agentics-foundation25lon-1809}
+REGION=${REGION:-europe-west2}
+DATA_BUCKET=${DATA_BUCKET:-gs://${PROJECT_ID}-${REGION}-datasets}
+MODEL_BUCKET=${MODEL_BUCKET:-gs://${PROJECT_ID}-${REGION}-models}
+
+GSDBG=${GSDBG:-0}
+GSFLAGS=(-m)
+if [[ "$GSDBG" == "1" ]]; then
+ GSFLAGS=(-m -D)
+fi
+
+echo "[$(ts)] Start upload to GCS"
+echo "Project: $PROJECT_ID"
+echo "Region: $REGION"
+echo "Data bkt: $DATA_BUCKET"
+echo "Model bkt: $MODEL_BUCKET"
+
+ensure_bucket() {
+ local B=$1
+ if gsutil ls -b "$B" >/dev/null 2>&1; then
+ echo "[$(ts)] Bucket exists: $B"
+ else
+ echo "[$(ts)] Creating bucket: $B"
+ gsutil mb -l "$REGION" "$B"
+ fi
+}
+
+ensure_bucket "$DATA_BUCKET"
+ensure_bucket "$MODEL_BUCKET"
+
+local_count_size() {
+ local P=$1
+ local cnt size
+ cnt=$(find "$P" -type f | wc -l | tr -d ' ')
+ size=$(du -sk "$P" | awk '{print $1}') # KiB
+ echo "$cnt files, ${size}KiB"
+}
+
+remote_count_size() {
+ local U=$1
+ local cnt size
+ cnt=$(gsutil ls -r "$U" 2>/dev/null | grep -v '/$' | wc -l | tr -d ' ')
+ size=$(gsutil du -s "$U" 2>/dev/null | awk '{print $1}')
+ echo "$cnt objects, ${size}B"
+}
+
+upload_dir() {
+ local SRC=$1 DST=$2
+ echo "[$(ts)] Uploading directory: $SRC -> $DST"
+ echo " Local: $(local_count_size "$SRC")"
+ gsutil "${GSFLAGS[@]}" rsync -r -c "$SRC" "$DST"
+ echo " Remote: $(remote_count_size "$DST")"
+}
+
+upload_file_verify() {
+ local SRC=$1 DST=$2 # DST ends with / or object path
+ local base=$(basename "$SRC")
+ local OBJ=$DST
+ if [[ "$DST" =~ /$ ]]; then OBJ="${DST}${base}"; fi
+ echo "[$(ts)] Uploading file: $SRC -> $OBJ"
+ gsutil "${GSFLAGS[@]}" cp -n "$SRC" "$OBJ"
+ # Verify MD5 if available
+ if command -v openssl >/dev/null 2>&1; then
+ local lmd5; lmd5=$(openssl md5 -binary "$SRC" | base64 | tr -d '[:space:]')
+ local rmd5; rmd5=$(gsutil stat "$OBJ" | awk -F": " '/Hash \(md5\)/{print $2}' | tr -d '[:space:]')
+ echo " Local MD5: $lmd5"
+ echo " Remote MD5: $rmd5"
+ if [[ -n "$rmd5" && "$lmd5" == "$rmd5" ]]; then
+ echo " Verify: OK"
+ else
+ echo " Verify: MISMATCH or unavailable" >&2
+ return 1
+ fi
+ else
+ echo " MD5 verify skipped (openssl not found)"
+ fi
+}
+
+# Upload TMDB CSV
+if [[ -f data/tmdb/TMDB_movie_dataset_v11.csv ]]; then
+ upload_file_verify data/tmdb/TMDB_movie_dataset_v11.csv "$DATA_BUCKET/data/tmdb/"
+else
+ echo "[$(ts)] WARN: TMDB CSV missing locally; skipping" >&2
+fi
+
+# Upload MovieLens directory
+if [[ -d data/movielens/ml-25m ]]; then
+ upload_dir data/movielens/ml-25m "$DATA_BUCKET/data/movielens/ml-25m/"
+else
+ echo "[$(ts)] WARN: MovieLens directory missing; skipping" >&2
+fi
+
+# Upload base MiniLM model directory
+if [[ -d models/base-minilm ]]; then
+ upload_dir models/base-minilm "$MODEL_BUCKET/models/base-minilm/"
+else
+ echo "[$(ts)] WARN: base-minilm directory missing; skipping" >&2
+fi
+
+echo "[$(ts)] Upload completed"
diff --git a/apps/cr-hypervr/scripts/validate_hyperedges.py b/apps/cr-hypervr/scripts/validate_hyperedges.py
new file mode 100644
index 00000000..8329bfbc
--- /dev/null
+++ b/apps/cr-hypervr/scripts/validate_hyperedges.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import os
+import sys
+from pathlib import Path
+from typing import Iterable, Iterator
+
+import pandas as pd
+
+# Optional heavy deps are imported lazily where possible to keep startup light
+
+
+def _storage_options(path: str | Path) -> dict | None:
+ p = str(path)
+ return {"token": "cloud"} if p.startswith("gs://") else None
+
+
+async def _ensure_tmp_table(conn) -> None:
+ # Create temp table for set comparison (preserve rows across implicit commits)
+ await conn.execute(
+ """
+ CREATE TEMP TABLE IF NOT EXISTS tmp_edges (
+ src_kind TEXT,
+ src_id BIGINT,
+ dst_kind TEXT,
+ dst_id BIGINT,
+ weight REAL
+ ) ON COMMIT PRESERVE ROWS
+ """
+ )
+ # Helpful index for JOIN/NOT EXISTS performance on large edge sets
+ await conn.execute(
+ "CREATE INDEX IF NOT EXISTS tmp_edges_idx ON tmp_edges(src_kind,src_id,dst_kind,dst_id)"
+ )
+
+
+async def _load_chunk(conn, df: pd.DataFrame) -> int:
+ if df is None or df.empty:
+ return 0
+ part = df[["src_kind", "src_id", "dst_kind", "dst_id", "weight"]].copy()
+ rows = list(
+ zip(
+ part["src_kind"].astype(str),
+ part["src_id"].astype(int),
+ part["dst_kind"].astype(str),
+ part["dst_id"].astype(int),
+ part["weight"].astype(float),
+ )
+ )
+ if rows:
+ await conn.executemany(
+ "INSERT INTO tmp_edges (src_kind, src_id, dst_kind, dst_id, weight) VALUES ($1,$2,$3,$4,$5)",
+ rows,
+ )
+ return len(rows)
+
+
+def _iter_parquet_batches(parquet_path: str | Path, batch_size: int = 200_000) -> Iterator[pd.DataFrame]:
+ """Yield DataFrames with only required columns from a Parquet file.
+
+ Uses pyarrow + fsspec for efficient row-group iteration and low memory use.
+ """
+ import pyarrow.parquet as pq
+ try:
+ import fsspec # provided transitively by gcsfs
+ except Exception: # pragma: no cover
+ fsspec = None # type: ignore
+
+ cols = ["src_kind", "src_id", "dst_kind", "dst_id", "weight"]
+ path_str = str(parquet_path)
+ if path_str.startswith("gs://") and fsspec is not None:
+ with fsspec.open(path_str, "rb") as f: # type: ignore
+ pf = pq.ParquetFile(f)
+ for batch in pf.iter_batches(columns=cols, batch_size=batch_size):
+ yield batch.to_pandas(types_mapper=None)
+ else:
+ pf = pq.ParquetFile(path_str)
+ for batch in pf.iter_batches(columns=cols, batch_size=batch_size):
+ yield batch.to_pandas(types_mapper=None)
+
+
+async def validate(parquet_path: str, database_url: str, weight_tol: float = 1e-6) -> int:
+ import asyncpg
+
+ # Stream parquet into temp table to avoid OOM on large files
+ # Also validates required columns exist in the first batch
+
+ conn = await asyncpg.connect(database_url)
+ try:
+ await _ensure_tmp_table(conn)
+
+ total = 0
+ first = True
+ for df in _iter_parquet_batches(parquet_path):
+ if first:
+ first = False
+ need = {"src_kind", "src_id", "dst_kind", "dst_id", "weight"}
+ if not need.issubset(df.columns):
+ raise RuntimeError(f"Parquet missing columns: {need - set(df.columns)}")
+ total += await _load_chunk(conn, df)
+
+ # Count matches/missing with float-tolerant comparison
+ # IMPORTANT: use EXISTS to avoid overcount when DB has duplicate rows for a given edge
+ q_matched_exists = (
+ "SELECT COUNT(*) FROM tmp_edges t WHERE EXISTS ("
+ " SELECT 1 FROM hyperedges h WHERE h.src_kind=t.src_kind AND h.src_id=t.src_id "
+ " AND h.dst_kind=t.dst_kind AND h.dst_id=t.dst_id AND ABS(h.weight - t.weight) < $1"
+ ")"
+ )
+ q_missing = (
+ "SELECT COUNT(*) FROM tmp_edges t "
+ " WHERE NOT EXISTS ("
+ " SELECT 1 FROM hyperedges h WHERE h.src_kind=t.src_kind AND h.src_id=t.src_id "
+ " AND h.dst_kind=t.dst_kind AND h.dst_id=t.dst_id AND ABS(h.weight - t.weight) < $1"
+ " )"
+ )
+ # Optional: extras present in DB but not in parquet (for debugging)
+ q_extra = (
+ "SELECT COUNT(*) FROM hyperedges h WHERE NOT EXISTS ("
+ " SELECT 1 FROM tmp_edges t WHERE h.src_kind=t.src_kind AND h.src_id=t.src_id "
+ " AND h.dst_kind=t.dst_kind AND h.dst_id=t.dst_id AND ABS(h.weight - t.weight) < $1"
+ ")"
+ )
+ matched = await conn.fetchval(q_matched_exists, weight_tol)
+ missing = await conn.fetchval(q_missing, weight_tol)
+ extra = await conn.fetchval(q_extra, weight_tol)
+ print({
+ "parquet_edges": int(total),
+ "db_matched": int(matched or 0),
+ "db_missing": int(missing or 0),
+ "db_extra": int(extra or 0),
+ })
+ return 0 if int(missing or 0) == 0 and int(matched or 0) == int(total) else 1
+ finally:
+ try:
+ await conn.execute("DROP TABLE IF EXISTS tmp_edges")
+ finally:
+ await conn.close()
+
+
+def main() -> int:
+ # Support both local and GCS-style envs
+ processed = os.getenv("PROCESSED_PREFIX") or os.getenv("GCS_PROCESSED_PREFIX") or "data/processed"
+ parquet_path = (
+ f"{processed}/hyperedges.parquet" if str(processed).startswith("gs://") else str(Path(processed) / "hyperedges.parquet")
+ )
+ db_url = os.getenv("DATABASE_URL")
+ if not db_url:
+ print("Set DATABASE_URL")
+ return 2
+ # Allow weight tolerance override for float comparisons
+ try:
+ weight_tol = float(os.getenv("WEIGHT_TOL", "1e-6"))
+ except ValueError:
+ weight_tol = 1e-6
+ import asyncio
+
+ # Quick existence check for clearer error (local or GCS)
+ pstr = str(parquet_path)
+ if not pstr.startswith("gs://"):
+ if not Path(pstr).exists():
+ print(f"Missing parquet at {pstr}")
+ return 3
+ else:
+ try:
+ import gcsfs # type: ignore
+ fs = gcsfs.GCSFileSystem(token="cloud")
+ if not fs.exists(pstr):
+ print(f"Missing parquet at {pstr}")
+ return 3
+ except Exception as e: # pragma: no cover
+ # If we cannot check existence, proceed and let the reader raise an error
+ print(f"Warning: couldn't verify GCS path existence for {pstr}: {e}")
+
+ return asyncio.run(validate(parquet_path, db_url, weight_tol=weight_tol))
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/apps/cr-hypervr/scripts/validate_triplets_coverage.py b/apps/cr-hypervr/scripts/validate_triplets_coverage.py
new file mode 100644
index 00000000..922f17bc
--- /dev/null
+++ b/apps/cr-hypervr/scripts/validate_triplets_coverage.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import asyncio
+import os
+from pathlib import Path
+from typing import Iterable, Set
+
+import pandas as pd
+
+
+def _storage_options(path: str | Path) -> dict | None:
+ p = str(path)
+ return {"token": "cloud"} if p.startswith("gs://") else None
+
+
+async def _fetch_existing(conn, movie_ids: Iterable[int]) -> tuple[Set[int], Set[int]]:
+ mids = list(set(int(x) for x in movie_ids))
+ if not mids:
+ return set(), set()
+ rows1 = await conn.fetch("SELECT movie_id FROM movies WHERE movie_id = ANY($1::int[])", mids)
+ rows2 = await conn.fetch("SELECT movie_id FROM movie_embeddings WHERE movie_id = ANY($1::int[])", mids)
+ have_movies = {int(r["movie_id"]) for r in rows1}
+ have_embs = {int(r["movie_id"]) for r in rows2}
+ return have_movies, have_embs
+
+
+async def main() -> int:
+ db_url = os.getenv("DATABASE_URL")
+ triplets_dir = os.getenv("GCS_TRIPLETS_PREFIX", "data/processed/triplets")
+ if not db_url:
+ print("Set DATABASE_URL")
+ return 2
+ trip_path = f"{triplets_dir}/triplets_10k.parquet"
+ df = pd.read_parquet(trip_path, storage_options=_storage_options(trip_path))
+ needed: Set[int] = set(map(int, df["pos_movie_id"].tolist())) | set(map(int, df["neg_movie_id"].tolist()))
+
+ import asyncpg # lazy import
+
+ conn = await asyncpg.connect(db_url)
+ try:
+ have_movies, have_embs = await _fetch_existing(conn, needed)
+ finally:
+ await conn.close()
+
+ missing_movies = needed - have_movies
+ missing_embs = needed - have_embs
+ print(f"Triplets movies referenced: {len(needed):,}")
+ print(f"Present in movies table: {len(have_movies):,} (missing {len(missing_movies):,})")
+ print(f"With embeddings present: {len(have_embs):,} (missing {len(missing_embs):,})")
+ if missing_movies:
+ print(f"Missing in movies table (sample): {sorted(list(missing_movies))[:10]}")
+ if missing_embs:
+ print(f"Missing embeddings (sample): {sorted(list(missing_embs))[:10]}")
+ # Non-zero exit if any gaps
+ return 0 if not missing_movies and not missing_embs else 1
+
+
+if __name__ == "__main__":
+ raise SystemExit(asyncio.run(main()))
+
diff --git a/apps/cr-hypervr/scripts/verify_gcp_access.py b/apps/cr-hypervr/scripts/verify_gcp_access.py
new file mode 100644
index 00000000..13b50d54
--- /dev/null
+++ b/apps/cr-hypervr/scripts/verify_gcp_access.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+"""
+Lightweight GCP access verifier used by Makefile target `gcp-verify-py`.
+
+Checks (non-destructive):
+- Active gcloud account
+- Project and region config
+- Cloud Run API access
+- Cloud Storage bucket reachability (optional)
+- Cloud SQL instance visibility (optional)
+- Secret Manager listing
+
+Environment:
+- CLOUDSDK_CONFIG respected (set to a repo-local path to avoid $HOME perms issues)
+- Optional: GCP_PROJECT_ID, GCP_REGION, GCP_BUCKET, GCP_SQL_INSTANCE
+"""
+from __future__ import annotations
+
+import os
+import subprocess
+import sys
+from dataclasses import dataclass
+
+
+def run(cmd: str) -> tuple[int, str, str]:
+ p = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+ return p.returncode, p.stdout.strip(), p.stderr.strip()
+
+
+@dataclass
+class Ctx:
+ project: str | None
+ region: str | None
+ bucket: str | None
+ sql_instance: str | None
+ cloudsdk_config: str | None
+
+
+def ctx_from_env() -> Ctx:
+ return Ctx(
+ project=os.getenv("GCP_PROJECT_ID") or os.getenv("PROJECT_ID"),
+ region=os.getenv("GCP_REGION") or os.getenv("REGION"),
+ bucket=os.getenv("GCP_BUCKET"),
+ sql_instance=os.getenv("GCP_SQL_INSTANCE") or os.getenv("SQL_INSTANCE"),
+ cloudsdk_config=os.getenv("CLOUDSDK_CONFIG"),
+ )
+
+
+def main() -> int:
+ ctx = ctx_from_env()
+ print("== GCP Access Verification ==")
+ if ctx.cloudsdk_config:
+ print(f"CLOUDSDK_CONFIG={ctx.cloudsdk_config}")
+
+ # 1) Active account
+ rc, out, err = run("gcloud auth list --filter=status:ACTIVE --format='value(account)'")
+ ok_auth = rc == 0 and bool(out)
+ print(("✓" if ok_auth else "✗") + f" Active account: {out or err}")
+
+ # 2) Project / region
+ if not ctx.project:
+ _rc, p_out, _ = run("gcloud config get-value core/project")
+ ctx.project = p_out or None
+ if not ctx.region:
+ _rc, r_out, _ = run("gcloud config get-value compute/region")
+ ctx.region = r_out or None
+ print(("✓" if ctx.project else "✗") + f" Project: {ctx.project or '[unset]'}")
+ print(("✓" if ctx.region else "✗") + f" Region: {ctx.region or '[unset]'}")
+
+ # 3) Core APIs / access
+ apis = [
+ "run.googleapis.com",
+ "cloudbuild.googleapis.com",
+ "artifactregistry.googleapis.com",
+ "sqladmin.googleapis.com",
+ "secretmanager.googleapis.com",
+ "vpcaccess.googleapis.com",
+ "storage.googleapis.com",
+ "compute.googleapis.com",
+ ]
+ apis_ok = True
+ for s in apis:
+ rc, out, _ = run(f"gcloud services list --enabled --filter=NAME:{s} --format='value(NAME)'")
+ ok = rc == 0 and s in out
+ apis_ok = apis_ok and ok
+ print(("✓" if ok else "✗") + f" API enabled: {s}")
+
+ # 4) Cloud Run access
+ rc, _, _ = run("gcloud run services list --limit=1 2>/dev/null")
+ print(("✓" if rc == 0 else "✗") + " Cloud Run access")
+
+ # 5) Cloud Storage (optional)
+ if ctx.bucket:
+ rc, out, _ = run(f"gsutil ls -b gs://{ctx.bucket} 2>/dev/null")
+ print(("✓" if rc == 0 else "✗") + f" Bucket exists: gs://{ctx.bucket}")
+ else:
+ print("○ Bucket not provided; skip")
+
+ # 6) Cloud SQL (optional)
+ if ctx.sql_instance:
+ rc, out, _ = run(
+ f"gcloud sql instances describe {ctx.sql_instance} --format='value(name)' 2>/dev/null"
+ )
+ print(("✓" if rc == 0 and out else "✗") + f" Cloud SQL: {ctx.sql_instance}")
+ else:
+ print("○ SQL instance not provided; skip")
+
+ # 7) Secret Manager
+ rc, _, _ = run("gcloud secrets list --limit=1 2>/dev/null")
+ print(("✓" if rc == 0 else "✗") + " Secret Manager access")
+
+ # Summary exit code
+ critical_ok = ok_auth and bool(ctx.project) and bool(ctx.region) and apis_ok
+ if critical_ok:
+ print("\n✅ GCP access verified. Ready to proceed.")
+ return 0
+ else:
+ print("\n❌ GCP access verification failed. Check credentials and permissions.")
+ return 2
+
+
+if __name__ == "__main__":
+ sys.exit(main())
+
diff --git a/apps/cr-hypervr/training/onnx_export.py b/apps/cr-hypervr/training/onnx_export.py
new file mode 100644
index 00000000..cf9da886
--- /dev/null
+++ b/apps/cr-hypervr/training/onnx_export.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from sentence_transformers import SentenceTransformer
+
+
+def export_onnx(model_dir: str = "models/movie-minilm-v1", onnx_out: str = "models/movie-minilm-v1/model.onnx") -> None:
+ model = SentenceTransformer(model_dir)
+ Path(Path(onnx_out).parent).mkdir(parents=True, exist_ok=True)
+ # Export using built-in utility (available in sentence-transformers>=3)
+ model.export(
+ export_path=onnx_out,
+ format="onnx",
+ quantize=False,
+ optimize=True,
+ )
+ print("Exported ONNX to", onnx_out)
+
+
+if __name__ == "__main__":
+ export_onnx()
+
diff --git a/apps/cr-hypervr/training/quantize_int8.py b/apps/cr-hypervr/training/quantize_int8.py
new file mode 100644
index 00000000..83d6800e
--- /dev/null
+++ b/apps/cr-hypervr/training/quantize_int8.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+from pathlib import Path
+from onnxruntime.quantization import quantize_dynamic, QuantType
+
+
+def quantize(onnx_in: str = "models/movie-minilm-v1/model.onnx", onnx_out: str = "models/movie-minilm-v1/model-int8.onnx") -> None:
+ Path(Path(onnx_out).parent).mkdir(parents=True, exist_ok=True)
+ quantize_dynamic(
+ model_input=onnx_in,
+ model_output=onnx_out,
+ weight_type=QuantType.QInt8,
+ optimize_model=True,
+ )
+ print("Quantized INT8 ONNX saved to", onnx_out)
+
+
+if __name__ == "__main__":
+ quantize()
+
diff --git a/apps/cr-hypervr/training/train_finetune.py b/apps/cr-hypervr/training/train_finetune.py
new file mode 100644
index 00000000..6767190e
--- /dev/null
+++ b/apps/cr-hypervr/training/train_finetune.py
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable
+
+import pandas as pd
+from sentence_transformers import InputExample, SentenceTransformer, losses
+from torch.utils.data import DataLoader
+
+
+@dataclass
+class TrainConfig:
+ base_model_dir: str = os.getenv("BASE_MODEL_DIR", "models/base-minilm")
+ output_dir: str = os.getenv("OUTPUT_DIR", "models/movie-minilm-v1")
+ processed_dir: str = os.getenv("PROCESSED_DIR", "data/processed")
+ epochs: int = int(os.getenv("EPOCHS", 1))
+ batch_size: int = int(os.getenv("BATCH_SIZE", 64))
+ use_triplet_loss: bool = bool(int(os.getenv("USE_TRIPLET", "0")))
+
+
+def _read_parquet(path: str):
+ storage = {"token": "cloud"} if str(path).startswith("gs://") else None
+ return pd.read_parquet(path, storage_options=storage)
+
+
+def build_examples(processed_dir: str) -> Iterable[InputExample]:
+ trips_path = f"{processed_dir}/triplets/triplets_10k.parquet"
+ users_path = f"{processed_dir}/user_profiles.parquet"
+ # Prefer movies_with_descriptions; allow legacy movies_enriched name
+ movies_primary = f"{processed_dir}/movies_with_descriptions.parquet"
+ movies_fallback = f"{processed_dir}/movies_enriched.parquet"
+
+ trips = _read_parquet(trips_path)
+ users = _read_parquet(users_path)[
+ ["user_id", "liked_titles", "disliked_titles"]
+ ]
+ try:
+ movies = _read_parquet(movies_primary)[["movie_id", "title", "overview", "genres"]]
+ except Exception:
+ movies = _read_parquet(movies_fallback)[["movie_id", "title", "overview", "genres"]]
+
+ u = users.set_index("user_id")
+ m = movies.set_index("movie_id")
+
+ def movie_text(mid: int) -> str:
+ row = m.loc[mid]
+ return f"Title: {row['title']}\nGenres: {row.get('genres', '')}\nOverview: {row.get('overview', '')}"
+
+ for row in trips.itertuples(index=False):
+ user_id = int(row.user_id)
+ pos = int(row.pos_movie_id)
+ neg = int(row.neg_movie_id)
+ up = u.loc[user_id]
+ anchor = f"User likes: {up.get('liked_titles', '')} \nDislikes: {up.get('disliked_titles', '')}"
+ pos_txt = movie_text(pos)
+ neg_txt = movie_text(neg)
+ yield InputExample(texts=[anchor, pos_txt, neg_txt])
+
+
+def main():
+ cfg = TrainConfig()
+ Path(cfg.output_dir).mkdir(parents=True, exist_ok=True)
+
+ print("Loading base model from", cfg.base_model_dir)
+ model = SentenceTransformer(cfg.base_model_dir)
+
+ examples = list(build_examples(cfg.processed_dir))
+ if not examples:
+ raise RuntimeError("No training examples found. Ensure Phase 2 outputs exist.")
+
+ if cfg.use_triplet_loss:
+ # Use explicit triplet loss with anchor-pos-neg
+ from sentence_transformers.losses import TripletLoss
+
+ train_dataloader = DataLoader(examples, shuffle=True, batch_size=cfg.batch_size)
+ train_loss = TripletLoss(model)
+ else:
+ # Use MNR: Only anchor and positive are used; implicit in-batch negatives apply.
+ mnr_examples = [InputExample(texts=e.texts[:2]) for e in examples]
+ train_dataloader = DataLoader(mnr_examples, shuffle=True, batch_size=cfg.batch_size)
+ train_loss = losses.MultipleNegativesRankingLoss(model)
+
+ print(f"Training for {cfg.epochs} epoch(s) with batch size {cfg.batch_size}")
+ model.fit(
+ train_objectives=[(train_dataloader, train_loss)],
+ epochs=cfg.epochs,
+ output_path=cfg.output_dir,
+ show_progress_bar=True,
+ )
+ print("Model saved to", cfg.output_dir)
+
+
+if __name__ == "__main__":
+ main()