From 3bb96e9ef8b849c7fe0d98f9907b8a5091117d8c Mon Sep 17 00:00:00 2001 From: David Viejo Date: Thu, 19 Feb 2026 09:45:19 +0100 Subject: [PATCH 1/9] feat(providers,backup,core): pg_dump sidecar backup, preset registry, and error hardening - Replace pg_dumpall exec-in-container with a disposable sidecar container running the same image, connected via Docker network; fixes OOM kills (exit 137) under load and supports TimescaleDB databases (--format=custom, streaming gzip, no memory spike) - Wire up all preset providers (Next.js, Vite, Rsbuild, Docusaurus v1/v2, NestJS, Angular, Astro, Dockerfile, Nixpacks) in PresetProviderRegistry::new(); Dockerfile and Nixpacks registered first for highest detection precedence - Convert ServiceRegistry and PluginStateRegistry from Mutex to RwLock for concurrent reads - Harden BackupError: structured NotFound/Internal variants with named fields, remove DatabaseConnectionError and Operation variants, exhaustive From for Problem - Replace hand-rolled CORS middleware helper with tower_http::cors::CorsLayer doc comment - Remove legacy CreateService.tsx and CreateServiceRefactored.tsx frontend pages --- CHANGELOG.md | 9 + Cargo.toml | 3 + .../temps-audit/src/services/audit_service.rs | 5 +- .../src/handlers/backup_handler.rs | 62 +- crates/temps-backup/src/services/backup.rs | 177 ++++-- crates/temps-core/src/plugin.rs | 102 ++-- .../src/handlers/deployments.rs | 18 +- .../tests/public_repo_deployment_test.rs | 12 +- crates/temps-email/src/dns.rs | 10 +- crates/temps-geo/src/ip_address_service.rs | 60 +- crates/temps-presets/src/preset_provider.rs | 42 +- crates/temps-presets/src/providers/app.rs | 58 +- .../src/externalsvc/postgres.rs | 267 +++++++-- .../temps-providers/src/externalsvc/redis.rs | 1 - crates/temps-providers/src/services.rs | 8 - .../src/handler/ip_access_control.rs | 155 +++-- .../tests/integration_test.rs | 14 +- web/package.json | 1 - web/src/pages/CreateService.tsx | 555 ------------------ web/src/pages/CreateServiceRefactored.tsx | 199 ------- 20 files changed, 651 insertions(+), 1107 deletions(-) delete mode 100644 web/src/pages/CreateService.tsx delete mode 100644 web/src/pages/CreateServiceRefactored.tsx diff --git a/CHANGELOG.md b/CHANGELOG.md index 82ddce1e..85c4faa3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- PostgreSQL backup now runs `pg_dump` inside a disposable sidecar container (same image as the service) attached to the shared Docker network, eliminating OOM kills (exit code 137) that occurred when `pg_dumpall` was exec'd inside the live service container; TimescaleDB databases are supported via `--format=custom` with advisory circular-FK warnings suppressed +- All preset providers (Next.js, Vite, Rsbuild, Docusaurus v1/v2, NestJS, Angular, Astro, Dockerfile, Nixpacks) are now registered in `PresetProviderRegistry::new()`; Dockerfile and Nixpacks are registered first to take detection precedence - Proxy now converts HTML responses to Markdown on the fly when clients send `Accept: text/markdown`, compatible with Cloudflare's Markdown for Agents standard; responses include `Content-Type: text/markdown`, `Vary: Accept`, and `X-Markdown-Tokens` headers; SSE, WebSocket, and responses over 2 MB pass through unchanged - MCP (Model Context Protocol) server with 210 tools across 30 domain modules (`mcp/`) - OpenAPI SDK auto-generated via `@hey-api/openapi-ts` for MCP server @@ -23,11 +25,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Comprehensive development and release documentation ### Changed +- `ServiceRegistry` and `PluginStateRegistry` now use `RwLock` instead of `Mutex`, allowing concurrent reads during request handling +- `BackupError::NotFound` and `BackupError::Internal` converted to structured variants with named fields (`resource`, `detail`, `message`) for richer, grep-able error messages; removed `DatabaseConnectionError` and `Operation` variants +- `From for Problem` updated to exhaustive match (no catch-all `_ =>`) with correct HTTP status codes per variant +- CORS middleware helper replaced with a doc comment pointing to `tower_http::cors::CorsLayer` - Updated `clippy::ptr_arg` warnings to use `&Path` instead of `&PathBuf` - Fixed `clippy::only_used_in_recursion` warning in workflow executor - Rewrote CLAUDE.md with comprehensive error handling, resilience, and testing guidance - Refined service parameter strategies in `temps-providers` +### Removed +- Deleted legacy `web/src/pages/CreateService.tsx` and `CreateServiceRefactored.tsx` (superseded by current service creation flow) + ### Fixed - Build failures when web UI is skipped in debug mode diff --git a/Cargo.toml b/Cargo.toml index aa7e52f9..dad4e841 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -51,6 +51,9 @@ members = [ "crates/temps-vulnerability-scanner", "crates/temps-kv", "crates/temps-blob", + "crates/temps-environments", + "crates/temps-screenshots", + "crates/temps-embeddings", ] [workspace.package] diff --git a/crates/temps-audit/src/services/audit_service.rs b/crates/temps-audit/src/services/audit_service.rs index 6ead41d0..9ae45ec9 100644 --- a/crates/temps-audit/src/services/audit_service.rs +++ b/crates/temps-audit/src/services/audit_service.rs @@ -1,4 +1,3 @@ -use anyhow::Context; use chrono::Utc; use sea_orm::{prelude::*, ColumnTrait, EntityTrait, QueryFilter, QueryOrder, QuerySelect, Set}; use serde::Serialize; @@ -128,7 +127,7 @@ impl AuditService { .offset(offset as u64) .all(self.db.as_ref()) .await - .context("Failed to load filtered audit logs")?; + .map_err(|e| anyhow::anyhow!("Failed to load filtered audit logs: {}", e))?; // Fetch related user and IP geolocation data for each log let mut audit_details = Vec::new(); @@ -161,7 +160,7 @@ impl AuditService { let log = temps_entities::audit_logs::Entity::find_by_id(log_id) .one(self.db.as_ref()) .await - .context("Failed to get audit log by ID")?; + .map_err(|e| anyhow::anyhow!("Failed to get audit log by ID {}: {}", log_id, e))?; if let Some(log) = log { // Fetch related user diff --git a/crates/temps-backup/src/handlers/backup_handler.rs b/crates/temps-backup/src/handlers/backup_handler.rs index 0b26168b..6921b5a5 100644 --- a/crates/temps-backup/src/handlers/backup_handler.rs +++ b/crates/temps-backup/src/handlers/backup_handler.rs @@ -25,55 +25,31 @@ use utoipa::{OpenApi, ToSchema}; impl From for Problem { fn from(error: BackupError) -> Self { match error { - BackupError::DatabaseConnectionError(msg) => { - problemdetails::new(StatusCode::INTERNAL_SERVER_ERROR) - .with_title("Database connection Error") - .with_detail(msg) - } - - BackupError::Database(e) => problemdetails::new(StatusCode::INTERNAL_SERVER_ERROR) - .with_title("Database Error") - .with_detail(e.to_string()), - - BackupError::S3(e) => problemdetails::new(StatusCode::INTERNAL_SERVER_ERROR) - .with_title("S3 Storage Error") - .with_detail(e.to_string()), - - BackupError::NotFound(msg) => problemdetails::new(StatusCode::NOT_FOUND) + BackupError::NotFound { .. } => problemdetails::new(StatusCode::NOT_FOUND) .with_title("Resource Not Found") - .with_detail(msg), + .with_detail(error.to_string()), - BackupError::Validation(msg) => problemdetails::new(StatusCode::BAD_REQUEST) + BackupError::Validation(ref msg) => problemdetails::new(StatusCode::BAD_REQUEST) .with_title("Validation Error") - .with_detail(msg), - - BackupError::Configuration(msg) => { - problemdetails::new(StatusCode::INTERNAL_SERVER_ERROR) - .with_title("Configuration Error") - .with_detail(msg) - } + .with_detail(msg.clone()), - BackupError::ExternalService(msg) => { + BackupError::Schedule(ref msg) => problemdetails::new(StatusCode::BAD_REQUEST) + .with_title("Schedule Error") + .with_detail(msg.clone()), + + BackupError::Database(_) + | BackupError::S3(_) + | BackupError::Configuration(_) + | BackupError::ExternalService(_) + | BackupError::Internal { .. } + | BackupError::NotificationError(_) + | BackupError::Unsupported(_) + | BackupError::Io(_) + | BackupError::Serialization(_) => { problemdetails::new(StatusCode::INTERNAL_SERVER_ERROR) - .with_title("External Service Error") - .with_detail(msg) + .with_title("Internal Server Error") + .with_detail(error.to_string()) } - - BackupError::Schedule(msg) => problemdetails::new(StatusCode::BAD_REQUEST) - .with_title("Schedule Error") - .with_detail(msg), - - BackupError::Operation(msg) => problemdetails::new(StatusCode::INTERNAL_SERVER_ERROR) - .with_title("Operation Failed") - .with_detail(msg), - - BackupError::Internal(msg) => problemdetails::new(StatusCode::INTERNAL_SERVER_ERROR) - .with_title("Internal Server Error") - .with_detail(msg), - - _ => problemdetails::new(StatusCode::INTERNAL_SERVER_ERROR) - .with_title("Internal Server Error") - .with_detail("An unexpected error occurred"), } } } diff --git a/crates/temps-backup/src/services/backup.rs b/crates/temps-backup/src/services/backup.rs index 8923c599..27d9ea86 100644 --- a/crates/temps-backup/src/services/backup.rs +++ b/crates/temps-backup/src/services/backup.rs @@ -1,5 +1,5 @@ use crate::handlers::backup_handler::{CreateBackupScheduleRequest, CreateS3SourceRequest}; -use anyhow::{Context, Result}; +use anyhow::Result; use aws_sdk_s3::error::ProvideErrorMetadata; use aws_sdk_s3::{Client as S3Client, Config}; use chrono::{DateTime, Duration, Timelike, Utc}; @@ -29,9 +29,6 @@ use tokio_stream::StreamExt; #[derive(Error, Debug)] pub enum BackupError { - #[error("Database connection error")] - DatabaseConnectionError(String), - #[error("Database error: {0}")] Database(sea_orm::DbErr), @@ -44,8 +41,8 @@ pub enum BackupError { #[error("IO error: {0}")] Io(#[from] std::io::Error), - #[error("Resource not found: {0}")] - NotFound(String), + #[error("{resource} not found: {detail}")] + NotFound { resource: String, detail: String }, #[error("Invalid configuration: {0}")] Configuration(String), @@ -59,11 +56,8 @@ pub enum BackupError { #[error("Serialization error: {0}")] Serialization(#[from] serde_json::Error), - #[error("Backup operation failed: {0}")] - Operation(String), - - #[error("Internal error: {0}")] - Internal(String), + #[error("Internal error: {message}")] + Internal { message: String }, #[error("Unsupported: {0}")] Unsupported(String), @@ -72,13 +66,6 @@ pub enum BackupError { NotificationError(String), } -// Implementation to convert anyhow errors to BackupError -impl From for BackupError { - fn from(err: anyhow::Error) -> Self { - BackupError::Internal(err.to_string()) - } -} - impl From> for BackupError { @@ -115,12 +102,24 @@ impl } } +// Conversion from anyhow::Error is used by service methods whose helper functions +// return anyhow::Result. This is a transitional impl; the goal is to convert all +// helper functions to return BackupError directly. +impl From for BackupError { + fn from(err: anyhow::Error) -> Self { + BackupError::Internal { + message: format!("{:#}", err), + } + } +} + impl From for BackupError { fn from(err: sea_orm::DbErr) -> Self { match err { - sea_orm::DbErr::RecordNotFound(_) => { - BackupError::NotFound("Resource not found".to_string()) - } + sea_orm::DbErr::RecordNotFound(msg) => BackupError::NotFound { + resource: "Backup resource".to_string(), + detail: msg, + }, _ => BackupError::Database(err), } } @@ -212,7 +211,10 @@ impl BackupService { let s3_source = temps_entities::s3_sources::Entity::find_by_id(s3_source_id) .one(self.db.as_ref()) .await? - .ok_or_else(|| BackupError::NotFound("S3 source not found".to_string()))?; + .ok_or_else(|| BackupError::NotFound { + resource: "S3Source".to_string(), + detail: "S3 source not found".to_string(), + })?; // Create a temporary file for the backup let mut temp_file = NamedTempFile::new().map_err(BackupError::Io)?; @@ -1101,7 +1103,8 @@ impl BackupService { let mut buffer = Vec::with_capacity(chunk_size); while let Some(chunk) = stream.next().await { - let chunk = chunk.context("Failed to read chunk from file")?; + let chunk = + chunk.map_err(|e| anyhow::anyhow!("Failed to read chunk from file: {}", e))?; buffer.extend_from_slice(&chunk); if buffer.len() >= chunk_size { @@ -1304,13 +1307,19 @@ impl BackupService { .filter(temps_entities::backups::Column::BackupId.eq(backup_id)) .one(self.db.as_ref()) .await? - .ok_or_else(|| BackupError::NotFound("Backup not found".to_string()))?; + .ok_or_else(|| BackupError::NotFound { + resource: "Backup".to_string(), + detail: "Backup not found".to_string(), + })?; // Get S3 source let s3_source = temps_entities::s3_sources::Entity::find_by_id(backup.s3_source_id) .one(self.db.as_ref()) .await? - .ok_or_else(|| BackupError::NotFound("S3 source not found".to_string()))?; + .ok_or_else(|| BackupError::NotFound { + resource: "S3Source".to_string(), + detail: "S3 source not found".to_string(), + })?; let backend = self.db.get_database_backend(); match backend { @@ -1486,8 +1495,9 @@ impl BackupService { let database_url = &self.config_service.get_database_url(); // Parse database URL to extract connection parameters - let url = url::Url::parse(database_url) - .map_err(|e| BackupError::Internal(format!("Invalid DATABASE_URL format: {}", e)))?; + let url = url::Url::parse(database_url).map_err(|e| BackupError::Internal { + message: format!("Invalid DATABASE_URL format: {}", e), + })?; let host = url.host_str().unwrap_or("localhost"); let port = url.port().unwrap_or(5432); @@ -1517,19 +1527,18 @@ impl BackupService { } info!("Running pg_restore command"); - let output = cmd.output().await.map_err(|e| { - BackupError::Internal(format!( + let output = cmd.output().await.map_err(|e| BackupError::Internal { + message: format!( "Failed to execute pg_restore: {}. Make sure pg_restore is installed and in PATH", e - )) + ), })?; if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); - return Err(BackupError::Internal(format!( - "pg_restore failed: {}", - stderr - ))); + return Err(BackupError::Internal { + message: format!("pg_restore failed: {}", stderr), + }); } info!("PostgreSQL backup restored successfully"); @@ -1555,12 +1564,18 @@ impl BackupService { .filter(temps_entities::backups::Column::BackupId.eq(backup_id)) .one(self.db.as_ref()) .await? - .ok_or_else(|| BackupError::NotFound("Backup not found".to_string()))?; + .ok_or_else(|| BackupError::NotFound { + resource: "Backup".to_string(), + detail: "Backup not found".to_string(), + })?; let s3_source = temps_entities::s3_sources::Entity::find_by_id(backup.s3_source_id) .one(self.db.as_ref()) .await? - .ok_or_else(|| BackupError::NotFound("S3 source not found".to_string()))?; + .ok_or_else(|| BackupError::NotFound { + resource: "S3Source".to_string(), + detail: "S3 source not found".to_string(), + })?; // Create S3 client let s3_client = self.create_s3_client(&s3_source).await?; @@ -1636,12 +1651,16 @@ impl BackupService { let encrypted_access_key = self .encryption_service .encrypt_string(&request.access_key_id) - .map_err(|e| BackupError::Internal(format!("Failed to encrypt access key: {}", e)))?; + .map_err(|e| BackupError::Internal { + message: format!("Failed to encrypt access key: {}", e), + })?; let encrypted_secret_key = self .encryption_service .encrypt_string(&request.secret_key) - .map_err(|e| BackupError::Internal(format!("Failed to encrypt secret key: {}", e)))?; + .map_err(|e| BackupError::Internal { + message: format!("Failed to encrypt secret key: {}", e), + })?; let new_source = temps_entities::s3_sources::ActiveModel { id: sea_orm::NotSet, @@ -1671,7 +1690,10 @@ impl BackupService { let source = temps_entities::s3_sources::Entity::find_by_id(id) .one(self.db.as_ref()) .await? - .ok_or_else(|| BackupError::NotFound("S3 source not found".to_string()))?; + .ok_or_else(|| BackupError::NotFound { + resource: "S3Source".to_string(), + detail: "S3 source not found".to_string(), + })?; Ok(source) } @@ -1711,7 +1733,10 @@ impl BackupService { temps_entities::s3_sources::Entity::find_by_id(request.s3_source_id) .one(self.db.as_ref()) .await? - .ok_or_else(|| BackupError::NotFound("S3 source not found".to_string()))?; + .ok_or_else(|| BackupError::NotFound { + resource: "S3Source".to_string(), + detail: "S3 source not found".to_string(), + })?; // Validate the schedule expression self.validate_backup_schedule(&request.schedule_expression)?; @@ -1752,7 +1777,10 @@ impl BackupService { let schedule = temps_entities::backup_schedules::Entity::find_by_id(id) .one(self.db.as_ref()) .await? - .ok_or_else(|| BackupError::NotFound("Backup schedule not found".to_string()))?; + .ok_or_else(|| BackupError::NotFound { + resource: "BackupSchedule".to_string(), + detail: "Backup schedule not found".to_string(), + })?; Ok(schedule) } @@ -1810,7 +1838,10 @@ impl BackupService { temps_entities::s3_sources::Entity::find_by_id(s3_source_id) .one(self.db.as_ref()) .await? - .ok_or_else(|| BackupError::NotFound("S3 source not found".to_string()))?; + .ok_or_else(|| BackupError::NotFound { + resource: "S3Source".to_string(), + detail: "S3 source not found".to_string(), + })?; // Create the backup let backup = self @@ -1840,7 +1871,10 @@ impl BackupService { let current = temps_entities::s3_sources::Entity::find_by_id(id) .one(self.db.as_ref()) .await? - .ok_or_else(|| BackupError::NotFound("S3 source not found".to_string()))?; + .ok_or_else(|| BackupError::NotFound { + resource: "S3Source".to_string(), + detail: "S3 source not found".to_string(), + })?; let mut active = current.into_active_model(); @@ -1858,8 +1892,8 @@ impl BackupService { let encrypted_access_key = self .encryption_service .encrypt_string(&access_key_id) - .map_err(|e| { - BackupError::Internal(format!("Failed to encrypt access key: {}", e)) + .map_err(|e| BackupError::Internal { + message: format!("Failed to encrypt access key: {}", e), })?; active.access_key_id = Set(encrypted_access_key); } @@ -1868,8 +1902,8 @@ impl BackupService { let encrypted_secret_key = self .encryption_service .encrypt_string(&secret_key) - .map_err(|e| { - BackupError::Internal(format!("Failed to encrypt secret key: {}", e)) + .map_err(|e| BackupError::Internal { + message: format!("Failed to encrypt secret key: {}", e), })?; active.secret_key = Set(encrypted_secret_key); } @@ -2020,7 +2054,10 @@ impl BackupService { let s3_source = temps_entities::s3_sources::Entity::find_by_id(s3_source_id) .one(self.db.as_ref()) .await? - .ok_or_else(|| BackupError::NotFound("S3 source not found".to_string()))?; + .ok_or_else(|| BackupError::NotFound { + resource: "S3Source".to_string(), + detail: "S3 source not found".to_string(), + })?; // Create S3 client let s3_client = self.create_s3_client(&s3_source).await?; @@ -2070,8 +2107,9 @@ impl BackupService { temps_entities::external_services::Entity::find_by_id(service_id) .one(self.db.as_ref()) .await? - .ok_or_else(|| { - BackupError::NotFound(format!("External service with ID {} not found", service_id)) + .ok_or_else(|| BackupError::NotFound { + resource: "ExternalService".to_string(), + detail: format!("External service with ID {} not found", service_id), }) } @@ -2089,7 +2127,10 @@ impl BackupService { let s3_source = temps_entities::s3_sources::Entity::find_by_id(s3_source_id) .one(self.db.as_ref()) .await? - .ok_or_else(|| BackupError::NotFound("S3 source not found".to_string()))?; + .ok_or_else(|| BackupError::NotFound { + resource: "S3Source".to_string(), + detail: "S3 source not found".to_string(), + })?; // Create S3 client let s3_client = self @@ -2176,8 +2217,9 @@ impl BackupService { .filter(temps_entities::external_service_backups::Column::BackupId.eq(backup.id)) .one(self.db.as_ref()) .await? - .ok_or_else(|| { - BackupError::NotFound("External service backup record not found".to_string()) + .ok_or_else(|| BackupError::NotFound { + resource: "ExternalServiceBackup".to_string(), + detail: "External service backup record not found".to_string(), })?; info!( @@ -2419,7 +2461,10 @@ impl BackupService { let schedule_model = temps_entities::backup_schedules::Entity::find_by_id(schedule_id) .one(self.db.as_ref()) .await? - .ok_or_else(|| BackupError::NotFound("Backup schedule not found".to_string()))?; + .ok_or_else(|| BackupError::NotFound { + resource: "BackupSchedule".to_string(), + detail: "Backup schedule not found".to_string(), + })?; let mut schedule_update: temps_entities::backup_schedules::ActiveModel = schedule_model.into_active_model(); @@ -2441,7 +2486,10 @@ impl BackupService { let schedule_model = temps_entities::backup_schedules::Entity::find_by_id(id) .one(self.db.as_ref()) .await? - .ok_or_else(|| BackupError::NotFound("Backup schedule not found".to_string()))?; + .ok_or_else(|| BackupError::NotFound { + resource: "BackupSchedule".to_string(), + detail: "Backup schedule not found".to_string(), + })?; let mut schedule_update: temps_entities::backup_schedules::ActiveModel = schedule_model.into_active_model(); @@ -2461,7 +2509,10 @@ impl BackupService { let schedule = temps_entities::backup_schedules::Entity::find_by_id(id) .one(self.db.as_ref()) .await? - .ok_or_else(|| BackupError::NotFound("Backup schedule not found".to_string()))?; + .ok_or_else(|| BackupError::NotFound { + resource: "BackupSchedule".to_string(), + detail: "Backup schedule not found".to_string(), + })?; // Calculate next run time based on the schedule expression let cron_schedule = Schedule::from_str(&schedule.schedule_expression) @@ -2843,7 +2894,7 @@ mod tests { let result = backup_service.get_s3_source(999).await; assert!(result.is_err()); match result { - Err(BackupError::NotFound(_)) => {} + Err(BackupError::NotFound { .. }) => {} _ => panic!("Expected NotFound error"), } } @@ -2872,14 +2923,17 @@ mod tests { let result = backup_service.get_backup_schedule(999).await; assert!(result.is_err()); match result { - Err(BackupError::NotFound(_)) => {} + Err(BackupError::NotFound { .. }) => {} _ => panic!("Expected NotFound error"), } } #[tokio::test] - #[ignore] // Requires Docker (MinIO and PostgreSQL containers) async fn test_backup_to_minio_integration() { + if bollard::Docker::connect_with_local_defaults().is_err() { + println!("Docker not available, skipping test"); + return; + } use temps_database::test_utils::TestDatabase; use testcontainers::{runners::AsyncRunner, GenericImage, ImageExt}; @@ -3086,8 +3140,11 @@ mod tests { } #[tokio::test] - #[ignore] // Requires Docker (PostgreSQL container) async fn test_restore_postgres_from_url() { + if bollard::Docker::connect_with_local_defaults().is_err() { + println!("Docker not available, skipping test"); + return; + } use temps_database::test_utils::TestDatabase; use testcontainers::{runners::AsyncRunner, GenericImage, ImageExt}; diff --git a/crates/temps-core/src/plugin.rs b/crates/temps-core/src/plugin.rs index 8aaa43ec..7d24d811 100644 --- a/crates/temps-core/src/plugin.rs +++ b/crates/temps-core/src/plugin.rs @@ -10,7 +10,7 @@ use std::any::{Any, TypeId}; use std::collections::HashMap; use std::future::Future; use std::pin::Pin; -use std::sync::{Arc, Mutex}; +use std::sync::{Arc, RwLock}; use anyhow::Result; use axum::extract::Request; @@ -423,7 +423,7 @@ impl PluginRoutes { /// Type-safe service registry for dependency injection pub struct ServiceRegistry { - services: Mutex>>, + services: RwLock>>, } impl Default for ServiceRegistry { @@ -436,7 +436,7 @@ impl ServiceRegistry { /// Create a new service registry pub fn new() -> Self { Self { - services: Mutex::new(HashMap::new()), + services: RwLock::new(HashMap::new()), } } @@ -444,7 +444,7 @@ impl ServiceRegistry { pub fn register(&self, service: Arc) { debug!("Registering service: {}", std::any::type_name::()); self.services - .lock() + .write() .unwrap() .insert(TypeId::of::(), Box::new(service)); } @@ -452,7 +452,7 @@ impl ServiceRegistry { /// Get a service if it's registered pub fn get(&self) -> Option> { self.services - .lock() + .read() .unwrap() .get(&TypeId::of::()) .and_then(|any| any.downcast_ref::>()) @@ -473,7 +473,7 @@ impl ServiceRegistry { /// Registry for plugin-specific state (used for routing) pub struct PluginStateRegistry { - states: Mutex>>, + states: RwLock>>, } impl Default for PluginStateRegistry { @@ -485,7 +485,7 @@ impl Default for PluginStateRegistry { impl PluginStateRegistry { pub fn new() -> Self { Self { - states: Mutex::new(HashMap::new()), + states: RwLock::new(HashMap::new()), } } @@ -497,7 +497,7 @@ impl PluginStateRegistry { ) { debug!("Registering plugin state for: {}", plugin_name); self.states - .lock() + .write() .unwrap() .insert(plugin_name.to_string(), Box::new(state)); } @@ -508,7 +508,7 @@ impl PluginStateRegistry { plugin_name: &str, ) -> Option> { self.states - .lock() + .read() .unwrap() .get(plugin_name) .and_then(|any| any.downcast_ref::>()) @@ -1021,50 +1021,46 @@ pub mod middleware_helpers { } } - /// Create CORS middleware - pub fn cors_middleware( - _plugin_name: &str, - allowed_origins: Vec, - ) -> impl Fn( - Request, - Next, - ) -> Pin> + Send>> - + Send - + Sync { - move |req: Request, next: Next| { - let allowed_origins = allowed_origins.clone(); - Box::pin(async move { - let origin = req - .headers() - .get("origin") - .and_then(|h| h.to_str().ok()) - .map(|s| s.to_string()); - - let mut response = next.run(req).await; - - // Add CORS headers - if let Some(origin) = origin { - if allowed_origins.contains(&origin) - || allowed_origins.contains(&"*".to_string()) - { - response.headers_mut().insert( - "access-control-allow-origin", - axum::http::HeaderValue::from_str(&origin).unwrap(), - ); - } - } - - response.headers_mut().insert( - "access-control-allow-methods", - axum::http::HeaderValue::from_static("GET, POST, PUT, DELETE, OPTIONS"), - ); - response.headers_mut().insert( - "access-control-allow-headers", - axum::http::HeaderValue::from_static("Content-Type, Authorization"), - ); - - Ok(response) - }) + /// Create a CORS layer using `tower_http::cors::CorsLayer`. + /// + /// # Example + /// ```rust,no_run + /// use tower_http::cors::{CorsLayer, Any}; + /// use axum::http::Method; + /// + /// let cors = CorsLayer::new() + /// .allow_methods([Method::GET, Method::POST, Method::PUT, Method::DELETE, Method::OPTIONS]) + /// .allow_headers(Any) + /// .allow_origin(Any); // or use .allow_origin("https://example.com".parse::().unwrap()) + /// ``` + /// + /// Apply the layer directly to your Axum `Router`: + /// ```rust,no_run + /// router.layer(cors) + /// ``` + pub fn cors_layer(allowed_origins: Vec) -> tower_http::cors::CorsLayer { + use axum::http::{HeaderValue, Method}; + use tower_http::cors::CorsLayer; + + let mut layer = CorsLayer::new().allow_methods([ + Method::GET, + Method::POST, + Method::PUT, + Method::PATCH, + Method::DELETE, + Method::OPTIONS, + ]); + + if allowed_origins.iter().any(|o| o == "*") { + layer = layer.allow_origin(tower_http::cors::Any); + } else { + let origins: Vec = allowed_origins + .iter() + .filter_map(|o| o.parse::().ok()) + .collect(); + layer = layer.allow_origin(origins); } + + layer } } diff --git a/crates/temps-deployments/src/handlers/deployments.rs b/crates/temps-deployments/src/handlers/deployments.rs index 0d296f31..d1cb0dea 100644 --- a/crates/temps-deployments/src/handlers/deployments.rs +++ b/crates/temps-deployments/src/handlers/deployments.rs @@ -1821,8 +1821,14 @@ mod tests { } #[tokio::test] - #[ignore] // FIXME: Flaky test - real-time log streaming timing issues. Needs refactoring as integration test async fn test_websocket_handler_end_to_end_with_server() { + // FIXME: Flaky test - real-time log streaming timing issues. + // Needs refactoring as a proper integration test. + // Set TEMPS_FLAKY_TESTS=1 to enable. + if std::env::var("TEMPS_FLAKY_TESTS").is_err() { + println!("Flaky test skipped; set TEMPS_FLAKY_TESTS=1 to enable"); + return; + } use axum::extract::Request; use axum::middleware; use sea_orm::{ActiveModelTrait, Set}; @@ -2036,8 +2042,11 @@ mod tests { } #[tokio::test] - #[ignore] // Requires actual Docker container async fn test_container_logs_by_id_websocket() { + if bollard::Docker::connect_with_local_defaults().is_err() { + println!("Docker not available, skipping test"); + return; + } use axum::extract::Request; use axum::middleware; use sea_orm::{ActiveModelTrait, Set}; @@ -2253,8 +2262,11 @@ mod tests { } #[tokio::test] - #[ignore] // Requires actual Docker container async fn test_filtered_container_logs_websocket() { + if bollard::Docker::connect_with_local_defaults().is_err() { + println!("Docker not available, skipping test"); + return; + } use axum::extract::Request; use axum::middleware; use sea_orm::{ActiveModelTrait, Set}; diff --git a/crates/temps-deployments/tests/public_repo_deployment_test.rs b/crates/temps-deployments/tests/public_repo_deployment_test.rs index 3f322429..e3bd8815 100644 --- a/crates/temps-deployments/tests/public_repo_deployment_test.rs +++ b/crates/temps-deployments/tests/public_repo_deployment_test.rs @@ -227,9 +227,17 @@ mod public_repo_tests { ]; #[tokio::test] - #[ignore] // Heavy integration test: clones public repos, builds Docker images, deploys containers. - // Run explicitly with: cargo test -p temps-deployments test_deploy_public_repositories -- --ignored async fn test_deploy_public_repositories() { + // Heavy integration test: clones public repos, builds Docker images, deploys containers. + // Enable with: TEMPS_INTEGRATION_TESTS=1 cargo test -p temps-deployments test_deploy_public_repositories + if std::env::var("TEMPS_INTEGRATION_TESTS").is_err() { + println!("Integration tests disabled; set TEMPS_INTEGRATION_TESTS=1 to enable"); + return; + } + if bollard::Docker::connect_with_local_defaults().is_err() { + println!("Docker not available, skipping test"); + return; + } // Enable verbose Docker build output // SAFETY: This is only called from a single-threaded test; no concurrent access. unsafe { diff --git a/crates/temps-email/src/dns.rs b/crates/temps-email/src/dns.rs index 46e73b10..c5de2ed3 100644 --- a/crates/temps-email/src/dns.rs +++ b/crates/temps-email/src/dns.rs @@ -205,8 +205,11 @@ mod tests { } #[tokio::test] - #[ignore] // Requires network access async fn test_verify_known_txt_record() { + if std::env::var("TEMPS_NETWORK_TESTS").is_err() { + println!("Network tests disabled; set TEMPS_NETWORK_TESTS=1 to enable"); + return; + } let verifier = DnsVerifier::new(); // Test with a known TXT record (Google's SPF) let status = verifier @@ -216,8 +219,11 @@ mod tests { } #[tokio::test] - #[ignore] // Requires network access async fn test_verify_known_mx_record() { + if std::env::var("TEMPS_NETWORK_TESTS").is_err() { + println!("Network tests disabled; set TEMPS_NETWORK_TESTS=1 to enable"); + return; + } let verifier = DnsVerifier::new(); // Test with a known MX record let status = verifier diff --git a/crates/temps-geo/src/ip_address_service.rs b/crates/temps-geo/src/ip_address_service.rs index a7a6d952..13e29e3d 100644 --- a/crates/temps-geo/src/ip_address_service.rs +++ b/crates/temps-geo/src/ip_address_service.rs @@ -1,6 +1,4 @@ use crate::geoip_service::GeoIpService; -use anyhow::Context; - use chrono::Utc; use sea_orm::{prelude::*, QueryFilter, QueryOrder, QuerySelect, Set}; use std::sync::Arc; @@ -58,24 +56,23 @@ impl IpAddressService { return Ok(existing_ip.into()); } - let geo_data = match self - .geoip_service - .geolocate( - ip_address_str - .parse::() - .context("Invalid IP address")?, - ) - .await - { - Ok(data) => Some(data), - Err(e) => { - error!( - "Failed to get geolocation data for IP {}: {}", - ip_address_str, e - ); - None - } - }; + let geo_data = + match self + .geoip_service + .geolocate(ip_address_str.parse::().map_err(|e| { + anyhow::anyhow!("Invalid IP address '{}': {}", ip_address_str, e) + })?) + .await + { + Ok(data) => Some(data), + Err(e) => { + error!( + "Failed to get geolocation data for IP {}: {}", + ip_address_str, e + ); + None + } + }; let new_ip = ip_geolocations::ActiveModel { ip_address: Set(ip_address_str.to_string()), @@ -96,10 +93,13 @@ impl IpAddressService { ..Default::default() }; - let result = new_ip - .insert(self.db.as_ref()) - .await - .context("Failed to create IP address record")?; + let result = new_ip.insert(self.db.as_ref()).await.map_err(|e| { + anyhow::anyhow!( + "Failed to create IP address record for '{}': {}", + ip_address_str, + e + ) + })?; info!("Created new IP address record for {}", ip_address_str); Ok(result.into()) @@ -117,7 +117,13 @@ impl IpAddressService { ip_record .ip_address .parse::() - .context("Invalid IP address in database")?, + .map_err(|e| { + anyhow::anyhow!( + "Invalid IP address in database for record {}: {}", + ip_id, + e + ) + })?, ) .await?; @@ -135,7 +141,7 @@ impl IpAddressService { let updated = active_model .update(self.db.as_ref()) .await - .context("Failed to update IP address record")?; + .map_err(|e| anyhow::anyhow!("Failed to update IP address record {}: {}", ip_id, e))?; Ok(updated.into()) } @@ -155,7 +161,7 @@ impl IpAddressService { .limit(limit) .all(self.db.as_ref()) .await - .context("Failed to load IP addresses")?; + .map_err(|e| anyhow::anyhow!("Failed to load IP addresses: {}", e))?; Ok(results.into_iter().map(|ip| ip.into()).collect()) } diff --git a/crates/temps-presets/src/preset_provider.rs b/crates/temps-presets/src/preset_provider.rs index 5651ce8c..790a2c4b 100644 --- a/crates/temps-presets/src/preset_provider.rs +++ b/crates/temps-presets/src/preset_provider.rs @@ -6,6 +6,11 @@ //! This system bridges the existing Preset enum (stored in database) with a modern provider architecture. use crate::providers::app::App; +use crate::providers::{ + AngularPresetProvider, AstroPresetProvider, DockerfilePresetProvider, DocusaurusPresetProvider, + DocusaurusV1PresetProvider, NestJsPresetProvider, NextJsPresetProvider, NixpacksPresetProvider, + RsbuildPresetProvider, VitePresetProvider, +}; use anyhow::Result; use std::collections::HashMap; use temps_entities::preset::Preset; @@ -92,16 +97,27 @@ pub struct PresetProviderRegistry { impl PresetProviderRegistry { /// Create a new registry with all providers pub fn new() -> Self { - - - // Register all providers - // registry.register(Box::new(providers::NextJsProvider)); - // TODO: Register other providers as they are implemented - - Self { + let mut registry = Self { providers: Vec::new(), by_slug: HashMap::new(), - } + }; + + // Register all providers. Dockerfile and Nixpacks are registered first + // as they take highest precedence during detection. + registry.register(Box::new(DockerfilePresetProvider)); + registry.register(Box::new(NixpacksPresetProvider)); + + // Node.js / TypeScript frameworks + registry.register(Box::new(NextJsPresetProvider)); + registry.register(Box::new(VitePresetProvider)); + registry.register(Box::new(RsbuildPresetProvider)); + registry.register(Box::new(DocusaurusPresetProvider)); + registry.register(Box::new(DocusaurusV1PresetProvider)); + registry.register(Box::new(NestJsPresetProvider)); + registry.register(Box::new(AngularPresetProvider)); + registry.register(Box::new(AstroPresetProvider)); + + registry } /// Register a provider @@ -127,7 +143,10 @@ impl PresetProviderRegistry { }; // Update best match if this has higher confidence - if best_match.as_ref().is_none_or(|m| confidence > m.confidence) { + if best_match + .as_ref() + .is_none_or(|m| confidence > m.confidence) + { best_match = Some(detection); } } @@ -237,7 +256,10 @@ mod tests { registry.register(Box::new(MockProvider)); let mut files = HashMap::new(); - files.insert("next.config.js".to_string(), "module.exports = {}".to_string()); + files.insert( + "next.config.js".to_string(), + "module.exports = {}".to_string(), + ); let app = App::from_tree(PathBuf::from("/test"), files); let detection = registry.detect(&app); diff --git a/crates/temps-presets/src/providers/app.rs b/crates/temps-presets/src/providers/app.rs index 2c3766f0..13f69f8e 100644 --- a/crates/temps-presets/src/providers/app.rs +++ b/crates/temps-presets/src/providers/app.rs @@ -58,10 +58,9 @@ impl App { #[cfg(test)] pub fn new_with_errors(path: &str) -> Result { let current_dir = std::env::current_dir()?; - let source = current_dir - .join(path) - .canonicalize() - .context("Failed to read app source directory")?; + let source = current_dir.join(path).canonicalize().map_err(|e| { + anyhow::anyhow!("Failed to read app source directory '{}': {}", path, e) + })?; let mut files = HashMap::new(); let mut paths = Vec::new(); @@ -77,7 +76,12 @@ impl App { } /// Walk directory tree and collect files - fn walk_dir(dir: &Path, base: &Path, files: &mut HashMap, paths: &mut Vec) -> Result<()> { + fn walk_dir( + dir: &Path, + base: &Path, + files: &mut HashMap, + paths: &mut Vec, + ) -> Result<()> { use std::fs; for entry in fs::read_dir(dir)? { @@ -108,7 +112,9 @@ impl App { /// Check if a directory exists pub fn includes_directory(&self, name: &str) -> bool { let dir_prefix = format!("{}/", name); - self.paths.iter().any(|p| p.starts_with(&dir_prefix) || p == name) + self.paths + .iter() + .any(|p| p.starts_with(&dir_prefix) || p == name) } /// Find files matching a glob-like pattern @@ -117,7 +123,8 @@ impl App { let regex_pattern = glob_to_regex(pattern); let re = regex::Regex::new(®ex_pattern)?; - let matches: Vec = self.files + let matches: Vec = self + .files .keys() .filter(|path| re.is_match(path)) .cloned() @@ -132,7 +139,8 @@ impl App { let re = regex::Regex::new(®ex_pattern)?; // Get all unique directory paths - let mut dirs: Vec = self.paths + let mut dirs: Vec = self + .paths .iter() .filter_map(|p| { let path = Path::new(p); @@ -145,10 +153,7 @@ impl App { dirs.sort(); dirs.dedup(); - let matches: Vec = dirs - .into_iter() - .filter(|dir| re.is_match(dir)) - .collect(); + let matches: Vec = dirs.into_iter().filter(|dir| re.is_match(dir)).collect(); Ok(matches) } @@ -226,8 +231,8 @@ impl App { T: DeserializeOwned, { let contents = self.read_file(name)?; - let toml_file = toml::from_str(&contents) - .with_context(|| format!("Error reading {} as TOML", name))?; + let toml_file = + toml::from_str(&contents).with_context(|| format!("Error reading {} as TOML", name))?; Ok(toml_file) } @@ -270,9 +275,7 @@ impl App { /// Helper: Check if package.json has a dependency pub fn has_dependency(&self, dep: &str) -> bool { if let Ok(pkg) = self.read_json::("package.json") { - pkg.get("dependencies") - .and_then(|d| d.get(dep)) - .is_some() + pkg.get("dependencies").and_then(|d| d.get(dep)).is_some() } else { false } @@ -297,9 +300,7 @@ impl App { /// Helper: Check if package.json has a script pub fn has_script(&self, script: &str) -> bool { if let Ok(pkg) = self.read_json::("package.json") { - pkg.get("scripts") - .and_then(|s| s.get(script)) - .is_some() + pkg.get("scripts").and_then(|s| s.get(script)).is_some() } else { false } @@ -374,7 +375,10 @@ mod tests { fn test_from_tree() { let mut files = HashMap::new(); files.insert("package.json".to_string(), r#"{"name":"test"}"#.to_string()); - files.insert("src/index.ts".to_string(), "console.log('hello')".to_string()); + files.insert( + "src/index.ts".to_string(), + "console.log('hello')".to_string(), + ); let app = App::from_tree(PathBuf::from("/test"), files); @@ -439,7 +443,7 @@ mod tests { let mut files = HashMap::new(); files.insert( "package.json".to_string(), - r#"{"name":"test","version":"1.0.0"}"#.to_string() + r#"{"name":"test","version":"1.0.0"}"#.to_string(), ); let app = App::from_tree(PathBuf::from("/test"), files); @@ -458,7 +462,7 @@ mod tests { let mut files = HashMap::new(); files.insert( "package.json".to_string(), - r#"{"name":"test","version":"1.0.0"}"#.to_string() + r#"{"name":"test","version":"1.0.0"}"#.to_string(), ); let app = App::from_tree(PathBuf::from("/test"), files); @@ -481,7 +485,8 @@ mod tests { /* Block comment */ "target": "es2015" } - }"#.to_string() + }"# + .to_string(), ); let app = App::from_tree(PathBuf::from("/test"), files); @@ -509,11 +514,12 @@ mod tests { files.insert( "src/App.tsx".to_string(), r#"import React from 'react'; - function App() { return
Hello
; }"#.to_string() + function App() { return
Hello
; }"# + .to_string(), ); files.insert( "src/index.ts".to_string(), - "console.log('hello')".to_string() + "console.log('hello')".to_string(), ); let app = App::from_tree(PathBuf::from("/test"), files); diff --git a/crates/temps-providers/src/externalsvc/postgres.rs b/crates/temps-providers/src/externalsvc/postgres.rs index 2bddcd55..6edcdbb3 100644 --- a/crates/temps-providers/src/externalsvc/postgres.rs +++ b/crates/temps-providers/src/externalsvc/postgres.rs @@ -1025,6 +1025,15 @@ impl ExternalService for PostgresService { } /// Backup PostgreSQL data to S3 + /// + /// Uses a disposable sidecar container running a version-matched `pg_dump` binary that + /// connects to the target PostgreSQL container over the shared Docker network. This avoids + /// competing for memory with the running PostgreSQL server — the root cause of OOM kills + /// (exit code 137) that occur when `pg_dumpall` is exec'd inside the service container. + /// + /// The sidecar approach also supports TimescaleDB databases: `pg_dump --format=custom` emits + /// circular-FK warnings for TimescaleDB catalog tables (`hypertable`, `chunk`, + /// `continuous_agg`) but these are advisory only and do not affect the dump. async fn backup_to_s3( &self, s3_client: &aws_sdk_s3::Client, @@ -1036,9 +1045,14 @@ impl ExternalService for PostgresService { external_service: &temps_entities::external_services::Model, service_config: ServiceConfig, ) -> anyhow::Result { + use bollard::exec::{CreateExecOptions, StartExecResults}; + use bollard::models::ContainerCreateBody as Config; + use bollard::query_parameters::RemoveContainerOptions; use chrono::Utc; + use flate2::write::GzEncoder; + use flate2::Compression; + use futures::stream::StreamExt as FuturesStreamExt; use sea_orm::*; - use std::io::Write; use tempfile::NamedTempFile; info!("Starting PostgreSQL backup to S3"); @@ -1066,81 +1080,245 @@ impl ExternalService for PostgresService { .insert(pool) .await?; - // Get container name - let container_name = self.get_container_name(); + // The DB container name is also its DNS hostname on the shared Docker network. + let db_container_name = self.get_container_name(); + + // Use the exact same image as the running service container for the sidecar. This + // guarantees that the pg_dump binary version matches the server AND that any required + // extensions (e.g. TimescaleDB) are present. A plain postgres image would lack the + // TimescaleDB extension and fail to dump extension-dependent objects. + let sidecar_image = postgres_config.docker_image.clone(); - // Create a temporary file for the backup - let mut temp_file = tempfile::NamedTempFile::new()?; + // Pull the sidecar image (no-op if already present) + info!("Pulling sidecar image {} for pg_dump", sidecar_image); + let (image_name, image_tag) = sidecar_image + .split_once(':') + .map(|(n, t)| (n.to_string(), t.to_string())) + .unwrap_or_else(|| (sidecar_image.clone(), "latest".to_string())); - // Execute pg_dumpall inside the container with actual credentials from config - // URL-decode password (it's stored URL-encoded in database for connection strings) + self.docker + .create_image( + Some(bollard::query_parameters::CreateImageOptions { + from_image: Some(image_name), + tag: Some(image_tag), + ..Default::default() + }), + None, + None, + ) + .try_collect::>() + .await + .map_err(|e| { + anyhow::anyhow!( + "Failed to pull pg_dump sidecar image {}: {}", + sidecar_image, + e + ) + })?; + + // Spin up a short-lived sidecar container attached to the same Docker network so it + // can reach the DB container by hostname. The container just sleeps; pg_dump is run + // via exec so we can stream stdout directly without buffering in memory. + let sidecar_name = format!("temps-pg-backup-{}", uuid::Uuid::new_v4()); let password_env = format!("PGPASSWORD={}", postgres_config.password); + + let sidecar_config = Config { + image: Some(sidecar_image.clone()), + cmd: Some(vec!["sleep".to_string(), "300".to_string()]), + env: Some(vec![password_env.clone()]), + ..Default::default() + }; + + let networking_config = Some(bollard::models::NetworkingConfig { + endpoints_config: Some(std::collections::HashMap::from([( + temps_core::NETWORK_NAME.to_string(), + bollard::models::EndpointSettings { + ..Default::default() + }, + )])), + }); + + let sidecar_config = Config { + networking_config, + ..sidecar_config + }; + + self.docker + .create_container( + Some( + bollard::query_parameters::CreateContainerOptionsBuilder::new() + .name(&sidecar_name) + .build(), + ), + sidecar_config, + ) + .await + .map_err(|e| anyhow::anyhow!("Failed to create pg_dump sidecar container: {}", e))?; + + self.docker + .start_container( + &sidecar_name, + Some(bollard::query_parameters::StartContainerOptionsBuilder::new().build()), + ) + .await + .map_err(|e| anyhow::anyhow!("Failed to start pg_dump sidecar container: {}", e))?; + + // Helper to remove the sidecar on any error path + let remove_sidecar = |docker: std::sync::Arc, name: String| async move { + let _ = docker + .remove_container( + &name, + Some(RemoveContainerOptions { + force: true, + ..Default::default() + }), + ) + .await; + }; + + // pg_dump connects to the DB container over the Docker network using its container name + // as the hostname. --format=custom produces a binary dump suitable for pg_restore. + // --compress=0 disables pg_dump's internal compression; we apply gzip ourselves while + // streaming to avoid buffering the entire dump in memory (prevents OOM / exit code 137). + let port_str = POSTGRES_INTERNAL_PORT.to_string(); + let pg_dump_cmd = vec![ + "pg_dump", + "--format=custom", + "--compress=0", + "--no-password", + "--host", + &db_container_name, + "--port", + &port_str, + "--username", + &postgres_config.username, + "--dbname", + &postgres_config.database, + ]; + + info!( + "Running pg_dump sidecar for service '{}' (host={})", + self.name, db_container_name + ); + let exec = self .docker .create_exec( - &container_name, - bollard::exec::CreateExecOptions { - cmd: Some(vec![ - "pg_dumpall", - "-U", - &postgres_config.username, - "-w", - "--clean", - "--if-exists", - ]), - env: Some(vec![password_env.as_str()]), + &sidecar_name, + CreateExecOptions { + cmd: Some(pg_dump_cmd), attach_stdout: Some(true), attach_stderr: Some(true), + env: Some(vec![password_env.as_str()]), ..Default::default() }, ) - .await?; + .await + .map_err(|e| anyhow::anyhow!("Failed to create pg_dump exec: {}", e))?; - let output: bollard::exec::StartExecResults = - self.docker.start_exec(&exec.id, None).await?; - if let bollard::exec::StartExecResults::Attached { output, .. } = output { - let mut stream = output.boxed(); - while let Some(result) = stream.next().await { - match result { - Ok(log_output) => match log_output { - bollard::container::LogOutput::StdOut { message } - | bollard::container::LogOutput::StdErr { message } => { - temp_file.write_all(&message)?; + // Stream pg_dump stdout directly into a gzip encoder writing to a temp file. + // This avoids buffering the entire dump in memory, preventing OOM kills. + let mut temp_file = NamedTempFile::new()?; + let mut encoder = GzEncoder::new(&mut temp_file, Compression::default()); + let mut stderr_data = Vec::new(); + + if let StartExecResults::Attached { mut output, .. } = + self.docker.start_exec(&exec.id, None).await? + { + while let Some(chunk) = FuturesStreamExt::next(&mut output).await { + match chunk { + Ok(bollard::container::LogOutput::StdOut { message }) => { + if let Err(e) = std::io::Write::write_all(&mut encoder, &message) { + remove_sidecar(self.docker.clone(), sidecar_name.clone()).await; + let mut backup_update: external_service_backups::ActiveModel = + backup_record.clone().into(); + backup_update.state = Set("failed".to_string()); + backup_update.error_message = Set(Some(e.to_string())); + backup_update.finished_at = Set(Some(Utc::now())); + backup_update.update(pool).await?; + return Err(anyhow::anyhow!("Failed to write pg_dump output: {}", e)); } - _ => (), - }, + } + Ok(bollard::container::LogOutput::StdErr { message }) => { + stderr_data.extend_from_slice(&message); + } + Ok(bollard::container::LogOutput::Console { message }) => { + if let Err(e) = std::io::Write::write_all(&mut encoder, &message) { + remove_sidecar(self.docker.clone(), sidecar_name.clone()).await; + let mut backup_update: external_service_backups::ActiveModel = + backup_record.clone().into(); + backup_update.state = Set("failed".to_string()); + backup_update.error_message = Set(Some(e.to_string())); + backup_update.finished_at = Set(Some(Utc::now())); + backup_update.update(pool).await?; + return Err(anyhow::anyhow!("Failed to write pg_dump output: {}", e)); + } + } Err(e) => { - error!("Error streaming backup data: {}", e); - // Update backup record with error + remove_sidecar(self.docker.clone(), sidecar_name.clone()).await; let mut backup_update: external_service_backups::ActiveModel = backup_record.clone().into(); backup_update.state = Set("failed".to_string()); backup_update.error_message = Set(Some(e.to_string())); backup_update.finished_at = Set(Some(Utc::now())); backup_update.update(pool).await?; - return Err(anyhow::anyhow!("Failed to stream backup data: {}", e)); + return Err(anyhow::anyhow!("Error reading pg_dump output: {}", e)); } + _ => {} } } + } else { + remove_sidecar(self.docker.clone(), sidecar_name.clone()).await; + let mut backup_update: external_service_backups::ActiveModel = + backup_record.clone().into(); + backup_update.state = Set("failed".to_string()); + backup_update.error_message = Set(Some("Unexpected exec result type".to_string())); + backup_update.finished_at = Set(Some(Utc::now())); + backup_update.update(pool).await?; + return Err(anyhow::anyhow!("Unexpected exec result type")); + } + + // Check pg_dump exit code. Warnings from TimescaleDB (circular FK constraints on + // hypertable / chunk / continuous_agg) are printed to stderr but do not cause a + // non-zero exit code, so they are logged but do not fail the backup. + let exec_inspect = self.docker.inspect_exec(&exec.id).await?; + if let Some(exit_code) = exec_inspect.exit_code { + if exit_code != 0 { + let stderr = String::from_utf8_lossy(&stderr_data); + remove_sidecar(self.docker.clone(), sidecar_name.clone()).await; + let error_msg = format!("pg_dump failed with exit code {}: {}", exit_code, stderr); + let mut backup_update: external_service_backups::ActiveModel = + backup_record.clone().into(); + backup_update.state = Set("failed".to_string()); + backup_update.error_message = Set(Some(error_msg.clone())); + backup_update.finished_at = Set(Some(Utc::now())); + backup_update.update(pool).await?; + return Err(anyhow::anyhow!("{}", error_msg)); + } + } + + // Log any stderr warnings (e.g. TimescaleDB circular FK hints) without failing + if !stderr_data.is_empty() { + let stderr = String::from_utf8_lossy(&stderr_data); + tracing::debug!("pg_dump stderr for service '{}': {}", self.name, stderr); } + // Finalize the gzip stream before reading the file size + encoder.finish()?; + + // Remove the sidecar now that pg_dump has finished + remove_sidecar(self.docker.clone(), sidecar_name).await; + // Generate backup path in S3 let timestamp = Utc::now().format("%Y%m%d_%H%M%S"); let backup_key = format!( - "{}/postgres_backup_{}.sql", + "{}/postgres_backup_{}.pgdump.gz", subpath.trim_matches('/'), timestamp ); - // Compress the backup - let mut compressed_file = NamedTempFile::new()?; - let mut encoder = - flate2::write::GzEncoder::new(&mut compressed_file, flate2::Compression::default()); - std::io::copy(&mut std::fs::File::open(temp_file.path())?, &mut encoder)?; - encoder.finish()?; - // Get file size after compression - let size_bytes = compressed_file.as_file().metadata()?.len() as i32; + let size_bytes = temp_file.as_file().metadata()?.len() as i32; // Validate backup size - a zero-size backup indicates failure if size_bytes == 0 { @@ -1160,7 +1338,7 @@ impl ExternalService for PostgresService { .put_object() .bucket(&s3_source.bucket_name) .key(&backup_key) - .body(aws_sdk_s3::primitives::ByteStream::from_path(compressed_file.path()).await?) + .body(aws_sdk_s3::primitives::ByteStream::from_path(temp_file.path()).await?) .content_type("application/x-gzip") .send() .await @@ -1920,7 +2098,6 @@ mod tests { #[cfg(feature = "docker-tests")] #[tokio::test] - #[ignore] // Requires Docker async fn test_port_change_after_creation() { let docker = Arc::new(Docker::connect_with_local_defaults().unwrap()); let service = PostgresService::new("test-port-change".to_string(), docker); diff --git a/crates/temps-providers/src/externalsvc/redis.rs b/crates/temps-providers/src/externalsvc/redis.rs index 6bda80e8..a7dab3a2 100644 --- a/crates/temps-providers/src/externalsvc/redis.rs +++ b/crates/temps-providers/src/externalsvc/redis.rs @@ -1421,7 +1421,6 @@ mod tests { #[cfg(feature = "docker-tests")] #[tokio::test] - #[ignore] // Requires Docker async fn test_port_change_after_creation() { let docker = Arc::new(Docker::connect_with_local_defaults().unwrap()); let service = RedisService::new("test-port-change".to_string(), docker); diff --git a/crates/temps-providers/src/services.rs b/crates/temps-providers/src/services.rs index 6c5ecc13..a6c17218 100644 --- a/crates/temps-providers/src/services.rs +++ b/crates/temps-providers/src/services.rs @@ -2298,7 +2298,6 @@ mod tests { #[cfg(feature = "docker-tests")] #[tokio::test] - #[ignore] // TODO: Implement service stop/start functionality async fn test_stop_and_start_service() { let (manager, _test_db) = setup_test_manager().await; let random_unused_port = get_unused_port(); @@ -2339,7 +2338,6 @@ mod tests { #[cfg(feature = "docker-tests")] #[tokio::test] - #[ignore] // TODO: Implement service deletion functionality async fn test_delete_service() { let (manager, _test_db) = setup_test_manager().await; @@ -2375,7 +2373,6 @@ mod tests { #[cfg(feature = "docker-tests")] #[tokio::test] - #[ignore] // TODO: Implement service parameter update functionality async fn test_update_service_parameters() { let (manager, _test_db) = setup_test_manager().await; @@ -2435,7 +2432,6 @@ mod tests { #[cfg(feature = "docker-tests")] #[tokio::test] - #[ignore] // TODO: Implement get service by name functionality async fn test_get_service_by_name() { let (manager, _test_db) = setup_test_manager().await; @@ -2467,7 +2463,6 @@ mod tests { #[cfg(feature = "docker-tests")] #[tokio::test] - #[ignore] // TODO: Implement get service by slug functionality async fn test_get_service_by_slug() { let (manager, _test_db) = setup_test_manager().await; @@ -2544,7 +2539,6 @@ mod tests { #[cfg(feature = "docker-tests")] #[tokio::test] - #[ignore] // TODO: Implement get_service_environment_variables functionality async fn test_service_environment_variables() { let (manager, _test_db) = setup_test_manager().await; let random_unused_port = get_unused_port(); @@ -2678,7 +2672,6 @@ mod tests { #[cfg(feature = "docker-tests")] #[tokio::test] - #[ignore] // FIXME: Parameter validation not implemented - code auto-generates missing parameters (port, password) async fn test_validate_parameters_fails_with_missing_required() { let (manager, _test_db) = setup_test_manager().await; @@ -2910,7 +2903,6 @@ mod tests { #[cfg(feature = "docker-tests")] #[tokio::test] - #[ignore] // TODO: Implement masked environment variables functionality async fn test_masked_environment_variables() { let (manager, _test_db) = setup_test_manager().await; // Find a random unused port on the system diff --git a/crates/temps-proxy/src/handler/ip_access_control.rs b/crates/temps-proxy/src/handler/ip_access_control.rs index fcd6a63e..3c074323 100644 --- a/crates/temps-proxy/src/handler/ip_access_control.rs +++ b/crates/temps-proxy/src/handler/ip_access_control.rs @@ -6,13 +6,42 @@ use axum::{ }; use serde::Deserialize; use std::sync::Arc; +use temps_auth::permission_guard; +use temps_auth::RequireAuth; +use temps_core::problemdetails::{self, Problem, ProblemDetails}; use utoipa::{IntoParams, ToSchema}; use crate::service::ip_access_control_service::{ - CreateIpAccessControlRequest, IpAccessControlResponse, IpAccessControlService, - UpdateIpAccessControlRequest, + CreateIpAccessControlRequest, IpAccessControlError, IpAccessControlResponse, + IpAccessControlService, UpdateIpAccessControlRequest, }; +impl From for Problem { + fn from(error: IpAccessControlError) -> Self { + match error { + IpAccessControlError::NotFound(_) => problemdetails::new(StatusCode::NOT_FOUND) + .with_title("IP Access Control Rule Not Found") + .with_detail(error.to_string()), + + IpAccessControlError::InvalidIpAddress(_) => { + problemdetails::new(StatusCode::BAD_REQUEST) + .with_title("Invalid IP Address") + .with_detail(error.to_string()) + } + + IpAccessControlError::DuplicateIp(_) => problemdetails::new(StatusCode::CONFLICT) + .with_title("Duplicate IP Address") + .with_detail(error.to_string()), + + IpAccessControlError::Database(_) | IpAccessControlError::Internal(_) => { + problemdetails::new(StatusCode::INTERNAL_SERVER_ERROR) + .with_title("Internal Server Error") + .with_detail(error.to_string()) + } + } + } +} + /// Query parameters for listing IP access control rules #[derive(Debug, Deserialize, IntoParams, ToSchema)] pub struct IpAccessControlQuery { @@ -27,18 +56,21 @@ pub struct IpAccessControlQuery { params(IpAccessControlQuery), responses( (status = 200, description = "List of IP access control rules", body = Vec), - (status = 500, description = "Internal server error") + (status = 401, description = "Unauthorized", body = ProblemDetails), + (status = 403, description = "Insufficient permissions", body = ProblemDetails), + (status = 500, description = "Internal server error", body = ProblemDetails) ), + security(("bearer_auth" = [])), tag = "IP Access Control" )] pub async fn list_ip_access_control( + RequireAuth(auth): RequireAuth, State(service): State>, Query(query): Query, -) -> Result { - let rules = service - .list(query.action) - .await - .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e.to_string()))?; +) -> Result { + permission_guard!(auth, LoadBalancerRead); + + let rules = service.list(query.action).await.map_err(Problem::from)?; let responses: Vec = rules .into_iter() @@ -57,22 +89,22 @@ pub async fn list_ip_access_control( ), responses( (status = 200, description = "IP access control rule details", body = IpAccessControlResponse), - (status = 404, description = "IP access control rule not found"), - (status = 500, description = "Internal server error") + (status = 401, description = "Unauthorized", body = ProblemDetails), + (status = 403, description = "Insufficient permissions", body = ProblemDetails), + (status = 404, description = "IP access control rule not found", body = ProblemDetails), + (status = 500, description = "Internal server error", body = ProblemDetails) ), + security(("bearer_auth" = [])), tag = "IP Access Control" )] pub async fn get_ip_access_control( + RequireAuth(auth): RequireAuth, State(service): State>, Path(id): Path, -) -> Result { - let rule = service.get_by_id(id).await.map_err(|e| match e { - crate::service::ip_access_control_service::IpAccessControlError::NotFound(_) => ( - StatusCode::NOT_FOUND, - "IP access control rule not found".to_string(), - ), - _ => (StatusCode::INTERNAL_SERVER_ERROR, e.to_string()), - })?; +) -> Result { + permission_guard!(auth, LoadBalancerRead); + + let rule = service.get_by_id(id).await.map_err(Problem::from)?; Ok(Json(IpAccessControlResponse::from(rule))) } @@ -84,31 +116,28 @@ pub async fn get_ip_access_control( request_body = CreateIpAccessControlRequest, responses( (status = 201, description = "IP access control rule created", body = IpAccessControlResponse), - (status = 400, description = "Invalid request"), - (status = 409, description = "Duplicate IP address"), - (status = 500, description = "Internal server error") + (status = 400, description = "Invalid request", body = ProblemDetails), + (status = 401, description = "Unauthorized", body = ProblemDetails), + (status = 403, description = "Insufficient permissions", body = ProblemDetails), + (status = 409, description = "Duplicate IP address", body = ProblemDetails), + (status = 500, description = "Internal server error", body = ProblemDetails) ), + security(("bearer_auth" = [])), tag = "IP Access Control" )] pub async fn create_ip_access_control( + RequireAuth(auth): RequireAuth, State(service): State>, Json(request): Json, -) -> Result { - // TODO: Get user_id from authentication context - let created_by = None; +) -> Result { + permission_guard!(auth, LoadBalancerWrite); + + let created_by = Some(auth.user_id()); let rule = service .create(request, created_by) .await - .map_err(|e| match e { - crate::service::ip_access_control_service::IpAccessControlError::InvalidIpAddress( - _, - ) => (StatusCode::BAD_REQUEST, e.to_string()), - crate::service::ip_access_control_service::IpAccessControlError::DuplicateIp(_) => { - (StatusCode::CONFLICT, e.to_string()) - } - _ => (StatusCode::INTERNAL_SERVER_ERROR, e.to_string()), - })?; + .map_err(Problem::from)?; Ok(( StatusCode::CREATED, @@ -126,27 +155,24 @@ pub async fn create_ip_access_control( request_body = UpdateIpAccessControlRequest, responses( (status = 200, description = "IP access control rule updated", body = IpAccessControlResponse), - (status = 400, description = "Invalid request"), - (status = 404, description = "IP access control rule not found"), - (status = 500, description = "Internal server error") + (status = 400, description = "Invalid request", body = ProblemDetails), + (status = 401, description = "Unauthorized", body = ProblemDetails), + (status = 403, description = "Insufficient permissions", body = ProblemDetails), + (status = 404, description = "IP access control rule not found", body = ProblemDetails), + (status = 500, description = "Internal server error", body = ProblemDetails) ), + security(("bearer_auth" = [])), tag = "IP Access Control" )] pub async fn update_ip_access_control( + RequireAuth(auth): RequireAuth, State(service): State>, Path(id): Path, Json(request): Json, -) -> Result { - let rule = service.update(id, request).await.map_err(|e| match e { - crate::service::ip_access_control_service::IpAccessControlError::NotFound(_) => ( - StatusCode::NOT_FOUND, - "IP access control rule not found".to_string(), - ), - crate::service::ip_access_control_service::IpAccessControlError::InvalidIpAddress(_) => { - (StatusCode::BAD_REQUEST, e.to_string()) - } - _ => (StatusCode::INTERNAL_SERVER_ERROR, e.to_string()), - })?; +) -> Result { + permission_guard!(auth, LoadBalancerWrite); + + let rule = service.update(id, request).await.map_err(Problem::from)?; Ok(Json(IpAccessControlResponse::from(rule))) } @@ -160,22 +186,22 @@ pub async fn update_ip_access_control( ), responses( (status = 204, description = "IP access control rule deleted"), - (status = 404, description = "IP access control rule not found"), - (status = 500, description = "Internal server error") + (status = 401, description = "Unauthorized", body = ProblemDetails), + (status = 403, description = "Insufficient permissions", body = ProblemDetails), + (status = 404, description = "IP access control rule not found", body = ProblemDetails), + (status = 500, description = "Internal server error", body = ProblemDetails) ), + security(("bearer_auth" = [])), tag = "IP Access Control" )] pub async fn delete_ip_access_control( + RequireAuth(auth): RequireAuth, State(service): State>, Path(id): Path, -) -> Result { - service.delete(id).await.map_err(|e| match e { - crate::service::ip_access_control_service::IpAccessControlError::NotFound(_) => ( - StatusCode::NOT_FOUND, - "IP access control rule not found".to_string(), - ), - _ => (StatusCode::INTERNAL_SERVER_ERROR, e.to_string()), - })?; +) -> Result { + permission_guard!(auth, LoadBalancerWrite); + + service.delete(id).await.map_err(Problem::from)?; Ok(StatusCode::NO_CONTENT) } @@ -189,18 +215,21 @@ pub async fn delete_ip_access_control( ), responses( (status = 200, description = "IP block status"), - (status = 500, description = "Internal server error") + (status = 401, description = "Unauthorized", body = ProblemDetails), + (status = 403, description = "Insufficient permissions", body = ProblemDetails), + (status = 500, description = "Internal server error", body = ProblemDetails) ), + security(("bearer_auth" = [])), tag = "IP Access Control" )] pub async fn check_ip_blocked( + RequireAuth(auth): RequireAuth, State(service): State>, Path(ip): Path, -) -> Result { - let is_blocked = service - .is_blocked(&ip) - .await - .map_err(|e| (StatusCode::INTERNAL_SERVER_ERROR, e.to_string()))?; +) -> Result { + permission_guard!(auth, LoadBalancerRead); + + let is_blocked = service.is_blocked(&ip).await.map_err(Problem::from)?; Ok(Json(serde_json::json!({ "ip": ip, diff --git a/crates/temps-vulnerability-scanner/tests/integration_test.rs b/crates/temps-vulnerability-scanner/tests/integration_test.rs index 692f0268..c69855fc 100644 --- a/crates/temps-vulnerability-scanner/tests/integration_test.rs +++ b/crates/temps-vulnerability-scanner/tests/integration_test.rs @@ -10,8 +10,11 @@ fn get_fixture_path(fixture_name: &str) -> PathBuf { } #[tokio::test] -#[ignore] // Requires Docker to be running async fn test_trivy_scanner_creation() { + if bollard::Docker::connect_with_local_defaults().is_err() { + println!("Docker not available, skipping test"); + return; + } let scanner = TrivyScanner::new(); assert!(scanner.is_ok()); @@ -20,8 +23,11 @@ async fn test_trivy_scanner_creation() { } #[tokio::test] -#[ignore] // Requires Docker to be running async fn test_trivy_scanner_version() { + if bollard::Docker::connect_with_local_defaults().is_err() { + println!("Docker not available, skipping test"); + return; + } let scanner = TrivyScanner::new().unwrap(); let version = scanner.version().await; @@ -34,7 +40,6 @@ async fn test_trivy_scanner_version() { } #[tokio::test] -#[ignore] // Requires Docker to be running async fn test_scan_vulnerable_node_project() { // Check if Docker is available if !bollard::Docker::connect_with_local_defaults() @@ -97,7 +102,6 @@ async fn test_scan_vulnerable_node_project() { } #[tokio::test] -#[ignore] // Requires Docker to be running async fn test_scan_vulnerable_python_project() { // Check if Docker is available if !bollard::Docker::connect_with_local_defaults() @@ -158,7 +162,6 @@ async fn test_scan_vulnerable_python_project() { } #[tokio::test] -#[ignore] // Requires Docker to be running async fn test_scan_empty_directory() { // Check if Docker is available if !bollard::Docker::connect_with_local_defaults() @@ -190,7 +193,6 @@ async fn test_scan_empty_directory() { } #[tokio::test] -#[ignore] // Requires Docker to be running async fn test_scanner_availability() { // Check if Docker is available if !bollard::Docker::connect_with_local_defaults() diff --git a/web/package.json b/web/package.json index f5c32ede..d2b2570f 100644 --- a/web/package.json +++ b/web/package.json @@ -72,7 +72,6 @@ "react-dom": "^19.1.1", "react-hook-form": "^7.66.1", "react-icons": "^5.5.0", - "react-query": "^3.39.3", "react-resizable-panels": "^3.0.6", "react-router-dom": "^7.9.1", "react-simple-maps": "^3.0.0", diff --git a/web/src/pages/CreateService.tsx b/web/src/pages/CreateService.tsx deleted file mode 100644 index 4a1cb48d..00000000 --- a/web/src/pages/CreateService.tsx +++ /dev/null @@ -1,555 +0,0 @@ -import { - createServiceMutation, - getProviderMetadataOptions, - getServiceTypeParametersOptions, -} from '@/api/client/@tanstack/react-query.gen' -import { CreateServiceResponse, ServiceTypeRoute } from '@/api/client/types.gen' -import { Button } from '@/components/ui/button' -import { - Form, - FormControl, - FormDescription, - FormField, - FormItem, - FormLabel, - FormMessage, -} from '@/components/ui/form' -import { Input } from '@/components/ui/input' -import { - Select, - SelectContent, - SelectItem, - SelectTrigger, - SelectValue, -} from '@/components/ui/select' -import { useBreadcrumbs } from '@/contexts/BreadcrumbContext' -import { zodResolver } from '@hookform/resolvers/zod' -import { useMutation, useQuery } from '@tanstack/react-query' -import { ArrowLeft, Loader2 } from 'lucide-react' -import { customAlphabet } from 'nanoid' -import { useEffect, useMemo } from 'react' -import { useForm } from 'react-hook-form' -import { Link, useNavigate, useSearchParams } from 'react-router-dom' -import { toast } from 'sonner' -import * as z from 'zod' - -// Create a custom nanoid with lowercase alphanumeric characters -const generateId = customAlphabet('0123456789abcdefghijklmnopqrstuvwxyz', 4) - -export function CreateService() { - const navigate = useNavigate() - const [searchParams] = useSearchParams() - const serviceType = searchParams.get('type') as ServiceTypeRoute | null - const { setBreadcrumbs } = useBreadcrumbs() - - useEffect(() => { - setBreadcrumbs([ - { label: 'Storage', href: '/storage' }, - { label: 'Create Service', href: '/storage/create' }, - ]) - }, [setBreadcrumbs]) - - // Fetch provider metadata for display - const { data: providerMetadata } = useQuery({ - ...getProviderMetadataOptions({ - path: { - service_type: serviceType || '', - }, - }), - enabled: !!serviceType, - }) - - const defaultName = useMemo( - () => (serviceType ? `${serviceType}-${generateId()}` : ''), - [serviceType] - ) - - // Fetch parameters for the selected service type - const { data: parametersResponse, isLoading: isLoadingParameters } = useQuery( - { - ...getServiceTypeParametersOptions({ - path: { - service_type: serviceType || '', - }, - }), - enabled: !!serviceType, - } - ) - - // Convert JSON schema to parameters array if needed - const parameters = useMemo(() => { - if (!parametersResponse) return undefined - - // If it's already an array, return it - if (Array.isArray(parametersResponse)) return parametersResponse - - // If it's a JSON schema with 'properties', convert to parameter array - if ( - typeof parametersResponse === 'object' && - parametersResponse !== null && - 'properties' in parametersResponse - ) { - const schema = parametersResponse as { - properties: Record - required?: string[] - } - - return Object.entries(schema.properties).map(([key, prop]) => ({ - name: key, - description: prop.description || '', - default_value: - prop.default !== undefined && prop.default !== null - ? String(prop.default) - : '', - required: schema.required?.includes(key) || false, - encrypted: - key.toLowerCase().includes('password') || - key.toLowerCase().includes('secret'), - validation_pattern: prop.pattern || undefined, - type: - prop.type === 'integer' || - prop.format === 'uint32' || - prop.format === 'int32' - ? 'number' - : 'string', - choices: prop.enum || undefined, - })) - } - - return undefined - }, [parametersResponse]) - - // Dynamically create the form schema based on parameters - const formSchema = useMemo(() => { - // Build dynamic parameter schema based on loaded parameters - const paramSchema: Record< - string, - z.ZodString | z.ZodOptional - > = {} - - if (parameters && Array.isArray(parameters)) { - parameters.forEach((param) => { - if (param && typeof param === 'object' && 'name' in param) { - const paramName = param.name as string - const isRequired = (param as { required?: boolean }).required || false - const validationPattern = (param as { validation_pattern?: string }) - .validation_pattern - - // Start with base string validation - let fieldSchema = z.string() - - // Add pattern validation if provided - if (validationPattern) { - fieldSchema = fieldSchema.regex( - new RegExp(validationPattern), - `Invalid format for ${paramName}` - ) - } - - // Make required or optional - if (isRequired) { - paramSchema[paramName] = fieldSchema.min( - 1, - `${paramName} is required` - ) - } else { - paramSchema[paramName] = fieldSchema.optional() - } - } - }) - } - - return z.object({ - name: z - .string() - .min(1, 'Service name is required') - .regex( - /^[a-z0-9-]+$/, - 'Name must contain only lowercase letters, numbers, and hyphens' - ), - service_type: z.string(), - parameters: z.object(paramSchema), - }) - }, [parameters]) - - type FormValues = z.infer - - const form = useForm({ - resolver: zodResolver(formSchema), - mode: 'onChange', // Validate on change for immediate feedback - reValidateMode: 'onChange', // Revalidate on every change - defaultValues: { - name: defaultName, - service_type: serviceType || '', - parameters: {}, - }, - }) - - // Set default values for parameters when they are loaded - useEffect(() => { - if (parameters) { - const defaultParameters = parameters.reduce>( - (acc, param) => { - // Convert "null" string or empty to empty string - acc[param.name] = - param.default_value && param.default_value !== 'null' - ? param.default_value - : '' - return acc - }, - {} - ) - form.setValue('parameters', defaultParameters) - } - }, [parameters, form]) - - const createServiceMut = useMutation({ - ...createServiceMutation(), - meta: { - errorTitle: 'Failed to create service', - }, - onSuccess: (data: CreateServiceResponse) => { - toast.success('Service created successfully') - navigate(`/storage/${data.id}`) - }, - }) - - const onSubmit = async (values: FormValues) => { - // Convert numeric parameters from strings to numbers - const processedParameters: Record = {} - - if (parameters && Array.isArray(parameters)) { - for (const param of parameters) { - const value = values.parameters[param.name] - - // For password/encrypted fields, always send empty string even if empty - if (param.encrypted) { - processedParameters[param.name] = value || '' - } else if (value !== undefined && value !== '' && value !== null) { - // Convert to number if the parameter type is 'number' - if (param.type === 'number') { - processedParameters[param.name] = Number(value) - } else { - processedParameters[param.name] = value - } - } - } - } else { - // Fallback if parameters is not an array - Object.assign(processedParameters, values.parameters) - } - - await createServiceMut.mutateAsync({ - body: { - service_type: values.service_type as ServiceTypeRoute, - name: values.name, - parameters: processedParameters, - }, - }) - } - - if (!serviceType) { - return ( -
-
-
-

Create Service

-

- Please select a service type from the URL parameter. -

-
- - - -
-
- ) - } - - if (isLoadingParameters) { - return ( -
-
-
-
-
- {[...Array(5)].map((_, i) => ( -
-
-
-
- ))} -
-
-
-
- ) - } - - return ( -
-
- {/* Header with provider info */} -
- - - - - {providerMetadata && ( -
-
- {`${providerMetadata.display_name} -
-
-

- Create {providerMetadata.display_name} Service -

-

- {providerMetadata.description} -

-
-
- )} -
- - {/* Form */} -
- - {/* Service Name */} - ( - - Service Name - - - - - A unique name to identify this service - - - - )} - /> - - {/* Dynamic Parameters */} - {parameters?.map((param, index: number) => { - // Check if this parameter should be grouped with the next one - const isHost = param.name === 'host' - const isUsername = param.name === 'username' - const nextParam = parameters[index + 1] - const shouldGroup = - (isHost && nextParam?.name === 'port') || - (isUsername && nextParam?.name === 'password') - - // Skip rendering if this is 'port' or 'password' (they'll be rendered with their pair) - if (param.name === 'port' || param.name === 'password') { - return null - } - - if (shouldGroup && nextParam) { - // Render paired fields (host/port or username/password) - return ( -
- ( - - - {param.name.charAt(0).toUpperCase() + - param.name.slice(1)} - {param.required && ( - * - )} - - - - - {param.description && ( - - {param.description} - - )} - - - )} - /> - ( - - - {nextParam.name.charAt(0).toUpperCase() + - nextParam.name.slice(1)} - {nextParam.required && ( - * - )} - - - - - {nextParam.description && ( - - {nextParam.description} - - )} - - - )} - /> -
- ) - } - - // Render single field - return ( - ( - - - {param.name.charAt(0).toUpperCase() + - param.name.slice(1)} - {param.required && ( - * - )} - - - {param.choices && param.choices.length > 0 ? ( - // Render Select for fields with choices - - ) : ( - // Render Input for fields without choices - - )} - - {param.description && ( - {param.description} - )} - - - )} - /> - ) - })} - - {/* Action Buttons */} -
- - -
- - -
-
- ) -} diff --git a/web/src/pages/CreateServiceRefactored.tsx b/web/src/pages/CreateServiceRefactored.tsx deleted file mode 100644 index 248c5e16..00000000 --- a/web/src/pages/CreateServiceRefactored.tsx +++ /dev/null @@ -1,199 +0,0 @@ -import { - createServiceMutation, - getProviderMetadataOptions, - getServiceTypeParametersOptions, -} from '@/api/client/@tanstack/react-query.gen' -import { ServiceTypeRoute } from '@/api/client/types.gen' -import { Button } from '@/components/ui/button' -import { DynamicForm } from '@/components/forms/DynamicForm' -import { Input } from '@/components/ui/input' -import { Label } from '@/components/ui/label' -import { useBreadcrumbs } from '@/contexts/BreadcrumbContext' -import { useMutation, useQuery } from '@tanstack/react-query' -import { customAlphabet } from 'nanoid' -import { ArrowLeft } from 'lucide-react' -import { useEffect, useMemo, useState } from 'react' -import { Link, useNavigate, useSearchParams } from 'react-router-dom' -import { toast } from 'sonner' - -// Create a custom nanoid with lowercase alphanumeric characters -const generateId = customAlphabet('0123456789abcdefghijklmnopqrstuvwxyz', 4) - -export function CreateService() { - const navigate = useNavigate() - const [searchParams] = useSearchParams() - const serviceType = searchParams.get('type') as ServiceTypeRoute | null - const { setBreadcrumbs } = useBreadcrumbs() - - const defaultName = useMemo( - () => (serviceType ? `${serviceType}-${generateId()}` : ''), - [serviceType] - ) - - const [serviceName, setServiceName] = useState(defaultName) - - useEffect(() => { - setBreadcrumbs([ - { label: 'Storage', href: '/storage' }, - { label: 'Create Service', href: '/storage/create' }, - ]) - }, [setBreadcrumbs]) - - // Fetch provider metadata for display - const { data: providerMetadata } = useQuery({ - ...getProviderMetadataOptions({ - path: { - service_type: serviceType || '', - }, - }), - enabled: !!serviceType, - }) - - // Fetch parameters for the selected service type - const { data: parameters, isLoading: isLoadingParameters } = useQuery({ - ...getServiceTypeParametersOptions({ - path: { - service_type: serviceType || '', - }, - }), - enabled: !!serviceType, - }) - - const createServiceMut = useMutation({ - ...createServiceMutation(), - meta: { - errorTitle: 'Failed to create service', - }, - onSuccess: (data) => { - toast.success('Service created successfully') - navigate(`/storage/${data.id}`) - }, - }) - - const handleSubmit = async (parameterValues: Record) => { - if (!serviceName.trim()) { - toast.error('Service name is required') - return - } - - await createServiceMut.mutateAsync({ - body: { - service_type: serviceType as ServiceTypeRoute, - name: serviceName, - parameters: parameterValues, - }, - }) - } - - if (!serviceType) { - return ( -
-
-
-

Create Service

-

- Please select a service type from the URL parameter. -

-
- - - -
-
- ) - } - - if (isLoadingParameters) { - return ( -
-
-
-
-
- {[...Array(5)].map((_, i) => ( -
-
-
-
- ))} -
-
-
-
- ) - } - - if (!parameters) { - return null - } - - return ( -
-
- {/* Header with provider info */} -
- - - - - {providerMetadata && ( -
-
- {`${providerMetadata.display_name} -
-
-

- Create {providerMetadata.display_name} Service -

-

- {providerMetadata.description} -

-
-
- )} -
- - {/* Service Name Field */} -
- - setServiceName(e.target.value)} - placeholder={`my-${serviceType}`} - /> -

- A unique name to identify this service -

-
- - {/* Dynamic Form for Parameters */} - navigate('/storage')} - submitText="Create Service" - isSubmitting={createServiceMut.isPending} - /> -
-
- ) -} From 4b5616d0753eb4761a023f41692a070d2d7a1098 Mon Sep 17 00:00:00 2001 From: David Viejo Date: Fri, 20 Feb 2026 19:22:26 +0100 Subject: [PATCH 2/9] fix(core): lifecycle management, bounded caches, and memory safety across 11 crates - Backup: pg_dump sidecar pattern to prevent OOM on large TimescaleDB databases - Environments: branch uniqueness validation per project - CLI: source map upload/list/delete commands for error tracking - Performance: SQL percentile_cont() instead of in-memory sorting - Session replay: LIMIT + bulk UPDATE to bound memory on large replays - Source maps: select_only() to avoid fetching blob content in listings - Error tracking: LazyLock regexes to avoid recompilation per request - Deployer: temp file tar streaming + tail-limited build logs - Notifications: DigestScheduler Arc cycle fix with proper start/shutdown - Routes: RouteTableListener + ProjectChangeListener stored JoinHandle + Drop - Funnels: MetricsCache bounded to max_capacity with LRU-style eviction - Monitoring: OutageDetectionService prunes stale monitor states - Webhooks: WebhookEventListener Drop impl aborts spawned task - Analytics events: OpenAPI spec improvements and handler refactoring - Deployments: workflow execution and job processor improvements Includes 28 new tests covering lifecycle, bounded cache, and cleanup behavior. --- Cargo.lock | 2 + apps/temps-cli/src/commands/errors/index.ts | 337 +++++++- crates/temps-analytics-events/Cargo.toml | 2 + .../src/handlers/events_handler.rs | 750 +++++++++++++++++- crates/temps-analytics-events/src/plugin.rs | 2 + .../src/types/requests.rs | 36 + .../src/services/service.rs | 244 +++++- .../src/services/service.rs | 567 ++++++++----- .../src/services/service.rs | 20 +- crates/temps-backup/src/services/backup.rs | 115 ++- crates/temps-deployer/src/docker.rs | 45 +- .../src/handlers/remote_deployments.rs | 9 +- .../src/jobs/deploy_image.rs | 66 +- .../src/jobs/mark_deployment_complete.rs | 5 +- .../src/services/job_processor.rs | 24 + .../src/services/services.rs | 82 +- .../services/workflow_execution_service.rs | 3 +- .../src/services/environment_service.rs | 458 ++++++++++- .../src/services/error_ingestion_service.rs | 72 +- .../src/services/source_map_service.rs | 42 +- crates/temps-monitoring/src/outage.rs | 184 ++++- .../src/digest/scheduler.rs | 187 ++++- crates/temps-notifications/src/plugin.rs | 5 +- .../temps-projects/src/handlers/handlers.rs | 3 +- crates/temps-projects/src/services/project.rs | 3 +- .../src/project_change_listener.rs | 131 ++- crates/temps-routes/src/route_table.rs | 91 ++- .../src/services/health_check_service.rs | 3 +- crates/temps-webhooks/src/listener.rs | 88 +- mcp/README.md | 15 +- 30 files changed, 3199 insertions(+), 392 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c20459bb..01c44fea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10445,9 +10445,11 @@ dependencies = [ "testcontainers", "thiserror 2.0.17", "tokio", + "tower 0.5.2", "tracing", "url", "utoipa", + "uuid", "woothee", ] diff --git a/apps/temps-cli/src/commands/errors/index.ts b/apps/temps-cli/src/commands/errors/index.ts index c0db6aaa..3edb5103 100644 --- a/apps/temps-cli/src/commands/errors/index.ts +++ b/apps/temps-cli/src/commands/errors/index.ts @@ -1,6 +1,6 @@ import type { Command } from 'commander' -import { requireAuth } from '../../config/store.js' -import { setupClient, client, getErrorMessage } from '../../lib/api-client.js' +import { requireAuth, config, credentials } from '../../config/store.js' +import { setupClient, client, getErrorMessage, normalizeApiUrl } from '../../lib/api-client.js' import { listErrorGroups, getErrorGroup, @@ -12,9 +12,12 @@ import { getErrorDashboardStats, } from '../../api/sdk.gen.js' import type { ErrorGroupResponse, ErrorEventResponse, ErrorTimeSeriesDataResponse } from '../../api/types.gen.js' -import { withSpinner } from '../../ui/spinner.js' +import { withSpinner, startSpinner, succeedSpinner, failSpinner } from '../../ui/spinner.js' import { printTable, statusBadge, type TableColumn } from '../../ui/table.js' import { newline, header, icons, json, colors, success, info, warning, keyValue, formatRelativeTime } from '../../ui/output.js' +import { readFile } from 'node:fs/promises' +import { existsSync } from 'node:fs' +import { resolve, basename } from 'node:path' interface ListOptions { projectId: string @@ -78,6 +81,61 @@ interface DashboardOptions { const ERROR_STATUSES = ['unresolved', 'resolved', 'ignored'] +// Source map interfaces +interface SourceMapUploadOptions { + projectId: string + release: string + file: string + filePath?: string + dist?: string +} + +interface SourceMapListOptions { + projectId: string + release: string + json?: boolean +} + +interface SourceMapReleasesOptions { + projectId: string + json?: boolean +} + +interface SourceMapDeleteReleaseOptions { + projectId: string + release: string +} + +interface SourceMapDeleteOneOptions { + projectId: string + sourceMapId: string +} + +// Source map API response shapes +interface SourceMapResponse { + id: number + project_id: number + release: string + file_path: string + dist?: string + size_bytes: number + checksum?: string + created_at: string +} + +interface SourceMapListResponse { + source_maps: SourceMapResponse[] + total: number +} + +interface ReleaseListResponse { + releases: string[] +} + +interface DeleteResponse { + deleted: number +} + export function registerErrorsCommands(program: Command): void { const errors = program .command('errors') @@ -160,6 +218,52 @@ export function registerErrorsCommands(program: Command): void { .option('--compare', 'Compare to previous period') .option('--json', 'Output in JSON format') .action(getErrorDashboardAction) + + // Source maps subcommand group + const sourcemaps = errors + .command('sourcemaps') + .alias('sm') + .description('Manage source maps for error symbolication') + + sourcemaps + .command('upload') + .description('Upload a source map file for a release') + .requiredOption('--project-id ', 'Project ID') + .requiredOption('--release ', 'Release version (e.g. commit SHA)') + .requiredOption('--file ', 'Path to the .map file') + .option('--file-path ', 'URL path in stack traces (e.g. ~/assets/main.js)') + .option('--dist ', 'Distribution identifier') + .action(uploadSourceMap) + + sourcemaps + .command('list') + .alias('ls') + .description('List source maps for a release') + .requiredOption('--project-id ', 'Project ID') + .requiredOption('--release ', 'Release version') + .option('--json', 'Output in JSON format') + .action(listSourceMaps) + + sourcemaps + .command('releases') + .description('List all releases that have source maps') + .requiredOption('--project-id ', 'Project ID') + .option('--json', 'Output in JSON format') + .action(listSourceMapReleases) + + sourcemaps + .command('delete') + .description('Delete all source maps for a release') + .requiredOption('--project-id ', 'Project ID') + .requiredOption('--release ', 'Release version') + .action(deleteReleaseSourceMaps) + + sourcemaps + .command('delete-one') + .description('Delete a specific source map by ID') + .requiredOption('--project-id ', 'Project ID') + .requiredOption('--source-map-id ', 'Source map ID') + .action(deleteSourceMap) } async function listErrorGroupsAction(options: ListOptions): Promise { @@ -521,6 +625,233 @@ async function getErrorTimelineAction(options: TimelineOptions): Promise { newline() } +async function uploadSourceMap(options: SourceMapUploadOptions): Promise { + await requireAuth() + await setupClient() + + const projectId = parseInt(options.projectId, 10) + if (isNaN(projectId)) { + warning('Invalid project ID') + return + } + + const resolvedPath = resolve(options.file) + if (!existsSync(resolvedPath)) { + warning(`File does not exist: ${resolvedPath}`) + return + } + + const filename = basename(resolvedPath) + startSpinner(`Uploading source map: ${filename}...`) + + const apiUrl = normalizeApiUrl(config.get('apiUrl')) + const apiKey = await credentials.getApiKey() + const uploadUrl = `${apiUrl}/projects/${projectId}/releases/${encodeURIComponent(options.release)}/source-maps` + + try { + const fileData = await readFile(resolvedPath) + const uint8Data = new Uint8Array(fileData.buffer, fileData.byteOffset, fileData.byteLength) + + const formData = new FormData() + formData.append('file', new Blob([uint8Data], { type: 'application/json' }), filename) + if (options.filePath) { + formData.append('file_path', options.filePath) + } + if (options.dist) { + formData.append('dist', options.dist) + } + + const response = await fetch(uploadUrl, { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + }, + body: formData, + }) + + if (!response.ok) { + const errorText = await response.text() + failSpinner(`Upload failed: ${response.status}`) + warning(errorText) + return + } + + const result = (await response.json()) as SourceMapResponse + succeedSpinner(`Source map uploaded (id=${result.id})`) + + newline() + keyValue('ID', result.id.toString()) + keyValue('Release', result.release) + keyValue('File Path', result.file_path) + keyValue('Size', formatBytes(result.size_bytes)) + if (result.dist) keyValue('Dist', result.dist) + newline() + } catch (err) { + failSpinner('Upload failed') + throw err + } +} + +async function listSourceMaps(options: SourceMapListOptions): Promise { + await requireAuth() + await setupClient() + + const projectId = parseInt(options.projectId, 10) + if (isNaN(projectId)) { + warning('Invalid project ID') + return + } + + const apiUrl = normalizeApiUrl(config.get('apiUrl')) + const apiKey = await credentials.getApiKey() + const url = `${apiUrl}/projects/${projectId}/releases/${encodeURIComponent(options.release)}/source-maps` + + const result = await withSpinner('Fetching source maps...', async () => { + const response = await fetch(url, { + headers: { Authorization: `Bearer ${apiKey}` }, + }) + if (!response.ok) { + const errorText = await response.text() + throw new Error(`Request failed (${response.status}): ${errorText}`) + } + return response.json() as Promise + }) + + if (options.json) { + json(result) + return + } + + newline() + header(`${icons.info} Source Maps for Release "${options.release}" (${result.total})`) + + if (result.source_maps.length === 0) { + info('No source maps found for this release') + newline() + return + } + + const columns: TableColumn[] = [ + { header: 'ID', key: 'id', width: 6 }, + { header: 'File Path', key: 'file_path' }, + { header: 'Dist', accessor: (m) => m.dist ?? '-' }, + { header: 'Size', accessor: (m) => formatBytes(m.size_bytes) }, + { header: 'Uploaded', accessor: (m) => formatRelativeTime(m.created_at) }, + ] + + printTable(result.source_maps, columns, { style: 'minimal' }) + newline() +} + +async function listSourceMapReleases(options: SourceMapReleasesOptions): Promise { + await requireAuth() + await setupClient() + + const projectId = parseInt(options.projectId, 10) + if (isNaN(projectId)) { + warning('Invalid project ID') + return + } + + const apiUrl = normalizeApiUrl(config.get('apiUrl')) + const apiKey = await credentials.getApiKey() + const url = `${apiUrl}/projects/${projectId}/source-map-releases` + + const result = await withSpinner('Fetching releases...', async () => { + const response = await fetch(url, { + headers: { Authorization: `Bearer ${apiKey}` }, + }) + if (!response.ok) { + const errorText = await response.text() + throw new Error(`Request failed (${response.status}): ${errorText}`) + } + return response.json() as Promise + }) + + if (options.json) { + json(result) + return + } + + newline() + header(`${icons.info} Releases with Source Maps for Project ${projectId}`) + + if (result.releases.length === 0) { + info('No releases with source maps found') + newline() + return + } + + for (const release of result.releases) { + info(` ${colors.bold(release)}`) + } + newline() +} + +async function deleteReleaseSourceMaps(options: SourceMapDeleteReleaseOptions): Promise { + await requireAuth() + await setupClient() + + const projectId = parseInt(options.projectId, 10) + if (isNaN(projectId)) { + warning('Invalid project ID') + return + } + + const apiUrl = normalizeApiUrl(config.get('apiUrl')) + const apiKey = await credentials.getApiKey() + const url = `${apiUrl}/projects/${projectId}/releases/${encodeURIComponent(options.release)}/source-maps` + + const result = await withSpinner(`Deleting source maps for release "${options.release}"...`, async () => { + const response = await fetch(url, { + method: 'DELETE', + headers: { Authorization: `Bearer ${apiKey}` }, + }) + if (!response.ok) { + const errorText = await response.text() + throw new Error(`Request failed (${response.status}): ${errorText}`) + } + return response.json() as Promise + }) + + success(`Deleted ${result.deleted} source map(s) for release "${options.release}"`) +} + +async function deleteSourceMap(options: SourceMapDeleteOneOptions): Promise { + await requireAuth() + await setupClient() + + const projectId = parseInt(options.projectId, 10) + const sourceMapId = parseInt(options.sourceMapId, 10) + if (isNaN(projectId) || isNaN(sourceMapId)) { + warning('Invalid project ID or source map ID') + return + } + + const apiUrl = normalizeApiUrl(config.get('apiUrl')) + const apiKey = await credentials.getApiKey() + const url = `${apiUrl}/projects/${projectId}/source-maps/${sourceMapId}` + + await withSpinner(`Deleting source map #${sourceMapId}...`, async () => { + const response = await fetch(url, { + method: 'DELETE', + headers: { Authorization: `Bearer ${apiKey}` }, + }) + if (!response.ok && response.status !== 204) { + const errorText = await response.text() + throw new Error(`Request failed (${response.status}): ${errorText}`) + } + }) + + success(`Source map #${sourceMapId} deleted`) +} + +function formatBytes(bytes: number): string { + if (bytes < 1024) return `${bytes} B` + if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB` + return `${(bytes / (1024 * 1024)).toFixed(2)} MB` +} + async function getErrorDashboardAction(options: DashboardOptions): Promise { await requireAuth() await setupClient() diff --git a/crates/temps-analytics-events/Cargo.toml b/crates/temps-analytics-events/Cargo.toml index d6778518..8da13eff 100644 --- a/crates/temps-analytics-events/Cargo.toml +++ b/crates/temps-analytics-events/Cargo.toml @@ -50,3 +50,5 @@ woothee = { workspace = true } tokio = { workspace = true } testcontainers = { workspace = true } temps-migrations = { path = "../temps-migrations" } +tower = { version = "0.5", features = ["util"] } +uuid = { version = "1", features = ["v4"] } diff --git a/crates/temps-analytics-events/src/handlers/events_handler.rs b/crates/temps-analytics-events/src/handlers/events_handler.rs index e2851e43..7ab2c6a1 100644 --- a/crates/temps-analytics-events/src/handlers/events_handler.rs +++ b/crates/temps-analytics-events/src/handlers/events_handler.rs @@ -1,9 +1,9 @@ use crate::services::AnalyticsEventsService; use crate::types::{ ActiveVisitorsQuery, ActiveVisitorsResponse, AggregatedBucketsResponse, AggregationLevel, - AnalyticsSessionEventsResponse, EventCount, EventMetricsPayload, EventTimeline, - EventTimelineQuery, EventTypeBreakdown, EventTypeBreakdownQuery, EventsCountQuery, - HasEventsQuery, HasEventsResponse, HourlyVisitsQuery, PropertyBreakdownQuery, + AnalyticsSessionEventsResponse, ConsoleEventPayload, EventCount, EventMetricsPayload, + EventTimeline, EventTimelineQuery, EventTypeBreakdown, EventTypeBreakdownQuery, + EventsCountQuery, HasEventsQuery, HasEventsResponse, HourlyVisitsQuery, PropertyBreakdownQuery, PropertyBreakdownResponse, PropertyColumn, PropertyTimelineQuery, PropertyTimelineResponse, SessionEventsQuery, UniqueCountsQuery, UniqueCountsResponse, }; @@ -26,6 +26,7 @@ pub struct AppState { pub events_service: Arc, pub route_table: Arc, pub ip_address_service: Arc, + pub cookie_crypto: Arc, } /// Get event counts with filtering @@ -719,6 +720,122 @@ pub async fn record_event_metrics( } } +/// Record an analytics event via the console API with explicit project ID. +/// +/// The app backend forwards the user's encrypted Temps cookies, so visitor/session +/// identity is resolved automatically by middleware. No geolocation or user-agent +/// enrichment is performed — this is a lightweight server-side ingestion path. +#[utoipa::path( + tag = "Events", + post, + path = "/projects/{project_id}/events/ingest", + params( + ("project_id" = i32, Path, description = "Project ID"), + ), + request_body = ConsoleEventPayload, + responses( + (status = 200, description = "Event recorded successfully"), + (status = 400, description = "Bad request"), + (status = 401, description = "Unauthorized"), + (status = 403, description = "Insufficient permissions"), + (status = 500, description = "Internal server error") + ), + security(("bearer_auth" = [])) +)] +pub async fn record_console_event( + RequireAuth(auth): RequireAuth, + State(state): State>, + Path(project_id): Path, + Json(payload): Json, +) -> Result { + use tracing::{info, warn}; + + permission_guard!(auth, AnalyticsWrite); + + info!( + "Recording console event: {} for project {} path: {}", + payload.event_name, project_id, payload.request_path + ); + + // Decrypt the encrypted cookie values if provided. + // Generate fallback UUIDs when cookies are absent or decryption fails, + // because the events hypertable enforces NOT NULL on session_id. + let visitor_id = payload.visitor_id.as_deref().and_then(|encrypted| { + match state.cookie_crypto.decrypt(encrypted) { + Ok(decrypted) => Some(decrypted), + Err(e) => { + warn!( + "Failed to decrypt visitor_id cookie for project {}: {}", + project_id, e + ); + None + } + } + }); + + let session_id = payload + .session_id + .as_deref() + .and_then(|encrypted| match state.cookie_crypto.decrypt(encrypted) { + Ok(decrypted) => Some(decrypted), + Err(e) => { + warn!( + "Failed to decrypt session_id cookie for project {}: {}", + project_id, e + ); + None + } + }) + .or_else(|| Some(temps_core::uuid::Uuid::new_v4().to_string())); + + state + .events_service + .record_event( + project_id, + Some(payload.environment_id), + Some(payload.deployment_id), + session_id, + visitor_id, + &payload.event_name, + payload.event_data, + &payload.request_path, + &payload.request_query, + None, // screen_width + None, // screen_height + None, // viewport_width + None, // viewport_height + None, // language + None, // page_title + None, // ip_geolocation_id + None, // user_agent + None, // referrer + None, // ttfb + None, // lcp + None, // fid + None, // fcp + None, // cls + None, // inp + ) + .await + .map_err(|e| { + error!("Failed to record console event: {:?}", e); + ErrorBuilder::new(StatusCode::INTERNAL_SERVER_ERROR) + .title("Failed to record event") + .detail(format!( + "Error recording event for project {}: {}", + project_id, e + )) + .build() + })?; + + info!( + "Console event recorded: {} for project {} env={}", + payload.event_name, project_id, payload.environment_id + ); + + Ok(StatusCode::OK) +} + /// Get aggregated metrics by time bucket #[utoipa::path( get, @@ -887,6 +1004,10 @@ pub fn configure_routes() -> Router> { "/projects/{project_id}/has-events", get(has_analytics_events), ) + .route( + "/projects/{project_id}/events/ingest", + post(record_console_event), + ) .route("/sessions/{session_id}/events", get(get_session_events)) .route("/_temps/event", post(record_event_metrics)) } @@ -904,6 +1025,7 @@ pub fn configure_routes() -> Router> { get_active_visitors, get_hourly_visits, record_event_metrics, + record_console_event, get_session_events, has_analytics_events, get_dashboard_projects_analytics, @@ -931,6 +1053,7 @@ pub fn configure_routes() -> Router> { ActiveVisitorsQuery, HourlyVisitsQuery, EventMetricsPayload, + ConsoleEventPayload, AnalyticsSessionEventsResponse, SessionEventsQuery, HasEventsResponse, @@ -946,3 +1069,624 @@ pub fn configure_routes() -> Router> { ) )] pub struct EventsApiDoc; + +#[cfg(test)] +mod tests { + use super::*; + use axum::body::Body; + use axum::http::Request; + use axum::middleware; + use sea_orm::{ActiveModelTrait, ActiveValue::Set, ColumnTrait, EntityTrait, QueryFilter}; + use temps_database::test_utils::TestDatabase; + use temps_entities::projects; + use tower::ServiceExt; + + fn create_test_auth_context() -> temps_auth::AuthContext { + let user = temps_entities::users::Model { + id: 1, + name: "Test User".to_string(), + email: "test@example.com".to_string(), + password_hash: Some("hashed".to_string()), + email_verified: true, + email_verification_token: None, + email_verification_expires: None, + password_reset_token: None, + password_reset_expires: None, + deleted_at: None, + mfa_secret: None, + mfa_enabled: false, + mfa_recovery_codes: None, + created_at: chrono::Utc::now(), + updated_at: chrono::Utc::now(), + }; + temps_auth::AuthContext::new_session(user, temps_auth::Role::Admin) + } + + async fn setup_test_app( + db: Arc, + ) -> (axum::Router, Arc, Arc) { + let events_service = Arc::new(crate::services::AnalyticsEventsService::new(db.clone())); + let route_table = Arc::new(temps_proxy::CachedPeerTable::new(db.clone())); + let geoip_service = Arc::new(temps_geo::GeoIpService::Mock( + temps_geo::MockGeoIpService::new(), + )); + let ip_address_service = + Arc::new(temps_geo::IpAddressService::new(db.clone(), geoip_service)); + let cookie_crypto = + Arc::new(temps_core::CookieCrypto::new("test_key_32_bytes_long_for_tests").unwrap()); + + let app_state = Arc::new(AppState { + events_service, + route_table, + ip_address_service, + cookie_crypto: cookie_crypto.clone(), + }); + + let auth_middleware = middleware::from_fn( + |mut req: Request, next: axum::middleware::Next| async move { + let auth_context = create_test_auth_context(); + req.extensions_mut().insert(auth_context); + next.run(req).await + }, + ); + + let app = configure_routes() + .layer(auth_middleware) + .with_state(app_state.clone()); + + (app, app_state, cookie_crypto) + } + + async fn insert_test_environment( + db: &sea_orm::DatabaseConnection, + project_id: i32, + ) -> temps_entities::environments::Model { + use temps_entities::{environments, upstream_config::UpstreamList}; + environments::ActiveModel { + project_id: Set(project_id), + name: Set("production".to_string()), + branch: Set(Some("main".to_string())), + slug: Set("production".to_string()), + subdomain: Set("prod".to_string()), + host: Set(String::new()), + upstreams: Set(UpstreamList::new()), + is_preview: Set(false), + current_deployment_id: Set(None), + deleted_at: Set(None), + deployment_config: Set(None), + last_deployment: Set(None), + created_at: Set(chrono::Utc::now()), + updated_at: Set(chrono::Utc::now()), + ..Default::default() + } + .insert(db) + .await + .expect("Failed to insert test environment") + } + + async fn insert_test_deployment( + db: &sea_orm::DatabaseConnection, + project_id: i32, + environment_id: i32, + ) -> temps_entities::deployments::Model { + use temps_entities::deployments; + deployments::ActiveModel { + project_id: Set(project_id), + environment_id: Set(environment_id), + slug: Set(format!("test-deploy-{}", uuid::Uuid::new_v4())), + state: Set("ready".to_string()), + metadata: Set(Some( + temps_entities::deployments::DeploymentMetadata::default(), + )), + deploying_at: Set(None), + ready_at: Set(Some(chrono::Utc::now())), + started_at: Set(Some(chrono::Utc::now())), + finished_at: Set(Some(chrono::Utc::now())), + context_vars: Set(None), + branch_ref: Set(Some("main".to_string())), + tag_ref: Set(None), + commit_sha: Set(None), + commit_message: Set(None), + commit_author: Set(None), + commit_json: Set(None), + cancelled_reason: Set(None), + static_dir_location: Set(None), + screenshot_location: Set(None), + image_name: Set(None), + deployment_config: Set(None), + created_at: Set(chrono::Utc::now()), + updated_at: Set(chrono::Utc::now()), + ..Default::default() + } + .insert(db) + .await + .expect("Failed to insert test deployment") + } + + async fn insert_test_project(db: &sea_orm::DatabaseConnection) -> projects::Model { + projects::ActiveModel { + name: Set("test-project".to_string()), + repo_name: Set("test-repo".to_string()), + repo_owner: Set("test-owner".to_string()), + directory: Set("/".to_string()), + main_branch: Set("main".to_string()), + preset: Set(temps_entities::preset::Preset::NextJs), + preset_config: Set(None), + deployment_config: Set(None), + slug: Set("test-project".to_string()), + is_deleted: Set(false), + deleted_at: Set(None), + last_deployment: Set(None), + is_public_repo: Set(false), + git_url: Set(None), + git_provider_connection_id: Set(None), + attack_mode: Set(false), + enable_preview_environments: Set(false), + source_type: Set(temps_entities::source_type::SourceType::Git), + created_at: Set(chrono::Utc::now()), + updated_at: Set(chrono::Utc::now()), + ..Default::default() + } + .insert(db) + .await + .expect("Failed to insert test project") + } + + #[tokio::test] + async fn test_console_event_ingest_success() { + let mut test_db: TestDatabase = match TestDatabase::with_migrations().await { + Ok(db) => db, + Err(e) => { + println!("Database not available, skipping test: {}", e); + return; + } + }; + let db = test_db.connection_arc(); + let project = insert_test_project(db.as_ref()).await; + let environment = insert_test_environment(db.as_ref(), project.id).await; + let deployment = insert_test_deployment(db.as_ref(), project.id, environment.id).await; + let (app, _state, _crypto) = setup_test_app(db.clone()).await; + + let payload = serde_json::json!({ + "event_name": "purchase", + "event_data": { "plan": "pro", "amount": 49.99 }, + "environment_id": environment.id, + "deployment_id": deployment.id, + "request_path": "/checkout", + "request_query": "" + }); + + let response = app + .oneshot( + Request::builder() + .method("POST") + .uri(format!("/projects/{}/events/ingest", project.id)) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_string(&payload).unwrap())) + .unwrap(), + ) + .await + .unwrap(); + + let status = response.status(); + let body = axum::body::to_bytes(response.into_body(), usize::MAX) + .await + .unwrap(); + assert_eq!( + status, + StatusCode::OK, + "Expected 200, got {}. Body: {}", + status, + String::from_utf8_lossy(&body) + ); + + // Verify the event was stored in the database + let events: Vec = temps_entities::events::Entity::find() + .filter(temps_entities::events::Column::ProjectId.eq(project.id)) + .all(db.as_ref()) + .await + .expect("Failed to query events"); + + assert_eq!(events.len(), 1); + assert_eq!(events[0].event_name.as_deref(), Some("purchase")); + assert_eq!(events[0].pathname, "/checkout"); + assert_eq!(events[0].project_id, project.id); + assert_eq!(events[0].environment_id, Some(environment.id)); + assert!(events[0].visitor_id.is_none()); + + test_db.cleanup().await; + } + + #[tokio::test] + async fn test_console_event_with_encrypted_visitor_id() { + let mut test_db: TestDatabase = match TestDatabase::with_migrations().await { + Ok(db) => db, + Err(e) => { + println!("Database not available, skipping test: {}", e); + return; + } + }; + let db = test_db.connection_arc(); + let project = insert_test_project(db.as_ref()).await; + use temps_entities::visitor; + let environment = insert_test_environment(db.as_ref(), project.id).await; + let deployment = insert_test_deployment(db.as_ref(), project.id, environment.id).await; + let (app, _state, cookie_crypto) = setup_test_app(db.clone()).await; + + // Create a visitor in the DB first + let visitor_uuid = uuid::Uuid::new_v4().to_string(); + let _visitor = visitor::ActiveModel { + visitor_id: Set(visitor_uuid.clone()), + project_id: Set(project.id), + environment_id: Set(environment.id), + first_seen: Set(chrono::Utc::now()), + last_seen: Set(chrono::Utc::now()), + has_activity: Set(false), + is_crawler: Set(false), + ..Default::default() + } + .insert(db.as_ref()) + .await + .expect("Failed to insert test visitor"); + + // Encrypt the visitor_id like the browser cookie would have + let encrypted_visitor_id = cookie_crypto.encrypt(&visitor_uuid).unwrap(); + let session_uuid = uuid::Uuid::new_v4().to_string(); + let encrypted_session_id = cookie_crypto.encrypt(&session_uuid).unwrap(); + + let payload = serde_json::json!({ + "event_name": "add_to_cart", + "event_data": { "item": "widget" }, + "visitor_id": encrypted_visitor_id, + "session_id": encrypted_session_id, + "environment_id": environment.id, + "deployment_id": deployment.id, + "request_path": "/products/widget" + }); + + let response = app + .oneshot( + Request::builder() + .method("POST") + .uri(format!("/projects/{}/events/ingest", project.id)) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_string(&payload).unwrap())) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); + + // Verify the event was stored with the correct visitor + let events: Vec = temps_entities::events::Entity::find() + .filter(temps_entities::events::Column::ProjectId.eq(project.id)) + .all(db.as_ref()) + .await + .expect("Failed to query events"); + + assert_eq!(events.len(), 1); + assert_eq!(events[0].event_name.as_deref(), Some("add_to_cart")); + assert_eq!(events[0].pathname, "/products/widget"); + // Visitor should be resolved from the encrypted cookie + assert!(events[0].visitor_id.is_some()); + // Session should be decrypted and stored + assert_eq!(events[0].session_id.as_deref(), Some(session_uuid.as_str())); + + // Verify visitor's has_activity was updated + let updated_visitor: temps_entities::visitor::Model = visitor::Entity::find() + .filter(visitor::Column::VisitorId.eq(&visitor_uuid)) + .one(db.as_ref()) + .await + .expect("Failed to query visitor") + .expect("Visitor not found"); + assert!(updated_visitor.has_activity); + + test_db.cleanup().await; + } + + #[tokio::test] + async fn test_console_event_with_invalid_encrypted_cookies_still_succeeds() { + let mut test_db: TestDatabase = match TestDatabase::with_migrations().await { + Ok(db) => db, + Err(e) => { + println!("Database not available, skipping test: {}", e); + return; + } + }; + let db = test_db.connection_arc(); + let project = insert_test_project(db.as_ref()).await; + let environment = insert_test_environment(db.as_ref(), project.id).await; + let deployment = insert_test_deployment(db.as_ref(), project.id, environment.id).await; + let (app, _state, _crypto) = setup_test_app(db.clone()).await; + + // Send garbage encrypted values — should warn but not fail + let payload = serde_json::json!({ + "event_name": "page_view", + "environment_id": environment.id, + "deployment_id": deployment.id, + "visitor_id": "not_a_valid_encrypted_value", + "session_id": "also_garbage" + }); + + let response = app + .oneshot( + Request::builder() + .method("POST") + .uri(format!("/projects/{}/events/ingest", project.id)) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_string(&payload).unwrap())) + .unwrap(), + ) + .await + .unwrap(); + + // Should succeed — invalid cookies are treated as absent + assert_eq!(response.status(), StatusCode::OK); + + let events: Vec = temps_entities::events::Entity::find() + .filter(temps_entities::events::Column::ProjectId.eq(project.id)) + .all(db.as_ref()) + .await + .expect("Failed to query events"); + + assert_eq!(events.len(), 1); + assert_eq!(events[0].event_name.as_deref(), Some("page_view")); + assert!(events[0].visitor_id.is_none()); + // session_id gets a generated UUID fallback when cookie decryption fails + assert!(events[0].session_id.is_some()); + + test_db.cleanup().await; + } + + #[tokio::test] + async fn test_console_event_with_environment_id() { + let mut test_db: TestDatabase = match TestDatabase::with_migrations().await { + Ok(db) => db, + Err(e) => { + println!("Database not available, skipping test: {}", e); + return; + } + }; + let db = test_db.connection_arc(); + let project = insert_test_project(db.as_ref()).await; + let environment = insert_test_environment(db.as_ref(), project.id).await; + let deployment = insert_test_deployment(db.as_ref(), project.id, environment.id).await; + let (app, _state, _crypto) = setup_test_app(db.clone()).await; + + let payload = serde_json::json!({ + "event_name": "deploy_complete", + "event_data": { "version": "1.2.3" }, + "environment_id": environment.id, + "deployment_id": deployment.id, + "request_path": "/deploy" + }); + + let response = app + .oneshot( + Request::builder() + .method("POST") + .uri(format!("/projects/{}/events/ingest", project.id)) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_string(&payload).unwrap())) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); + + let events: Vec = temps_entities::events::Entity::find() + .filter(temps_entities::events::Column::ProjectId.eq(project.id)) + .all(db.as_ref()) + .await + .expect("Failed to query events"); + + assert_eq!(events.len(), 1); + assert_eq!(events[0].environment_id, Some(environment.id)); + assert_eq!(events[0].event_name.as_deref(), Some("deploy_complete")); + + test_db.cleanup().await; + } + + #[tokio::test] + async fn test_console_event_without_auth_returns_401() { + let mut test_db: TestDatabase = match TestDatabase::with_migrations().await { + Ok(db) => db, + Err(e) => { + println!("Database not available, skipping test: {}", e); + return; + } + }; + let db = test_db.connection_arc(); + let project = insert_test_project(db.as_ref()).await; + + let events_service = Arc::new(crate::services::AnalyticsEventsService::new(db.clone())); + let route_table = Arc::new(temps_proxy::CachedPeerTable::new(db.clone())); + let geoip_service = Arc::new(temps_geo::GeoIpService::Mock( + temps_geo::MockGeoIpService::new(), + )); + let ip_address_service = + Arc::new(temps_geo::IpAddressService::new(db.clone(), geoip_service)); + let cookie_crypto = + Arc::new(temps_core::CookieCrypto::new("test_key_32_bytes_long_for_tests").unwrap()); + let app_state = Arc::new(AppState { + events_service, + route_table, + ip_address_service, + cookie_crypto, + }); + + // No auth middleware — should return 401 + let app = configure_routes().with_state(app_state); + + let payload = serde_json::json!({ + "event_name": "should_fail" + }); + + let response = app + .oneshot( + Request::builder() + .method("POST") + .uri(format!("/projects/{}/events/ingest", project.id)) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_string(&payload).unwrap())) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::UNAUTHORIZED); + + test_db.cleanup().await; + } + + #[tokio::test] + async fn test_console_event_minimal_payload() { + let mut test_db: TestDatabase = match TestDatabase::with_migrations().await { + Ok(db) => db, + Err(e) => { + println!("Database not available, skipping test: {}", e); + return; + } + }; + let db = test_db.connection_arc(); + let project = insert_test_project(db.as_ref()).await; + let environment = insert_test_environment(db.as_ref(), project.id).await; + let deployment = insert_test_deployment(db.as_ref(), project.id, environment.id).await; + let (app, _state, _crypto) = setup_test_app(db.clone()).await; + + // Minimum payload — event_name + environment_id + deployment_id + let payload = serde_json::json!({ + "event_name": "heartbeat", + "environment_id": environment.id, + "deployment_id": deployment.id + }); + + let response = app + .oneshot( + Request::builder() + .method("POST") + .uri(format!("/projects/{}/events/ingest", project.id)) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_string(&payload).unwrap())) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); + + let events: Vec = temps_entities::events::Entity::find() + .filter(temps_entities::events::Column::ProjectId.eq(project.id)) + .all(db.as_ref()) + .await + .expect("Failed to query events"); + + assert_eq!(events.len(), 1); + assert_eq!(events[0].event_name.as_deref(), Some("heartbeat")); + // Defaults + assert_eq!(events[0].pathname, "/"); + assert!(events[0].screen_width.is_none()); + assert!(events[0].user_agent.is_none()); + assert!(events[0].ip_geolocation_id.is_none()); + + test_db.cleanup().await; + } + + #[tokio::test] + async fn test_console_events_appear_in_query_results() { + let mut test_db: TestDatabase = match TestDatabase::with_migrations().await { + Ok(db) => db, + Err(e) => { + println!("Database not available, skipping test: {}", e); + return; + } + }; + let db = test_db.connection_arc(); + let project = insert_test_project(db.as_ref()).await; + let environment = insert_test_environment(db.as_ref(), project.id).await; + let deployment = insert_test_deployment(db.as_ref(), project.id, environment.id).await; + let (_app, state, _crypto) = setup_test_app(db.clone()).await; + + // Ingest 3 events + for event_name in &["signup", "purchase", "purchase"] { + let payload = serde_json::json!({ + "event_name": event_name, + "event_data": {}, + "environment_id": environment.id, + "deployment_id": deployment.id, + "request_path": "/api/track" + }); + + let app_clone = configure_routes() + .layer(middleware::from_fn( + |mut req: Request, next: axum::middleware::Next| async move { + req.extensions_mut().insert(create_test_auth_context()); + next.run(req).await + }, + )) + .with_state(state.clone()); + + let response = app_clone + .oneshot( + Request::builder() + .method("POST") + .uri(format!("/projects/{}/events/ingest", project.id)) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_string(&payload).unwrap())) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); + } + + // Verify via has-events endpoint + let app_query = configure_routes() + .layer(middleware::from_fn( + |mut req: Request, next: axum::middleware::Next| async move { + req.extensions_mut().insert(create_test_auth_context()); + next.run(req).await + }, + )) + .with_state(state.clone()); + + let response = app_query + .oneshot( + Request::builder() + .method("GET") + .uri(format!("/projects/{}/has-events", project.id)) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); + let body = axum::body::to_bytes(response.into_body(), usize::MAX) + .await + .unwrap(); + let has_events: serde_json::Value = serde_json::from_slice(&body).unwrap(); + assert_eq!(has_events["has_events"], true); + + // Verify all 3 events are in the database + let events: Vec = temps_entities::events::Entity::find() + .filter(temps_entities::events::Column::ProjectId.eq(project.id)) + .all(db.as_ref()) + .await + .expect("Failed to query events"); + + assert_eq!(events.len(), 3); + + let event_names: Vec<&str> = events + .iter() + .filter_map(|e| e.event_name.as_deref()) + .collect(); + assert_eq!(event_names.iter().filter(|n| **n == "signup").count(), 1); + assert_eq!(event_names.iter().filter(|n| **n == "purchase").count(), 2); + + test_db.cleanup().await; + } +} diff --git a/crates/temps-analytics-events/src/plugin.rs b/crates/temps-analytics-events/src/plugin.rs index a024a4e7..faf14567 100644 --- a/crates/temps-analytics-events/src/plugin.rs +++ b/crates/temps-analytics-events/src/plugin.rs @@ -39,12 +39,14 @@ impl TempsPlugin for EventsPlugin { let events_service = context.get_service::()?; let route_table = context.get_service::()?; let ip_address_service = context.get_service::()?; + let cookie_crypto = context.require_service::(); let routes = crate::handlers::configure_routes().with_state(Arc::new(crate::handlers::AppState { events_service, route_table, ip_address_service, + cookie_crypto, })); Some(PluginRoutes { router: routes }) diff --git a/crates/temps-analytics-events/src/types/requests.rs b/crates/temps-analytics-events/src/types/requests.rs index 13a40b55..3a05e43d 100644 --- a/crates/temps-analytics-events/src/types/requests.rs +++ b/crates/temps-analytics-events/src/types/requests.rs @@ -200,6 +200,42 @@ pub struct EventMetricsPayload { pub inp: Option, } +/// Payload for server-side event ingestion via the console API. +/// +/// The app backend reads the encrypted `_temps_visitor_id` and `_temps_sid` +/// cookie values from the user's request and forwards them here. +/// Temps decrypts them server-side to resolve visitor/session identity. +#[derive(Debug, Deserialize, ToSchema)] +pub struct ConsoleEventPayload { + /// Event name (e.g. "purchase", "signup", custom event names) + pub event_name: String, + /// Arbitrary JSON event data + #[serde(default = "default_event_data")] + pub event_data: serde_json::Value, + /// Encrypted `_temps_visitor_id` cookie value from the user's browser + pub visitor_id: Option, + /// Encrypted `_temps_sid` cookie value from the user's browser + pub session_id: Option, + /// Environment ID to attribute the event to + pub environment_id: i32, + /// Deployment ID to attribute the event to + pub deployment_id: i32, + /// Page path context (defaults to "/") + #[serde(default = "default_request_path")] + pub request_path: String, + /// Query string context + #[serde(default)] + pub request_query: String, +} + +fn default_event_data() -> serde_json::Value { + serde_json::Value::Object(serde_json::Map::new()) +} + +fn default_request_path() -> String { + "/".to_string() +} + /// Query parameters for property breakdown (group by column) #[derive(Debug, Deserialize, ToSchema)] pub struct PropertyBreakdownQuery { diff --git a/crates/temps-analytics-funnels/src/services/service.rs b/crates/temps-analytics-funnels/src/services/service.rs index 19e211c1..485944af 100644 --- a/crates/temps-analytics-funnels/src/services/service.rs +++ b/crates/temps-analytics-funnels/src/services/service.rs @@ -55,7 +55,13 @@ impl FunnelMetricsCacheKey { } } -/// Generic TTL-based cache +/// Maximum number of entries in the metrics cache. +/// This prevents unbounded memory growth from many unique funnel/filter combinations. +const DEFAULT_MAX_CACHE_ENTRIES: usize = 1000; + +/// Generic TTL-based cache with bounded capacity. +/// When the cache exceeds `max_capacity`, expired entries are evicted first, +/// then oldest entries are removed to make room. pub struct MetricsCache where K: Eq + Hash + Clone, @@ -63,6 +69,7 @@ where { cache: Arc>>>, default_ttl_minutes: i64, + max_capacity: usize, } impl MetricsCache @@ -74,6 +81,7 @@ where Self { cache: Arc::new(RwLock::new(HashMap::new())), default_ttl_minutes, + max_capacity: DEFAULT_MAX_CACHE_ENTRIES, } } @@ -89,6 +97,27 @@ where pub async fn set(&self, key: K, value: V) { let mut cache = self.cache.write().await; + + // Evict expired entries if at capacity + if cache.len() >= self.max_capacity { + cache.retain(|_, entry| !entry.is_expired()); + } + + // If still at capacity after expiry cleanup, remove oldest entries + if cache.len() >= self.max_capacity { + let to_remove = cache.len() - self.max_capacity + 1; + // Find the entries with the earliest expiration times + let mut entries_by_expiry: Vec<(K, chrono::DateTime)> = cache + .iter() + .map(|(k, entry)| (k.clone(), entry.expires_at)) + .collect(); + entries_by_expiry.sort_by_key(|(_, expires_at)| *expires_at); + + for (k, _) in entries_by_expiry.into_iter().take(to_remove) { + cache.remove(&k); + } + } + cache.insert(key, CacheEntry::new(value, self.default_ttl_minutes)); } @@ -2648,4 +2677,217 @@ mod tests { // Overall conversion assert!((metrics.overall_conversion_rate - 66.67).abs() < 0.01); } + + // ======================================================================== + // MetricsCache unit tests (no DB required) + // ======================================================================== + + #[tokio::test] + async fn test_cache_basic_get_set() { + let cache: MetricsCache = MetricsCache::new(5); + + // Miss on empty cache + assert!(cache.get(&"key1".to_string()).await.is_none()); + + // Set and hit + cache.set("key1".to_string(), "value1".to_string()).await; + assert_eq!( + cache.get(&"key1".to_string()).await, + Some("value1".to_string()) + ); + } + + #[tokio::test] + async fn test_cache_expired_entries_not_returned() { + // TTL of 0 minutes — entries expire immediately + let cache: MetricsCache = MetricsCache::new(0); + + cache.set("key1".to_string(), "value1".to_string()).await; + + // Should miss because TTL is 0 (already expired) + assert!( + cache.get(&"key1".to_string()).await.is_none(), + "Expired entry should not be returned" + ); + } + + #[tokio::test] + async fn test_cache_invalidate() { + let cache: MetricsCache = MetricsCache::new(60); + + cache.set("key1".to_string(), "value1".to_string()).await; + assert!(cache.get(&"key1".to_string()).await.is_some()); + + cache.invalidate(&"key1".to_string()).await; + assert!(cache.get(&"key1".to_string()).await.is_none()); + } + + #[tokio::test] + async fn test_cache_cleanup_expired() { + // TTL of 0 minutes + let cache: MetricsCache = MetricsCache::new(0); + + cache.set("key1".to_string(), "value1".to_string()).await; + cache.set("key2".to_string(), "value2".to_string()).await; + + // Both entries are expired (TTL=0), cleanup should remove them + cache.cleanup_expired().await; + + let inner = cache.cache.read().await; + assert_eq!(inner.len(), 0, "All expired entries should be cleaned up"); + } + + #[tokio::test] + async fn test_cache_bounded_capacity_evicts_expired_first() { + // Create cache with very small max_capacity for testing + let cache: MetricsCache = MetricsCache { + cache: Arc::new(RwLock::new(HashMap::new())), + default_ttl_minutes: 60, + max_capacity: 3, + }; + + // Fill to capacity + cache.set(1, "a".to_string()).await; + cache.set(2, "b".to_string()).await; + cache.set(3, "c".to_string()).await; + + { + let inner = cache.cache.read().await; + assert_eq!(inner.len(), 3, "Cache should be at capacity"); + } + + // Manually expire entry 1 + { + let mut inner = cache.cache.write().await; + if let Some(entry) = inner.get_mut(&1) { + entry.expires_at = Utc::now() - Duration::minutes(1); + } + } + + // Adding a new entry should evict the expired entry first + cache.set(4, "d".to_string()).await; + + { + let inner = cache.cache.read().await; + assert_eq!(inner.len(), 3, "Should still be at capacity (3)"); + assert!( + !inner.contains_key(&1), + "Expired entry 1 should have been evicted" + ); + assert!(inner.contains_key(&4), "New entry 4 should be present"); + } + } + + #[tokio::test] + async fn test_cache_bounded_capacity_evicts_oldest_when_no_expired() { + let cache: MetricsCache = MetricsCache { + cache: Arc::new(RwLock::new(HashMap::new())), + default_ttl_minutes: 60, + max_capacity: 3, + }; + + // Fill to capacity with entries that expire at different times + cache.set(1, "a".to_string()).await; + + // Give entry 2 a later expiration by inserting after a tiny delay + // (or by manipulating the cache directly) + cache.set(2, "b".to_string()).await; + cache.set(3, "c".to_string()).await; + + // Make entry 1 expire soonest (oldest expiration) + { + let mut inner = cache.cache.write().await; + if let Some(entry) = inner.get_mut(&1) { + entry.expires_at = Utc::now() + Duration::minutes(1); // soonest + } + if let Some(entry) = inner.get_mut(&2) { + entry.expires_at = Utc::now() + Duration::minutes(30); + } + if let Some(entry) = inner.get_mut(&3) { + entry.expires_at = Utc::now() + Duration::minutes(60); + } + } + + // No expired entries, so adding a 4th should evict the one with earliest expiration (key=1) + cache.set(4, "d".to_string()).await; + + { + let inner = cache.cache.read().await; + assert_eq!(inner.len(), 3, "Should still be at capacity"); + assert!( + !inner.contains_key(&1), + "Oldest-by-expiration entry (key=1) should be evicted" + ); + assert!(inner.contains_key(&2), "Entry 2 should remain"); + assert!(inner.contains_key(&3), "Entry 3 should remain"); + assert!(inner.contains_key(&4), "New entry 4 should be present"); + } + } + + #[tokio::test] + async fn test_cache_under_capacity_does_not_evict() { + let cache: MetricsCache = MetricsCache { + cache: Arc::new(RwLock::new(HashMap::new())), + default_ttl_minutes: 60, + max_capacity: 10, + }; + + cache.set(1, "a".to_string()).await; + cache.set(2, "b".to_string()).await; + cache.set(3, "c".to_string()).await; + + { + let inner = cache.cache.read().await; + assert_eq!(inner.len(), 3, "All 3 entries should be present"); + } + } + + #[tokio::test] + async fn test_cache_invalidate_by_funnel() { + let cache: MetricsCache = MetricsCache::new(60); + + let key1 = FunnelMetricsCacheKey { + funnel_id: 1, + environment_id: None, + country_code: None, + start_date: None, + end_date: None, + }; + let key2 = FunnelMetricsCacheKey { + funnel_id: 1, + environment_id: Some(5), + country_code: None, + start_date: None, + end_date: None, + }; + let key3 = FunnelMetricsCacheKey { + funnel_id: 2, + environment_id: None, + country_code: None, + start_date: None, + end_date: None, + }; + + cache.set(key1, "metrics1".to_string()).await; + cache.set(key2, "metrics2".to_string()).await; + cache.set(key3, "metrics3".to_string()).await; + + // Invalidate funnel 1 — should remove key1 and key2, keep key3 + cache.invalidate_by_funnel(1).await; + + { + let inner = cache.cache.read().await; + assert_eq!(inner.len(), 1, "Only funnel 2 entry should remain"); + assert!( + inner.contains_key(&FunnelMetricsCacheKey { + funnel_id: 2, + environment_id: None, + country_code: None, + start_date: None, + end_date: None, + }), + "Funnel 2 entry should still be present" + ); + } + } } diff --git a/crates/temps-analytics-performance/src/services/service.rs b/crates/temps-analytics-performance/src/services/service.rs index ee68297e..7de067e2 100644 --- a/crates/temps-analytics-performance/src/services/service.rs +++ b/crates/temps-analytics-performance/src/services/service.rs @@ -173,16 +173,6 @@ pub struct GroupedPageMetricsResponse { pub grouped_by: String, } -// Internal struct for SQL query results -#[derive(Debug, Default)] -struct MetricPercentiles { - avg: Option, - p75: Option, - p90: Option, - p95: Option, - p99: Option, -} - pub struct PerformanceService { db: Arc, } @@ -203,94 +193,216 @@ impl PerformanceService { } } - /// Apply device type filter to a SeaORM query, translating frontend values - /// to woothee categories stored in the database. - fn apply_device_filter( - query: Select, - device_type: &str, - ) -> Select { - let categories = Self::woothee_device_types(device_type); - query.filter(performance_metrics::Column::DeviceType.is_in(categories)) - } - - pub async fn get_metrics( - &self, + /// Build the WHERE clause and params for percentile/time-series SQL queries. + fn build_percentile_where( + project_id: i32, start_date: UtcDateTime, end_date: UtcDateTime, - project_id: i32, environment_id: Option, deployment_id: Option, device_type: Option, - ) -> Result { - // Get all metrics for the project and date range - let mut query = performance_metrics::Entity::find() - .filter(performance_metrics::Column::ProjectId.eq(project_id)) - .filter(performance_metrics::Column::RecordedAt.between(start_date, end_date)); + ) -> (String, Vec) { + let mut conditions = vec![ + "project_id = $1".to_string(), + "recorded_at >= $2".to_string(), + "recorded_at <= $3".to_string(), + ]; + let mut params: Vec = + vec![project_id.into(), start_date.into(), end_date.into()]; + let mut idx = 3; if let Some(env_id) = environment_id { - query = query.filter(performance_metrics::Column::EnvironmentId.eq(env_id)); + idx += 1; + conditions.push(format!("environment_id = ${}", idx)); + params.push(env_id.into()); } if let Some(dep_id) = deployment_id { - query = query.filter(performance_metrics::Column::DeploymentId.eq(dep_id)); + idx += 1; + conditions.push(format!("deployment_id = ${}", idx)); + params.push(dep_id.into()); } if let Some(ref device) = device_type { - query = Self::apply_device_filter(query, device); + let categories = Self::woothee_device_types(device); + let placeholders: Vec = categories + .iter() + .map(|c| { + idx += 1; + params.push(c.clone().into()); + format!("${}", idx) + }) + .collect(); + conditions.push(format!("device_type IN ({})", placeholders.join(", "))); } - let metrics = query.all(self.db.as_ref()).await?; + (conditions.join(" AND "), params) + } - // Calculate simple averages and percentiles - let ttfb_values: Vec = metrics.iter().filter_map(|m| m.ttfb).collect(); - let lcp_values: Vec = metrics.iter().filter_map(|m| m.lcp).collect(); - let fid_values: Vec = metrics.iter().filter_map(|m| m.fid).collect(); - let fcp_values: Vec = metrics.iter().filter_map(|m| m.fcp).collect(); - let cls_values: Vec = metrics.iter().filter_map(|m| m.cls).collect(); - let inp_values: Vec = metrics.iter().filter_map(|m| m.inp).collect(); + pub async fn get_metrics( + &self, + start_date: UtcDateTime, + end_date: UtcDateTime, + project_id: i32, + environment_id: Option, + deployment_id: Option, + device_type: Option, + ) -> Result { + let (where_clause, params) = Self::build_percentile_where( + project_id, + start_date, + end_date, + environment_id, + deployment_id, + device_type, + ); + + // Use SQL percentile_cont to compute all stats in a single query — no data loaded into Rust + let sql = format!( + r#" + SELECT + AVG(ttfb)::float4 as ttfb_avg, + AVG(lcp)::float4 as lcp_avg, + AVG(fid)::float4 as fid_avg, + AVG(fcp)::float4 as fcp_avg, + AVG(cls)::float4 as cls_avg, + AVG(inp)::float4 as inp_avg, + (percentile_cont(0.75) WITHIN GROUP (ORDER BY ttfb))::float4 as ttfb_p75, + (percentile_cont(0.75) WITHIN GROUP (ORDER BY lcp))::float4 as lcp_p75, + (percentile_cont(0.75) WITHIN GROUP (ORDER BY fid))::float4 as fid_p75, + (percentile_cont(0.75) WITHIN GROUP (ORDER BY fcp))::float4 as fcp_p75, + (percentile_cont(0.75) WITHIN GROUP (ORDER BY cls))::float4 as cls_p75, + (percentile_cont(0.75) WITHIN GROUP (ORDER BY inp))::float4 as inp_p75, + (percentile_cont(0.90) WITHIN GROUP (ORDER BY ttfb))::float4 as ttfb_p90, + (percentile_cont(0.90) WITHIN GROUP (ORDER BY lcp))::float4 as lcp_p90, + (percentile_cont(0.90) WITHIN GROUP (ORDER BY fid))::float4 as fid_p90, + (percentile_cont(0.90) WITHIN GROUP (ORDER BY fcp))::float4 as fcp_p90, + (percentile_cont(0.90) WITHIN GROUP (ORDER BY cls))::float4 as cls_p90, + (percentile_cont(0.90) WITHIN GROUP (ORDER BY inp))::float4 as inp_p90, + (percentile_cont(0.95) WITHIN GROUP (ORDER BY ttfb))::float4 as ttfb_p95, + (percentile_cont(0.95) WITHIN GROUP (ORDER BY lcp))::float4 as lcp_p95, + (percentile_cont(0.95) WITHIN GROUP (ORDER BY fid))::float4 as fid_p95, + (percentile_cont(0.95) WITHIN GROUP (ORDER BY fcp))::float4 as fcp_p95, + (percentile_cont(0.95) WITHIN GROUP (ORDER BY cls))::float4 as cls_p95, + (percentile_cont(0.95) WITHIN GROUP (ORDER BY inp))::float4 as inp_p95, + (percentile_cont(0.99) WITHIN GROUP (ORDER BY ttfb))::float4 as ttfb_p99, + (percentile_cont(0.99) WITHIN GROUP (ORDER BY lcp))::float4 as lcp_p99, + (percentile_cont(0.99) WITHIN GROUP (ORDER BY fid))::float4 as fid_p99, + (percentile_cont(0.99) WITHIN GROUP (ORDER BY fcp))::float4 as fcp_p99, + (percentile_cont(0.99) WITHIN GROUP (ORDER BY cls))::float4 as cls_p99, + (percentile_cont(0.99) WITHIN GROUP (ORDER BY inp))::float4 as inp_p99 + FROM performance_metrics + WHERE {} + "#, + where_clause + ); + + #[derive(FromQueryResult)] + struct PercentileRow { + ttfb_avg: Option, + lcp_avg: Option, + fid_avg: Option, + fcp_avg: Option, + cls_avg: Option, + inp_avg: Option, + ttfb_p75: Option, + lcp_p75: Option, + fid_p75: Option, + fcp_p75: Option, + cls_p75: Option, + inp_p75: Option, + ttfb_p90: Option, + lcp_p90: Option, + fid_p90: Option, + fcp_p90: Option, + cls_p90: Option, + inp_p90: Option, + ttfb_p95: Option, + lcp_p95: Option, + fid_p95: Option, + fcp_p95: Option, + cls_p95: Option, + inp_p95: Option, + ttfb_p99: Option, + lcp_p99: Option, + fid_p99: Option, + fcp_p99: Option, + cls_p99: Option, + inp_p99: Option, + } - let ttfb = Self::calculate_stats(&ttfb_values); - let lcp = Self::calculate_stats(&lcp_values); - let fid = Self::calculate_stats(&fid_values); - let fcp = Self::calculate_stats(&fcp_values); - let cls = Self::calculate_stats(&cls_values); - let inp = Self::calculate_stats(&inp_values); + let row = PercentileRow::find_by_statement(Statement::from_sql_and_values( + DatabaseBackend::Postgres, + &sql, + params, + )) + .one(self.db.as_ref()) + .await?; + + let row = row.unwrap_or(PercentileRow { + ttfb_avg: None, + lcp_avg: None, + fid_avg: None, + fcp_avg: None, + cls_avg: None, + inp_avg: None, + ttfb_p75: None, + lcp_p75: None, + fid_p75: None, + fcp_p75: None, + cls_p75: None, + inp_p75: None, + ttfb_p90: None, + lcp_p90: None, + fid_p90: None, + fcp_p90: None, + cls_p90: None, + inp_p90: None, + ttfb_p95: None, + lcp_p95: None, + fid_p95: None, + fcp_p95: None, + cls_p95: None, + inp_p95: None, + ttfb_p99: None, + lcp_p99: None, + fid_p99: None, + fcp_p99: None, + cls_p99: None, + inp_p99: None, + }); Ok(PerformanceMetricsResponse { - ttfb: ttfb.avg, - lcp: lcp.avg, - fid: fid.avg, - fcp: fcp.avg, - cls: cls.avg, - inp: inp.avg, - - ttfb_p75: ttfb.p75, - lcp_p75: lcp.p75, - fid_p75: fid.p75, - fcp_p75: fcp.p75, - cls_p75: cls.p75, - inp_p75: inp.p75, - - ttfb_p90: ttfb.p90, - lcp_p90: lcp.p90, - fid_p90: fid.p90, - fcp_p90: fcp.p90, - cls_p90: cls.p90, - inp_p90: inp.p90, - - ttfb_p95: ttfb.p95, - lcp_p95: lcp.p95, - fid_p95: fid.p95, - fcp_p95: fcp.p95, - cls_p95: cls.p95, - inp_p95: inp.p95, - - ttfb_p99: ttfb.p99, - lcp_p99: lcp.p99, - fid_p99: fid.p99, - fcp_p99: fcp.p99, - cls_p99: cls.p99, - inp_p99: inp.p99, + ttfb: row.ttfb_avg, + lcp: row.lcp_avg, + fid: row.fid_avg, + fcp: row.fcp_avg, + cls: row.cls_avg, + inp: row.inp_avg, + ttfb_p75: row.ttfb_p75, + lcp_p75: row.lcp_p75, + fid_p75: row.fid_p75, + fcp_p75: row.fcp_p75, + cls_p75: row.cls_p75, + inp_p75: row.inp_p75, + ttfb_p90: row.ttfb_p90, + lcp_p90: row.lcp_p90, + fid_p90: row.fid_p90, + fcp_p90: row.fcp_p90, + cls_p90: row.cls_p90, + inp_p90: row.inp_p90, + ttfb_p95: row.ttfb_p95, + lcp_p95: row.lcp_p95, + fid_p95: row.fid_p95, + fcp_p95: row.fcp_p95, + cls_p95: row.cls_p95, + inp_p95: row.inp_p95, + ttfb_p99: row.ttfb_p99, + lcp_p99: row.lcp_p99, + fid_p99: row.fid_p99, + fcp_p99: row.fcp_p99, + cls_p99: row.cls_p99, + inp_p99: row.inp_p99, }) } @@ -303,105 +415,198 @@ impl PerformanceService { deployment_id: Option, device_type: Option, ) -> Result { - // Get all metrics for percentile calculations - let mut percentile_query = performance_metrics::Entity::find() - .filter(performance_metrics::Column::ProjectId.eq(project_id)) - .filter(performance_metrics::Column::RecordedAt.between(start_date, end_date)); - - if let Some(env_id) = environment_id { - percentile_query = - percentile_query.filter(performance_metrics::Column::EnvironmentId.eq(env_id)); - } - - if let Some(dep_id) = deployment_id { - percentile_query = - percentile_query.filter(performance_metrics::Column::DeploymentId.eq(dep_id)); - } - - if let Some(ref device) = device_type { - percentile_query = Self::apply_device_filter(percentile_query, device); - } - - let all_metrics = percentile_query.all(self.db.as_ref()).await?; - - // Calculate percentiles - let ttfb_values: Vec = all_metrics.iter().filter_map(|m| m.ttfb).collect(); - let lcp_values: Vec = all_metrics.iter().filter_map(|m| m.lcp).collect(); - let fid_values: Vec = all_metrics.iter().filter_map(|m| m.fid).collect(); - let fcp_values: Vec = all_metrics.iter().filter_map(|m| m.fcp).collect(); - let cls_values: Vec = all_metrics.iter().filter_map(|m| m.cls).collect(); - let inp_values: Vec = all_metrics.iter().filter_map(|m| m.inp).collect(); - - let ttfb = Self::calculate_stats(&ttfb_values); - let lcp = Self::calculate_stats(&lcp_values); - let fid = Self::calculate_stats(&fid_values); - let fcp = Self::calculate_stats(&fcp_values); - let cls = Self::calculate_stats(&cls_values); - let inp = Self::calculate_stats(&inp_values); - - // Get time series data - let mut time_query = performance_metrics::Entity::find() - .filter(performance_metrics::Column::ProjectId.eq(project_id)) - .filter(performance_metrics::Column::RecordedAt.between(start_date, end_date)) - .order_by_asc(performance_metrics::Column::RecordedAt); + let (where_clause, params) = Self::build_percentile_where( + project_id, + start_date, + end_date, + environment_id, + deployment_id, + device_type, + ); - if let Some(env_id) = environment_id { - time_query = time_query.filter(performance_metrics::Column::EnvironmentId.eq(env_id)); - } + // Single SQL query: percentiles + time-bucketed averages in one round-trip. + // The percentiles CTE computes aggregates without loading rows into Rust. + // The time_series CTE uses time_bucket (TimescaleDB) for hourly averages. + let sql = format!( + r#" + WITH percentiles AS ( + SELECT + AVG(ttfb)::float4 as ttfb_avg, + AVG(lcp)::float4 as lcp_avg, + AVG(fid)::float4 as fid_avg, + AVG(fcp)::float4 as fcp_avg, + AVG(cls)::float4 as cls_avg, + AVG(inp)::float4 as inp_avg, + (percentile_cont(0.75) WITHIN GROUP (ORDER BY ttfb))::float4 as ttfb_p75, + (percentile_cont(0.75) WITHIN GROUP (ORDER BY lcp))::float4 as lcp_p75, + (percentile_cont(0.75) WITHIN GROUP (ORDER BY fid))::float4 as fid_p75, + (percentile_cont(0.75) WITHIN GROUP (ORDER BY fcp))::float4 as fcp_p75, + (percentile_cont(0.75) WITHIN GROUP (ORDER BY cls))::float4 as cls_p75, + (percentile_cont(0.75) WITHIN GROUP (ORDER BY inp))::float4 as inp_p75, + (percentile_cont(0.90) WITHIN GROUP (ORDER BY ttfb))::float4 as ttfb_p90, + (percentile_cont(0.90) WITHIN GROUP (ORDER BY lcp))::float4 as lcp_p90, + (percentile_cont(0.90) WITHIN GROUP (ORDER BY fid))::float4 as fid_p90, + (percentile_cont(0.90) WITHIN GROUP (ORDER BY fcp))::float4 as fcp_p90, + (percentile_cont(0.90) WITHIN GROUP (ORDER BY cls))::float4 as cls_p90, + (percentile_cont(0.90) WITHIN GROUP (ORDER BY inp))::float4 as inp_p90, + (percentile_cont(0.95) WITHIN GROUP (ORDER BY ttfb))::float4 as ttfb_p95, + (percentile_cont(0.95) WITHIN GROUP (ORDER BY lcp))::float4 as lcp_p95, + (percentile_cont(0.95) WITHIN GROUP (ORDER BY fid))::float4 as fid_p95, + (percentile_cont(0.95) WITHIN GROUP (ORDER BY fcp))::float4 as fcp_p95, + (percentile_cont(0.95) WITHIN GROUP (ORDER BY cls))::float4 as cls_p95, + (percentile_cont(0.95) WITHIN GROUP (ORDER BY inp))::float4 as inp_p95, + (percentile_cont(0.99) WITHIN GROUP (ORDER BY ttfb))::float4 as ttfb_p99, + (percentile_cont(0.99) WITHIN GROUP (ORDER BY lcp))::float4 as lcp_p99, + (percentile_cont(0.99) WITHIN GROUP (ORDER BY fid))::float4 as fid_p99, + (percentile_cont(0.99) WITHIN GROUP (ORDER BY fcp))::float4 as fcp_p99, + (percentile_cont(0.99) WITHIN GROUP (ORDER BY cls))::float4 as cls_p99, + (percentile_cont(0.99) WITHIN GROUP (ORDER BY inp))::float4 as inp_p99 + FROM performance_metrics + WHERE {where_clause} + ), + time_series AS ( + SELECT + time_bucket('1 hour', recorded_at) as bucket, + AVG(ttfb)::float4 as ttfb, + AVG(lcp)::float4 as lcp, + AVG(fid)::float4 as fid, + AVG(fcp)::float4 as fcp, + AVG(cls)::float4 as cls, + AVG(inp)::float4 as inp + FROM performance_metrics + WHERE {where_clause} + GROUP BY bucket + ORDER BY bucket ASC + ) + SELECT + ts.bucket as "timestamp", + ts.ttfb, ts.lcp, ts.fid, ts.fcp, ts.cls, ts.inp, + p.ttfb_p75, p.lcp_p75, p.fid_p75, p.fcp_p75, p.cls_p75, p.inp_p75, + p.ttfb_p90, p.lcp_p90, p.fid_p90, p.fcp_p90, p.cls_p90, p.inp_p90, + p.ttfb_p95, p.lcp_p95, p.fid_p95, p.fcp_p95, p.cls_p95, p.inp_p95, + p.ttfb_p99, p.lcp_p99, p.fid_p99, p.fcp_p99, p.cls_p99, p.inp_p99 + FROM time_series ts + CROSS JOIN percentiles p + "#, + where_clause = where_clause + ); - if let Some(dep_id) = deployment_id { - time_query = time_query.filter(performance_metrics::Column::DeploymentId.eq(dep_id)); - } + // The CTE references the same WHERE twice, but with the same param positions. + // PostgreSQL allows re-use of $N placeholders, so we pass params once. - if let Some(ref device) = device_type { - time_query = Self::apply_device_filter(time_query, device); + #[derive(FromQueryResult)] + struct OverTimeRow { + timestamp: UtcDateTime, + ttfb: Option, + lcp: Option, + fid: Option, + fcp: Option, + cls: Option, + inp: Option, + ttfb_p75: Option, + lcp_p75: Option, + fid_p75: Option, + fcp_p75: Option, + cls_p75: Option, + inp_p75: Option, + ttfb_p90: Option, + lcp_p90: Option, + fid_p90: Option, + fcp_p90: Option, + cls_p90: Option, + inp_p90: Option, + ttfb_p95: Option, + lcp_p95: Option, + fid_p95: Option, + fcp_p95: Option, + cls_p95: Option, + inp_p95: Option, + ttfb_p99: Option, + lcp_p99: Option, + fid_p99: Option, + fcp_p99: Option, + cls_p99: Option, + inp_p99: Option, } - let metrics = time_query.all(self.db.as_ref()).await?; + let rows = OverTimeRow::find_by_statement(Statement::from_sql_and_values( + DatabaseBackend::Postgres, + &sql, + params, + )) + .all(self.db.as_ref()) + .await?; let mut result = MetricsOverTimeResponse { - timestamps: Vec::new(), - ttfb: Vec::new(), - lcp: Vec::new(), - fid: Vec::new(), - fcp: Vec::new(), - cls: Vec::new(), - inp: Vec::new(), - // Single values for percentiles - ttfb_p75: ttfb.p75, - lcp_p75: lcp.p75, - fid_p75: fid.p75, - fcp_p75: fcp.p75, - cls_p75: cls.p75, - inp_p75: inp.p75, - ttfb_p90: ttfb.p90, - lcp_p90: lcp.p90, - fid_p90: fid.p90, - fcp_p90: fcp.p90, - cls_p90: cls.p90, - inp_p90: inp.p90, - ttfb_p95: ttfb.p95, - lcp_p95: lcp.p95, - fid_p95: fid.p95, - fcp_p95: fcp.p95, - cls_p95: cls.p95, - inp_p95: inp.p95, - ttfb_p99: ttfb.p99, - lcp_p99: lcp.p99, - fid_p99: fid.p99, - fcp_p99: fcp.p99, - cls_p99: cls.p99, - inp_p99: inp.p99, + timestamps: Vec::with_capacity(rows.len()), + ttfb: Vec::with_capacity(rows.len()), + lcp: Vec::with_capacity(rows.len()), + fid: Vec::with_capacity(rows.len()), + fcp: Vec::with_capacity(rows.len()), + cls: Vec::with_capacity(rows.len()), + inp: Vec::with_capacity(rows.len()), + // Percentiles are the same for every row; take from first (or None if empty) + ttfb_p75: None, + lcp_p75: None, + fid_p75: None, + fcp_p75: None, + cls_p75: None, + inp_p75: None, + ttfb_p90: None, + lcp_p90: None, + fid_p90: None, + fcp_p90: None, + cls_p90: None, + inp_p90: None, + ttfb_p95: None, + lcp_p95: None, + fid_p95: None, + fcp_p95: None, + cls_p95: None, + inp_p95: None, + ttfb_p99: None, + lcp_p99: None, + fid_p99: None, + fcp_p99: None, + cls_p99: None, + inp_p99: None, }; - for metric in metrics { - result.timestamps.push(metric.recorded_at.to_rfc3339()); - result.ttfb.push(metric.ttfb); - result.lcp.push(metric.lcp); - result.fid.push(metric.fid); - result.fcp.push(metric.fcp); - result.cls.push(metric.cls); - result.inp.push(metric.inp); + if let Some(first) = rows.first() { + result.ttfb_p75 = first.ttfb_p75; + result.lcp_p75 = first.lcp_p75; + result.fid_p75 = first.fid_p75; + result.fcp_p75 = first.fcp_p75; + result.cls_p75 = first.cls_p75; + result.inp_p75 = first.inp_p75; + result.ttfb_p90 = first.ttfb_p90; + result.lcp_p90 = first.lcp_p90; + result.fid_p90 = first.fid_p90; + result.fcp_p90 = first.fcp_p90; + result.cls_p90 = first.cls_p90; + result.inp_p90 = first.inp_p90; + result.ttfb_p95 = first.ttfb_p95; + result.lcp_p95 = first.lcp_p95; + result.fid_p95 = first.fid_p95; + result.fcp_p95 = first.fcp_p95; + result.cls_p95 = first.cls_p95; + result.inp_p95 = first.inp_p95; + result.ttfb_p99 = first.ttfb_p99; + result.lcp_p99 = first.lcp_p99; + result.fid_p99 = first.fid_p99; + result.fcp_p99 = first.fcp_p99; + result.cls_p99 = first.cls_p99; + result.inp_p99 = first.inp_p99; + } + + for row in &rows { + result.timestamps.push(row.timestamp.to_rfc3339()); + result.ttfb.push(row.ttfb); + result.lcp.push(row.lcp); + result.fid.push(row.fid); + result.fcp.push(row.fcp); + result.cls.push(row.cls); + result.inp.push(row.inp); } Ok(result) @@ -539,30 +744,6 @@ impl PerformanceService { }) } - fn calculate_stats(values: &[f32]) -> MetricPercentiles { - if values.is_empty() { - return MetricPercentiles::default(); - } - - let mut sorted = values.to_vec(); - sorted.sort_by(|a, b| a.partial_cmp(b).unwrap()); - - let len = sorted.len(); - let avg = sorted.iter().sum::() / len as f32; - let p75 = sorted.get((len * 75) / 100).copied(); - let p90 = sorted.get((len * 90) / 100).copied(); - let p95 = sorted.get((len * 95) / 100).copied(); - let p99 = sorted.get((len * 99) / 100).copied(); - - MetricPercentiles { - avg: Some(avg), - p75, - p90, - p95, - p99, - } - } - /// Record performance metrics from client pub async fn record_performance_metrics( &self, diff --git a/crates/temps-analytics-session-replay/src/services/service.rs b/crates/temps-analytics-session-replay/src/services/service.rs index b152d743..cd9c5538 100644 --- a/crates/temps-analytics-session-replay/src/services/service.rs +++ b/crates/temps-analytics-session-replay/src/services/service.rs @@ -1101,10 +1101,14 @@ impl SessionReplayService { .await? .ok_or_else(|| SessionReplayError::SessionNotFound(session_id.to_string()))?; - // Get events using the integer session ID + // Get events using the integer session ID. + // Limit to 50 000 events to avoid loading hundreds of MB for very long sessions. + const MAX_REPLAY_EVENTS: u64 = 50_000; + let events = session_replay_events::Entity::find() .filter(session_replay_events::Column::SessionId.eq(row.id)) .order_by_asc(session_replay_events::Column::Timestamp) + .limit(MAX_REPLAY_EVENTS) .all(self.db.as_ref()) .await?; @@ -1385,19 +1389,15 @@ impl SessionReplayService { session.is_active = Set(false); session.update(self.db.as_ref()).await?; - // Soft delete all events for this session using the integer ID - let events = session_replay_events::Entity::find() + // Soft delete all events for this session in a single UPDATE instead of + // loading every event into memory and updating one by one (N+1). + session_replay_events::Entity::update_many() + .col_expr(session_replay_events::Column::IsActive, Expr::value(false)) .filter(session_replay_events::Column::SessionId.eq(session_int_id)) .filter(session_replay_events::Column::IsActive.eq(true)) - .all(self.db.as_ref()) + .exec(self.db.as_ref()) .await?; - for event in events { - let mut event: session_replay_events::ActiveModel = event.into(); - event.is_active = Set(false); - event.update(self.db.as_ref()).await?; - } - Ok(()) } } diff --git a/crates/temps-backup/src/services/backup.rs b/crates/temps-backup/src/services/backup.rs index 27d9ea86..08f7d648 100644 --- a/crates/temps-backup/src/services/backup.rs +++ b/crates/temps-backup/src/services/backup.rs @@ -429,9 +429,11 @@ impl BackupService { Ok(major_version.to_string()) } - /// Returns the Docker image tag for the given PostgreSQL major version + /// Returns the Docker image tag for the pg_dump sidecar container. + /// Temps requires TimescaleDB as its database, so the sidecar always uses the + /// timescaledb-ha image to ensure pg_dump has the extension available. fn get_postgres_image_tag(&self, major_version: &str) -> String { - format!("postgres:{}", major_version) + format!("timescale/timescaledb-ha:pg{}", major_version) } /// Pulls the specified PostgreSQL Docker image @@ -3128,14 +3130,64 @@ mod tests { object_result.err() ); + // Download the backup and verify it is a valid gzip-compressed pg_dump custom format. + // + // This is the key assertion for the TimescaleDB fix: if the sidecar image were plain + // postgres (missing the timescaledb extension), pg_dump would either fail with a non-zero + // exit code (caught earlier) or produce a corrupt/truncated dump. A valid dump must: + // 1. Start with gzip magic bytes 0x1f 0x8b + // 2. Decompress to a pg_dump custom-format file starting with "PGDMP" + // + // This rules out zero-byte files, plain-text error output, and partial dumps that + // happen to be non-zero in size. + let backup_bytes = s3_client + .get_object() + .bucket(bucket_name) + .key(&backup_result.s3_location) + .send() + .await + .expect("Failed to download backup file from S3") + .body + .collect() + .await + .expect("Failed to read backup body") + .into_bytes(); + + assert!( + backup_bytes.len() >= 2, + "Backup file too small to contain gzip magic bytes" + ); + assert_eq!( + &backup_bytes[..2], + &[0x1f, 0x8b], + "Backup file does not start with gzip magic bytes — not a valid gzip file" + ); + + let mut decoder = flate2::read::GzDecoder::new(&backup_bytes[..]); + let mut decompressed = Vec::new(); + std::io::Read::read_to_end(&mut decoder, &mut decompressed) + .expect("Failed to decompress backup — gzip stream is corrupt"); + + assert!( + decompressed.len() >= 5, + "Decompressed backup too small to contain pg_dump magic" + ); + assert_eq!( + &decompressed[..5], + b"PGDMP", + "Decompressed backup does not start with PGDMP — not a valid pg_dump custom format file" + ); + println!("\nāœ“ Integration test passed:"); - println!(" - Database container started"); + println!(" - Database container started (timescale/timescaledb-ha)"); println!(" - MinIO container started"); println!(" - Backup created with ID: {}", backup_result.id); println!( - " - Backup size: {} bytes", + " - Backup size: {} bytes (compressed)", backup_result.size_bytes.unwrap_or(0) ); + println!(" - Decompressed size: {} bytes", decompressed.len()); + println!(" - Backup format: valid gzip-compressed pg_dump custom format (PGDMP)"); println!(" - Objects in bucket: {}", object_count); } @@ -3661,4 +3713,59 @@ mod tests { _ => panic!("Expected validation error for empty name"), } } + + // ------------------------------------------------------------------------- + // TimescaleDB sidecar image selection + // ------------------------------------------------------------------------- + + fn make_backup_service() -> BackupService { + let db = Arc::new(MockDatabase::new(DatabaseBackend::Postgres).into_connection()); + BackupService::new( + db.clone(), + create_mock_external_service_manager(db), + create_mock_notification_service(), + create_mock_config_service(), + Arc::new(EncryptionService::new("test_encryption_key_1234567890ab").unwrap()), + ) + } + + /// The main Temps database always runs on TimescaleDB, so the pg_dump sidecar + /// must always use the timescaledb-ha image — never plain postgres. + #[test] + fn test_pg_dump_sidecar_always_uses_timescaledb_image() { + let svc = make_backup_service(); + + for major in ["15", "16", "17", "18"] { + let image = svc.get_postgres_image_tag(major); + assert!( + image.starts_with("timescale/timescaledb-ha:pg"), + "Expected timescaledb-ha image for version {major}, got: {image}" + ); + assert!( + image.ends_with(major), + "Image tag should end with the major version {major}, got: {image}" + ); + } + } + + /// The TimescaleDB version string format is "PostgreSQL 17.x on ..." — identical to + /// plain Postgres. Verify that parse_postgres_version correctly extracts the major + /// version from a real TimescaleDB SELECT version() output. + #[test] + fn test_parse_postgres_version_from_timescaledb_version_string() { + let svc = make_backup_service(); + + let timescaledb_version_string = + "PostgreSQL 17.4 on aarch64-unknown-linux-gnu, compiled by gcc (GCC) 13.2.0, 64-bit"; + + let major = svc + .parse_postgres_version(timescaledb_version_string) + .expect("Should parse TimescaleDB version string"); + + assert_eq!(major, "17"); + + // Confirm the full image tag is correct end-to-end + let image = svc.get_postgres_image_tag(&major); + assert_eq!(image, "timescale/timescaledb-ha:pg17"); + } } diff --git a/crates/temps-deployer/src/docker.rs b/crates/temps-deployer/src/docker.rs index 7e5e0671..3718caaa 100644 --- a/crates/temps-deployer/src/docker.rs +++ b/crates/temps-deployer/src/docker.rs @@ -76,18 +76,34 @@ impl DockerRuntime { use bytes::Bytes; use http_body_util::Full; - let mut tar_buffer = Vec::new(); - { - let mut tar_builder = tar::Builder::new(&mut tar_buffer); + // Write the tar archive to a temporary file to avoid holding the entire + // build context in memory. The temp file is cleaned up when `_tmp` drops. + let tmp = tempfile::NamedTempFile::new().map_err(BuilderError::IoError)?; + let tmp_path = tmp.path().to_path_buf(); + + // Tar creation is synchronous and CPU-bound — run it on a blocking thread. + let ctx = context_path.clone(); + let out_path = tmp_path.clone(); + tokio::task::spawn_blocking(move || { + let file = std::fs::File::create(&out_path).map_err(BuilderError::IoError)?; + let mut tar_builder = tar::Builder::new(file); tar_builder - .append_dir_all(".", context_path) + .append_dir_all(".", ctx) .map_err(BuilderError::IoError)?; tar_builder.finish().map_err(BuilderError::IoError)?; - } + Ok::<(), BuilderError>(()) + }) + .await + .map_err(|e| BuilderError::Other(format!("Tar task panicked: {}", e)))??; + + // Read the completed tar file back into memory for Bollard. + // This is still an allocation, but happens after the tar builder is dropped, + // so peak memory is (tar file on disk) instead of (directory walk + tar buffer). + let tar_data = tokio::fs::read(&tmp_path) + .await + .map_err(BuilderError::IoError)?; - // Create body from tar buffer as expected by Bollard - // Return Full which will be converted to Either::Left automatically - Ok(Full::new(Bytes::from(tar_buffer))) + Ok(Full::new(Bytes::from(tar_data))) } fn get_resource_limits() -> (usize, u64) { @@ -925,9 +941,17 @@ impl ContainerDeployer for DockerRuntime { async fn stop_container(&self, container_id: &str) -> Result<(), DeployerError> { self.docker - .stop_container(container_id, None::) + .stop_container( + container_id, + Some(StopContainerOptions { + t: Some(10), + signal: None, + }), + ) .await - .map_err(|e| DeployerError::Other(format!("Failed to stop container: {}", e)))?; + .map_err(|e| { + DeployerError::Other(format!("Failed to stop container {}: {}", container_id, e)) + })?; Ok(()) } @@ -1160,6 +1184,7 @@ impl ContainerDeployer for DockerRuntime { Some(LogsOptions { stdout: true, stderr: true, + tail: "10000".to_string(), ..Default::default() }), ) diff --git a/crates/temps-deployments/src/handlers/remote_deployments.rs b/crates/temps-deployments/src/handlers/remote_deployments.rs index 4117d2f8..698f16bc 100644 --- a/crates/temps-deployments/src/handlers/remote_deployments.rs +++ b/crates/temps-deployments/src/handlers/remote_deployments.rs @@ -326,8 +326,9 @@ pub async fn deploy_from_image( ))); } - // 2. Verify environment exists and belongs to project + // 2. Verify environment exists, belongs to project, and is not deleted let environment = environments::Entity::find_by_id(environment_id) + .filter(environments::Column::DeletedAt.is_null()) .one(state.db.as_ref()) .await .map_err(|e| { @@ -576,8 +577,9 @@ pub async fn deploy_from_static( ))); } - // 2. Verify environment exists and belongs to project + // 2. Verify environment exists, belongs to project, and is not deleted let environment = environments::Entity::find_by_id(environment_id) + .filter(environments::Column::DeletedAt.is_null()) .one(state.db.as_ref()) .await .map_err(|e| { @@ -858,8 +860,9 @@ pub async fn deploy_from_image_upload( ))); } - // 2. Verify environment exists and belongs to project + // 2. Verify environment exists, belongs to project, and is not deleted let environment = environments::Entity::find_by_id(environment_id) + .filter(environments::Column::DeletedAt.is_null()) .one(state.db.as_ref()) .await .map_err(|e| { diff --git a/crates/temps-deployments/src/jobs/deploy_image.rs b/crates/temps-deployments/src/jobs/deploy_image.rs index 4828a3bd..3b59d13d 100644 --- a/crates/temps-deployments/src/jobs/deploy_image.rs +++ b/crates/temps-deployments/src/jobs/deploy_image.rs @@ -8,12 +8,15 @@ use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::path::PathBuf; use std::sync::{Arc, Mutex}; -use temps_core::{JobResult, WorkflowContext, WorkflowError, WorkflowTask}; +use temps_core::{ + JobResult, WorkflowCancellationProvider, WorkflowContext, WorkflowError, WorkflowTask, +}; use temps_deployer::{ ContainerDeployer, ContainerLogConfig, ContainerStatus as DeployerContainerStatus, DeployRequest, PortMapping, Protocol, ResourceLimits, RestartPolicy, }; use temps_logs::{LogLevel, LogService}; +use tokio::time::{sleep, Duration}; /// Typed output from BuildImageJob #[derive(Debug, Clone, Serialize, Deserialize)] @@ -1136,6 +1139,67 @@ impl WorkflowTask for DeployImageJob { Ok(JobResult::success(context)) } + async fn execute_with_cancellation( + &self, + context: WorkflowContext, + cancellation_provider: &dyn WorkflowCancellationProvider, + ) -> Result { + let workflow_run_id = context.workflow_run_id.clone(); + + // Check if already cancelled before starting + if cancellation_provider.is_cancelled(&workflow_run_id).await? { + self.log( + &context, + "Deploy cancelled before starting - deployment was cancelled by user".to_string(), + ) + .await + .ok(); + return Err(WorkflowError::BuildCancelled); + } + + // Create cancellation check future that polls every 2 seconds + let cancellation_check = async { + loop { + sleep(Duration::from_secs(2)).await; + + match cancellation_provider.is_cancelled(&workflow_run_id).await { + Ok(true) => { + // Cancellation detected + return; + } + Ok(false) => { + // Continue checking + } + Err(_) => { + // Error checking cancellation - stop polling + break; + } + } + } + }; + + // Race between deploy execution and cancellation detection + let deploy_future = self.execute(context.clone()); + + tokio::select! { + result = deploy_future => { + // Deploy completed (success or failure) + result + } + _ = cancellation_check => { + // Cancellation detected during deploy + self.log( + &context, + "Deploy cancelled by user - stopping container deployment".to_string(), + ) + .await + .ok(); + + Err(WorkflowError::BuildCancelled) + } + } + } + async fn validate_prerequisites(&self, context: &WorkflowContext) -> Result<(), WorkflowError> { // If external image is provided, skip build job validation if self.external_image_tag.is_some() { diff --git a/crates/temps-deployments/src/jobs/mark_deployment_complete.rs b/crates/temps-deployments/src/jobs/mark_deployment_complete.rs index 23f756bd..95b7037a 100644 --- a/crates/temps-deployments/src/jobs/mark_deployment_complete.rs +++ b/crates/temps-deployments/src/jobs/mark_deployment_complete.rs @@ -297,8 +297,9 @@ impl MarkDeploymentCompleteJob { )) .await?; - // Update environment's current_deployment_id + // Update environment's current_deployment_id (only if environment is not soft-deleted) let environment = environments::Entity::find_by_id(environment_id) + .filter(environments::Column::DeletedAt.is_null()) .one(self.db.as_ref()) .await .map_err(|e| { @@ -306,7 +307,7 @@ impl MarkDeploymentCompleteJob { })? .ok_or_else(|| { WorkflowError::JobExecutionFailed(format!( - "Environment {} not found", + "Environment {} not found or was deleted", environment_id )) })?; diff --git a/crates/temps-deployments/src/services/job_processor.rs b/crates/temps-deployments/src/services/job_processor.rs index fd621573..c04579db 100644 --- a/crates/temps-deployments/src/services/job_processor.rs +++ b/crates/temps-deployments/src/services/job_processor.rs @@ -338,6 +338,30 @@ async fn find_or_create_environment_for_branch( return Ok(existing_preview); } + // Check if a soft-deleted preview environment exists for this branch — restore it + if let Some(deleted_preview) = environments::Entity::find() + .filter(environments::Column::ProjectId.eq(project.id)) + .filter(environments::Column::Branch.eq(branch_name)) + .filter(environments::Column::DeletedAt.is_not_null()) + .one(db.as_ref()) + .await + .map_err(|e| format!("Database error finding deleted preview environment: {}", e))? + { + info!( + "Restoring soft-deleted preview environment {} for branch '{}'", + deleted_preview.id, branch_name + ); + let mut active_env: environments::ActiveModel = deleted_preview.into(); + active_env.deleted_at = Set(None); + active_env.updated_at = Set(chrono::Utc::now()); + active_env.current_deployment_id = Set(None); + let restored = active_env + .update(db.as_ref()) + .await + .map_err(|e| format!("Failed to restore preview environment: {}", e))?; + return Ok(restored); + } + // Create new preview environment for this branch return create_preview_environment(db, project, branch_name, &slugified_branch).await; } diff --git a/crates/temps-deployments/src/services/services.rs b/crates/temps-deployments/src/services/services.rs index 58463bcd..3445a586 100644 --- a/crates/temps-deployments/src/services/services.rs +++ b/crates/temps-deployments/src/services/services.rs @@ -1183,15 +1183,27 @@ impl DeploymentService { .await?; for container in containers { - self.deployer - .stop_container(&container.container_id) - .await - .map_err(|e| { - DeploymentError::Other(format!( - "Failed to stop container {}: {}", + // Stop container with timeout - don't fail the whole teardown if one container fails + match tokio::time::timeout( + std::time::Duration::from_secs(30), + self.deployer.stop_container(&container.container_id), + ) + .await + { + Ok(Ok(())) => {} + Ok(Err(e)) => { + warn!( + "Failed to stop container {} during teardown: {} (continuing)", container.container_id, e - )) - })?; + ); + } + Err(_) => { + warn!( + "Timed out stopping container {} after 30s during teardown (continuing)", + container.container_id + ); + } + } // Mark container as deleted let mut active_container: deployment_containers::ActiveModel = container.into(); @@ -1775,24 +1787,48 @@ impl DeploymentService { container.container_id, environment_id ); - // Stop the container - if let Err(e) = self.deployer.stop_container(&container.container_id).await { - warn!( - "Failed to stop container {}: {} (continuing anyway)", - container.container_id, e - ); + // Stop the container with a 30-second timeout to prevent hanging + match tokio::time::timeout( + std::time::Duration::from_secs(30), + self.deployer.stop_container(&container.container_id), + ) + .await + { + Ok(Ok(())) => {} + Ok(Err(e)) => { + warn!( + "Failed to stop container {}: {} (continuing anyway)", + container.container_id, e + ); + } + Err(_) => { + warn!( + "Timed out stopping container {} after 30s (continuing anyway)", + container.container_id + ); + } } - // Remove the container - if let Err(e) = self - .deployer - .remove_container(&container.container_id) - .await + // Remove the container with a 15-second timeout + match tokio::time::timeout( + std::time::Duration::from_secs(15), + self.deployer.remove_container(&container.container_id), + ) + .await { - warn!( - "Failed to remove container {}: {} (continuing anyway)", - container.container_id, e - ); + Ok(Ok(())) => {} + Ok(Err(e)) => { + warn!( + "Failed to remove container {}: {} (continuing anyway)", + container.container_id, e + ); + } + Err(_) => { + warn!( + "Timed out removing container {} after 15s (continuing anyway)", + container.container_id + ); + } } // Update container status to stopped diff --git a/crates/temps-deployments/src/services/workflow_execution_service.rs b/crates/temps-deployments/src/services/workflow_execution_service.rs index f04e31a7..eb2778ba 100644 --- a/crates/temps-deployments/src/services/workflow_execution_service.rs +++ b/crates/temps-deployments/src/services/workflow_execution_service.rs @@ -1297,8 +1297,9 @@ impl WorkflowExecutionService { // Get the deployment to find its environment_id let deployment = self.get_deployment(deployment_id).await?; - // Update the environment + // Update the environment (only if not soft-deleted) let environment = environments::Entity::find_by_id(deployment.environment_id) + .filter(environments::Column::DeletedAt.is_null()) .one(self.db.as_ref()) .await? .ok_or_else(|| { diff --git a/crates/temps-environments/src/services/environment_service.rs b/crates/temps-environments/src/services/environment_service.rs index 52240904..d5b5ed1a 100644 --- a/crates/temps-environments/src/services/environment_service.rs +++ b/crates/temps-environments/src/services/environment_service.rs @@ -6,7 +6,7 @@ use serde::Serialize; use slug::slugify; use std::sync::Arc; use temps_core::problemdetails::Problem; -use temps_core::{EnvironmentCreatedJob, Job, JobQueue}; +use temps_core::{EnvironmentCreatedJob, EnvironmentDeletedJob, Job, JobQueue}; use temps_entities::{environment_domains, environments, projects}; use thiserror::Error; use tracing::{info, warn}; @@ -25,6 +25,15 @@ pub enum EnvironmentError { #[error("Invalid input: {0}")] InvalidInput(String), + #[error( + "Branch '{branch}' is already used by environment '{env_name}' in project {project_id}" + )] + BranchAlreadyInUse { + branch: String, + env_name: String, + project_id: i32, + }, + #[error("Other error: {0}")] Other(String), } @@ -59,6 +68,10 @@ impl From for Problem { .detail(reason) .build() } + EnvironmentError::BranchAlreadyInUse { .. } => temps_core::error_builder::bad_request() + .title("Branch Already In Use") + .detail(error.to_string()) + .build(), EnvironmentError::Other(msg) => temps_core::error_builder::internal_server_error() .detail(msg) .build(), @@ -132,20 +145,40 @@ impl EnvironmentService { memory_limit: Option, branch: String, ) -> anyhow::Result { - // Start a transaction - let txn = self.db.begin().await?; - // Get the project slug let project = projects::Entity::find_by_id(project_id) - .one(&txn) + .one(self.db.as_ref()) .await? .ok_or_else(|| anyhow::anyhow!("Project not found"))?; + // Check if a soft-deleted environment with this branch exists — restore it + if let Some(deleted_env) = environments::Entity::find() + .filter(environments::Column::ProjectId.eq(project_id)) + .filter(environments::Column::Branch.eq(&branch)) + .filter(environments::Column::DeletedAt.is_not_null()) + .one(self.db.as_ref()) + .await? + { + info!( + "Restoring soft-deleted environment {} for branch '{}' in project {}", + deleted_env.id, branch, project_id + ); + let mut active_env: environments::ActiveModel = deleted_env.into(); + active_env.deleted_at = Set(None); + active_env.updated_at = Set(chrono::Utc::now()); + active_env.current_deployment_id = Set(None); + let restored = active_env.update(self.db.as_ref()).await?; + return Ok(restored); + } + let env_slug = slugify(&name); // Create main_url using project_slug-env_slug format let main_url = format!("{}-{}", project.slug, env_slug); + // Start a transaction for insert + domain creation + let txn = self.db.begin().await?; + // Create the new environment let new_environment = environments::ActiveModel { project_id: Set(project_id), @@ -218,6 +251,7 @@ impl EnvironmentService { ) -> Result, EnvironmentError> { let envs = environments::Entity::find() .filter(environments::Column::ProjectId.eq(project_id_p)) + .filter(environments::Column::DeletedAt.is_null()) .order_by_asc(environments::Column::CreatedAt) .all(self.db.as_ref()) .await?; @@ -243,10 +277,10 @@ impl EnvironmentService { project_id_p: i32, env_id: i32, ) -> Result { - // If not found by ID or if parsing failed, try by slug let environment = environments::Entity::find() .filter(environments::Column::ProjectId.eq(project_id_p)) .filter(environments::Column::Id.eq(env_id)) + .filter(environments::Column::DeletedAt.is_null()) .one(self.db.as_ref()) .await?; @@ -262,6 +296,7 @@ impl EnvironmentService { ) -> Result { let default_environment = environments::Entity::find() .filter(environments::Column::ProjectId.eq(project_id_p)) + .filter(environments::Column::DeletedAt.is_null()) .order_by_asc(environments::Column::CreatedAt) .one(self.db.as_ref()) .await?; @@ -269,7 +304,7 @@ impl EnvironmentService { match default_environment { Some(env) => Ok(env), None => Err(EnvironmentError::NotFound(format!( - "Environment {} not found", + "No environment found for project {}", project_id_p ))), } @@ -295,10 +330,11 @@ impl EnvironmentService { }); } - // For non-main branches, continue with preview environment logic + // For non-main branches, find active environment for this branch let existing_env = environments::Entity::find() .filter(environments::Column::ProjectId.eq(project_id)) .filter(environments::Column::Branch.eq(branch)) + .filter(environments::Column::DeletedAt.is_null()) .one(self.db.as_ref()) .await .map_err(|e| EnvironmentError::Other(e.to_string()))?; @@ -311,6 +347,29 @@ impl EnvironmentService { return Ok(env); } + // Check for a soft-deleted environment with this branch and restore it + if let Some(deleted_env) = environments::Entity::find() + .filter(environments::Column::ProjectId.eq(project_id)) + .filter(environments::Column::Branch.eq(branch)) + .filter(environments::Column::DeletedAt.is_not_null()) + .one(self.db.as_ref()) + .await + .map_err(|e| EnvironmentError::Other(e.to_string()))? + { + info!( + "Restoring soft-deleted environment {} for branch '{}'", + deleted_env.id, branch + ); + let mut active_env: environments::ActiveModel = deleted_env.into(); + active_env.deleted_at = Set(None); + active_env.updated_at = Set(chrono::Utc::now()); + let restored = active_env + .update(self.db.as_ref()) + .await + .map_err(|e| EnvironmentError::Other(e.to_string()))?; + return Ok(restored); + } + let env_name = "preview"; info!("Creating new preview environment for branch: {}", branch); self.create_environment( @@ -344,10 +403,11 @@ impl EnvironmentService { EnvironmentError::NotFound(format!("Project {} not found", project_id)) })?; - // Check if environment with same name already exists + // Check if an active environment with same name already exists let existing_env = environments::Entity::find() .filter(environments::Column::ProjectId.eq(project_id)) .filter(environments::Column::Name.eq(&name)) + .filter(environments::Column::DeletedAt.is_null()) .one(self.db.as_ref()) .await .map_err(|e| EnvironmentError::Other(e.to_string()))?; @@ -358,6 +418,58 @@ impl EnvironmentService { )); } + // Check if another active environment in the same project already tracks this branch + // (applies to both restore and fresh creation paths) + let branch_conflict = environments::Entity::find() + .filter(environments::Column::ProjectId.eq(project_id)) + .filter(environments::Column::Branch.eq(&branch)) + .filter(environments::Column::DeletedAt.is_null()) + .one(self.db.as_ref()) + .await + .map_err(|e| EnvironmentError::DatabaseError { + reason: format!("Failed to check branch uniqueness: {}", e), + })?; + + if let Some(conflict) = branch_conflict { + return Err(EnvironmentError::BranchAlreadyInUse { + branch, + env_name: conflict.name, + project_id, + }); + } + + // Check if a soft-deleted environment with this name exists — restore it + if let Some(deleted_env) = environments::Entity::find() + .filter(environments::Column::ProjectId.eq(project_id)) + .filter(environments::Column::Name.eq(&name)) + .filter(environments::Column::DeletedAt.is_not_null()) + .one(self.db.as_ref()) + .await + .map_err(|e| EnvironmentError::Other(e.to_string()))? + { + info!( + "Restoring soft-deleted environment {} ('{}') in project {}", + deleted_env.id, name, project_id + ); + let mut active_env: environments::ActiveModel = deleted_env.into(); + active_env.deleted_at = Set(None); + active_env.branch = Set(Some(branch)); + active_env.updated_at = Set(chrono::Utc::now()); + active_env.current_deployment_id = Set(None); + if let Some(r) = replicas { + active_env.deployment_config = + Set(Some(temps_entities::deployment_config::DeploymentConfig { + replicas: r, + ..Default::default() + })); + } + let restored = active_env + .update(self.db.as_ref()) + .await + .map_err(|e| EnvironmentError::Other(e.to_string()))?; + return Ok(restored); + } + // Generate the environment identifier let env_slug = slugify(&name); @@ -469,6 +581,29 @@ impl EnvironmentService { EnvironmentError::InvalidInput(format!("Invalid deployment config: {}", e)) })?; + // If the branch is being changed, verify no other environment in the same project + // already tracks it. We exclude the environment being updated from the check. + if let Some(ref new_branch) = settings.branch { + let branch_conflict = environments::Entity::find() + .filter(environments::Column::ProjectId.eq(project_id_param)) + .filter(environments::Column::Branch.eq(new_branch.as_str())) + .filter(environments::Column::Id.ne(env_id)) + .filter(environments::Column::DeletedAt.is_null()) + .one(self.db.as_ref()) + .await + .map_err(|e| EnvironmentError::DatabaseError { + reason: format!("Failed to check branch uniqueness: {}", e), + })?; + + if let Some(conflict) = branch_conflict { + return Err(EnvironmentError::BranchAlreadyInUse { + branch: new_branch.clone(), + env_name: conflict.name, + project_id: project_id_param, + }); + } + } + active_model.deployment_config = Set(Some(deployment_config)); active_model.branch = Set(settings.branch); active_model.updated_at = Set(chrono::Utc::now()); @@ -576,22 +711,25 @@ impl EnvironmentService { ))) } - /// Delete an environment permanently + /// Soft-delete an environment by setting its `deleted_at` timestamp. + /// + /// The environment row is preserved for historical data (deployments, analytics) + /// and can be restored if a new environment with the same name/branch is created. /// /// Prevents deletion of: /// - Production environments (name = "Production" case-insensitive) /// /// Note: Active deployments should be cancelled before calling this method - /// Warning: This permanently deletes the environment and related data pub async fn delete_environment( &self, project_id: i32, env_id: i32, ) -> Result<(), EnvironmentError> { - // Get the environment + // Get the environment (only non-deleted ones) let environment = environments::Entity::find() .filter(environments::Column::ProjectId.eq(project_id)) .filter(environments::Column::Id.eq(env_id)) + .filter(environments::Column::DeletedAt.is_null()) .one(self.db.as_ref()) .await? .ok_or_else(|| { @@ -605,14 +743,31 @@ impl EnvironmentService { )); } - // Delete the environment permanently - environments::Entity::delete_by_id(env_id) - .exec(self.db.as_ref()) - .await?; + // Emit EnvironmentDeleted job so subscribers can clean up + if let Some(queue_service) = &self.queue_service { + let env_deleted_job = Job::EnvironmentDeleted(EnvironmentDeletedJob { + environment_id: env_id, + environment_name: environment.name.clone(), + project_id, + }); + + if let Err(e) = queue_service.send(env_deleted_job).await { + warn!( + "Failed to emit EnvironmentDeleted job for environment {}: {}", + env_id, e + ); + } + } + + // Soft-delete: set deleted_at and clear current_deployment_id + let mut active_env: environments::ActiveModel = environment.into(); + active_env.deleted_at = Set(Some(chrono::Utc::now())); + active_env.current_deployment_id = Set(None); + active_env.update(self.db.as_ref()).await?; info!( - "Permanently deleted environment {} (slug: {}) in project {}", - env_id, environment.slug, project_id + "Soft-deleted environment {} in project {}", + env_id, project_id ); Ok(()) @@ -622,6 +777,22 @@ impl EnvironmentService { #[cfg(test)] mod tests { use super::*; + use sea_orm::{DatabaseBackend, MockDatabase, MockExecResult}; + + fn make_service(db: sea_orm::DatabaseConnection) -> EnvironmentService { + let server_config = temps_config::ServerConfig::new( + "127.0.0.1:3000".to_string(), + "postgres://localhost/test".to_string(), + None, + None, + ) + .unwrap(); + let config_service = Arc::new(temps_config::ConfigService::new( + Arc::new(server_config), + Arc::new(MockDatabase::new(DatabaseBackend::Postgres).into_connection()), + )); + EnvironmentService::new(Arc::new(db), config_service) + } #[test] fn test_environment_error_display() { @@ -635,6 +806,30 @@ mod tests { assert_eq!(error.to_string(), "Other error: some error"); } + #[test] + fn test_branch_already_in_use_error_display() { + let error = EnvironmentError::BranchAlreadyInUse { + branch: "main".to_string(), + env_name: "production".to_string(), + project_id: 42, + }; + assert_eq!( + error.to_string(), + "Branch 'main' is already used by environment 'production' in project 42" + ); + } + + #[test] + fn test_branch_already_in_use_maps_to_bad_request() { + let error = EnvironmentError::BranchAlreadyInUse { + branch: "main".to_string(), + env_name: "production".to_string(), + project_id: 1, + }; + let problem = Problem::from(error); + assert_eq!(problem.status_code, axum::http::StatusCode::BAD_REQUEST); + } + #[test] fn test_domain_environment_struct() { let domain_env = DomainEnvironment { @@ -658,4 +853,231 @@ mod tests { _ => panic!("Expected NotFound error"), } } + + /// create_new_environment rejects a branch already tracked by another environment + /// in the same project. + #[tokio::test] + async fn test_create_environment_rejects_duplicate_branch() { + let existing_env = environments::Model { + id: 1, + name: "production".to_string(), + slug: "production".to_string(), + subdomain: "my-project-production".to_string(), + branch: Some("main".to_string()), + project_id: 10, + host: "".to_string(), + upstreams: temps_entities::upstream_config::UpstreamList::new(), + created_at: chrono::Utc::now().into(), + updated_at: chrono::Utc::now().into(), + last_deployment: None, + current_deployment_id: None, + deleted_at: None, + deployment_config: None, + is_preview: false, + }; + + let project = temps_entities::projects::Model { + id: 10, + name: "My Project".to_string(), + slug: "my-project".to_string(), + repo_name: "repo".to_string(), + repo_owner: "owner".to_string(), + directory: ".".to_string(), + main_branch: "main".to_string(), + preset: temps_entities::preset::Preset::NextJs, + preset_config: None, + deployment_config: None, + created_at: chrono::Utc::now().into(), + updated_at: chrono::Utc::now().into(), + is_deleted: false, + deleted_at: None, + last_deployment: None, + is_public_repo: true, + git_url: None, + git_provider_connection_id: None, + attack_mode: false, + enable_preview_environments: false, + source_type: temps_entities::source_type::SourceType::Git, + }; + + // Query sequence: + // 1. find project by id → returns project + // 2. find env by project_id+name → returns empty (no name conflict) + // 3. find env by project_id+branch → returns existing_env (branch conflict) + let db = MockDatabase::new(DatabaseBackend::Postgres) + .append_query_results(vec![vec![project]]) + .append_query_results(vec![Vec::::new()]) + .append_query_results(vec![vec![existing_env]]) + .into_connection(); + + let svc = make_service(db); + let result = svc + .create_new_environment(10, "staging".to_string(), "main".to_string(), None) + .await; + + assert!(result.is_err()); + assert!( + matches!( + result.unwrap_err(), + EnvironmentError::BranchAlreadyInUse { + branch, + env_name, + project_id: 10, + } if branch == "main" && env_name == "production" + ), + "Expected BranchAlreadyInUse error" + ); + } + + /// update_environment_settings rejects a branch already tracked by a different + /// environment in the same project. + #[tokio::test] + async fn test_update_settings_rejects_duplicate_branch() { + let current_env = environments::Model { + id: 2, + name: "staging".to_string(), + slug: "staging".to_string(), + subdomain: "my-project-staging".to_string(), + branch: Some("develop".to_string()), + project_id: 10, + host: "".to_string(), + upstreams: temps_entities::upstream_config::UpstreamList::new(), + created_at: chrono::Utc::now().into(), + updated_at: chrono::Utc::now().into(), + last_deployment: None, + current_deployment_id: None, + deleted_at: None, + deployment_config: None, + is_preview: false, + }; + + let conflicting_env = environments::Model { + id: 1, + name: "production".to_string(), + slug: "production".to_string(), + subdomain: "my-project-production".to_string(), + branch: Some("main".to_string()), + project_id: 10, + host: "".to_string(), + upstreams: temps_entities::upstream_config::UpstreamList::new(), + created_at: chrono::Utc::now().into(), + updated_at: chrono::Utc::now().into(), + last_deployment: None, + current_deployment_id: None, + deleted_at: None, + deployment_config: None, + is_preview: false, + }; + + // Query sequence: + // 1. get_environment (find by project_id + env_id) → returns current_env + // 2. find env by project_id + branch (excluding env_id=2) → returns conflicting_env + let db = MockDatabase::new(DatabaseBackend::Postgres) + .append_query_results(vec![vec![current_env]]) + .append_query_results(vec![vec![conflicting_env]]) + .into_connection(); + + let svc = make_service(db); + let result = svc + .update_environment_settings( + 10, + 2, + crate::handlers::UpdateEnvironmentSettingsRequest { + branch: Some("main".to_string()), + cpu_request: None, + cpu_limit: None, + memory_request: None, + memory_limit: None, + replicas: None, + exposed_port: None, + automatic_deploy: None, + performance_metrics_enabled: None, + session_recording_enabled: None, + security: None, + }, + ) + .await; + + assert!(result.is_err()); + assert!( + matches!( + result.unwrap_err(), + EnvironmentError::BranchAlreadyInUse { + branch, + env_name, + project_id: 10, + } if branch == "main" && env_name == "production" + ), + "Expected BranchAlreadyInUse error" + ); + } + + /// update_environment_settings allows updating other settings while keeping + /// the same branch (self-reference must not trigger the conflict check). + #[tokio::test] + async fn test_update_settings_allows_same_branch_on_same_env() { + let current_env = environments::Model { + id: 1, + name: "production".to_string(), + slug: "production".to_string(), + subdomain: "my-project-production".to_string(), + branch: Some("main".to_string()), + project_id: 10, + host: "".to_string(), + upstreams: temps_entities::upstream_config::UpstreamList::new(), + created_at: chrono::Utc::now().into(), + updated_at: chrono::Utc::now().into(), + last_deployment: None, + current_deployment_id: None, + deleted_at: None, + deployment_config: None, + is_preview: false, + }; + + // Query sequence: + // 1. get_environment → returns current_env + // 2. branch conflict check (id != 1) → returns empty (no other env uses "main") + // 3. update → returns updated env + let updated_env = environments::Model { + branch: Some("main".to_string()), + ..current_env.clone() + }; + + let db = MockDatabase::new(DatabaseBackend::Postgres) + .append_query_results(vec![vec![current_env]]) + .append_query_results(vec![Vec::::new()]) + .append_query_results(vec![vec![updated_env]]) + .append_exec_results(vec![MockExecResult { + last_insert_id: 0, + rows_affected: 1, + }]) + .into_connection(); + + let svc = make_service(db); + let result = svc + .update_environment_settings( + 10, + 1, + crate::handlers::UpdateEnvironmentSettingsRequest { + branch: Some("main".to_string()), + cpu_request: None, + cpu_limit: None, + memory_request: None, + memory_limit: None, + replicas: None, + exposed_port: None, + automatic_deploy: None, + performance_metrics_enabled: None, + session_recording_enabled: None, + security: None, + }, + ) + .await; + + assert!( + result.is_ok(), + "Should allow keeping the same branch: {:?}", + result.err() + ); + } } diff --git a/crates/temps-error-tracking/src/services/error_ingestion_service.rs b/crates/temps-error-tracking/src/services/error_ingestion_service.rs index a3b957dd..0a9da264 100644 --- a/crates/temps-error-tracking/src/services/error_ingestion_service.rs +++ b/crates/temps-error-tracking/src/services/error_ingestion_service.rs @@ -129,54 +129,52 @@ impl ErrorIngestionService { /// Replaces dynamic values (IDs, UUIDs, numbers, paths, URLs) with placeholders fn normalize_error_message(&self, message: &str) -> String { use regex::Regex; + use std::sync::LazyLock; + + static UUID_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}").unwrap() + }); + static HEX_RE: LazyLock = + LazyLock::new(|| Regex::new(r"\b(0x)?[0-9a-f]{8,}\b").unwrap()); + static URL_RE: LazyLock = + LazyLock::new(|| Regex::new(r"https?://[\w./\-?=&%]+").unwrap()); + static EMAIL_RE: LazyLock = + LazyLock::new(|| Regex::new(r"\b[\w._%+-]+@[\w.-]+\.[a-z]{2,}\b").unwrap()); + static UNIX_PATH_RE: LazyLock = + LazyLock::new(|| Regex::new(r"/[\w/.]+\.[\w]+").unwrap()); + static WIN_PATH_RE: LazyLock = + LazyLock::new(|| Regex::new(r"[a-z]:\\[\w\\]+\.[\w]+").unwrap()); + static IP_RE: LazyLock = + LazyLock::new(|| Regex::new(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b").unwrap()); + static TABLE_REF_RE: LazyLock = + LazyLock::new(|| Regex::new(r"\b(\w+)_\d+\b").unwrap()); + static NUMBER_RE: LazyLock = LazyLock::new(|| Regex::new(r"\b\d{4,}\b").unwrap()); + static QUOTED_RE: LazyLock = + LazyLock::new(|| Regex::new(r#"["']([^"']{10,})["']"#).unwrap()); let mut normalized = message.to_lowercase(); // Replace UUIDs (e.g., 550e8400-e29b-41d4-a716-446655440000) - FIRST - let uuid_regex = - Regex::new(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}").unwrap(); - normalized = uuid_regex.replace_all(&normalized, "").to_string(); - + normalized = UUID_RE.replace_all(&normalized, "").to_string(); // Replace hex IDs (e.g., 0x1a2b3c4d, deadbeef) - SECOND - let hex_regex = Regex::new(r"\b(0x)?[0-9a-f]{8,}\b").unwrap(); - normalized = hex_regex.replace_all(&normalized, "").to_string(); - - // Replace URLs (http/https) - BEFORE paths (paths might match URL components) - let url_regex = Regex::new(r"https?://[\w./\-?=&%]+").unwrap(); - normalized = url_regex.replace_all(&normalized, "").to_string(); - + normalized = HEX_RE.replace_all(&normalized, "").to_string(); + // Replace URLs (http/https) - BEFORE paths + normalized = URL_RE.replace_all(&normalized, "").to_string(); // Replace email addresses - BEFORE paths - let email_regex = Regex::new(r"\b[\w._%+-]+@[\w.-]+\.[a-z]{2,}\b").unwrap(); - normalized = email_regex.replace_all(&normalized, "").to_string(); - + normalized = EMAIL_RE.replace_all(&normalized, "").to_string(); // Replace file paths (Unix and Windows style) - AFTER URLs/emails - let unix_path_regex = Regex::new(r"/[\w/.]+\.[\w]+").unwrap(); - normalized = unix_path_regex - .replace_all(&normalized, "") - .to_string(); - - let windows_path_regex = Regex::new(r"[a-z]:\\[\w\\]+\.[\w]+").unwrap(); - normalized = windows_path_regex - .replace_all(&normalized, "") - .to_string(); - + normalized = UNIX_PATH_RE.replace_all(&normalized, "").to_string(); + normalized = WIN_PATH_RE.replace_all(&normalized, "").to_string(); // Replace IP addresses (v4) - BEFORE table refs and numbers - let ip_regex = Regex::new(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b").unwrap(); - normalized = ip_regex.replace_all(&normalized, "").to_string(); - - // Replace database table references (table_123, users_456) - BEFORE general numbers - let table_ref_regex = Regex::new(r"\b(\w+)_\d+\b").unwrap(); - normalized = table_ref_regex + normalized = IP_RE.replace_all(&normalized, "").to_string(); + // Replace database table references (table_123, users_456) + normalized = TABLE_REF_RE .replace_all(&normalized, "${1}_") .to_string(); - - // Replace numeric IDs and timestamps (standalone numbers of 4+ digits) - LAST number operation - let number_regex = Regex::new(r"\b\d{4,}\b").unwrap(); - normalized = number_regex.replace_all(&normalized, "").to_string(); - + // Replace numeric IDs and timestamps (standalone numbers of 4+ digits) + normalized = NUMBER_RE.replace_all(&normalized, "").to_string(); // Replace quoted strings (often dynamic user input) - FINAL - let quoted_string_regex = Regex::new(r#"["']([^"']{10,})["']"#).unwrap(); - normalized = quoted_string_regex + normalized = QUOTED_RE .replace_all(&normalized, r#""""#) .to_string(); diff --git a/crates/temps-error-tracking/src/services/source_map_service.rs b/crates/temps-error-tracking/src/services/source_map_service.rs index e7978d4a..a514c25c 100644 --- a/crates/temps-error-tracking/src/services/source_map_service.rs +++ b/crates/temps-error-tracking/src/services/source_map_service.rs @@ -1,6 +1,7 @@ use chrono::Utc; use sea_orm::{ - ActiveModelTrait, ColumnTrait, DatabaseConnection, EntityTrait, QueryFilter, QueryOrder, Set, + ActiveModelTrait, ColumnTrait, DatabaseConnection, EntityTrait, FromQueryResult, QueryFilter, + QueryOrder, QuerySelect, Set, }; use sha2::{Digest, Sha256}; use std::sync::Arc; @@ -136,20 +137,55 @@ impl SourceMapService { Ok(SourceMapInfo::from(model)) } - /// List all source maps for a project release + /// List all source maps for a project release. + /// Only fetches metadata columns — excludes the large `source_map_data` blob. pub async fn list_for_release( &self, project_id: i32, release: &str, ) -> Result, SourceMapError> { + #[derive(FromQueryResult)] + struct SourceMapMetadata { + id: i32, + project_id: i32, + release: String, + file_path: String, + dist: Option, + size_bytes: i64, + checksum: Option, + created_at: chrono::DateTime, + } + let maps = source_maps::Entity::find() .filter(source_maps::Column::ProjectId.eq(project_id)) .filter(source_maps::Column::Release.eq(release)) .order_by_asc(source_maps::Column::FilePath) + .select_only() + .column(source_maps::Column::Id) + .column(source_maps::Column::ProjectId) + .column(source_maps::Column::Release) + .column(source_maps::Column::FilePath) + .column(source_maps::Column::Dist) + .column(source_maps::Column::SizeBytes) + .column(source_maps::Column::Checksum) + .column(source_maps::Column::CreatedAt) + .into_model::() .all(self.db.as_ref()) .await?; - Ok(maps.into_iter().map(SourceMapInfo::from).collect()) + Ok(maps + .into_iter() + .map(|m| SourceMapInfo { + id: m.id, + project_id: m.project_id, + release: m.release, + file_path: m.file_path, + dist: m.dist, + size_bytes: m.size_bytes, + checksum: m.checksum, + created_at: m.created_at, + }) + .collect()) } /// List all releases that have source maps for a project diff --git a/crates/temps-monitoring/src/outage.rs b/crates/temps-monitoring/src/outage.rs index f63d7ec4..07215ce4 100644 --- a/crates/temps-monitoring/src/outage.rs +++ b/crates/temps-monitoring/src/outage.rs @@ -421,14 +421,31 @@ impl OutageDetectionService { Ok(()) } - /// Scan all active monitors and detect outages from recent checks - /// This is useful for catching up after service restart + /// Scan all active monitors and detect outages from recent checks. + /// Also prunes cached state for monitors that are no longer active in the database, + /// preventing unbounded growth of the in-memory state map. pub async fn scan_monitors(&self) -> Result, OutageError> { let monitors = status_monitors::Entity::find() .filter(status_monitors::Column::IsActive.eq(true)) .all(self.db.as_ref()) .await?; + // Prune cached state for monitors that are no longer active + let active_ids: std::collections::HashSet = monitors.iter().map(|m| m.id).collect(); + { + let mut states = self.monitor_states.write().await; + let before = states.len(); + states.retain(|id, _| active_ids.contains(id)); + let pruned = before - states.len(); + if pruned > 0 { + debug!( + "Pruned {} stale monitor state entries ({} active)", + pruned, + states.len() + ); + } + } + let mut events = Vec::new(); for monitor in monitors { @@ -655,4 +672,167 @@ mod tests { NotificationPriority::Critical )); } + + #[tokio::test] + async fn test_clear_monitor_state() { + use async_trait::async_trait; + use temps_core::notifications::{EmailMessage, NotificationData, NotificationError}; + + struct NoopNotificationService; + + #[async_trait] + impl NotificationService for NoopNotificationService { + async fn send_notification( + &self, + _notification: NotificationData, + ) -> Result<(), NotificationError> { + Ok(()) + } + async fn send_email(&self, _message: EmailMessage) -> Result<(), NotificationError> { + Ok(()) + } + async fn is_configured(&self) -> Result { + Ok(false) + } + } + + let db = Arc::new(sea_orm::Database::connect("sqlite::memory:").await.unwrap()); + let service = OutageDetectionService::new(db, Arc::new(NoopNotificationService)); + + // Manually insert some monitor states + { + let mut states = service.monitor_states.write().await; + states.insert( + 1, + MonitorState { + status: MonitorStatus::Operational, + active_incident_id: None, + consecutive_failures: 0, + }, + ); + states.insert( + 2, + MonitorState { + status: MonitorStatus::Down, + active_incident_id: Some(42), + consecutive_failures: 5, + }, + ); + states.insert( + 3, + MonitorState { + status: MonitorStatus::Degraded, + active_incident_id: None, + consecutive_failures: 1, + }, + ); + } + + // Verify initial state + { + let states = service.monitor_states.read().await; + assert_eq!(states.len(), 3); + } + + // Clear monitor 2 + service.clear_monitor_state(2).await; + + { + let states = service.monitor_states.read().await; + assert_eq!(states.len(), 2); + assert!(!states.contains_key(&2), "Monitor 2 should be removed"); + assert!(states.contains_key(&1), "Monitor 1 should remain"); + assert!(states.contains_key(&3), "Monitor 3 should remain"); + } + + // Clear non-existent monitor — should not panic + service.clear_monitor_state(999).await; + + { + let states = service.monitor_states.read().await; + assert_eq!(states.len(), 2, "No change for non-existent monitor"); + } + } + + #[tokio::test] + async fn test_get_monitor_statuses() { + use async_trait::async_trait; + use temps_core::notifications::{EmailMessage, NotificationData, NotificationError}; + + struct NoopNotificationService; + + #[async_trait] + impl NotificationService for NoopNotificationService { + async fn send_notification( + &self, + _notification: NotificationData, + ) -> Result<(), NotificationError> { + Ok(()) + } + async fn send_email(&self, _message: EmailMessage) -> Result<(), NotificationError> { + Ok(()) + } + async fn is_configured(&self) -> Result { + Ok(false) + } + } + + let db = Arc::new(sea_orm::Database::connect("sqlite::memory:").await.unwrap()); + let service = OutageDetectionService::new(db, Arc::new(NoopNotificationService)); + + // Empty initially + let statuses = service.get_monitor_statuses().await; + assert!(statuses.is_empty()); + + // Insert states + { + let mut states = service.monitor_states.write().await; + states.insert( + 10, + MonitorState { + status: MonitorStatus::Operational, + active_incident_id: None, + consecutive_failures: 0, + }, + ); + states.insert( + 20, + MonitorState { + status: MonitorStatus::Down, + active_incident_id: Some(1), + consecutive_failures: 3, + }, + ); + } + + let statuses = service.get_monitor_statuses().await; + assert_eq!(statuses.len(), 2); + assert_eq!(statuses[&10], MonitorStatus::Operational); + assert_eq!(statuses[&20], MonitorStatus::Down); + } + + #[test] + fn test_monitor_state_struct() { + let state = MonitorState { + status: MonitorStatus::Down, + active_incident_id: Some(42), + consecutive_failures: 3, + }; + + assert!(state.status.is_outage()); + assert_eq!(state.active_incident_id, Some(42)); + assert_eq!(state.consecutive_failures, 3); + } + + #[test] + fn test_outage_error_display() { + let db_err = OutageError::Database("connection refused".to_string()); + assert_eq!(db_err.to_string(), "Database error: connection refused"); + + let not_found = OutageError::MonitorNotFound(42); + assert_eq!(not_found.to_string(), "Monitor not found: 42"); + + let notif_err = OutageError::Notification("timeout".to_string()); + assert_eq!(notif_err.to_string(), "Notification error: timeout"); + } } diff --git a/crates/temps-notifications/src/digest/scheduler.rs b/crates/temps-notifications/src/digest/scheduler.rs index 3c511f25..897927a6 100644 --- a/crates/temps-notifications/src/digest/scheduler.rs +++ b/crates/temps-notifications/src/digest/scheduler.rs @@ -5,6 +5,7 @@ use chrono::{Datelike, Timelike, Utc, Weekday}; use std::sync::Arc; +use tokio::task::JoinHandle; use tokio::time::{sleep, Duration as TokioDuration}; use tracing::{debug, error, info, warn}; @@ -15,34 +16,58 @@ use crate::services::NotificationPreferencesService; pub struct DigestScheduler { digest_service: Arc, preferences_service: Arc, + task_handle: Option>, } impl DigestScheduler { - /// Create a new digest scheduler and start the background task + /// Create a new digest scheduler (does not start the background task). + /// Call `start()` to begin scheduling. pub fn new( digest_service: Arc, preferences_service: Arc, - ) -> Arc { - let scheduler = Arc::new(Self { + ) -> Self { + Self { digest_service, preferences_service, - }); + task_handle: None, + } + } - // Spawn background task - let scheduler_clone = scheduler.clone(); - tokio::spawn(async move { - scheduler_clone.run_scheduler().await; + /// Start the background scheduler task. + /// The scheduler runs until `shutdown()` is called or the DigestScheduler is dropped. + pub fn start(&mut self) { + if self.task_handle.is_some() { + info!("Weekly digest scheduler already running"); + return; + } + + let digest_service = self.digest_service.clone(); + let preferences_service = self.preferences_service.clone(); + + let handle = tokio::spawn(async move { + Self::run_scheduler(digest_service, preferences_service).await; }); + self.task_handle = Some(handle); info!("Weekly digest scheduler started"); - scheduler + } + + /// Shutdown the background scheduler task gracefully. + pub fn shutdown(&mut self) { + if let Some(handle) = self.task_handle.take() { + handle.abort(); + info!("Weekly digest scheduler stopped"); + } } /// Main scheduler loop - calculates exact sleep duration until next scheduled time - async fn run_scheduler(&self) { + async fn run_scheduler( + digest_service: Arc, + preferences_service: Arc, + ) { loop { // Get current preferences - let preferences = match self.preferences_service.get_preferences().await { + let preferences = match preferences_service.get_preferences().await { Ok(prefs) => prefs, Err(e) => { error!("Failed to get preferences: {}", e); @@ -60,7 +85,7 @@ impl DigestScheduler { } // Calculate duration until next scheduled time - let sleep_duration = self.calculate_next_run_duration(&preferences); + let sleep_duration = Self::calculate_next_run_duration_static(&preferences); info!( "Next weekly digest scheduled in {} hours ({} minutes)", @@ -72,7 +97,10 @@ impl DigestScheduler { sleep(sleep_duration).await; // Send the digest - match self.send_digest(&preferences).await { + match digest_service + .generate_and_send_weekly_digest(preferences.digest_sections.clone()) + .await + { Ok(_) => { info!("Successfully sent weekly digest"); } @@ -84,8 +112,7 @@ impl DigestScheduler { } /// Calculate duration until next scheduled run - fn calculate_next_run_duration( - &self, + fn calculate_next_run_duration_static( preferences: &crate::services::NotificationPreferences, ) -> TokioDuration { let now = Utc::now(); @@ -144,20 +171,6 @@ impl DigestScheduler { TokioDuration::from_secs(seconds as u64) } - /// Send the weekly digest - async fn send_digest( - &self, - preferences: &crate::services::NotificationPreferences, - ) -> anyhow::Result<()> { - info!("Sending weekly digest"); - - self.digest_service - .generate_and_send_weekly_digest(preferences.digest_sections.clone()) - .await?; - - Ok(()) - } - /// Parse weekday string to Weekday enum fn parse_weekday(day: &str) -> Weekday { match day.to_lowercase().as_str() { @@ -195,9 +208,45 @@ impl DigestScheduler { } } +impl Drop for DigestScheduler { + fn drop(&mut self) { + self.shutdown(); + } +} + #[cfg(test)] mod tests { use super::*; + use temps_core::EncryptionService; + use temps_database::test_utils::TestDatabase; + + /// Helper to create a DigestScheduler with real DB-backed services. + async fn create_test_scheduler() -> (DigestScheduler, TestDatabase) { + let test_db = TestDatabase::with_migrations() + .await + .expect("Failed to create test database"); + + let encryption_key = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef"; + let encryption_service = Arc::new( + EncryptionService::new(encryption_key).expect("Failed to create encryption service"), + ); + + let notification_service = Arc::new(crate::services::NotificationService::new( + test_db.connection_arc(), + encryption_service, + )); + + let digest_service = Arc::new(super::super::DigestService::new( + test_db.connection_arc(), + notification_service, + )); + let preferences_service = Arc::new(crate::services::NotificationPreferencesService::new( + test_db.connection_arc(), + )); + + let scheduler = DigestScheduler::new(digest_service, preferences_service); + (scheduler, test_db) + } #[test] fn test_parse_weekday() { @@ -230,4 +279,84 @@ mod tests { assert_eq!(DigestScheduler::parse_minute("invalid"), 0); // Default assert_eq!(DigestScheduler::parse_minute("09"), 0); // No colon, default to 0 } + + #[tokio::test] + async fn test_scheduler_new_does_not_start_task() { + let (scheduler, _db) = create_test_scheduler().await; + + // new() should NOT have a running task + assert!(scheduler.task_handle.is_none()); + } + + #[tokio::test] + async fn test_scheduler_start_creates_task() { + let (mut scheduler, _db) = create_test_scheduler().await; + + scheduler.start(); + assert!(scheduler.task_handle.is_some()); + + // Cleanup + scheduler.shutdown(); + } + + #[tokio::test] + async fn test_scheduler_shutdown_clears_task() { + let (mut scheduler, _db) = create_test_scheduler().await; + + scheduler.start(); + assert!(scheduler.task_handle.is_some()); + + scheduler.shutdown(); + assert!(scheduler.task_handle.is_none()); + } + + #[tokio::test] + async fn test_scheduler_double_start_is_noop() { + let (mut scheduler, _db) = create_test_scheduler().await; + + scheduler.start(); + let handle_ptr = scheduler + .task_handle + .as_ref() + .map(|h| format!("{:?}", h)) + .unwrap(); + + // Second start should not replace the handle + scheduler.start(); + let handle_ptr_2 = scheduler + .task_handle + .as_ref() + .map(|h| format!("{:?}", h)) + .unwrap(); + + assert_eq!(handle_ptr, handle_ptr_2, "Second start should be a no-op"); + + scheduler.shutdown(); + } + + #[tokio::test] + async fn test_scheduler_shutdown_without_start_is_safe() { + let (mut scheduler, _db) = create_test_scheduler().await; + + // shutdown on an unstarted scheduler should not panic + scheduler.shutdown(); + assert!(scheduler.task_handle.is_none()); + } + + #[tokio::test] + async fn test_scheduler_drop_aborts_task() { + let (mut scheduler, _db) = create_test_scheduler().await; + + scheduler.start(); + let handle = scheduler.task_handle.as_ref().unwrap().abort_handle(); + + // Drop the scheduler — should abort the task via Drop impl + drop(scheduler); + + // Give tokio a tick to process the abort + tokio::task::yield_now().await; + + // The task should have been aborted + assert!(handle.is_finished()); + } } diff --git a/crates/temps-notifications/src/plugin.rs b/crates/temps-notifications/src/plugin.rs index 68b4065d..15812a2b 100644 --- a/crates/temps-notifications/src/plugin.rs +++ b/crates/temps-notifications/src/plugin.rs @@ -88,11 +88,12 @@ impl TempsPlugin for NotificationsPlugin { context.register_service(notification_state); // Start the weekly digest scheduler - let scheduler = DigestScheduler::new( + let mut scheduler = DigestScheduler::new( digest_service.clone(), notification_preferences_service.clone(), ); - context.register_service(scheduler); + scheduler.start(); + context.register_service(Arc::new(scheduler)); debug!("Notifications plugin services registered successfully"); Ok(()) diff --git a/crates/temps-projects/src/handlers/handlers.rs b/crates/temps-projects/src/handlers/handlers.rs index 8d321aad..eea0c80b 100644 --- a/crates/temps-projects/src/handlers/handlers.rs +++ b/crates/temps-projects/src/handlers/handlers.rs @@ -823,9 +823,10 @@ pub async fn trigger_project_pipeline( .build()); }; - // Get the environment for audit logging + // Get the environment for audit logging (only active environments) let environment = temps_entities::environments::Entity::find_by_id(environment_id) .filter(temps_entities::environments::Column::ProjectId.eq(id)) + .filter(temps_entities::environments::Column::DeletedAt.is_null()) .one(state.project_service.db.as_ref()) .await .map_err(|e| { diff --git a/crates/temps-projects/src/services/project.rs b/crates/temps-projects/src/services/project.rs index c2c83ff6..0b6cfe56 100644 --- a/crates/temps-projects/src/services/project.rs +++ b/crates/temps-projects/src/services/project.rs @@ -1327,9 +1327,10 @@ impl ProjectService { .map_err(|e| ProjectError::Other(e.to_string()))? .ok_or_else(|| ProjectError::NotFound("Project not found".to_string()))?; - // Validate environment belongs to this project + // Validate environment belongs to this project and is not soft-deleted let environment = temps_entities::environments::Entity::find_by_id(environment_id) .filter(temps_entities::environments::Column::ProjectId.eq(project_id)) + .filter(temps_entities::environments::Column::DeletedAt.is_null()) .one(self.db.as_ref()) .await .map_err(|e| ProjectError::Other(e.to_string()))? diff --git a/crates/temps-routes/src/project_change_listener.rs b/crates/temps-routes/src/project_change_listener.rs index 4cc8da48..9a4471b7 100644 --- a/crates/temps-routes/src/project_change_listener.rs +++ b/crates/temps-routes/src/project_change_listener.rs @@ -16,6 +16,7 @@ use tracing::{error, info}; pub struct ProjectChangeListener { database_url: String, peer_table: Arc, + task_handle: std::sync::Mutex>>, } impl ProjectChangeListener { @@ -24,52 +25,74 @@ impl ProjectChangeListener { Self { database_url, peer_table, + task_handle: std::sync::Mutex::new(None), } } - /// Start listening for project change notifications - /// This runs in a background task and listens indefinitely - pub async fn start_listening(self) -> Result<()> { + /// Start listening for project change notifications in a background task. + /// The task runs until `shutdown()` is called or the listener is dropped. + pub async fn start_listening(&self) -> Result<()> { use sqlx::postgres::{PgListener, PgPool}; // Create PostgreSQL listener using sqlx let pool = PgPool::connect(&self.database_url).await?; - let mut listener = PgListener::connect_with(&pool).await?; + let mut pg_listener = PgListener::connect_with(&pool).await?; - listener.listen("project_route_change").await?; + pg_listener.listen("project_route_change").await?; info!("Started listening for project_route_change events"); - loop { - match listener.recv().await { - Ok(notification) => { - self.handle_project_change(notification.payload()).await; - } - Err(e) => { - error!("Error receiving project change notification: {}", e); - - // Attempt to reconnect after error - tokio::time::sleep(tokio::time::Duration::from_secs(5)).await; - - match PgListener::connect_with(&pool).await { - Ok(mut new_listener) => { - if let Err(e) = new_listener.listen("project_route_change").await { - error!("Failed to re-subscribe to project_route_change: {}", e); - } else { - listener = new_listener; - info!("Reconnected to project_route_change listener"); + let peer_table = self.peer_table.clone(); + + let handle = tokio::spawn(async move { + loop { + match pg_listener.recv().await { + Ok(notification) => { + Self::handle_project_change_static(&peer_table, notification.payload()) + .await; + } + Err(e) => { + error!("Error receiving project change notification: {}", e); + + // Attempt to reconnect after error + tokio::time::sleep(tokio::time::Duration::from_secs(5)).await; + + match PgListener::connect_with(&pool).await { + Ok(mut new_listener) => { + if let Err(e) = new_listener.listen("project_route_change").await { + error!("Failed to re-subscribe to project_route_change: {}", e); + } else { + pg_listener = new_listener; + info!("Reconnected to project_route_change listener"); + } + } + Err(e) => { + error!("Failed to reconnect project_route_change listener: {}", e); } - } - Err(e) => { - error!("Failed to reconnect project_route_change listener: {}", e); } } } } + }); + + if let Ok(mut guard) = self.task_handle.lock() { + *guard = Some(handle); + } + + Ok(()) + } + + /// Stop the background listener task + pub fn shutdown(&self) { + if let Ok(mut guard) = self.task_handle.lock() { + if let Some(handle) = guard.take() { + handle.abort(); + info!("Project change listener stopped"); + } } } /// Handle a route change notification (project or environment) - async fn handle_project_change(&self, payload: &str) { + async fn handle_project_change_static(peer_table: &CachedPeerTable, payload: &str) { // Try to parse as RouteChangePayload which handles both project and environment changes match serde_json::from_str::(payload) { Ok(change) => { @@ -95,7 +118,7 @@ impl ProjectChangeListener { } // Reload all routes when any change happens - if let Err(e) = self.peer_table.load_routes().await { + if let Err(e) = peer_table.load_routes().await { error!("Failed to reload routes after change: {}", e); } } @@ -109,6 +132,12 @@ impl ProjectChangeListener { } } +impl Drop for ProjectChangeListener { + fn drop(&mut self) { + self.shutdown(); + } +} + /// Unified payload structure for route changes (project or environment) #[derive(Debug, serde::Deserialize)] #[serde(untagged)] @@ -197,4 +226,50 @@ mod tests { _ => panic!("Expected Environment payload"), } } + + // ======================================================================== + // ProjectChangeListener lifecycle tests + // ======================================================================== + + #[test] + fn test_project_change_listener_new_has_no_task() { + let db = Arc::new(sea_orm::DatabaseConnection::Disconnected); + let peer_table = Arc::new(CachedPeerTable::new(db)); + let listener = ProjectChangeListener::new( + "postgresql://fake:fake@localhost/fake".to_string(), + peer_table, + ); + + let guard = listener.task_handle.lock().unwrap(); + assert!(guard.is_none(), "New listener should have no task handle"); + } + + #[test] + fn test_project_change_listener_shutdown_without_start_is_safe() { + let db = Arc::new(sea_orm::DatabaseConnection::Disconnected); + let peer_table = Arc::new(CachedPeerTable::new(db)); + let listener = ProjectChangeListener::new( + "postgresql://fake:fake@localhost/fake".to_string(), + peer_table, + ); + + // Calling shutdown before start should not panic + listener.shutdown(); + + let guard = listener.task_handle.lock().unwrap(); + assert!(guard.is_none()); + } + + #[test] + fn test_project_change_listener_drop_without_start_is_safe() { + let db = Arc::new(sea_orm::DatabaseConnection::Disconnected); + let peer_table = Arc::new(CachedPeerTable::new(db)); + let listener = ProjectChangeListener::new( + "postgresql://fake:fake@localhost/fake".to_string(), + peer_table, + ); + + // Dropping without starting should not panic + drop(listener); + } } diff --git a/crates/temps-routes/src/route_table.rs b/crates/temps-routes/src/route_table.rs index f56c0261..b3d61776 100644 --- a/crates/temps-routes/src/route_table.rs +++ b/crates/temps-routes/src/route_table.rs @@ -262,9 +262,10 @@ impl CachedPeerTable { ); for env_domain in env_domains { - // Fetch environment if not cached + // Fetch environment if not cached (skip soft-deleted) if !environments_cache.contains_key(&env_domain.environment_id) { if let Ok(Some(env)) = environments::Entity::find_by_id(env_domain.environment_id) + .filter(environments::Column::DeletedAt.is_null()) .one(self.db.as_ref()) .await { @@ -452,10 +453,11 @@ impl CachedPeerTable { ); for custom_domain in custom_domains { - // Fetch environment if not cached + // Fetch environment if not cached (skip soft-deleted) if !environments_cache.contains_key(&custom_domain.environment_id) { if let Ok(Some(env)) = environments::Entity::find_by_id(custom_domain.environment_id) + .filter(environments::Column::DeletedAt.is_null()) .one(self.db.as_ref()) .await { @@ -569,6 +571,7 @@ impl CachedPeerTable { let all_envs = environments::Entity::find() .filter(environments::Column::Subdomain.is_not_null()) .filter(environments::Column::CurrentDeploymentId.is_not_null()) + .filter(environments::Column::DeletedAt.is_null()) .all(self.db.as_ref()) .await?; @@ -720,9 +723,10 @@ impl CachedPeerTable { // This ensures we have complete coverage of all running deployments debug!("Loading all active deployments for environments..."); - // Get all environments with current_deployment_id + // Get all environments with current_deployment_id (exclude soft-deleted) let all_active_envs = environments::Entity::find() .filter(environments::Column::CurrentDeploymentId.is_not_null()) + .filter(environments::Column::DeletedAt.is_null()) .all(self.db.as_ref()) .await?; @@ -878,6 +882,7 @@ impl CachedPeerTable { pub struct RouteTableListener { peer_table: Arc, database_url: String, + task_handle: std::sync::Mutex>>, } impl RouteTableListener { @@ -885,6 +890,7 @@ impl RouteTableListener { Self { peer_table, database_url, + task_handle: std::sync::Mutex::new(None), } } @@ -909,7 +915,8 @@ impl RouteTableListener { ); // Spawn background task to handle notifications - tokio::spawn(async move { + let peer_table = self.peer_table.clone(); + let handle = tokio::spawn(async move { loop { match listener.recv().await { Ok(notification) => { @@ -918,15 +925,12 @@ impl RouteTableListener { notification.payload() ); - debug!("šŸ”„ Route table synchronizing..."); + debug!("Route table synchronizing..."); - if let Err(e) = self.peer_table.load_routes().await { + if let Err(e) = peer_table.load_routes().await { error!("Failed to reload routes: {}", e); } else { - debug!( - "āœ… Route table synchronized ({} entries)", - self.peer_table.len() - ); + debug!("Route table synchronized ({} entries)", peer_table.len()); } } Err(e) => { @@ -954,8 +958,29 @@ impl RouteTableListener { } }); + // Store the handle so it can be aborted on drop + if let Ok(mut guard) = self.task_handle.lock() { + *guard = Some(handle); + } + Ok(()) } + + /// Stop the background listener task + pub fn shutdown(&self) { + if let Ok(mut guard) = self.task_handle.lock() { + if let Some(handle) = guard.take() { + handle.abort(); + info!("Route table listener stopped"); + } + } + } +} + +impl Drop for RouteTableListener { + fn drop(&mut self) { + self.shutdown(); + } } #[cfg(test)] @@ -1175,4 +1200,50 @@ mod tests { assert_eq!(route.static_dir(), None); assert_eq!(route.get_backend_addr(), "10.0.0.1:9000"); } + + // ======================================================================== + // RouteTableListener lifecycle tests + // ======================================================================== + + #[test] + fn test_route_table_listener_new_has_no_task() { + let db = Arc::new(sea_orm::DatabaseConnection::Disconnected); + let peer_table = Arc::new(CachedPeerTable::new(db)); + let listener = RouteTableListener::new( + peer_table, + "postgresql://fake:fake@localhost/fake".to_string(), + ); + + let guard = listener.task_handle.lock().unwrap(); + assert!(guard.is_none(), "New listener should have no task handle"); + } + + #[test] + fn test_route_table_listener_shutdown_without_start_is_safe() { + let db = Arc::new(sea_orm::DatabaseConnection::Disconnected); + let peer_table = Arc::new(CachedPeerTable::new(db)); + let listener = RouteTableListener::new( + peer_table, + "postgresql://fake:fake@localhost/fake".to_string(), + ); + + // Calling shutdown before start should not panic + listener.shutdown(); + + let guard = listener.task_handle.lock().unwrap(); + assert!(guard.is_none()); + } + + #[test] + fn test_route_table_listener_drop_without_start_is_safe() { + let db = Arc::new(sea_orm::DatabaseConnection::Disconnected); + let peer_table = Arc::new(CachedPeerTable::new(db)); + let listener = RouteTableListener::new( + peer_table, + "postgresql://fake:fake@localhost/fake".to_string(), + ); + + // Dropping without starting should not panic + drop(listener); + } } diff --git a/crates/temps-status-page/src/services/health_check_service.rs b/crates/temps-status-page/src/services/health_check_service.rs index 0bd464dc..eaa99d59 100644 --- a/crates/temps-status-page/src/services/health_check_service.rs +++ b/crates/temps-status-page/src/services/health_check_service.rs @@ -419,8 +419,9 @@ impl HealthCheckService { pub async fn initialize_monitors(&self) -> Result<(), StatusPageError> { debug!("Initializing monitors for all existing environments"); - // Get all environments with their projects + // Get all active (non-deleted) environments with their projects let environments_with_projects = environments::Entity::find() + .filter(environments::Column::DeletedAt.is_null()) .inner_join(projects::Entity) .all(self.db.as_ref()) .await?; diff --git a/crates/temps-webhooks/src/listener.rs b/crates/temps-webhooks/src/listener.rs index 762e6f60..2afc6900 100644 --- a/crates/temps-webhooks/src/listener.rs +++ b/crates/temps-webhooks/src/listener.rs @@ -384,13 +384,32 @@ impl WebhookEventListener { } } +impl Drop for WebhookEventListener { + fn drop(&mut self) { + // Abort the background task if it's still running. + // We can't call the async stop() from Drop, but we can abort the handle + // which will cause the spawned task to be cancelled immediately. + // try_write() is synchronous and won't block — it fails if the lock is held. + match self.task_handle.try_write() { + Ok(mut guard) => { + if let Some(handle) = guard.take() { + handle.abort(); + } + } + Err(_) => { + // Lock is held — this is rare during Drop. The task will be cleaned + // up when the Arc is fully dropped and the JoinHandle is dropped. + } + } + } +} + #[cfg(test)] mod tests { use super::*; - #[tokio::test] - async fn test_listener_lifecycle() { - // Create mock services + /// Helper to create a test listener with mock services + async fn create_test_listener() -> WebhookEventListener { let db = Arc::new(sea_orm::Database::connect("sqlite::memory:").await.unwrap()); let encryption_service = Arc::new( temps_core::EncryptionService::new( @@ -403,7 +422,12 @@ mod tests { temps_queue::BroadcastQueueService::create_broadcast_channel(100); let queue = Arc::new(queue_service) as Arc; - let listener = WebhookEventListener::new(webhook_service, db.clone(), queue); + WebhookEventListener::new(webhook_service, db.clone(), queue) + } + + #[tokio::test] + async fn test_listener_lifecycle() { + let listener = create_test_listener().await; // Test initial state assert!(!listener.is_running().await); @@ -416,4 +440,60 @@ mod tests { listener.stop().await; assert!(!listener.is_running().await); } + + #[tokio::test] + async fn test_listener_drop_when_not_started() { + let listener = create_test_listener().await; + + // Dropping an unstarted listener should not panic + drop(listener); + } + + #[tokio::test] + async fn test_listener_drop_aborts_running_task() { + let listener = create_test_listener().await; + + listener.start().await.unwrap(); + assert!(listener.is_running().await); + + // Capture the abort handle before dropping + let handle = { + let guard = listener.task_handle.read().await; + guard.as_ref().unwrap().abort_handle() + }; + + // Drop the listener — Drop impl should abort the background task + drop(listener); + + // Give tokio a tick to process the abort + tokio::task::yield_now().await; + + assert!( + handle.is_finished(), + "Background task should be aborted after Drop" + ); + } + + #[tokio::test] + async fn test_listener_double_start_is_noop() { + let listener = create_test_listener().await; + + listener.start().await.unwrap(); + assert!(listener.is_running().await); + + // Starting again should succeed without error + listener.start().await.unwrap(); + assert!(listener.is_running().await); + + listener.stop().await; + } + + #[tokio::test] + async fn test_listener_stop_when_not_started_is_safe() { + let listener = create_test_listener().await; + + // Stopping an unstarted listener should not panic + listener.stop().await; + assert!(!listener.is_running().await); + } } diff --git a/mcp/README.md b/mcp/README.md index ae045533..8bd2db7f 100644 --- a/mcp/README.md +++ b/mcp/README.md @@ -43,13 +43,24 @@ Add to your Claude Desktop configuration file: { "mcpServers": { "temps": { - "command": "node", - "args": ["/absolute/path/to/temps/mcp/dist/index.js"] + "command": "npx", + "args": [ + "-y", + "@temps-sdk/mcp", + "--tools", + "deployments,analytics,projects" + ], + "env": { + "TEMPS_API_URL": "", + "TEMPS_API_KEY": "" + } } } } ``` +Replace `` with your Temps instance URL and `` with an API key from your Temps dashboard. + Restart Claude Desktop to activate the server. ## Project Structure From c6dd56f29302c19e6f049ab4829598b43efcb3b6 Mon Sep 17 00:00:00 2001 From: David Viejo Date: Fri, 20 Feb 2026 19:36:35 +0100 Subject: [PATCH 3/9] fix(environments): remove useless .into() conversions on chrono::Utc::now() in tests --- .../src/services/environment_service.rs | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/crates/temps-environments/src/services/environment_service.rs b/crates/temps-environments/src/services/environment_service.rs index d5b5ed1a..d8324eb7 100644 --- a/crates/temps-environments/src/services/environment_service.rs +++ b/crates/temps-environments/src/services/environment_service.rs @@ -867,8 +867,8 @@ mod tests { project_id: 10, host: "".to_string(), upstreams: temps_entities::upstream_config::UpstreamList::new(), - created_at: chrono::Utc::now().into(), - updated_at: chrono::Utc::now().into(), + created_at: chrono::Utc::now(), + updated_at: chrono::Utc::now(), last_deployment: None, current_deployment_id: None, deleted_at: None, @@ -887,8 +887,8 @@ mod tests { preset: temps_entities::preset::Preset::NextJs, preset_config: None, deployment_config: None, - created_at: chrono::Utc::now().into(), - updated_at: chrono::Utc::now().into(), + created_at: chrono::Utc::now(), + updated_at: chrono::Utc::now(), is_deleted: false, deleted_at: None, last_deployment: None, @@ -942,8 +942,8 @@ mod tests { project_id: 10, host: "".to_string(), upstreams: temps_entities::upstream_config::UpstreamList::new(), - created_at: chrono::Utc::now().into(), - updated_at: chrono::Utc::now().into(), + created_at: chrono::Utc::now(), + updated_at: chrono::Utc::now(), last_deployment: None, current_deployment_id: None, deleted_at: None, @@ -960,8 +960,8 @@ mod tests { project_id: 10, host: "".to_string(), upstreams: temps_entities::upstream_config::UpstreamList::new(), - created_at: chrono::Utc::now().into(), - updated_at: chrono::Utc::now().into(), + created_at: chrono::Utc::now(), + updated_at: chrono::Utc::now(), last_deployment: None, current_deployment_id: None, deleted_at: None, @@ -1025,8 +1025,8 @@ mod tests { project_id: 10, host: "".to_string(), upstreams: temps_entities::upstream_config::UpstreamList::new(), - created_at: chrono::Utc::now().into(), - updated_at: chrono::Utc::now().into(), + created_at: chrono::Utc::now(), + updated_at: chrono::Utc::now(), last_deployment: None, current_deployment_id: None, deleted_at: None, From 6cad3cd1f83dfefac3c72f40c2008f16693b82fc Mon Sep 17 00:00:00 2001 From: David Viejo Date: Sat, 21 Feb 2026 11:31:32 +0100 Subject: [PATCH 4/9] fix(backup): switch pg_dump to plain format to fix OOM and add error logging pg_dump with --format=custom buffered per-table data in both the sidecar container and the temps process (via Bollard's HTTP stream), causing 2-3 GB memory peaks and OOM kills (exit code 137) on large TimescaleDB databases. Switch to --format=plain which streams SQL text (COPY statements) row-by-row with constant memory usage. Update both the main database backup path and the external service postgres provider. Additional fixes: - Stream S3 uploads via ByteStream::from_path instead of reading entire file into memory in upload_single_part - Add error! logging for backup failures in handlers and service layer that were previously silently converted to HTTP Problem responses - Update restore paths to detect format from S3 location and use psql for new .sql.gz backups or pg_restore for old .pgdump.gz backups, preserving backward compatibility with existing backups --- .../src/handlers/backup_handler.rs | 18 ++- crates/temps-backup/src/services/backup.rs | 114 ++++++++++++---- .../src/externalsvc/postgres.rs | 127 +++++++++++++++--- 3 files changed, 212 insertions(+), 47 deletions(-) diff --git a/crates/temps-backup/src/handlers/backup_handler.rs b/crates/temps-backup/src/handlers/backup_handler.rs index 6921b5a5..f71a6bf1 100644 --- a/crates/temps-backup/src/handlers/backup_handler.rs +++ b/crates/temps-backup/src/handlers/backup_handler.rs @@ -880,7 +880,10 @@ async fn run_backup_for_source( .backup_service .run_backup_for_source(id, &request.backup_type, auth.user_id()) .await - .map_err(Problem::from)?; + .map_err(|e| { + error!("Failed to run backup for S3 source {}: {}", id, e); + Problem::from(e) + })?; let audit = BackupRunAudit { context: AuditContext { @@ -1044,7 +1047,10 @@ async fn run_external_service_backup( .backup_service .get_external_service(id) .await - .map_err(Problem::from)?; + .map_err(|e| { + error!("Failed to get external service {} for backup: {}", id, e); + Problem::from(e) + })?; let backup_type = request.backup_type.as_deref().unwrap_or("full"); @@ -1053,7 +1059,13 @@ async fn run_external_service_backup( .backup_service .backup_external_service(&service, request.s3_source_id, backup_type, auth.user_id()) .await - .map_err(Problem::from)?; + .map_err(|e| { + error!( + "Failed to backup external service {} ({}): {}", + service.name, service.service_type, e + ); + Problem::from(e) + })?; // Create audit log let audit = ExternalServiceBackupRunAudit { diff --git a/crates/temps-backup/src/services/backup.rs b/crates/temps-backup/src/services/backup.rs index 08f7d648..87404039 100644 --- a/crates/temps-backup/src/services/backup.rs +++ b/crates/temps-backup/src/services/backup.rs @@ -220,7 +220,13 @@ impl BackupService { let mut temp_file = NamedTempFile::new().map_err(BackupError::Io)?; // Perform database backup - self.backup_database(&mut temp_file).await?; + self.backup_database(&mut temp_file).await.map_err(|e| { + error!( + "Database backup failed for S3 source {}: {}", + s3_source_id, e + ); + e + })?; // Generate unique backup ID let backup_id = Uuid::new_v4().to_string(); @@ -252,7 +258,14 @@ impl BackupService { // Compress and upload the backup self.upload_backup(&s3_client, &s3_source, &temp_file, &s3_location) - .await?; + .await + .map_err(|e| { + error!( + "Failed to upload backup to S3 source {} at {}: {}", + s3_source_id, s3_location, e + ); + e + })?; // Create backup record let new_backup = temps_entities::backups::ActiveModel { @@ -560,12 +573,14 @@ impl BackupService { .await .map_err(|e| anyhow::anyhow!("Failed to start container: {}", e))?; - // Build pg_dump command with proper lifetimes + // Build pg_dump command with proper lifetimes. + // --format=plain streams SQL text (COPY statements) row-by-row with constant memory. + // Previous --format=custom buffered per-table data in both pg_dump and the temps process + // (via Bollard's HTTP stream), causing 2-3 GB memory peaks on large TimescaleDB databases. let port_str = port.to_string(); let pg_dump_cmd = vec![ "pg_dump", - "--format=custom", - "--compress=0", + "--format=plain", "--no-password", "--host", host, @@ -1017,13 +1032,16 @@ impl BackupService { temp_file: &NamedTempFile, s3_location: &str, ) -> Result<()> { - let file_content = tokio::fs::read(temp_file.path()).await?; + // Stream from file instead of reading entire contents into memory + let body = aws_sdk_s3::primitives::ByteStream::from_path(temp_file.path()) + .await + .map_err(|e| anyhow::anyhow!("Failed to create byte stream from backup file: {}", e))?; match s3_client .put_object() .bucket(&s3_source.bucket_name) .key(s3_location) - .body(file_content.into()) + .body(body) .content_type("application/x-gzip") .send() .await @@ -1507,39 +1525,69 @@ impl BackupService { let username = url.username(); let password = url.password(); - // Build pg_restore command - let mut cmd = tokio::process::Command::new("pg_restore"); - cmd.arg("--verbose") - .arg("--clean") // Drop existing objects before recreating - .arg("--if-exists") // Don't error if objects don't exist - .arg("--no-password") - .arg("--host") - .arg(host) - .arg("--port") - .arg(port.to_string()) - .arg("--username") - .arg(username) - .arg("--dbname") - .arg(database) - .arg(temp_file.path()); + // Detect backup format from S3 location path: + // - .pgdump.gz / backup.postgresql.gz = custom format (pg_restore) + // - .sql.gz = plain SQL format (psql) + let is_plain_format = backup.s3_location.ends_with(".sql.gz"); + + let mut cmd; + if is_plain_format { + // Plain SQL format: pipe decompressed SQL through psql + cmd = tokio::process::Command::new("psql"); + cmd.arg("--no-password") + .arg("--host") + .arg(host) + .arg("--port") + .arg(port.to_string()) + .arg("--username") + .arg(username) + .arg("--dbname") + .arg(database) + .arg("--file") + .arg(temp_file.path()); + } else { + // Custom format: use pg_restore + cmd = tokio::process::Command::new("pg_restore"); + cmd.arg("--verbose") + .arg("--clean") // Drop existing objects before recreating + .arg("--if-exists") // Don't error if objects don't exist + .arg("--no-password") + .arg("--host") + .arg(host) + .arg("--port") + .arg(port.to_string()) + .arg("--username") + .arg(username) + .arg("--dbname") + .arg(database) + .arg(temp_file.path()); + } // Set password via environment variable if provided if let Some(pwd) = password { cmd.env("PGPASSWORD", pwd); } - info!("Running pg_restore command"); + let restore_tool = if is_plain_format { + "psql" + } else { + "pg_restore" + }; + info!( + "Running {} command for backup {}", + restore_tool, backup.backup_id + ); let output = cmd.output().await.map_err(|e| BackupError::Internal { message: format!( - "Failed to execute pg_restore: {}. Make sure pg_restore is installed and in PATH", - e + "Failed to execute {}: {}. Make sure {} is installed and in PATH", + restore_tool, e, restore_tool ), })?; if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); return Err(BackupError::Internal { - message: format!("pg_restore failed: {}", stderr), + message: format!("{} failed: {}", restore_tool, stderr), }); } @@ -1853,7 +1901,11 @@ impl BackupService { backup_type, created_by, ) - .await?; + .await + .map_err(|e| { + error!("Backup failed for S3 source {}: {}", s3_source_id, e); + e + })?; info!( "Successfully created backup {} for S3 source {}", @@ -2212,7 +2264,13 @@ impl BackupService { service_config, ) .await - .map_err(|e| BackupError::ExternalService(e.to_string()))?; + .map_err(|e| { + error!( + "External service backup failed for service '{}' (type={}, id={}): {}", + service.name, service.service_type, service.id, e + ); + BackupError::ExternalService(e.to_string()) + })?; info!("Backup created at location: {}", backup_location); // Get the external service backup record let external_backup = temps_entities::external_service_backups::Entity::find() diff --git a/crates/temps-providers/src/externalsvc/postgres.rs b/crates/temps-providers/src/externalsvc/postgres.rs index 6edcdbb3..f24cf5c0 100644 --- a/crates/temps-providers/src/externalsvc/postgres.rs +++ b/crates/temps-providers/src/externalsvc/postgres.rs @@ -14,7 +14,7 @@ use std::time::Duration; use temps_entities::external_service_backups; use tokio::sync::RwLock; use tokio::time::sleep; -use tracing::{error, info}; +use tracing::{debug, error, info}; use urlencoding; use crate::utils::ensure_network_exists; @@ -956,6 +956,83 @@ impl PostgresService { Ok(()) } + /// Restore a custom-format pg_dump backup via pg_restore inside the container. + /// Used for backward compatibility with backups created before the switch to plain format. + async fn restore_custom_backup_file( + &self, + docker: &Docker, + container_name: &str, + backup_data: Vec, + username: &str, + password: &str, + ) -> Result<()> { + let temp_file = tempfile::NamedTempFile::new()?; + tokio::fs::write(temp_file.path(), &backup_data).await?; + + // Create a tar archive containing the backup file + let mut tar = tar::Builder::new(Vec::new()); + tar.append_path_with_name(temp_file.path(), "backup.pgdump")?; + let tar_data = tar.into_inner()?; + + // Copy the tar archive into the container + docker + .upload_to_container( + container_name, + Some(bollard::query_parameters::UploadToContainerOptions { + path: "/".to_string(), + ..Default::default() + }), + body_full(bytes::Bytes::from(tar_data)), + ) + .await + .map_err(|e| anyhow::anyhow!("Failed to upload backup file to container: {}", e))?; + + // Execute pg_restore inside the container + let password_env = format!("PGPASSWORD={}", password); + let exec = docker + .create_exec( + container_name, + bollard::exec::CreateExecOptions { + cmd: Some(vec![ + "pg_restore", + "--verbose", + "--clean", + "--if-exists", + "--no-password", + "-U", + username, + "-d", + username, // default database is same as username + "/backup.pgdump", + ]), + env: Some(vec![password_env.as_str()]), + attach_stdout: Some(true), + attach_stderr: Some(true), + ..Default::default() + }, + ) + .await + .map_err(|e| anyhow::anyhow!("Failed to create pg_restore exec: {}", e))?; + + let output = docker.start_exec(&exec.id, None).await?; + if let bollard::exec::StartExecResults::Attached { mut output, .. } = output { + while let Some(Ok(output)) = output.next().await { + match output { + bollard::container::LogOutput::StdOut { message } => { + info!("pg_restore stdout: {}", String::from_utf8_lossy(&message)); + } + bollard::container::LogOutput::StdErr { message } => { + // pg_restore emits progress info on stderr, log at debug level + debug!("pg_restore stderr: {}", String::from_utf8_lossy(&message)); + } + _ => {} + } + } + } + + Ok(()) + } + /// Verify that a Docker image can be pulled without actually downloading the full image /// Attempts to pull the image - fails if it doesn't exist or cannot be accessed async fn verify_image_pullable(&self, image: &str) -> Result<()> { @@ -1177,14 +1254,15 @@ impl ExternalService for PostgresService { }; // pg_dump connects to the DB container over the Docker network using its container name - // as the hostname. --format=custom produces a binary dump suitable for pg_restore. - // --compress=0 disables pg_dump's internal compression; we apply gzip ourselves while - // streaming to avoid buffering the entire dump in memory (prevents OOM / exit code 137). + // as the hostname. --format=plain streams SQL text (COPY statements) row-by-row with + // constant memory usage. We apply gzip compression ourselves while streaming to disk. + // Previous --format=custom buffered per-table data in both pg_dump and the temps process + // (via Bollard's HTTP stream), causing multi-GB memory peaks and OOM kills (exit 137) + // on large databases, especially with TimescaleDB hypertable chunks. let port_str = POSTGRES_INTERNAL_PORT.to_string(); let pg_dump_cmd = vec![ "pg_dump", - "--format=custom", - "--compress=0", + "--format=plain", "--no-password", "--host", &db_container_name, @@ -1312,7 +1390,7 @@ impl ExternalService for PostgresService { // Generate backup path in S3 let timestamp = Utc::now().format("%Y%m%d_%H%M%S"); let backup_key = format!( - "{}/postgres_backup_{}.pgdump.gz", + "{}/postgres_backup_{}.sql.gz", subpath.trim_matches('/'), timestamp ); @@ -1798,15 +1876,32 @@ impl ExternalService for PostgresService { // Get container name let container_name = self.get_container_name(); - // Restore the backup using Docker with actual credentials - self.restore_backup_file( - &self.docker, - &container_name, - decompressed_data, - &postgres_config.username, - &postgres_config.password, - ) - .await?; + // Detect backup format from the S3 location: + // - .pgdump.gz = custom format (pg_restore) + // - .sql.gz = plain SQL format (psql) + let is_plain_format = backup_location.ends_with(".sql.gz"); + + if is_plain_format { + // Plain SQL: restore via psql -f + self.restore_backup_file( + &self.docker, + &container_name, + decompressed_data, + &postgres_config.username, + &postgres_config.password, + ) + .await?; + } else { + // Custom format: restore via pg_restore + self.restore_custom_backup_file( + &self.docker, + &container_name, + decompressed_data, + &postgres_config.username, + &postgres_config.password, + ) + .await?; + } info!("PostgreSQL restore completed successfully"); Ok(()) From 92c0041046349efc0b40208956ea4fc114661f58 Mon Sep 17 00:00:00 2001 From: David Viejo Date: Sun, 22 Feb 2026 14:03:03 +0100 Subject: [PATCH 5/9] refactor(backup): update backup file extensions and improve sidecar memory management - Change backup file extension from .postgresql.gz to .sql.gz to align with plain format pg_dump output. - Update metadata location handling to accommodate both .sql.gz and legacy .backup.postgresql.gz formats. - Enhance sidecar container configuration to prevent OOM kills by overriding the entrypoint and adjusting oom_score_adj. - Update tests to validate the new plain format output structure. --- crates/temps-backup/src/services/backup.rs | 41 ++++++++++++------- crates/temps-cli/src/commands/backup.rs | 8 +++- .../src/externalsvc/postgres.rs | 12 +++++- 3 files changed, 44 insertions(+), 17 deletions(-) diff --git a/crates/temps-backup/src/services/backup.rs b/crates/temps-backup/src/services/backup.rs index 87404039..29bc5d1e 100644 --- a/crates/temps-backup/src/services/backup.rs +++ b/crates/temps-backup/src/services/backup.rs @@ -245,9 +245,11 @@ impl BackupService { )); } - // Generate S3 location + // Generate S3 location. Use .sql.gz extension to match the plain-format pg_dump + // output (--format=plain). The restore logic uses this extension to decide whether + // to pipe through psql (plain) or pg_restore (custom). let s3_location = format!( - "{}/backups/{}/{}/backup.postgresql.gz", + "{}/backups/{}/{}/backup.sql.gz", s3_source.bucket_path.trim_matches('/'), Utc::now().format("%Y/%m/%d"), backup_id @@ -536,14 +538,24 @@ impl BackupService { let pgpassword_env = format!("PGPASSWORD={}", decoded_password); let env_vars = vec![pgpassword_env]; - // Create container config with version-matched postgres image (includes pg_dump) + // Create container config with version-matched postgres image (includes pg_dump). + // Override the entrypoint to prevent the timescaledb-ha image from starting a full + // PostgreSQL server instance inside the sidecar. The default entrypoint + // (docker-entrypoint.sh) initializes a PG cluster and allocates shared_buffers, + // consuming 1-2 GB of RAM that competes with the actual database server for memory + // and triggers OOM kills (exit code 137). let config = Config { image: Some(image_tag), - cmd: Some(vec!["sleep".to_string(), "300".to_string()]), // Keep container alive for exec + entrypoint: Some(vec!["/bin/sleep".to_string()]), + cmd: Some(vec!["300".to_string()]), env: Some(env_vars), host_config: Some(bollard::models::HostConfig { network_mode: Some("host".to_string()), // Use host network to access database auto_remove: Some(true), + // Protect the sidecar from the OOM killer. Without this, the kernel may + // kill the sidecar (which has no oom_score_adj) instead of less important + // processes when the system is under memory pressure during large dumps. + oom_score_adj: Some(-500), ..Default::default() }), ..Default::default() @@ -1526,8 +1538,8 @@ impl BackupService { let password = url.password(); // Detect backup format from S3 location path: - // - .pgdump.gz / backup.postgresql.gz = custom format (pg_restore) - // - .sql.gz = plain SQL format (psql) + // - .pgdump.gz / backup.postgresql.gz = custom format (pg_restore) [legacy backups] + // - .sql.gz = plain SQL format (psql) [current format] let is_plain_format = backup.s3_location.ends_with(".sql.gz"); let mut cmd; @@ -2081,7 +2093,9 @@ impl BackupService { "created_at": backup.started_at.to_rfc3339(), "size_bytes": backup.size_bytes, "location": backup.s3_location.clone(), - "metadata_location": backup.s3_location.replace("backup.postgresql.gz", "metadata.json") + "metadata_location": backup.s3_location + .replace("backup.sql.gz", "metadata.json") + .replace("backup.postgresql.gz", "metadata.json") })); } index["last_updated"] = json!(Utc::now().to_rfc3339()); @@ -3226,14 +3240,13 @@ mod tests { std::io::Read::read_to_end(&mut decoder, &mut decompressed) .expect("Failed to decompress backup — gzip stream is corrupt"); + // Backups use --format=plain so the decompressed content is SQL text starting + // with a comment header ("--"), not the binary PGDMP magic bytes. + let content_str = String::from_utf8_lossy(&decompressed); assert!( - decompressed.len() >= 5, - "Decompressed backup too small to contain pg_dump magic" - ); - assert_eq!( - &decompressed[..5], - b"PGDMP", - "Decompressed backup does not start with PGDMP — not a valid pg_dump custom format file" + content_str.starts_with("--"), + "Decompressed backup does not start with SQL comment header — expected plain-format pg_dump output, got: {:?}", + &decompressed[..std::cmp::min(20, decompressed.len())] ); println!("\nāœ“ Integration test passed:"); diff --git a/crates/temps-cli/src/commands/backup.rs b/crates/temps-cli/src/commands/backup.rs index 940365fe..698022e3 100644 --- a/crates/temps-cli/src/commands/backup.rs +++ b/crates/temps-cli/src/commands/backup.rs @@ -382,7 +382,9 @@ impl BackupCommand { format!("Reading backup metadata from: {}", backup.metadata_location).bright_white() ); let metadata_key = backup.metadata_location.trim_start_matches('/').to_string(); - let binding = metadata_key.replace("backup.postgresql.gz", "metadata.json"); + let binding = metadata_key + .replace("backup.sql.gz", "metadata.json") + .replace("backup.postgresql.gz", "metadata.json"); let metadata_key = binding; let metadata_key = metadata_key.as_str(); let metadata_data = rt.block_on(async { @@ -991,7 +993,9 @@ impl BackupCommand { // Download and parse metadata.json println!("{}", "Reading backup metadata...".bright_white()); let metadata_key = backup.metadata_location.trim_start_matches('/').to_string(); - let binding = metadata_key.replace("backup.postgresql.gz", "metadata.json"); + let binding = metadata_key + .replace("backup.sql.gz", "metadata.json") + .replace("backup.postgresql.gz", "metadata.json"); let metadata_key = binding; let metadata_key = metadata_key.as_str(); diff --git a/crates/temps-providers/src/externalsvc/postgres.rs b/crates/temps-providers/src/externalsvc/postgres.rs index f24cf5c0..8e733694 100644 --- a/crates/temps-providers/src/externalsvc/postgres.rs +++ b/crates/temps-providers/src/externalsvc/postgres.rs @@ -1199,10 +1199,20 @@ impl ExternalService for PostgresService { let sidecar_name = format!("temps-pg-backup-{}", uuid::Uuid::new_v4()); let password_env = format!("PGPASSWORD={}", postgres_config.password); + // Override the entrypoint to prevent the PostgreSQL image from starting a full + // database server inside the sidecar. The default entrypoint (docker-entrypoint.sh) + // initializes a PG cluster and allocates shared_buffers, consuming 1-2 GB of RAM + // that competes with the actual database server and triggers OOM kills (exit 137). let sidecar_config = Config { image: Some(sidecar_image.clone()), - cmd: Some(vec!["sleep".to_string(), "300".to_string()]), + entrypoint: Some(vec!["/bin/sleep".to_string()]), + cmd: Some(vec!["300".to_string()]), env: Some(vec![password_env.clone()]), + host_config: Some(bollard::models::HostConfig { + // Protect the sidecar from the OOM killer during large dumps. + oom_score_adj: Some(-500), + ..Default::default() + }), ..Default::default() }; From 023a5eb6c3936959209c78adfb25b630d24eb31a Mon Sep 17 00:00:00 2001 From: David Viejo Date: Sun, 22 Feb 2026 14:33:24 +0100 Subject: [PATCH 6/9] refactor(backup): enhance backup process with direct file writing and improved error handling - Implement a host directory for bind mounting, allowing pg_dump to write directly to disk, reducing memory usage during backups. - Update container configuration to prevent OOM kills by adjusting entrypoint and adding bind mounts. - Introduce a helper function to ensure sidecar removal on error paths. - Streamline pg_dump command execution to pipe output directly to a gzip-compressed file on the host filesystem, maintaining low memory usage. --- crates/temps-backup/src/services/backup.rs | 164 ++++++++++----------- 1 file changed, 79 insertions(+), 85 deletions(-) diff --git a/crates/temps-backup/src/services/backup.rs b/crates/temps-backup/src/services/backup.rs index 29bc5d1e..fa00f36a 100644 --- a/crates/temps-backup/src/services/backup.rs +++ b/crates/temps-backup/src/services/backup.rs @@ -538,24 +538,39 @@ impl BackupService { let pgpassword_env = format!("PGPASSWORD={}", decoded_password); let env_vars = vec![pgpassword_env]; + // Create a host directory for the bind mount so the backup file is written + // directly to disk by the sidecar container, bypassing the Temps process entirely. + // Previous approach streamed pg_dump output through Bollard's exec HTTP stream + // into the Temps process, which caused unbounded memory growth (2-6+ GB) because + // hyper/Bollard buffers the chunked HTTP response internally even though we write + // each chunk to disk immediately. + let backup_dir = self.config_service.data_dir().join("backups").join("tmp"); + tokio::fs::create_dir_all(&backup_dir).await.map_err(|e| { + anyhow::anyhow!( + "Failed to create backup temp directory {}: {}", + backup_dir.display(), + e + ) + })?; + let backup_filename = format!("{}.sql.gz", uuid::Uuid::new_v4()); + let host_backup_path = backup_dir.join(&backup_filename); + let container_backup_path = format!("/tmp/{}", backup_filename); + // Create container config with version-matched postgres image (includes pg_dump). // Override the entrypoint to prevent the timescaledb-ha image from starting a full - // PostgreSQL server instance inside the sidecar. The default entrypoint - // (docker-entrypoint.sh) initializes a PG cluster and allocates shared_buffers, - // consuming 1-2 GB of RAM that competes with the actual database server for memory - // and triggers OOM kills (exit code 137). + // PostgreSQL server instance inside the sidecar. + // Bind-mount the host backup directory into the container so pg_dump writes + // directly to the host filesystem without any data flowing through the Temps process. let config = Config { image: Some(image_tag), entrypoint: Some(vec!["/bin/sleep".to_string()]), cmd: Some(vec!["300".to_string()]), env: Some(env_vars), host_config: Some(bollard::models::HostConfig { - network_mode: Some("host".to_string()), // Use host network to access database + network_mode: Some("host".to_string()), auto_remove: Some(true), - // Protect the sidecar from the OOM killer. Without this, the kernel may - // kill the sidecar (which has no oom_score_adj) instead of less important - // processes when the system is under memory pressure during large dumps. oom_score_adj: Some(-500), + binds: Some(vec![format!("{}:/tmp:rw", backup_dir.display())]), ..Default::default() }), ..Default::default() @@ -576,6 +591,19 @@ impl BackupService { .await .map_err(|e| anyhow::anyhow!("Failed to create container: {}", e))?; + // Helper to remove the sidecar on any error path + let remove_sidecar = |docker: bollard::Docker, name: String| async move { + let _ = docker + .remove_container( + &name, + Some(RemoveContainerOptions { + force: true, + ..Default::default() + }), + ) + .await; + }; + // Start container docker .start_container( @@ -583,40 +611,34 @@ impl BackupService { Some(bollard::query_parameters::StartContainerOptionsBuilder::new().build()), ) .await - .map_err(|e| anyhow::anyhow!("Failed to start container: {}", e))?; + .map_err(|e| { + let docker = docker.clone(); + let name = container_name.clone(); + tokio::spawn(async move { remove_sidecar(docker, name).await }); + anyhow::anyhow!("Failed to start container: {}", e) + })?; - // Build pg_dump command with proper lifetimes. - // --format=plain streams SQL text (COPY statements) row-by-row with constant memory. - // Previous --format=custom buffered per-table data in both pg_dump and the temps process - // (via Bollard's HTTP stream), causing 2-3 GB memory peaks on large TimescaleDB databases. + // Run pg_dump | gzip inside the sidecar, writing directly to the bind-mounted + // host filesystem. This keeps the Temps process memory flat regardless of DB size. let port_str = port.to_string(); - let pg_dump_cmd = vec![ - "pg_dump", - "--format=plain", - "--no-password", - "--host", - host, - "--port", - &port_str, - "--username", - username, - "--dbname", - database, - ]; - - info!("Running pg_dump command in Docker container"); - - // Create exec instance - // URL-decode password (it's stored URL-encoded in database for connection strings) + let pg_dump_shell_cmd = format!( + "pg_dump --format=plain --no-password --host={} --port={} --username={} --dbname={} | gzip > {}", + host, port_str, username, database, container_backup_path + ); + + info!("Running pg_dump command in Docker container (bind-mount mode)"); + + // URL-decode password for exec env let decoded_password = urlencoding::decode(password) .map(|s| s.to_string()) .unwrap_or_else(|_| password.to_string()); let pgpassword = format!("PGPASSWORD={}", decoded_password); + let exec = docker .create_exec( &container_name, CreateExecOptions { - cmd: Some(pg_dump_cmd), + cmd: Some(vec!["sh", "-c", &pg_dump_shell_cmd]), attach_stdout: Some(true), attach_stderr: Some(true), env: Some(vec![pgpassword.as_str()]), @@ -626,10 +648,9 @@ impl BackupService { .await .map_err(|e| anyhow::anyhow!("Failed to create exec: {}", e))?; - // Stream pg_dump output directly to gzip-compressed temp file. - // Previous implementation buffered the entire dump in memory which - // caused OOM kills (exit code 137) on large databases. - let mut encoder = GzEncoder::new(temp_file, Compression::default()); + // Consume the exec stream to let the command run. Only capture stderr for error + // reporting. stdout is empty because pg_dump output is piped to gzip > file + // inside the container. let mut stderr_data = Vec::new(); if let StartExecResults::Attached { mut output, .. } = @@ -637,42 +658,20 @@ impl BackupService { { while let Some(chunk) = FuturesStreamExt::next(&mut output).await { match chunk { - Ok(bollard::container::LogOutput::StdOut { message }) => { - std::io::Write::write_all(&mut encoder, &message)?; - } Ok(bollard::container::LogOutput::StdErr { message }) => { stderr_data.extend_from_slice(&message); } - Ok(bollard::container::LogOutput::Console { message }) => { - std::io::Write::write_all(&mut encoder, &message)?; - } Err(e) => { - // Clean up container before returning error - let _ = docker - .remove_container( - &container_name, - Some(RemoveContainerOptions { - force: true, - ..Default::default() - }), - ) - .await; + remove_sidecar(docker.clone(), container_name.clone()).await; + let _ = tokio::fs::remove_file(&host_backup_path).await; return Err(anyhow::anyhow!("Error reading pg_dump output: {}", e)); } - _ => {} + _ => {} // stdout/console are empty (piped to file inside container) } } } else { - // Clean up container - let _ = docker - .remove_container( - &container_name, - Some(RemoveContainerOptions { - force: true, - ..Default::default() - }), - ) - .await; + remove_sidecar(docker.clone(), container_name.clone()).await; + let _ = tokio::fs::remove_file(&host_backup_path).await; return Err(anyhow::anyhow!("Unexpected exec result type")); } @@ -681,16 +680,8 @@ impl BackupService { if let Some(exit_code) = exec_inspect.exit_code { if exit_code != 0 { let stderr = String::from_utf8_lossy(&stderr_data); - // Clean up container - let _ = docker - .remove_container( - &container_name, - Some(RemoveContainerOptions { - force: true, - ..Default::default() - }), - ) - .await; + remove_sidecar(docker.clone(), container_name.clone()).await; + let _ = tokio::fs::remove_file(&host_backup_path).await; return Err(anyhow::anyhow!( "pg_dump failed with exit code {}: {}", exit_code, @@ -699,20 +690,23 @@ impl BackupService { } } - // Clean up container - docker - .remove_container( - &container_name, - Some(RemoveContainerOptions { - force: true, - ..Default::default() - }), - ) + // Clean up sidecar container + remove_sidecar(docker.clone(), container_name.clone()).await; + + // Copy the backup file from the bind-mount location to the temp_file that the + // caller uses for S3 upload. This is a local file copy (not through memory). + tokio::fs::copy(&host_backup_path, temp_file.path()) .await - .map_err(|e| anyhow::anyhow!("Failed to remove container: {}", e))?; + .map_err(|e| { + anyhow::anyhow!( + "Failed to copy backup from {} to temp file: {}", + host_backup_path.display(), + e + ) + })?; - // Finalize gzip stream - encoder.finish()?; + // Clean up the bind-mount backup file + let _ = tokio::fs::remove_file(&host_backup_path).await; info!("PostgreSQL backup completed successfully"); Ok(()) From e5b82c2a8dd2d3f0ed0ee4b784279236f62635be Mon Sep 17 00:00:00 2001 From: David Viejo Date: Sun, 22 Feb 2026 14:46:20 +0100 Subject: [PATCH 7/9] fix(backup): extend command duration for backup sidecar to prevent OOM issues Updated the command duration from 300 seconds to 86400 seconds (24 hours) in both BackupService and PostgresService configurations to ensure the sidecar outlives pg_dump operations on large databases, preventing potential OOM kills during backups. --- crates/temps-backup/src/services/backup.rs | 2 +- crates/temps-providers/src/externalsvc/postgres.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/temps-backup/src/services/backup.rs b/crates/temps-backup/src/services/backup.rs index fa00f36a..d89ea969 100644 --- a/crates/temps-backup/src/services/backup.rs +++ b/crates/temps-backup/src/services/backup.rs @@ -564,7 +564,7 @@ impl BackupService { let config = Config { image: Some(image_tag), entrypoint: Some(vec!["/bin/sleep".to_string()]), - cmd: Some(vec!["300".to_string()]), + cmd: Some(vec!["86400".to_string()]), // 24h: must outlive pg_dump on large DBs (42+ GB) env: Some(env_vars), host_config: Some(bollard::models::HostConfig { network_mode: Some("host".to_string()), diff --git a/crates/temps-providers/src/externalsvc/postgres.rs b/crates/temps-providers/src/externalsvc/postgres.rs index 8e733694..d3ce6591 100644 --- a/crates/temps-providers/src/externalsvc/postgres.rs +++ b/crates/temps-providers/src/externalsvc/postgres.rs @@ -1206,7 +1206,7 @@ impl ExternalService for PostgresService { let sidecar_config = Config { image: Some(sidecar_image.clone()), entrypoint: Some(vec!["/bin/sleep".to_string()]), - cmd: Some(vec!["300".to_string()]), + cmd: Some(vec!["86400".to_string()]), // 24h: must outlive pg_dump on large DBs env: Some(vec![password_env.clone()]), host_config: Some(bollard::models::HostConfig { // Protect the sidecar from the OOM killer during large dumps. From a15552d32ba47b57175cfc542ddbb847d8ce2bc6 Mon Sep 17 00:00:00 2001 From: David Viejo Date: Sun, 22 Feb 2026 14:59:23 +0100 Subject: [PATCH 8/9] refactor(backup): update backup container configuration for improved access and clarity - Change bind mount path from /tmp to /backup in the container configuration to ensure proper write access for the postgres user. - Set the container user to root to guarantee write permissions on the bind mount. - Enhance comments for clarity regarding the backup process and container setup. --- crates/temps-backup/src/services/backup.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/crates/temps-backup/src/services/backup.rs b/crates/temps-backup/src/services/backup.rs index d89ea969..b72c0b76 100644 --- a/crates/temps-backup/src/services/backup.rs +++ b/crates/temps-backup/src/services/backup.rs @@ -3,7 +3,7 @@ use anyhow::Result; use aws_sdk_s3::error::ProvideErrorMetadata; use aws_sdk_s3::{Client as S3Client, Config}; use chrono::{DateTime, Duration, Timelike, Utc}; -use flate2::Compression; + use sea_orm::{ ActiveModelTrait, ColumnTrait, DatabaseConnection, EntityTrait, IntoActiveModel, QueryFilter, QueryOrder, @@ -21,7 +21,6 @@ use urlencoding; use uuid::Uuid; use cron::Schedule; -use flate2::write::GzEncoder; use temps_core::notifications::{BackupFailureData, NotificationService}; use temps_entities::{backup_schedules::Model as BackupSchedule, s3_sources::Model as S3Source}; use temps_providers::ExternalServiceManager; @@ -554,23 +553,25 @@ impl BackupService { })?; let backup_filename = format!("{}.sql.gz", uuid::Uuid::new_v4()); let host_backup_path = backup_dir.join(&backup_filename); - let container_backup_path = format!("/tmp/{}", backup_filename); + let container_backup_path = format!("/backup/{}", backup_filename); // Create container config with version-matched postgres image (includes pg_dump). // Override the entrypoint to prevent the timescaledb-ha image from starting a full // PostgreSQL server instance inside the sidecar. - // Bind-mount the host backup directory into the container so pg_dump writes - // directly to the host filesystem without any data flowing through the Temps process. + // Bind-mount the host backup directory to /backup inside the container. We use + // /backup instead of /tmp because the timescaledb-ha image runs as the postgres + // user which may not have write access to a bind-mounted /tmp. let config = Config { image: Some(image_tag), entrypoint: Some(vec!["/bin/sleep".to_string()]), cmd: Some(vec!["86400".to_string()]), // 24h: must outlive pg_dump on large DBs (42+ GB) env: Some(env_vars), + user: Some("root".to_string()), // Run as root to ensure write access to bind mount host_config: Some(bollard::models::HostConfig { network_mode: Some("host".to_string()), auto_remove: Some(true), oom_score_adj: Some(-500), - binds: Some(vec![format!("{}:/tmp:rw", backup_dir.display())]), + binds: Some(vec![format!("{}:/backup:rw", backup_dir.display())]), ..Default::default() }), ..Default::default() From 9ea5e808a6fcb970512de37f22f69d1e5ee57bf4 Mon Sep 17 00:00:00 2001 From: David Viejo Date: Sun, 22 Feb 2026 16:02:14 +0100 Subject: [PATCH 9/9] refactor(backup): optimize pg_dump execution to prevent memory issues - Update pg_dump command execution to run fully detached, avoiding memory growth by not streaming stdout/stderr through the Temps process. - Redirect stderr to a file within the container for error logging, improving error handling during backup operations. - Adjust exec options to disable stdout/stderr attachment, enhancing performance and stability during large database backups. --- crates/temps-backup/src/services/backup.rs | 68 ++++++++++++---------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/crates/temps-backup/src/services/backup.rs b/crates/temps-backup/src/services/backup.rs index b72c0b76..96ccdb39 100644 --- a/crates/temps-backup/src/services/backup.rs +++ b/crates/temps-backup/src/services/backup.rs @@ -493,11 +493,10 @@ impl BackupService { } async fn backup_postgres_database(&self, temp_file: &mut NamedTempFile) -> Result<()> { - use bollard::exec::{CreateExecOptions, StartExecResults}; + use bollard::exec::CreateExecOptions; use bollard::models::ContainerCreateBody as Config; use bollard::query_parameters::RemoveContainerOptions; use bollard::Docker; - use futures::stream::StreamExt as FuturesStreamExt; info!("Creating PostgreSQL database backup using Docker"); @@ -622,10 +621,6 @@ impl BackupService { // Run pg_dump | gzip inside the sidecar, writing directly to the bind-mounted // host filesystem. This keeps the Temps process memory flat regardless of DB size. let port_str = port.to_string(); - let pg_dump_shell_cmd = format!( - "pg_dump --format=plain --no-password --host={} --port={} --username={} --dbname={} | gzip > {}", - host, port_str, username, database, container_backup_path - ); info!("Running pg_dump command in Docker container (bind-mount mode)"); @@ -635,13 +630,24 @@ impl BackupService { .unwrap_or_else(|_| password.to_string()); let pgpassword = format!("PGPASSWORD={}", decoded_password); + // Run pg_dump fully detached — no stdout/stderr streaming through the Temps process. + // Previous approach used attach_stdout which caused Bollard's hyper HTTP client + // to buffer the chunked transfer encoding internally, leading to unbounded memory + // growth (19+ GB) even when we weren't reading stdout data. + // Instead we redirect stderr to a file inside the container and poll for completion. + let stderr_path = format!("/backup/{}.stderr", uuid::Uuid::new_v4()); + let pg_dump_shell_cmd = format!( + "pg_dump --format=plain --no-password --host={} --port={} --username={} --dbname={} 2>{} | gzip > {}", + host, port_str, username, database, stderr_path, container_backup_path + ); + let exec = docker .create_exec( &container_name, CreateExecOptions { cmd: Some(vec!["sh", "-c", &pg_dump_shell_cmd]), - attach_stdout: Some(true), - attach_stderr: Some(true), + attach_stdout: Some(false), + attach_stderr: Some(false), env: Some(vec![pgpassword.as_str()]), ..Default::default() }, @@ -649,33 +655,35 @@ impl BackupService { .await .map_err(|e| anyhow::anyhow!("Failed to create exec: {}", e))?; - // Consume the exec stream to let the command run. Only capture stderr for error - // reporting. stdout is empty because pg_dump output is piped to gzip > file - // inside the container. - let mut stderr_data = Vec::new(); + // Start the exec in detached mode — no HTTP stream through the Temps process + use bollard::exec::StartExecOptions; + docker + .start_exec( + &exec.id, + Some(StartExecOptions { + detach: true, + ..Default::default() + }), + ) + .await?; - if let StartExecResults::Attached { mut output, .. } = - docker.start_exec(&exec.id, None).await? - { - while let Some(chunk) = FuturesStreamExt::next(&mut output).await { - match chunk { - Ok(bollard::container::LogOutput::StdErr { message }) => { - stderr_data.extend_from_slice(&message); - } - Err(e) => { - remove_sidecar(docker.clone(), container_name.clone()).await; - let _ = tokio::fs::remove_file(&host_backup_path).await; - return Err(anyhow::anyhow!("Error reading pg_dump output: {}", e)); - } - _ => {} // stdout/console are empty (piped to file inside container) + // Poll for completion instead of streaming + loop { + let inspect = docker.inspect_exec(&exec.id).await?; + if let Some(running) = inspect.running { + if !running { + break; } } - } else { - remove_sidecar(docker.clone(), container_name.clone()).await; - let _ = tokio::fs::remove_file(&host_backup_path).await; - return Err(anyhow::anyhow!("Unexpected exec result type")); + tokio::time::sleep(std::time::Duration::from_secs(2)).await; } + // Read stderr from the file inside the container (via bind mount on host) + let host_stderr_path = + backup_dir.join(std::path::Path::new(&stderr_path).file_name().unwrap()); + let stderr_data = tokio::fs::read(&host_stderr_path).await.unwrap_or_default(); + let _ = tokio::fs::remove_file(&host_stderr_path).await; + // Check if command was successful let exec_inspect = docker.inspect_exec(&exec.id).await?; if let Some(exit_code) = exec_inspect.exit_code {