diff --git a/Cargo.lock b/Cargo.lock index cbbb1dc096..5fa5d6b229 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2091,6 +2091,7 @@ dependencies = [ "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", + "backon", "base64 0.22.1", "bytes", "chrono", diff --git a/crates/forge_repo/Cargo.toml b/crates/forge_repo/Cargo.toml index ed9cc726a9..2c70665b06 100644 --- a/crates/forge_repo/Cargo.toml +++ b/crates/forge_repo/Cargo.toml @@ -13,6 +13,7 @@ forge_fs.workspace = true forge_template.workspace = true forge_json_repair.workspace = true anyhow.workspace = true +backon.workspace = true futures.workspace = true async-trait.workspace = true serde.workspace = true diff --git a/crates/forge_repo/src/database/pool.rs b/crates/forge_repo/src/database/pool.rs index 2ed368e106..3abae19965 100644 --- a/crates/forge_repo/src/database/pool.rs +++ b/crates/forge_repo/src/database/pool.rs @@ -3,6 +3,7 @@ use std::path::PathBuf; use std::time::Duration; use anyhow::Result; +use backon::{BlockingRetryable, ExponentialBuilder}; use diesel::prelude::*; use diesel::r2d2::{ConnectionManager, CustomizeConnection, Pool, PooledConnection}; use diesel::sqlite::SqliteConnection; @@ -20,6 +21,7 @@ pub struct PoolConfig { pub min_idle: Option, pub connection_timeout: Duration, pub idle_timeout: Option, + pub max_retries: usize, pub database_path: PathBuf, } @@ -28,8 +30,9 @@ impl PoolConfig { Self { max_size: 5, min_idle: Some(1), - connection_timeout: Duration::from_secs(30), + connection_timeout: Duration::from_secs(5), idle_timeout: Some(Duration::from_secs(600)), // 10 minutes + max_retries: 5, database_path, } } @@ -37,6 +40,7 @@ impl PoolConfig { pub struct DatabasePool { pool: DbPool, + max_retries: usize, } impl DatabasePool { @@ -61,14 +65,65 @@ impl DatabasePool { .run_pending_migrations(MIGRATIONS) .map_err(|e| anyhow::anyhow!("Failed to run database migrations: {e}"))?; - Ok(Self { pool }) + Ok(Self { pool, max_retries: 5 }) } pub fn get_connection(&self) -> Result { - self.pool.get().map_err(|e| { - warn!(error = %e, "Failed to get connection from pool"); - anyhow::anyhow!("Failed to get connection from pool: {e}") - }) + Self::retry_with_backoff( + self.max_retries, + "Failed to get connection from pool, retrying", + || { + self.pool + .get() + .map_err(|e| anyhow::anyhow!("Failed to get connection from pool: {e}")) + }, + ) + } + + /// Retries a blocking database pool operation with exponential backoff. + fn retry_with_backoff( + max_retries: usize, + message: &'static str, + operation: impl FnMut() -> Result, + ) -> Result { + operation + .retry( + ExponentialBuilder::default() + .with_min_delay(Duration::from_secs(1)) + .with_max_times(max_retries) + .with_jitter(), + ) + .sleep(std::thread::sleep) + .notify(|err, dur| { + warn!( + error = %err, + retry_after_ms = dur.as_millis() as u64, + "{}", + message + ); + }) + .call() + } +} +// Configure SQLite for better concurrency ref: https://docs.diesel.rs/master/diesel/sqlite/struct.SqliteConnection.html#concurrency +#[derive(Debug)] +struct SqliteCustomizer; + +impl CustomizeConnection for SqliteCustomizer { + fn on_acquire(&self, conn: &mut SqliteConnection) -> Result<(), diesel::r2d2::Error> { + diesel::sql_query("PRAGMA busy_timeout = 30000;") + .execute(conn) + .map_err(diesel::r2d2::Error::QueryError)?; + diesel::sql_query("PRAGMA journal_mode = WAL;") + .execute(conn) + .map_err(diesel::r2d2::Error::QueryError)?; + diesel::sql_query("PRAGMA synchronous = NORMAL;") + .execute(conn) + .map_err(diesel::r2d2::Error::QueryError)?; + diesel::sql_query("PRAGMA wal_autocheckpoint = 1000;") + .execute(conn) + .map_err(diesel::r2d2::Error::QueryError)?; + Ok(()) } } @@ -83,36 +138,27 @@ impl TryFrom for DatabasePool { std::fs::create_dir_all(parent)?; } + // Retry pool creation with exponential backoff to handle transient + // failures such as another process holding an exclusive lock on the + // SQLite database file. + DatabasePool::retry_with_backoff( + config.max_retries, + "Failed to create database pool, retrying", + || Self::build_pool(&config), + ) + } +} + +impl DatabasePool { + /// Builds the connection pool and runs migrations. + fn build_pool(config: &PoolConfig) -> Result { let database_url = config.database_path.to_string_lossy().to_string(); let manager = ConnectionManager::::new(&database_url); - // Configure SQLite for better concurrency ref: https://docs.diesel.rs/master/diesel/sqlite/struct.SqliteConnection.html#concurrency - #[derive(Debug)] - struct SqliteCustomizer; - impl CustomizeConnection for SqliteCustomizer { - fn on_acquire(&self, conn: &mut SqliteConnection) -> Result<(), diesel::r2d2::Error> { - diesel::sql_query("PRAGMA busy_timeout = 30000;") - .execute(conn) - .map_err(diesel::r2d2::Error::QueryError)?; - diesel::sql_query("PRAGMA journal_mode = WAL;") - .execute(conn) - .map_err(diesel::r2d2::Error::QueryError)?; - diesel::sql_query("PRAGMA synchronous = NORMAL;") - .execute(conn) - .map_err(diesel::r2d2::Error::QueryError)?; - diesel::sql_query("PRAGMA wal_autocheckpoint = 1000;") - .execute(conn) - .map_err(diesel::r2d2::Error::QueryError)?; - Ok(()) - } - } - - let customizer = SqliteCustomizer; - let mut builder = Pool::builder() .max_size(config.max_size) .connection_timeout(config.connection_timeout) - .connection_customizer(Box::new(customizer)); + .connection_customizer(Box::new(SqliteCustomizer)); if let Some(min_idle) = config.min_idle { builder = builder.min_idle(Some(min_idle)); @@ -138,6 +184,6 @@ impl TryFrom for DatabasePool { })?; debug!(database_path = %config.database_path.display(), "created connection pool"); - Ok(Self { pool }) + Ok(Self { pool, max_retries: config.max_retries }) } }