From a5d6eed65609065af7fc239ce3b4d21b09b12129 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Thu, 22 Jan 2026 16:31:38 -0300 Subject: [PATCH 01/22] First implementation of off chain data saving to gcp --- Cargo.lock | 1 + architectures/centralized/server/src/app.rs | 129 ++++---- .../suites/memnet_coordinator_full_round.rs | 18 - .../suites/memnet_coordinator_rewards.rs | 18 - .../suites/memnet_treasurer_create_update.rs | 18 - .../suites/memnet_treasurer_full_epoch.rs | 20 -- shared/client/src/state/init.rs | 63 ++-- shared/client/src/state/train.rs | 10 +- shared/coordinator/Cargo.toml | 1 + shared/coordinator/src/external_config.rs | 312 ++++++++++++++++++ shared/coordinator/src/lib.rs | 1 + shared/coordinator/src/model.rs | 61 +--- shared/data-provider/src/gcs.rs | 44 +++ shared/data-provider/src/lib.rs | 3 +- shared/watcher/src/tui.rs | 4 +- tools/rust-tools/preview-lr/src/main.rs | 7 +- 16 files changed, 491 insertions(+), 219 deletions(-) create mode 100644 shared/coordinator/src/external_config.rs diff --git a/Cargo.lock b/Cargo.lock index 6fa41bd09..b3fe8059f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6982,6 +6982,7 @@ dependencies = [ "cfg_eval", "psyche-core", "serde", + "serde_json", "serde_with", "ts-rs", ] diff --git a/architectures/centralized/server/src/app.rs b/architectures/centralized/server/src/app.rs index 402146817..8d1fd1400 100644 --- a/architectures/centralized/server/src/app.rs +++ b/architectures/centralized/server/src/app.rs @@ -1,7 +1,7 @@ -use anyhow::{Result, anyhow, bail}; +use anyhow::{Result, bail}; use async_trait::async_trait; use psyche_centralized_shared::{ClientId, ClientToServerMessage, ServerToClientMessage}; -use psyche_coordinator::model::{self, Checkpoint, LLM, LLMTrainingDataLocation, Model}; +use psyche_coordinator::model::{self, Checkpoint, Model}; use psyche_coordinator::{ Client, ClientState, Coordinator, CoordinatorError, HealthChecks, Round, RunState, SOLANA_MAX_NUM_CLIENTS, TickResult, @@ -148,12 +148,18 @@ impl App { } } +fn default_data_server_port() -> u16 { + 9088 +} + #[derive(Serialize, Deserialize, Debug)] pub struct DataServerInfo { pub dir: PathBuf, pub token_size: TokenSize, pub seq_len: usize, pub shuffle_seed: [u8; 32], + #[serde(default = "default_data_server_port")] + pub port: u16, } impl App { @@ -176,72 +182,58 @@ impl App { debug!("potentially launching data server..."); - let training_data_server = match &coordinator.model { - Model::LLM(LLM { - data_location, - checkpoint, - .. - }) => { - if let LLMTrainingDataLocation::Server(url) = data_location { - match checkpoint { - Checkpoint::Hub(hub_repo) => { - let repo_id = String::from(&hub_repo.repo_id); - let revision = hub_repo.revision.map(|bytes| (&bytes).into()); - if revision.is_some() - || !tokio::fs::try_exists(PathBuf::from(repo_id.clone())) - .await - .unwrap_or_default() - { - download_model_repo_async(&repo_id, revision, None, None, None, true) - .await?; - } - } - Checkpoint::Ephemeral => { - bail!("Can't start up a run with an Ephemeral checkpoint.") - } - Checkpoint::Dummy(_) => { - // ok! - } - Checkpoint::P2P(_) | Checkpoint::P2PGcs(_) => { - bail!("Can't start up a run with a P2P checkpoint.") - } - Checkpoint::Gcs(gcs_repo) => { - let bucket: String = (&gcs_repo.bucket).into(); - let prefix: Option = - gcs_repo.prefix.map(|p| (&p).into()); - download_model_from_gcs_async(&bucket, prefix.as_deref()).await?; - } - } - - let server_addr: SocketAddr = String::from(url).parse().map_err(|e| { - anyhow!("Failed to parse training data server URL {:?}: {}", url, e) - })?; - let data_server_port = server_addr.port(); - let DataServerInfo { - dir, - seq_len, - shuffle_seed, - token_size - } = data_server_config.ok_or_else(|| anyhow!( - "Coordinator state requires we host training data, but no --data-config passed." - ))?; - - let local_data_provider = LocalDataProvider::new_from_directory( - dir, - token_size, - seq_len, - Shuffle::Seeded(shuffle_seed), - )?; - - let (tx, backend) = ChannelCoordinatorBackend::new(); - let data_server = - DataProviderTcpServer::start(local_data_provider, backend, data_server_port) + let training_data_server = if let Some(DataServerInfo { + dir, + seq_len, + shuffle_seed, + token_size, + port, + }) = data_server_config + { + // Download model if needed based on checkpoint type + let Model::LLM(llm) = &coordinator.model; + match &llm.checkpoint { + Checkpoint::Hub(hub_repo) => { + let repo_id = String::from(&hub_repo.repo_id); + let revision = hub_repo.revision.map(|bytes| (&bytes).into()); + if revision.is_some() + || !tokio::fs::try_exists(PathBuf::from(repo_id.clone())) + .await + .unwrap_or_default() + { + download_model_repo_async(&repo_id, revision, None, None, None, true) .await?; - Some((tx, data_server)) - } else { - None + } + } + Checkpoint::Ephemeral => { + bail!("Can't start up a run with an Ephemeral checkpoint.") + } + Checkpoint::Dummy(_) => { + // ok! + } + Checkpoint::P2P(_) | Checkpoint::P2PGcs(_) => { + bail!("Can't start up a run with a P2P checkpoint.") + } + Checkpoint::Gcs(gcs_repo) => { + let bucket: String = (&gcs_repo.bucket).into(); + let prefix: Option = gcs_repo.prefix.map(|p| (&p).into()); + download_model_from_gcs_async(&bucket, prefix.as_deref()).await?; } } + + let local_data_provider = LocalDataProvider::new_from_directory( + dir, + token_size, + seq_len, + Shuffle::Seeded(shuffle_seed), + )?; + + let (tx, backend) = ChannelCoordinatorBackend::new(); + let data_server = + DataProviderTcpServer::start(local_data_provider, backend, port).await?; + Some((tx, data_server)) + } else { + None }; debug!("data server work done."); @@ -253,8 +245,7 @@ impl App { } else { (None, None) }; - let (cancel, tx_tui_state) = - maybe_start_render_loop(tabs)?; + let (cancel, tx_tui_state) = maybe_start_render_loop(tabs)?; let mut tick_interval = interval(Duration::from_millis(500)); tick_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); //important! @@ -293,7 +284,9 @@ impl App { withdraw_on_disconnect, pause, }) - }.instrument(info_span!("App::new")).await + } + .instrument(info_span!("App::new")) + .await } pub async fn run(&mut self) -> Result<()> { diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_round.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_round.rs index fc03202cf..45a235723 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_round.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_full_round.rs @@ -5,13 +5,7 @@ use psyche_coordinator::WitnessProof; use psyche_coordinator::model::Checkpoint; use psyche_coordinator::model::HubRepo; use psyche_coordinator::model::LLM; -use psyche_coordinator::model::LLMArchitecture; -use psyche_coordinator::model::LLMTrainingDataLocation; -use psyche_coordinator::model::LLMTrainingDataType; use psyche_coordinator::model::Model; -use psyche_core::ConstantLR; -use psyche_core::LearningRateSchedule; -use psyche_core::OptimizerDefinition; use psyche_solana_authorizer::logic::AuthorizationGrantorUpdateParams; use psyche_solana_coordinator::ClientId; use psyche_solana_coordinator::CoordinatorAccount; @@ -111,20 +105,8 @@ pub async fn run() { waiting_for_members_extra_time: 3, }), Some(Model::LLM(LLM { - architecture: LLMArchitecture::HfLlama, checkpoint: Checkpoint::Dummy(HubRepo::dummy()), max_seq_len: 4096, - data_type: LLMTrainingDataType::Pretraining, - data_location: LLMTrainingDataLocation::default(), - lr_schedule: LearningRateSchedule::Constant(ConstantLR::default()), - optimizer: OptimizerDefinition::Distro { - clip_grad_norm: None, - compression_decay: 1.0, - compression_topk: 1, - compression_chunk: 1, - quantize_1bit: false, - weight_decay: None, - }, cold_start_warmup_steps: 0, })), None, // no explicit progress diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_rewards.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_rewards.rs index f69bbf8c5..d8779d257 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_rewards.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_rewards.rs @@ -5,13 +5,7 @@ use psyche_coordinator::WAITING_FOR_MEMBERS_EXTRA_SECONDS; use psyche_coordinator::model::Checkpoint; use psyche_coordinator::model::HubRepo; use psyche_coordinator::model::LLM; -use psyche_coordinator::model::LLMArchitecture; -use psyche_coordinator::model::LLMTrainingDataLocation; -use psyche_coordinator::model::LLMTrainingDataType; use psyche_coordinator::model::Model; -use psyche_core::ConstantLR; -use psyche_core::LearningRateSchedule; -use psyche_core::OptimizerDefinition; use psyche_solana_authorizer::logic::AuthorizationGrantorUpdateParams; use psyche_solana_coordinator::ClientId; use psyche_solana_coordinator::CoordinatorAccount; @@ -108,20 +102,8 @@ pub async fn run() { total_steps: 100, }), Some(Model::LLM(LLM { - architecture: LLMArchitecture::HfLlama, checkpoint: Checkpoint::Dummy(HubRepo::dummy()), max_seq_len: 4096, - data_type: LLMTrainingDataType::Pretraining, - data_location: LLMTrainingDataLocation::default(), - lr_schedule: LearningRateSchedule::Constant(ConstantLR::default()), - optimizer: OptimizerDefinition::Distro { - clip_grad_norm: None, - compression_decay: 1.0, - compression_topk: 1, - compression_chunk: 1, - quantize_1bit: false, - weight_decay: None, - }, cold_start_warmup_steps: 0, })), None, // no explicit progress diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_create_update.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_create_update.rs index e51ced2dd..3d185f7fd 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_create_update.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_create_update.rs @@ -3,13 +3,7 @@ use psyche_coordinator::WAITING_FOR_MEMBERS_EXTRA_SECONDS; use psyche_coordinator::model::Checkpoint; use psyche_coordinator::model::HubRepo; use psyche_coordinator::model::LLM; -use psyche_coordinator::model::LLMArchitecture; -use psyche_coordinator::model::LLMTrainingDataLocation; -use psyche_coordinator::model::LLMTrainingDataType; use psyche_coordinator::model::Model; -use psyche_core::ConstantLR; -use psyche_core::LearningRateSchedule; -use psyche_core::OptimizerDefinition; use psyche_solana_coordinator::CoordinatorAccount; use psyche_solana_tooling::create_memnet_endpoint::create_memnet_endpoint; use psyche_solana_tooling::process_treasurer_instructions::process_treasurer_run_create; @@ -55,20 +49,8 @@ pub async fn run() { as u8, }), model: Some(Model::LLM(LLM { - architecture: LLMArchitecture::HfLlama, checkpoint: Checkpoint::Dummy(HubRepo::dummy()), max_seq_len: 4096, - data_type: LLMTrainingDataType::Pretraining, - data_location: LLMTrainingDataLocation::default(), - lr_schedule: LearningRateSchedule::Constant(ConstantLR::default()), - optimizer: OptimizerDefinition::Distro { - clip_grad_norm: None, - compression_decay: 1.0, - compression_topk: 1, - compression_chunk: 1, - quantize_1bit: false, - weight_decay: None, - }, cold_start_warmup_steps: 0, })), progress: None, diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs index 014772e32..3875de2be 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_treasurer_full_epoch.rs @@ -7,13 +7,7 @@ use psyche_coordinator::WAITING_FOR_MEMBERS_EXTRA_SECONDS; use psyche_coordinator::model::Checkpoint; use psyche_coordinator::model::HubRepo; use psyche_coordinator::model::LLM; -use psyche_coordinator::model::LLMArchitecture; -use psyche_coordinator::model::LLMTrainingDataLocation; -use psyche_coordinator::model::LLMTrainingDataType; use psyche_coordinator::model::Model; -use psyche_core::ConstantLR; -use psyche_core::LearningRateSchedule; -use psyche_core::OptimizerDefinition; use psyche_solana_authorizer::logic::AuthorizationGranteeUpdateParams; use psyche_solana_authorizer::logic::AuthorizationGrantorUpdateParams; use psyche_solana_coordinator::ClientId; @@ -230,22 +224,8 @@ pub async fn run() { waiting_for_members_extra_time: 3, }), model: Some(Model::LLM(LLM { - architecture: LLMArchitecture::HfLlama, checkpoint: Checkpoint::Dummy(HubRepo::dummy()), max_seq_len: 4096, - data_type: LLMTrainingDataType::Pretraining, - data_location: LLMTrainingDataLocation::default(), - lr_schedule: LearningRateSchedule::Constant( - ConstantLR::default(), - ), - optimizer: OptimizerDefinition::Distro { - clip_grad_norm: None, - compression_decay: 1.0, - compression_topk: 1, - compression_chunk: 1, - quantize_1bit: false, - weight_decay: None, - }, cold_start_warmup_steps: 0, })), progress: None, diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs index 7a945f74b..1c58b0707 100644 --- a/shared/client/src/state/init.rs +++ b/shared/client/src/state/init.rs @@ -1,6 +1,7 @@ use crate::{WandBInfo, fetch_data::DataFetcher}; use psyche_coordinator::{ Coordinator, HealthChecks, + external_config::{ExternalModelConfig, get_config_gcs_path}, model::{self, HttpLLMTrainingDataLocation, LLMTrainingDataLocation}, }; use psyche_core::{ @@ -9,7 +10,7 @@ use psyche_core::{ use psyche_data_provider::{ DataProvider, DataProviderTcpClient, DownloadError, DummyDataProvider, PreprocessedDataProvider, Split, WeightedDataProvider, download_dataset_repo_async, - download_model_from_gcs_async, download_model_repo_async, + download_model_from_gcs_async, download_model_repo_async, fetch_json_from_gcs, http::{FileURLs, HttpDataProvider}, }; use psyche_metrics::ClientMetrics; @@ -199,11 +200,26 @@ impl RunInitConfigAndIO { + debug!("Fetching external config from gs://{}/{}", bucket, path); + fetch_json_from_gcs(&bucket, &path).await? + } + None => { + debug!("No GCS checkpoint, using default external config"); + ExternalModelConfig::default() + } + }; + let hub_read_token = init_config.hub_read_token.clone(); let hub_max_concurrent_downloads = init_config.hub_max_concurrent_downloads; let data_future = async { - debug!("Setting up data provider from {:?}", llm.data_location); - let data_provider = match llm.data_location { + debug!( + "Setting up data provider from {:?}", + external_config.data_location + ); + let data_provider = match external_config.data_location { LLMTrainingDataLocation::Server(data_server) => DataProvider::Server( DataProviderTcpClient::connect( (&data_server).into(), @@ -268,7 +284,8 @@ impl RunInitConfigAndIO> = match &llm.architecture + let model_future: JoinHandle> = match &external_config + .architecture { model::LLMArchitecture::HfLlama | model::LLMArchitecture::HfDeepseek @@ -387,7 +404,7 @@ impl RunInitConfigAndIO { AutoConfig::Llama(serde_json::from_str(&model_config)?) } @@ -408,7 +425,7 @@ impl RunInitConfigAndIO RunInitConfigAndIO 1 - && llm.architecture == model::LLMArchitecture::HfAuto + && external_config.architecture == model::LLMArchitecture::HfAuto { 1 } else { @@ -489,7 +506,7 @@ impl RunInitConfigAndIO = - match llm.data_type { + match external_config.data_type { model::LLMTrainingDataType::Finetuning => { #[cfg(feature = "parallelism")] { @@ -503,7 +520,9 @@ impl RunInitConfigAndIO None, }; - let raw_loaded_model_type: RawLoadedModelType = match llm.architecture { + let raw_loaded_model_type: RawLoadedModelType = match external_config + .architecture + { model::LLMArchitecture::HfAuto | model::LLMArchitecture::Torchtitan => { #[cfg(feature = "python")] { @@ -513,7 +532,7 @@ impl RunInitConfigAndIO RunInitConfigAndIO RunInitConfigAndIO RunInitConfigAndIO RunInitConfigAndIO, data_parallel: None, }, - llm.lr_schedule, - llm.optimizer, + external_config.lr_schedule, + external_config.optimizer, init_config.micro_batch_size, init_config.optim_stats_every_n_steps, init_config.grad_accum_in_fp32, @@ -822,8 +841,8 @@ impl RunInitConfigAndIO RunInitConfigAndIO RunInitConfigAndIO quantize_1bit, + _ => false, + }; + let training = TrainingStepMetadata { data_fetcher, identity: init_config.identity, @@ -855,6 +879,7 @@ impl RunInitConfigAndIO { pub write_gradients_dir: Option, pub model_task_runner: ModelTaskRunner, + pub quantize_1bit: bool, } #[derive(Debug)] @@ -274,12 +275,7 @@ impl TrainingStepMetadata let cancel_training = cancel_training.clone(); let write_gradients_dir = self.write_gradients_dir.clone(); let tx_distro_result = self.tx_distro_result.clone(); - let quantize = match &state.model { - model::Model::LLM(llm) => match llm.optimizer { - OptimizerDefinition::Distro { quantize_1bit, .. } => quantize_1bit, - _ => false, - }, - }; + let quantize = self.quantize_1bit; let finished = finished.clone(); let TrainingDataForStep { diff --git a/shared/coordinator/Cargo.toml b/shared/coordinator/Cargo.toml index f7cdecc81..2ca5ec09a 100644 --- a/shared/coordinator/Cargo.toml +++ b/shared/coordinator/Cargo.toml @@ -9,6 +9,7 @@ async-trait.workspace = true anchor-lang.workspace = true bytemuck.workspace = true serde_with.workspace = true +serde_json.workspace = true serde.workspace = true cfg_eval = "0.1.2" ts-rs.workspace = true diff --git a/shared/coordinator/src/external_config.rs b/shared/coordinator/src/external_config.rs new file mode 100644 index 000000000..5ab162ebe --- /dev/null +++ b/shared/coordinator/src/external_config.rs @@ -0,0 +1,312 @@ +//! External model configuration stored in GCS. +//! +//! This module provides schemas for model configuration that lives outside +//! the on-chain state. The coordinator only needs minimal fields on-chain: +//! - `checkpoint` (reads and writes for Hub↔P2P transitions) +//! - `max_seq_len` (reads for sequence length) +//! - `cold_start_warmup_steps` (reads for warmup bounds) +//! +//! Everything else is stored in GCS at `gs://{checkpoint_bucket}/config/model_config.json` +//! and fetched by clients at startup. + +use serde::{Deserialize, Serialize}; + +use crate::model::{ + Checkpoint, GcsRepo, LLMArchitecture, LLMTrainingDataLocation, LLMTrainingDataType, +}; +use psyche_core::{LearningRateSchedule, OptimizerDefinition}; + +/// Path within the bucket where config is stored +pub const CONFIG_PREFIX: &str = "config"; +/// Filename for the model config +pub const MODEL_CONFIG_FILENAME: &str = "model_config.json"; + +/// External model configuration schema. +/// This is stored in GCS and fetched by clients. +/// +/// Adding new fields here doesn't affect on-chain memory layout. +/// Use `#[serde(default)]` for backward compatibility. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ExternalModelConfig { + /// Schema version for forward compatibility + #[serde(default = "default_version")] + pub version: u32, + + /// Model architecture (HfLlama, HfDeepseek, etc.) + #[serde(default = "default_architecture")] + pub architecture: LLMArchitecture, + + /// Training data type (Pretraining, Finetuning) + #[serde(default = "default_data_type")] + pub data_type: LLMTrainingDataType, + + /// Training data location + #[serde(default)] + pub data_location: LLMTrainingDataLocation, + + /// Learning rate schedule + #[serde(default = "default_lr_schedule")] + pub lr_schedule: LearningRateSchedule, + + /// Optimizer configuration + #[serde(default = "default_optimizer")] + pub optimizer: OptimizerDefinition, + + /// Optional run metadata + #[serde(default, skip_serializing_if = "Option::is_none")] + pub run_metadata: Option, + + /// Optional client requirements + #[serde(default, skip_serializing_if = "Option::is_none")] + pub client_requirements: Option, +} + +fn default_version() -> u32 { + 1 +} + +fn default_architecture() -> LLMArchitecture { + LLMArchitecture::HfLlama +} + +fn default_data_type() -> LLMTrainingDataType { + LLMTrainingDataType::Pretraining +} + +fn default_lr_schedule() -> LearningRateSchedule { + LearningRateSchedule::Constant(psyche_core::ConstantLR::default()) +} + +fn default_optimizer() -> OptimizerDefinition { + OptimizerDefinition::Dummy +} + +impl Default for ExternalModelConfig { + fn default() -> Self { + Self { + version: default_version(), + architecture: LLMArchitecture::HfLlama, + data_type: LLMTrainingDataType::Pretraining, + data_location: LLMTrainingDataLocation::default(), + lr_schedule: default_lr_schedule(), + optimizer: default_optimizer(), + run_metadata: None, + client_requirements: None, + } + } +} + +/// Run metadata - display information about the run +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct RunMetadata { + #[serde(default)] + pub name: String, + + #[serde(default)] + pub description: String, + + #[serde(default)] + pub num_parameters: u64, + + #[serde(default)] + pub vocab_size: u64, + + #[serde(default)] + pub client_version: String, +} + +/// Client requirements for joining the run +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct ClientRequirements { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub min_gpu_memory_gb: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub recommended_gpu: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub recommended_micro_batch: Option, + + #[serde(default, skip_serializing_if = "Option::is_none")] + pub recommended_total_batch: Option, +} + +impl ExternalModelConfig { + /// Serialize to JSON string + pub fn to_json(&self) -> Result { + serde_json::to_string_pretty(self) + } + + /// Deserialize from JSON string + pub fn from_json(json: &str) -> Result { + serde_json::from_str(json) + } + + /// Validate the configuration + pub fn check(&self) -> bool { + // Validate data location + let bad_data_location = match &self.data_location { + LLMTrainingDataLocation::Dummy => false, + LLMTrainingDataLocation::Server(url) => url.is_empty(), + LLMTrainingDataLocation::Local(_) => false, + LLMTrainingDataLocation::Http(http_loc) => { + use crate::model::HttpTrainingDataLocation; + match &http_loc.location { + HttpTrainingDataLocation::SingleUrl(url) => url.is_empty(), + HttpTrainingDataLocation::NumberedFiles { + url_template, + num_files, + .. + } => url_template.is_empty() || *num_files == 0, + HttpTrainingDataLocation::Gcp { bucket_name, .. } => bucket_name.is_empty(), + } + } + LLMTrainingDataLocation::WeightedHttp(url) => url.is_empty(), + LLMTrainingDataLocation::Preprocessed(url) => url.is_empty(), + }; + + if bad_data_location { + return false; + } + + // Validate optimizer + match &self.optimizer { + OptimizerDefinition::Dummy => false, + OptimizerDefinition::AdamW { .. } => true, + OptimizerDefinition::Distro { .. } => true, + } + } +} + +/// Helper to derive the config GCS path from a checkpoint. +/// Returns `Some((bucket, path))` for GCS checkpoints, `None` for others. +pub fn get_config_gcs_path(checkpoint: &Checkpoint) -> Option<(String, String)> { + let gcs_repo = match checkpoint { + Checkpoint::Gcs(repo) | Checkpoint::P2PGcs(repo) => repo, + // For Hub/P2P checkpoints, we could potentially use a different mechanism + // or require explicit config URL + _ => return None, + }; + + let bucket = gcs_repo.bucket.to_string(); + let path = format!("{}/{}", CONFIG_PREFIX, MODEL_CONFIG_FILENAME); + + Some((bucket, path)) +} + +/// Construct the full GCS URI for the config file +pub fn get_config_gcs_uri(checkpoint: &Checkpoint) -> Option { + get_config_gcs_path(checkpoint).map(|(bucket, path)| format!("gs://{}/{}", bucket, path)) +} + +/// Helper to create a GcsRepo for the config location from a checkpoint +pub fn get_config_gcs_repo(checkpoint: &Checkpoint) -> Option { + match checkpoint { + Checkpoint::Gcs(repo) | Checkpoint::P2PGcs(repo) => Some(GcsRepo { + bucket: repo.bucket.clone(), + prefix: Some(psyche_core::FixedString::from_str_truncated(CONFIG_PREFIX)), + }), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use psyche_core::FixedString; + + #[test] + fn test_roundtrip() { + let config = ExternalModelConfig { + version: 1, + architecture: LLMArchitecture::HfLlama, + data_type: LLMTrainingDataType::Pretraining, + data_location: LLMTrainingDataLocation::default(), + lr_schedule: default_lr_schedule(), + optimizer: OptimizerDefinition::AdamW { + betas: [0.9, 0.999], + weight_decay: 0.01, + eps: 1e-8, + clip_grad_norm: None, + }, + run_metadata: Some(RunMetadata { + name: "Test Run".to_string(), + description: "A test training run".to_string(), + num_parameters: 20_000_000, + vocab_size: 32_000, + client_version: "v1.0.0".to_string(), + }), + client_requirements: None, + }; + + let json = config.to_json().unwrap(); + let parsed = ExternalModelConfig::from_json(&json).unwrap(); + + assert_eq!(parsed.version, config.version); + assert_eq!(parsed.architecture, config.architecture); + assert_eq!(parsed.run_metadata.unwrap().name, "Test Run"); + } + + #[test] + fn test_backward_compatibility() { + // Old JSON without new fields + let old_json = r#"{ + "version": 1, + "architecture": "HfLlama" + }"#; + + let config = ExternalModelConfig::from_json(old_json).unwrap(); + + // Should use defaults for missing fields + assert_eq!(config.architecture, LLMArchitecture::HfLlama); + assert!(matches!( + config.data_location, + LLMTrainingDataLocation::Dummy + )); + assert!(config.run_metadata.is_none()); + } + + #[test] + fn test_config_gcs_path() { + let checkpoint = Checkpoint::Gcs(GcsRepo { + bucket: FixedString::from_str_truncated("my-bucket"), + prefix: Some(FixedString::from_str_truncated("checkpoints")), + }); + + let (bucket, path) = get_config_gcs_path(&checkpoint).unwrap(); + assert_eq!(bucket, "my-bucket"); + assert_eq!(path, "config/model_config.json"); + + let uri = get_config_gcs_uri(&checkpoint).unwrap(); + assert_eq!(uri, "gs://my-bucket/config/model_config.json"); + } + + #[test] + fn test_config_gcs_path_hub_returns_none() { + use crate::model::HubRepo; + + let checkpoint = Checkpoint::Hub(HubRepo { + repo_id: FixedString::from_str_truncated("org/model"), + revision: None, + }); + + assert!(get_config_gcs_path(&checkpoint).is_none()); + } + + #[test] + fn test_adding_new_fields() { + // This test demonstrates that adding new fields doesn't break parsing + // of old configs (as long as they have #[serde(default)]) + let config_with_future_fields = r#"{ + "version": 2, + "architecture": "HfLlama", + "some_future_field": "this field doesn't exist yet", + "another_future_field": { "nested": true } + }"#; + + // Should parse without error, ignoring unknown fields + let config = ExternalModelConfig::from_json(config_with_future_fields).unwrap(); + assert_eq!(config.version, 2); + assert_eq!(config.architecture, LLMArchitecture::HfLlama); + } +} diff --git a/shared/coordinator/src/lib.rs b/shared/coordinator/src/lib.rs index bef26863e..fff25b812 100644 --- a/shared/coordinator/src/lib.rs +++ b/shared/coordinator/src/lib.rs @@ -4,6 +4,7 @@ mod commitment; mod committee_selection; mod coordinator; mod data_selection; +pub mod external_config; pub mod model; pub use commitment::Commitment; diff --git a/shared/coordinator/src/model.rs b/shared/coordinator/src/model.rs index 3176f276e..3fa2f8825 100644 --- a/shared/coordinator/src/model.rs +++ b/shared/coordinator/src/model.rs @@ -5,10 +5,7 @@ use anchor_lang::{ prelude::{borsh, msg}, }; use bytemuck::{Zeroable, ZeroableInOption}; -use psyche_core::{ - ConstantLR, FixedString, FixedVec, LearningRateSchedule, OptimizerDefinition, Shuffle, - TokenSize, -}; +use psyche_core::{FixedString, FixedVec, Shuffle, TokenSize}; use serde::{Deserialize, Serialize}; use ts_rs::TS; @@ -183,31 +180,31 @@ pub enum HttpTrainingDataLocation { }, } +/// On-chain LLM configuration. +/// +/// This struct only contains fields that the coordinator needs to read/write. +/// All other configuration (architecture, data_location, lr_schedule, optimizer, etc.) +/// is stored externally in GCS and fetched by clients. +/// +/// See `external_config::ExternalModelConfig` for the external configuration schema. #[derive( AnchorSerialize, AnchorDeserialize, Serialize, Deserialize, Clone, Debug, Zeroable, Copy, TS, )] #[repr(C)] pub struct LLM { + /// Maximum sequence length for training pub max_seq_len: u32, + /// Number of warmup steps for cold start pub cold_start_warmup_steps: u32, - pub architecture: LLMArchitecture, + /// Checkpoint location - coordinator reads and writes this for Hub↔P2P transitions pub checkpoint: Checkpoint, - pub data_type: LLMTrainingDataType, - pub data_location: LLMTrainingDataLocation, - pub lr_schedule: LearningRateSchedule, - pub optimizer: OptimizerDefinition, } impl LLM { pub fn dummy() -> Self { Self { - architecture: LLMArchitecture::HfLlama, checkpoint: Checkpoint::Dummy(HubRepo::dummy()), - data_location: LLMTrainingDataLocation::default(), - data_type: LLMTrainingDataType::Pretraining, - lr_schedule: LearningRateSchedule::Constant(ConstantLR::default()), max_seq_len: 2048, - optimizer: OptimizerDefinition::Dummy, cold_start_warmup_steps: 0, } } @@ -305,6 +302,11 @@ impl std::fmt::Display for Checkpoint { } impl Model { + /// Check on-chain model configuration validity. + /// + /// This only validates fields stored on-chain. External configuration + /// (architecture, data_location, optimizer, etc.) is validated by + /// `ExternalModelConfig::check()` on the client side. pub fn check(&self) -> bool { match self { Model::LLM(llm) => { @@ -313,28 +315,6 @@ impl Model { return false; } - let bad_data_location = match llm.data_location { - LLMTrainingDataLocation::Dummy => false, - LLMTrainingDataLocation::Server(url) => url.is_empty(), - LLMTrainingDataLocation::Local(_) => false, - LLMTrainingDataLocation::Http(HttpLLMTrainingDataLocation { - location, .. - }) => match location { - HttpTrainingDataLocation::SingleUrl(url) => url.is_empty(), - HttpTrainingDataLocation::NumberedFiles { - url_template, - num_files, - .. - } => url_template.is_empty() || num_files == 0, - HttpTrainingDataLocation::Gcp { bucket_name, .. } => bucket_name.is_empty(), - }, - LLMTrainingDataLocation::WeightedHttp(url) => url.is_empty(), - LLMTrainingDataLocation::Preprocessed(url) => url.is_empty(), - }; - if bad_data_location { - msg!("model check failed: bad LLM training data location."); - return false; - } let bad_checkpoint = match llm.checkpoint { Checkpoint::Dummy(_hub_repo) => false, Checkpoint::Ephemeral => true, @@ -349,14 +329,7 @@ impl Model { msg!("model check failed: bad checkpoint"); return false; } - if !match llm.optimizer { - OptimizerDefinition::Dummy => false, - OptimizerDefinition::AdamW { .. } => true, - OptimizerDefinition::Distro { .. } => true, - } { - msg!("model check failed: bad optimizer"); - return false; - } + true } } diff --git a/shared/data-provider/src/gcs.rs b/shared/data-provider/src/gcs.rs index 71f29e414..339039241 100644 --- a/shared/data-provider/src/gcs.rs +++ b/shared/data-provider/src/gcs.rs @@ -323,6 +323,50 @@ pub fn download_model_from_gcs_sync( rt.block_on(download_model_from_gcs_async(bucket, prefix)) } +/// Fetch a JSON file from GCS and deserialize it. +/// Used for fetching external model configuration. +pub async fn fetch_json_from_gcs( + bucket: &str, + object_path: &str, +) -> Result { + let storage = Storage::builder() + .build() + .await + .map_err(|e| DownloadError::Gcs(e.to_string()))?; + + let bucket_resource_name = format!("projects/_/buckets/{}", bucket); + + debug!("Fetching gs://{}/{}", bucket, object_path); + + let mut read_response = storage + .read_object(&bucket_resource_name, object_path) + .send() + .await + .map_err(|e| DownloadError::Gcs(format!("Failed to read {}: {}", object_path, e)))?; + + let mut data = Vec::new(); + while let Some(chunk_result) = read_response.next().await { + let chunk = chunk_result.map_err(|e| DownloadError::Gcs(e.to_string()))?; + data.extend_from_slice(&chunk); + } + + serde_json::from_slice(&data).map_err(|e| { + DownloadError::Gcs(format!( + "Failed to parse JSON from gs://{}/{}: {}", + bucket, object_path, e + )) + }) +} + +/// Fetch a JSON file from GCS synchronously. +pub fn fetch_json_from_gcs_sync( + bucket: &str, + object_path: &str, +) -> Result { + let rt = Runtime::new().map_err(DownloadError::Io)?; + rt.block_on(fetch_json_from_gcs(bucket, object_path)) +} + pub async fn upload_to_gcs( gcs_info: GcsUploadInfo, manifest_metadata: GcsManifestMetadata, diff --git a/shared/data-provider/src/lib.rs b/shared/data-provider/src/lib.rs index 0044d77d2..b2debc2e5 100644 --- a/shared/data-provider/src/lib.rs +++ b/shared/data-provider/src/lib.rs @@ -19,7 +19,8 @@ pub use errors::{DownloadError, UploadError}; pub use file_extensions::{DATA_FILE_EXTENSIONS, PARQUET_EXTENSION}; pub use gcs::{ GcsCheckpointManifest, GcsManifestMetadata, GcsUploadInfo, ManifestFileEntry, ManifestMetadata, - download_model_from_gcs_async, download_model_from_gcs_sync, upload_to_gcs, + download_model_from_gcs_async, download_model_from_gcs_sync, fetch_json_from_gcs, + fetch_json_from_gcs_sync, upload_to_gcs, }; pub use hub::{ HubUploadInfo, download_dataset_repo_async, download_dataset_repo_sync, diff --git a/shared/watcher/src/tui.rs b/shared/watcher/src/tui.rs index 58bac18ba..9be2b57ff 100644 --- a/shared/watcher/src/tui.rs +++ b/shared/watcher/src/tui.rs @@ -187,9 +187,7 @@ impl From<&Coordinator> for CoordinatorTuiState { .iter() .map(|c| format!("{:?}", c.id)) .collect(), - data_source: match &value.model { - Model::LLM(l) => format!("{:?}", l.data_type), - }, + data_source: "External Config".to_string(), // data_type moved to external config model_checkpoint: match &value.model { Model::LLM(l) => format!("{}", l.checkpoint), }, diff --git a/tools/rust-tools/preview-lr/src/main.rs b/tools/rust-tools/preview-lr/src/main.rs index ffea7ea00..cc557b187 100644 --- a/tools/rust-tools/preview-lr/src/main.rs +++ b/tools/rust-tools/preview-lr/src/main.rs @@ -1,6 +1,6 @@ use clap::Parser; use plotters::prelude::*; -use psyche_coordinator::{CoordinatorConfig, model::Model}; +use psyche_coordinator::{CoordinatorConfig, external_config::ExternalModelConfig, model::Model}; use serde::Deserialize; use std::path::PathBuf; @@ -28,7 +28,9 @@ enum Commands { #[derive(Deserialize)] struct Config { pub config: CoordinatorConfig, + #[allow(dead_code)] pub model: Model, + pub external_config: ExternalModelConfig, } fn main() -> anyhow::Result<()> { let args = Args::parse(); @@ -48,9 +50,8 @@ fn main() -> anyhow::Result<()> { let config: Config = toml::from_str(&std::fs::read_to_string(&config_path)?)?; - let Model::LLM(llm) = config.model; let steps = config.config.total_steps; - let lr = llm.lr_schedule; + let lr = config.external_config.lr_schedule; let root = BitMapBackend::new("lr-plot.png", (steps.min(10_000), 1024)).into_drawing_area(); root.fill(&WHITE)?; From eb0d7f2a07a64c8f176bd24747e4d173d9d71215 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Thu, 22 Jan 2026 17:09:24 -0300 Subject: [PATCH 02/22] Correct config parse and push model extra config to the gcp bucket when updating it in Solana --- Cargo.lock | 1 + shared/data-provider/src/gcs.rs | 38 ++++++++++++++++++ shared/data-provider/src/lib.rs | 2 +- tools/rust-tools/run-manager/Cargo.toml | 1 + .../src/commands/run/update_config.rs | 39 +++++++++++++++++-- 5 files changed, 76 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b3fe8059f..67738dad1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8106,6 +8106,7 @@ dependencies = [ "clap-markdown", "psyche-coordinator", "psyche-core", + "psyche-data-provider", "psyche-solana-authorizer", "psyche-solana-coordinator", "psyche-solana-rpc", diff --git a/shared/data-provider/src/gcs.rs b/shared/data-provider/src/gcs.rs index 339039241..d96e5ffdf 100644 --- a/shared/data-provider/src/gcs.rs +++ b/shared/data-provider/src/gcs.rs @@ -367,6 +367,44 @@ pub fn fetch_json_from_gcs_sync( rt.block_on(fetch_json_from_gcs(bucket, object_path)) } +/// Upload a JSON-serializable value to GCS. +pub async fn upload_json_to_gcs( + bucket: &str, + object_path: &str, + value: &T, +) -> Result<(), UploadError> { + let storage = Storage::builder() + .build() + .await + .map_err(|e| UploadError::Gcs(e.to_string()))?; + + let json = serde_json::to_string_pretty(value)?; + let data = bytes::Bytes::from(json.into_bytes()); + let bucket_resource_name = format!("projects/_/buckets/{}", bucket); + + info!("Uploading JSON to gs://{}/{}", bucket, object_path); + + storage + .write_object(&bucket_resource_name, object_path, data) + .send_unbuffered() + .await + .map_err(|e| UploadError::Gcs(e.to_string()))?; + + info!("Uploaded JSON to gs://{}/{}", bucket, object_path); + + Ok(()) +} + +/// Upload a JSON-serializable value to GCS synchronously. +pub fn upload_json_to_gcs_sync( + bucket: &str, + object_path: &str, + value: &T, +) -> Result<(), UploadError> { + let rt = Runtime::new().map_err(|e| UploadError::Io(e))?; + rt.block_on(upload_json_to_gcs(bucket, object_path, value)) +} + pub async fn upload_to_gcs( gcs_info: GcsUploadInfo, manifest_metadata: GcsManifestMetadata, diff --git a/shared/data-provider/src/lib.rs b/shared/data-provider/src/lib.rs index b2debc2e5..670e6e093 100644 --- a/shared/data-provider/src/lib.rs +++ b/shared/data-provider/src/lib.rs @@ -20,7 +20,7 @@ pub use file_extensions::{DATA_FILE_EXTENSIONS, PARQUET_EXTENSION}; pub use gcs::{ GcsCheckpointManifest, GcsManifestMetadata, GcsUploadInfo, ManifestFileEntry, ManifestMetadata, download_model_from_gcs_async, download_model_from_gcs_sync, fetch_json_from_gcs, - fetch_json_from_gcs_sync, upload_to_gcs, + fetch_json_from_gcs_sync, upload_json_to_gcs, upload_json_to_gcs_sync, upload_to_gcs, }; pub use hub::{ HubUploadInfo, download_dataset_repo_async, download_dataset_repo_sync, diff --git a/tools/rust-tools/run-manager/Cargo.toml b/tools/rust-tools/run-manager/Cargo.toml index e47a1931f..c6782c9e1 100644 --- a/tools/rust-tools/run-manager/Cargo.toml +++ b/tools/rust-tools/run-manager/Cargo.toml @@ -22,6 +22,7 @@ psyche-solana-authorizer.workspace = true psyche-solana-treasurer.workspace = true psyche-coordinator.workspace = true psyche-core.workspace = true +psyche-data-provider.workspace = true anchor-client.workspace = true anchor-lang.workspace = true anchor-spl.workspace = true diff --git a/tools/rust-tools/run-manager/src/commands/run/update_config.rs b/tools/rust-tools/run-manager/src/commands/run/update_config.rs index 641577307..5f4a0092e 100644 --- a/tools/rust-tools/run-manager/src/commands/run/update_config.rs +++ b/tools/rust-tools/run-manager/src/commands/run/update_config.rs @@ -5,11 +5,15 @@ use std::path::PathBuf; use anyhow::{Context, Result, bail}; use clap::Args; use psyche_coordinator::{ - CoordinatorConfig, CoordinatorProgress, get_data_index_for_step, + CoordinatorConfig, CoordinatorProgress, + external_config::{ExternalModelConfig, get_config_gcs_path}, + get_data_index_for_step, model::{Checkpoint, Model}, }; +use psyche_data_provider::upload_json_to_gcs; use psyche_solana_treasurer::logic::RunUpdateParams; use serde::{Deserialize, Serialize}; +use tracing::info; use crate::{SolanaBackend, instructions}; @@ -69,12 +73,14 @@ impl Command for CommandUpdateConfig { .get_coordinator_account(&coordinator_account) .await?; - let (config, mut model) = match config_path { + let (config, mut model, external_config) = match config_path { Some(config_path) => { #[derive(Serialize, Deserialize)] struct State { pub config: CoordinatorConfig, pub model: Model, + #[serde(default)] + pub external_config: ExternalModelConfig, } let state: State = toml::from_str(std::str::from_utf8( &std::fs::read(&config_path).with_context(|| { @@ -83,9 +89,13 @@ impl Command for CommandUpdateConfig { )?) .with_context(|| format!("failed to parse config toml file {config_path:?}"))?; - (Some(state.config), Some(state.model)) + ( + Some(state.config), + Some(state.model), + Some(state.external_config), + ) } - None => (None, None), + None => (None, None, None), }; model = if switch_to_hub { @@ -135,6 +145,27 @@ impl Command for CommandUpdateConfig { coordinator_account_state.state.coordinator.model = model; } + // Upload external config to GCS if provided + if let Some(ref external_config) = external_config { + let Model::LLM(llm) = &coordinator_account_state.state.coordinator.model; + if let Some((bucket, path)) = get_config_gcs_path(&llm.checkpoint) { + info!("Uploading external config to gs://{}/{}", bucket, path); + upload_json_to_gcs(&bucket, &path, external_config) + .await + .with_context(|| { + format!( + "failed to upload external config to gs://{}/{}", + bucket, path + ) + })?; + println!("Uploaded external config to gs://{}/{}", bucket, path); + } else { + println!( + "Warning: external_config provided but checkpoint is not GCS-based, skipping upload" + ); + } + } + let progress = restart_from_step.map(|step| CoordinatorProgress { epoch: coordinator_account_state.state.coordinator.progress.epoch, step, From 8dccc9b2554d99692830580a19b8be0745df9a14 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Thu, 22 Jan 2026 12:53:39 -0800 Subject: [PATCH 03/22] Fix light config to match with new format --- .../solana-coordinator/Cargo.lock | 1 + config/solana-test/light-config.toml | 20 +++--- scripts/train-solana-test.sh | 67 ++++++++++--------- 3 files changed, 48 insertions(+), 40 deletions(-) diff --git a/architectures/decentralized/solana-coordinator/Cargo.lock b/architectures/decentralized/solana-coordinator/Cargo.lock index 22d64fadc..a39998c22 100644 --- a/architectures/decentralized/solana-coordinator/Cargo.lock +++ b/architectures/decentralized/solana-coordinator/Cargo.lock @@ -1610,6 +1610,7 @@ dependencies = [ "cfg_eval", "psyche-core", "serde", + "serde_json", "serde_with", "ts-rs", ] diff --git a/config/solana-test/light-config.toml b/config/solana-test/light-config.toml index eab015342..c0968e61b 100644 --- a/config/solana-test/light-config.toml +++ b/config/solana-test/light-config.toml @@ -15,28 +15,32 @@ total_steps = 25000 waiting_for_members_extra_time = 3 [model.LLM] -architecture = "HfLlama" -data_type = "Pretraining" max_seq_len = 2048 cold_start_warmup_steps = 0 -[model.LLM.checkpoint.Hub] -repo_id = "emozilla/llama2-20m-init" +[model.LLM.checkpoint.Gcs] +bucket = "my_checkpoints" -[model.LLM.data_location.Http] +[external_config] +architecture = "HfLlama" +data_type = "Pretraining" + +[external_config.data_location.Http] token_size_in_bytes = "TwoBytes" shuffle = "DontShuffle" -[model.LLM.data_location.Http.location.Gcp] + +[external_config.data_location.Http.location.Gcp] bucket_name = "nous-pretraining-public-us" filter_directory = "fineweb-edu-tokenized-llama2" -[model.LLM.lr_schedule.Cosine] +[external_config.lr_schedule.Cosine] base_lr = 4.0e-4 warmup_steps = 250 warmup_init_lr = 0.0 total_steps = 25000 final_lr = 4.0e-5 -[model.LLM.optimizer.Distro] + +[external_config.optimizer.Distro] clip_grad_norm = 1.0 compression_decay = 0.999 compression_chunk = 64 diff --git a/scripts/train-solana-test.sh b/scripts/train-solana-test.sh index 55699e591..abbc87f45 100755 --- a/scripts/train-solana-test.sh +++ b/scripts/train-solana-test.sh @@ -1,5 +1,4 @@ #!/usr/bin/env bash - set -eo pipefail # use the agenix provided wallet if you have it @@ -7,16 +6,12 @@ if [[ -n "${devnet__keypair__wallet_PATH}" && -f "${devnet__keypair__wallet_PATH WALLET_FILE="${devnet__keypair__wallet_PATH}" elif [[ -z "${WALLET_FILE:-}" ]]; then echo "No wallet file specified, generating ephemeral keypair..." - # Create a named pipe for the keypair data mkdir -p ~/.config/solana/solana-keys WALLET_FILE=$(mktemp ~/.config/solana/solana-keys/solana-wallet-XXXXXXXXX) - # Generate keypair and write to the generated file solana-keygen new --no-bip39-passphrase --force --outfile "${WALLET_FILE}" echo "Using ephemeral keypair (will not persist after script exits)" - # Set up cleanup trap to remove the wallet file when script exits - # This will run on normal exit, SIGINT (Ctrl+C), SIGTERM, or ERR trap "echo 'Cleaning up ephemeral wallet file...'; rm -f '${WALLET_FILE}'" EXIT fi @@ -30,37 +25,45 @@ DP=${DP:-"8"} TP=${TP:-"1"} BATCH_SIZE=${BATCH_SIZE:-"1"} +# Optional checkpoint args +CHECKPOINT_ARGS=() +if [[ "$CHECKPOINT" == "true" ]]; then + echo -e "\n[+] Starting Solana training with checkpointing enabled..." + CHECKPOINT_ARGS+=(--skip-checkpoint-upload) +else + echo -e "\n[+] Starting Solana training without checkpointing..." +fi + # fine if this fails -solana airdrop 10 "$(solana-keygen pubkey ${WALLET_FILE})" --url "${RPC}" || true +solana airdrop 10 "$(solana-keygen pubkey "${WALLET_FILE}")" --url "${RPC}" || true export RUST_LOG="info,psyche=debug" -if [[ "$OTLP_METRICS_URL" == "" ]]; then - cargo run --release --bin psyche-solana-client -- \ - train \ - --wallet-private-key-path ${WALLET_FILE} \ - --rpc ${RPC} \ - --ws-rpc ${WS_RPC} \ - --run-id ${RUN_ID} \ - --data-parallelism ${DP} \ - --tensor-parallelism ${TP} \ - --micro-batch-size ${BATCH_SIZE} \ - --authorizer ${AUTHORIZER} \ - --logs "console" \ - "$@" +COMMON_ARGS=( + train + --wallet-private-key-path "${WALLET_FILE}" + --rpc "${RPC}" + --ws-rpc "${WS_RPC}" + --run-id "${RUN_ID}" + --data-parallelism "${DP}" + --tensor-parallelism "${TP}" + --micro-batch-size "${BATCH_SIZE}" + --authorizer "${AUTHORIZER}" + --logs console + "${CHECKPOINT_ARGS[@]}" + "$@" +) + +if [[ -z "${OTLP_METRICS_URL:-}" ]]; then + HF_TOKEN=${HF_TOKEN} \ + GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS} \ + cargo run --release --bin psyche-solana-client -- \ + "${COMMON_ARGS[@]}" else - cargo run --release --bin psyche-solana-client -- \ - train \ - --wallet-private-key-path ${WALLET_FILE} \ - --rpc ${RPC} \ - --ws-rpc ${WS_RPC} \ - --run-id ${RUN_ID} \ - --data-parallelism ${DP} \ - --tensor-parallelism ${TP} \ - --micro-batch-size ${BATCH_SIZE} \ - --logs "console" \ - --authorizer ${AUTHORIZER} \ + HF_TOKEN=${HF_TOKEN} \ + GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS} \ + cargo run --release --bin psyche-solana-client -- \ + "${COMMON_ARGS[@]}" \ --oltp-metrics-url "http://localhost:4318/v1/metrics" \ - --oltp-logs-url "http://localhost:4318/v1/logs" \ - "$@" + --oltp-logs-url "http://localhost:4318/v1/logs" fi From e6e85fa07929717a4d644ea9e37ec176e9b2656b Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Tue, 27 Jan 2026 17:49:25 -0300 Subject: [PATCH 04/22] Fix tests and add hub support --- .../suites/memnet_coordinator_data_layout.rs | 65 +------------ .../decentralized/testing/src/utils.rs | 2 +- config/solana-test/nano-config.toml | 14 +-- shared/client/src/state/init.rs | 42 +++++++-- shared/coordinator/src/external_config.rs | 94 ++++++++++++++++++- shared/data-provider/src/errors.rs | 6 ++ shared/data-provider/src/gcs.rs | 70 +++++++------- shared/data-provider/src/hub.rs | 33 ++++++- shared/data-provider/src/lib.rs | 2 +- 9 files changed, 208 insertions(+), 120 deletions(-) diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_data_layout.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_data_layout.rs index 8257cbbd1..88a663362 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_data_layout.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_data_layout.rs @@ -1,19 +1,10 @@ use psyche_coordinator::Round; use psyche_coordinator::RunState; use psyche_coordinator::model::Checkpoint; -use psyche_coordinator::model::HttpTrainingDataLocation; -use psyche_coordinator::model::LLMArchitecture; -use psyche_coordinator::model::LLMTrainingDataLocation; -use psyche_coordinator::model::LLMTrainingDataType; use psyche_coordinator::model::Model; -use psyche_core::CosineLR; use psyche_core::FixedString; use psyche_core::FixedVec; -use psyche_core::LearningRateSchedule; -use psyche_core::OptimizerDefinition; -use psyche_core::Shuffle; use psyche_core::SmallBoolean; -use psyche_core::TokenSize; use psyche_solana_coordinator::CoordinatorAccount; use psyche_solana_coordinator::coordinator_account_from_bytes; @@ -43,12 +34,13 @@ pub async fn run() { assert_eq!(coordinator.run_state, RunState::Uninitialized); assert_eq!(coordinator.run_state_start_unix_timestamp, 0); assert_eq!(coordinator.pending_pause, SmallBoolean::FALSE); - // Coordinator model + // Coordinator model (only on-chain fields) + // Note: architecture, data_type, data_location, lr_schedule, optimizer are now + // stored off-chain in ExternalModelConfig (GCS) and not part of the on-chain LLM struct match coordinator.model { Model::LLM(llm) => { assert_eq!(llm.max_seq_len, 2048); assert_eq!(llm.cold_start_warmup_steps, 0); - assert_eq!(llm.architecture, LLMArchitecture::HfLlama); match llm.checkpoint { Checkpoint::Hub(hub) => { assert_eq!( @@ -59,57 +51,6 @@ pub async fn run() { }, _ => panic!("Expected Hub checkpoint"), }; - assert_eq!(llm.data_type, LLMTrainingDataType::Pretraining); - match llm.data_location { - LLMTrainingDataLocation::Http(http) => { - match http.location { - HttpTrainingDataLocation::Gcp { - bucket_name, - filter_directory, - } => { - assert_eq!( - bucket_name, - fixed_str("nous-pretraining-public-us") - ); - assert_eq!( - filter_directory, - fixed_str("fineweb-edu-tokenized-llama2") - ); - }, - _ => panic!("Expected Gcp data location"), - }; - assert_eq!(http.token_size_in_bytes, TokenSize::TwoBytes); - assert_eq!(http.shuffle, Shuffle::DontShuffle); - }, - _ => panic!("Expected Http data location"), - }; - match llm.lr_schedule { - LearningRateSchedule::Cosine(learning_rate) => { - assert_eq!( - learning_rate, - CosineLR::new(0.0004, 250, 0.0, 25000, 0.00004) - ); - }, - _ => panic!("Expected Constant LR schedule"), - }; - match llm.optimizer { - OptimizerDefinition::Distro { - clip_grad_norm, - weight_decay, - compression_decay, - compression_topk, - compression_chunk, - quantize_1bit, - } => { - assert_eq!(clip_grad_norm, Some(1.0)); - assert_eq!(weight_decay, None); - assert_eq!(compression_decay, 0.999); - assert_eq!(compression_topk, 2); - assert_eq!(compression_chunk, 64); - assert_eq!(quantize_1bit, false); - }, - _ => panic!("Expected Distro optimizer"), - } }, }; // Coordinator config diff --git a/architectures/decentralized/testing/src/utils.rs b/architectures/decentralized/testing/src/utils.rs index 631b05f52..4b3f92c58 100644 --- a/architectures/decentralized/testing/src/utils.rs +++ b/architectures/decentralized/testing/src/utils.rs @@ -183,7 +183,7 @@ impl ConfigBuilder { // This means that every client is a witness self.set_value("config.witness_nodes", 0_u32); - self.set_value("model.LLM.architecture", self.architecture.clone()); + self.set_value("external_config.architecture", self.architecture.clone()); self.set_value("config.global_batch_size_start", self.batch_size); self.set_value("config.global_batch_size_end", self.batch_size); diff --git a/config/solana-test/nano-config.toml b/config/solana-test/nano-config.toml index c275feea3..4ed67eec8 100644 --- a/config/solana-test/nano-config.toml +++ b/config/solana-test/nano-config.toml @@ -15,8 +15,6 @@ total_steps = 25000 waiting_for_members_extra_time = 3 [model.LLM] -architecture = "HfLlama" -data_type = "Pretraining" max_seq_len = 64 cold_start_warmup_steps = 0 @@ -24,21 +22,25 @@ cold_start_warmup_steps = 0 repo_id = "pefontana/Nano-Llama" revision = "cf48eac4944f6e954a3d9c9c30e8c865e64e7d03" -[model.LLM.data_location.Http] +[external_config] +architecture = "HfLlama" +data_type = "Pretraining" + +[external_config.data_location.Http] token_size_in_bytes = "TwoBytes" shuffle = "DontShuffle" -[model.LLM.data_location.Http.location] +[external_config.data_location.Http.location] SingleUrl = "https://huggingface.co/pefontana/Nano-Llama/resolve/main/tiny-ci-dataset/000_tiny-test.ds" -[model.LLM.lr_schedule.Cosine] +[external_config.lr_schedule.Cosine] base_lr = 4.0e-4 warmup_steps = 250 warmup_init_lr = 0.0 total_steps = 25000 final_lr = 4.0e-5 -[model.LLM.optimizer.Distro] +[external_config.optimizer.Distro] clip_grad_norm = 1.0 compression_decay = 0.999 compression_chunk = 64 diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs index 1c58b0707..58c24d43f 100644 --- a/shared/client/src/state/init.rs +++ b/shared/client/src/state/init.rs @@ -1,7 +1,7 @@ use crate::{WandBInfo, fetch_data::DataFetcher}; use psyche_coordinator::{ Coordinator, HealthChecks, - external_config::{ExternalModelConfig, get_config_gcs_path}, + external_config::{ExternalModelConfig, get_config_gcs_path, get_config_hub_path}, model::{self, HttpLLMTrainingDataLocation, LLMTrainingDataLocation}, }; use psyche_core::{ @@ -11,6 +11,7 @@ use psyche_data_provider::{ DataProvider, DataProviderTcpClient, DownloadError, DummyDataProvider, PreprocessedDataProvider, Split, WeightedDataProvider, download_dataset_repo_async, download_model_from_gcs_async, download_model_repo_async, fetch_json_from_gcs, + fetch_json_from_hub, http::{FileURLs, HttpDataProvider}, }; use psyche_metrics::ClientMetrics; @@ -200,16 +201,37 @@ impl RunInitConfigAndIO { - debug!("Fetching external config from gs://{}/{}", bucket, path); - fetch_json_from_gcs(&bucket, &path).await? - } - None => { - debug!("No GCS checkpoint, using default external config"); - ExternalModelConfig::default() + // Fetch external model config - try GCS first, then Hub, then default + let external_config: ExternalModelConfig = if let Some((bucket, path)) = + get_config_gcs_path(&llm.checkpoint) + { + debug!("Fetching external config from gs://{}/{}", bucket, path); + fetch_json_from_gcs(&bucket, &path).await? + } else if let Some((repo_id, revision, filename)) = get_config_hub_path(&llm.checkpoint) { + debug!( + "Fetching external config from Hub: {}/{}", + repo_id, filename + ); + match fetch_json_from_hub( + &repo_id, + revision, + filename, + init_config.hub_read_token.clone(), + ) + .await + { + Ok(config) => config, + Err(e) => { + debug!( + "Failed to fetch external config from Hub ({}), using default: {}", + repo_id, e + ); + ExternalModelConfig::default() + } } + } else { + debug!("No GCS/Hub checkpoint, using default external config"); + ExternalModelConfig::default() }; let hub_read_token = init_config.hub_read_token.clone(); diff --git a/shared/coordinator/src/external_config.rs b/shared/coordinator/src/external_config.rs index 5ab162ebe..9a0efb236 100644 --- a/shared/coordinator/src/external_config.rs +++ b/shared/coordinator/src/external_config.rs @@ -12,7 +12,7 @@ use serde::{Deserialize, Serialize}; use crate::model::{ - Checkpoint, GcsRepo, LLMArchitecture, LLMTrainingDataLocation, LLMTrainingDataType, + Checkpoint, GcsRepo, LLM, LLMArchitecture, LLMTrainingDataLocation, LLMTrainingDataType, Model, }; use psyche_core::{LearningRateSchedule, OptimizerDefinition}; @@ -21,6 +21,76 @@ pub const CONFIG_PREFIX: &str = "config"; /// Filename for the model config pub const MODEL_CONFIG_FILENAME: &str = "model_config.json"; +// ============================================================================ +// Config-file representations (old format with all fields in [model.LLM]) +// ============================================================================ + +/// Config-file representation of the model with all fields. +/// This allows config files to keep the old format where everything +/// is under `[model.LLM]`. +/// +/// Use `ConfigModel::split()` to separate into on-chain `Model` and `ExternalModelConfig`. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum ConfigModel { + LLM(ConfigLLM), +} + +impl ConfigModel { + /// Split the config model into on-chain Model and ExternalModelConfig. + pub fn split(self) -> (Model, ExternalModelConfig) { + match self { + ConfigModel::LLM(config_llm) => config_llm.split(), + } + } +} + +/// Config-file representation of LLM with all fields (old format). +/// This includes both on-chain fields and external config fields. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConfigLLM { + // On-chain fields + pub max_seq_len: u32, + #[serde(default)] + pub cold_start_warmup_steps: u32, + pub checkpoint: Checkpoint, + + // External config fields (with defaults for backward compatibility) + #[serde(default = "default_architecture")] + pub architecture: LLMArchitecture, + #[serde(default = "default_data_type")] + pub data_type: LLMTrainingDataType, + #[serde(default)] + pub data_location: LLMTrainingDataLocation, + #[serde(default = "default_lr_schedule")] + pub lr_schedule: LearningRateSchedule, + #[serde(default = "default_optimizer")] + pub optimizer: OptimizerDefinition, +} + +impl ConfigLLM { + /// Split into on-chain LLM and ExternalModelConfig. + pub fn split(self) -> (Model, ExternalModelConfig) { + let llm = LLM { + max_seq_len: self.max_seq_len, + cold_start_warmup_steps: self.cold_start_warmup_steps, + checkpoint: self.checkpoint, + }; + + let external_config = ExternalModelConfig { + version: default_version(), + architecture: self.architecture, + data_type: self.data_type, + data_location: self.data_location, + lr_schedule: self.lr_schedule, + optimizer: self.optimizer, + run_metadata: None, + client_requirements: None, + }; + + (Model::LLM(llm), external_config) + } +} + /// External model configuration schema. /// This is stored in GCS and fetched by clients. /// @@ -183,8 +253,6 @@ impl ExternalModelConfig { pub fn get_config_gcs_path(checkpoint: &Checkpoint) -> Option<(String, String)> { let gcs_repo = match checkpoint { Checkpoint::Gcs(repo) | Checkpoint::P2PGcs(repo) => repo, - // For Hub/P2P checkpoints, we could potentially use a different mechanism - // or require explicit config URL _ => return None, }; @@ -194,6 +262,26 @@ pub fn get_config_gcs_path(checkpoint: &Checkpoint) -> Option<(String, String)> Some((bucket, path)) } +/// Helper to derive the config Hub path from a checkpoint. +/// Returns `Some((repo_id, revision, filename))` for Hub checkpoints, `None` for others. +pub fn get_config_hub_path( + checkpoint: &Checkpoint, +) -> Option<(String, Option, &'static str)> { + let hub_repo = match checkpoint { + Checkpoint::Hub(repo) | Checkpoint::P2P(repo) | Checkpoint::Dummy(repo) => repo, + _ => return None, + }; + + let repo_id = hub_repo.repo_id.to_string(); + if repo_id.is_empty() { + return None; + } + + let revision = hub_repo.revision.as_ref().map(|r| r.to_string()); + + Some((repo_id, revision, MODEL_CONFIG_FILENAME)) +} + /// Construct the full GCS URI for the config file pub fn get_config_gcs_uri(checkpoint: &Checkpoint) -> Option { get_config_gcs_path(checkpoint).map(|(bucket, path)| format!("gs://{}/{}", bucket, path)) diff --git a/shared/data-provider/src/errors.rs b/shared/data-provider/src/errors.rs index b84bc5f9a..d2afa5ce7 100644 --- a/shared/data-provider/src/errors.rs +++ b/shared/data-provider/src/errors.rs @@ -26,6 +26,9 @@ pub enum UploadError { #[error("GCS operation failed: {0}")] GcsStorage(#[from] google_cloud_storage::http::Error), + #[error("GCS error: {0}")] + Gcs(String), + // Common errors #[error("IO error: {0}")] Io(#[from] std::io::Error), @@ -45,6 +48,9 @@ pub enum DownloadError { #[error("GCS operation failed: {0}")] GcsStorage(#[from] google_cloud_storage::http::Error), + #[error("GCS error: {0}")] + Gcs(String), + #[error("IO error: {0}")] Io(#[from] std::io::Error), diff --git a/shared/data-provider/src/gcs.rs b/shared/data-provider/src/gcs.rs index d96e5ffdf..da5326b3a 100644 --- a/shared/data-provider/src/gcs.rs +++ b/shared/data-provider/src/gcs.rs @@ -329,33 +329,30 @@ pub async fn fetch_json_from_gcs( bucket: &str, object_path: &str, ) -> Result { - let storage = Storage::builder() - .build() - .await - .map_err(|e| DownloadError::Gcs(e.to_string()))?; - - let bucket_resource_name = format!("projects/_/buckets/{}", bucket); - - debug!("Fetching gs://{}/{}", bucket, object_path); + // Use authenticated client if GOOGLE_APPLICATION_CREDENTIALS is set, otherwise anonymous + let config = if std::env::var("GOOGLE_APPLICATION_CREDENTIALS").is_ok() { + info!("Using authenticated GCS client for config fetch"); + ClientConfig::default().with_auth().await? + } else { + info!("Using anonymous GCS client for config fetch"); + ClientConfig::default().anonymous() + }; + let client = Client::new(config); - let mut read_response = storage - .read_object(&bucket_resource_name, object_path) - .send() - .await - .map_err(|e| DownloadError::Gcs(format!("Failed to read {}: {}", object_path, e)))?; + info!("Fetching gs://{}/{}", bucket, object_path); - let mut data = Vec::new(); - while let Some(chunk_result) = read_response.next().await { - let chunk = chunk_result.map_err(|e| DownloadError::Gcs(e.to_string()))?; - data.extend_from_slice(&chunk); - } + let data = client + .download_object( + &GetObjectRequest { + bucket: bucket.to_owned(), + object: object_path.to_owned(), + ..Default::default() + }, + &Range::default(), + ) + .await?; - serde_json::from_slice(&data).map_err(|e| { - DownloadError::Gcs(format!( - "Failed to parse JSON from gs://{}/{}: {}", - bucket, object_path, e - )) - }) + serde_json::from_slice(&data).map_err(DownloadError::Json) } /// Fetch a JSON file from GCS synchronously. @@ -373,22 +370,25 @@ pub async fn upload_json_to_gcs( object_path: &str, value: &T, ) -> Result<(), UploadError> { - let storage = Storage::builder() - .build() - .await - .map_err(|e| UploadError::Gcs(e.to_string()))?; + // Use authenticated client - must have credentials for upload + let config = ClientConfig::default().with_auth().await?; + let client = Client::new(config); let json = serde_json::to_string_pretty(value)?; - let data = bytes::Bytes::from(json.into_bytes()); - let bucket_resource_name = format!("projects/_/buckets/{}", bucket); + let data = json.into_bytes(); info!("Uploading JSON to gs://{}/{}", bucket, object_path); - storage - .write_object(&bucket_resource_name, object_path, data) - .send_unbuffered() - .await - .map_err(|e| UploadError::Gcs(e.to_string()))?; + client + .upload_object( + &UploadObjectRequest { + bucket: bucket.to_owned(), + ..Default::default() + }, + data, + &UploadType::Simple(Media::new(object_path.to_owned())), + ) + .await?; info!("Uploaded JSON to gs://{}/{}", bucket, object_path); diff --git a/shared/data-provider/src/hub.rs b/shared/data-provider/src/hub.rs index 13a575b84..6a504aead 100644 --- a/shared/data-provider/src/hub.rs +++ b/shared/data-provider/src/hub.rs @@ -1,4 +1,4 @@ -use crate::errors::UploadError; +use crate::errors::{DownloadError, UploadError}; use crate::hub::model::HubRepo; use hf_hub::{ Cache, Repo, RepoType, @@ -11,7 +11,7 @@ use psyche_coordinator::model; use psyche_core::FixedString; use std::{path::PathBuf, time::Instant}; use tokio::sync::mpsc; -use tracing::{error, info}; +use tracing::{debug, error, info}; const MODEL_EXTENSIONS: [&str; 3] = [".safetensors", ".json", ".py"]; const DATASET_EXTENSIONS: [&str; 1] = [".parquet"]; @@ -193,6 +193,35 @@ pub fn download_dataset_repo_sync( ) } +/// Fetch a JSON file from HuggingFace and deserialize it. +/// Used for fetching external model configuration from Hub checkpoints. +pub async fn fetch_json_from_hub( + repo_id: &str, + revision: Option, + filename: &str, + token: Option, +) -> Result { + let cache = Cache::default(); + let api = hf_hub::api::tokio::ApiBuilder::new() + .with_cache_dir(cache.path().clone()) + .with_token(token.or(cache.token())) + .with_progress(false) + .build()?; + + let repo = match revision { + Some(rev) => Repo::with_revision(repo_id.to_string(), RepoType::Model, rev), + None => Repo::model(repo_id.to_string()), + }; + let api_repo = api.repo(repo); + + debug!("Fetching {} from {}", filename, repo_id); + + let file_path = api_repo.get(filename).await?; + let content = tokio::fs::read_to_string(&file_path).await?; + + serde_json::from_str(&content).map_err(DownloadError::Json) +} + #[derive(Debug, Clone)] pub struct HubUploadInfo { pub hub_repo: String, diff --git a/shared/data-provider/src/lib.rs b/shared/data-provider/src/lib.rs index 670e6e093..a33a724c6 100644 --- a/shared/data-provider/src/lib.rs +++ b/shared/data-provider/src/lib.rs @@ -24,7 +24,7 @@ pub use gcs::{ }; pub use hub::{ HubUploadInfo, download_dataset_repo_async, download_dataset_repo_sync, - download_model_repo_async, download_model_repo_sync, upload_to_hub, + download_model_repo_async, download_model_repo_sync, fetch_json_from_hub, upload_to_hub, }; pub use local::LocalDataProvider; pub use parquet::record::{ListAccessor, MapAccessor, RowAccessor}; From 5df718a7fa0296fc84a1178e0d52b035904a83f1 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Tue, 27 Jan 2026 13:28:59 -0800 Subject: [PATCH 05/22] Update 0.2.0 version of contracts --- architectures/decentralized/solana-authorizer/Cargo.lock | 2 +- architectures/decentralized/solana-coordinator/Cargo.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/architectures/decentralized/solana-authorizer/Cargo.lock b/architectures/decentralized/solana-authorizer/Cargo.lock index c1a36b8a6..7eb386dad 100644 --- a/architectures/decentralized/solana-authorizer/Cargo.lock +++ b/architectures/decentralized/solana-authorizer/Cargo.lock @@ -1389,7 +1389,7 @@ dependencies = [ [[package]] name = "psyche-solana-authorizer" -version = "0.1.0" +version = "0.2.0" dependencies = [ "anchor-lang", "anchor-spl", diff --git a/architectures/decentralized/solana-coordinator/Cargo.lock b/architectures/decentralized/solana-coordinator/Cargo.lock index a39998c22..6a1fe747f 100644 --- a/architectures/decentralized/solana-coordinator/Cargo.lock +++ b/architectures/decentralized/solana-coordinator/Cargo.lock @@ -1602,7 +1602,7 @@ dependencies = [ [[package]] name = "psyche-coordinator" -version = "0.1.0" +version = "0.2.0" dependencies = [ "anchor-lang", "async-trait", @@ -1617,7 +1617,7 @@ dependencies = [ [[package]] name = "psyche-core" -version = "0.1.0" +version = "0.2.0" dependencies = [ "anchor-lang", "anchor-lang-idl", @@ -1636,7 +1636,7 @@ dependencies = [ [[package]] name = "psyche-solana-authorizer" -version = "0.1.0" +version = "0.2.0" dependencies = [ "anchor-lang", "anchor-spl", @@ -1644,7 +1644,7 @@ dependencies = [ [[package]] name = "psyche-solana-coordinator" -version = "0.1.0" +version = "0.2.0" dependencies = [ "anchor-lang", "bytemuck", From 767e1df819231c89275fae9557b0b86b2b970952 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Tue, 27 Jan 2026 13:37:20 -0800 Subject: [PATCH 06/22] Fix clippy and format --- shared/coordinator/src/external_config.rs | 2 +- shared/data-provider/src/gcs.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/shared/coordinator/src/external_config.rs b/shared/coordinator/src/external_config.rs index 9a0efb236..d28b25d79 100644 --- a/shared/coordinator/src/external_config.rs +++ b/shared/coordinator/src/external_config.rs @@ -291,7 +291,7 @@ pub fn get_config_gcs_uri(checkpoint: &Checkpoint) -> Option { pub fn get_config_gcs_repo(checkpoint: &Checkpoint) -> Option { match checkpoint { Checkpoint::Gcs(repo) | Checkpoint::P2PGcs(repo) => Some(GcsRepo { - bucket: repo.bucket.clone(), + bucket: repo.bucket, prefix: Some(psyche_core::FixedString::from_str_truncated(CONFIG_PREFIX)), }), _ => None, diff --git a/shared/data-provider/src/gcs.rs b/shared/data-provider/src/gcs.rs index da5326b3a..f5adc6d46 100644 --- a/shared/data-provider/src/gcs.rs +++ b/shared/data-provider/src/gcs.rs @@ -401,7 +401,7 @@ pub fn upload_json_to_gcs_sync( object_path: &str, value: &T, ) -> Result<(), UploadError> { - let rt = Runtime::new().map_err(|e| UploadError::Io(e))?; + let rt = Runtime::new().map_err(UploadError::Io)?; rt.block_on(upload_json_to_gcs(bucket, object_path, value)) } From c783800e7abe310f45e773ccc3e2c4a18898eb7d Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Wed, 28 Jan 2026 00:15:51 -0300 Subject: [PATCH 07/22] Fix website with new off-chain data --- Cargo.lock | 1 + website/wasm/Cargo.toml | 1 + website/wasm/src/lib.rs | 12 ++++++++++++ 3 files changed, 14 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 67738dad1..ef66cf80f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7074,6 +7074,7 @@ dependencies = [ name = "psyche-deserialize-zerocopy-wasm" version = "0.2.0" dependencies = [ + "psyche-coordinator", "psyche-core", "psyche-solana-coordinator", "serde", diff --git a/website/wasm/Cargo.toml b/website/wasm/Cargo.toml index b11b43aa5..b834fe910 100644 --- a/website/wasm/Cargo.toml +++ b/website/wasm/Cargo.toml @@ -8,6 +8,7 @@ crate-type = ["cdylib"] [dependencies] psyche-solana-coordinator = { path = "../../architectures/decentralized/solana-coordinator/programs/solana-coordinator" } +psyche-coordinator.workspace = true serde.workspace = true serde-wasm-bindgen = "0.6.5" wasm-bindgen = "=0.2.104" diff --git a/website/wasm/src/lib.rs b/website/wasm/src/lib.rs index 1beee51dd..1e84298bc 100644 --- a/website/wasm/src/lib.rs +++ b/website/wasm/src/lib.rs @@ -1,3 +1,4 @@ +use psyche_coordinator::model::LLMArchitecture; use psyche_core::LearningRateSchedule; use psyche_solana_coordinator::{ClientId, CoordinatorAccount, coordinator_account_from_bytes}; use serde::ser::Serialize; @@ -38,3 +39,14 @@ pub struct DummyCoordinatorAccount(CoordinatorAccount); #[derive(TS)] #[ts(export)] pub struct DummyClientId(ClientId); + +// Export types that are now in ExternalModelConfig but still needed by the website +#[allow(dead_code)] +#[derive(TS)] +#[ts(export)] +pub struct DummyLLMArchitecture(LLMArchitecture); + +#[allow(dead_code)] +#[derive(TS)] +#[ts(export)] +pub struct DummyLearningRateSchedule(LearningRateSchedule); From bbbfa7424da452f01d9bd33c052922612ceaf454 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Wed, 28 Jan 2026 10:31:45 -0300 Subject: [PATCH 08/22] Fix web with off-chain data --- .../src/dataStores/flatFileCoordinator.ts | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/website/backend/src/dataStores/flatFileCoordinator.ts b/website/backend/src/dataStores/flatFileCoordinator.ts index da7d496ca..25b6bfbbb 100644 --- a/website/backend/src/dataStores/flatFileCoordinator.ts +++ b/website/backend/src/dataStores/flatFileCoordinator.ts @@ -4,7 +4,6 @@ import { Model, PsycheCoordinator, RunMetadata, - lr_at_step, } from 'psyche-deserialize-zerocopy-wasm' import { RunSummary, @@ -291,13 +290,8 @@ export class FlatFileCoordinatorDataStore implements CoordinatorDataStore { lastRun.lastUpdated = eventTime lastRun.lastState = newState - const step = newState.coordinator.progress.step - if (step > (lastRun.observedLrByStep.at(-1)?.[0] ?? 0)) { - const lr = lr_at_step(newState.coordinator.model.LLM.lr_schedule, step) - if (isGoodNumber(lr)) { - lastRun.observedLrByStep.push([step, lr]) - } - } + // TODO: LR schedule moved to ExternalModelConfig (off-chain) + // Fetch from GCS/HuggingFace to restore LR tracking if (configChanged) { lastRun.configChanges.push({ @@ -672,7 +666,10 @@ export class FlatFileCoordinatorDataStore implements CoordinatorDataStore { roundWitnessTime: Number(config.round_witness_time), warmupTime: Number(config.warmup_time), - lrSchedule: c.coordinator.model.LLM.lr_schedule, + // TODO: Fetch from ExternalModelConfig (off-chain) + lrSchedule: { + Constant: { base_lr: 0, warmup_init_lr: 0, warmup_steps: 0 }, + }, }, } } @@ -744,7 +741,8 @@ function makeRunSummary( : undefined const summary: RunSummary = { - arch: c.model.LLM.architecture, + // TODO: Fetch from ExternalModelConfig (off-chain) + arch: 'HfLlama', id: c.run_id, index: index, isOnlyRunAtThisIndex, From 8659093bab1fc564efe7d8054608a229472e9374 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Wed, 28 Jan 2026 13:06:14 -0300 Subject: [PATCH 09/22] Fetch coordinator metadata in the website --- website/backend/src/coordinatorChainLoop.ts | 11 -- .../src/dataStores/flatFileCoordinator.ts | 45 ++++- website/backend/src/externalConfig.ts | 174 ++++++++++++++++++ 3 files changed, 213 insertions(+), 17 deletions(-) create mode 100644 website/backend/src/externalConfig.ts diff --git a/website/backend/src/coordinatorChainLoop.ts b/website/backend/src/coordinatorChainLoop.ts index 21aa04dc2..fc17b5687 100644 --- a/website/backend/src/coordinatorChainLoop.ts +++ b/website/backend/src/coordinatorChainLoop.ts @@ -342,17 +342,6 @@ export async function startWatchCoordinatorChainLoop( }) break } - case 'update_client_version': { - const runPdaAddr = i.accounts[1].toString() - const coordinatorAddr = i.accounts[2].toString() - runUpdates.getAndTouchCurrentRun({ - runPdaAddr, - coordinatorAddr, - decoded, - tx, - }) - break - } default: { const _missed_tx: never = decoded throw new Error( diff --git a/website/backend/src/dataStores/flatFileCoordinator.ts b/website/backend/src/dataStores/flatFileCoordinator.ts index 25b6bfbbb..8308f8125 100644 --- a/website/backend/src/dataStores/flatFileCoordinator.ts +++ b/website/backend/src/dataStores/flatFileCoordinator.ts @@ -4,6 +4,8 @@ import { Model, PsycheCoordinator, RunMetadata, + LearningRateSchedule, + LLMArchitecture, } from 'psyche-deserialize-zerocopy-wasm' import { RunSummary, @@ -25,6 +27,7 @@ import { UniqueRunKey, runKey } from '../coordinator.js' import { readVersionedFile, writeVersionedFile } from './versioned.js' import { CURRENT_VERSION } from 'shared/formats/type.js' import { existsSync, renameSync } from 'fs' +import { fetchExternalConfig } from '../externalConfig.js' // any run ID outside this list will not be returned to the frontend in the summary list, const ALLOWLISTED_RUN_IDS = @@ -65,6 +68,11 @@ type WitnessV2 = Omit< prompt_index: number } +interface ExternalModelConfig { + architecture: LLMArchitecture + lr_schedule: LearningRateSchedule +} + interface RunHistoryV2 { runId: string createdAt: ChainTimestamp @@ -95,6 +103,9 @@ interface RunHistoryV2 { observedLrByStep: Array<[number, number]> recentTxs: Array + + // Cached external config (fetched from HuggingFace/GCS) + externalConfig?: ExternalModelConfig } interface RunSummaries { @@ -290,8 +301,30 @@ export class FlatFileCoordinatorDataStore implements CoordinatorDataStore { lastRun.lastUpdated = eventTime lastRun.lastState = newState - // TODO: LR schedule moved to ExternalModelConfig (off-chain) - // Fetch from GCS/HuggingFace to restore LR tracking + // Fetch external config in the background (non-blocking) + const checkpoint = newState.coordinator.model.LLM.checkpoint + if (!lastRun.externalConfig) { + fetchExternalConfig(checkpoint) + .then((config) => { + if (config) { + lastRun.externalConfig = { + architecture: config.architecture, + lr_schedule: config.lr_schedule, + } + // Clear caches so next request will show updated data + this.#runCache.delete(runKey(lastRun.runId, index)) + this.#summaryCache = null + // Notify listeners of update + this.eventEmitter.emit('update', runKey(lastRun.runId, index)) + } + }) + .catch((err) => { + console.warn( + `Failed to fetch external config for run ${lastRun.runId}:`, + err + ) + }) + } if (configChanged) { lastRun.configChanges.push({ @@ -666,8 +699,8 @@ export class FlatFileCoordinatorDataStore implements CoordinatorDataStore { roundWitnessTime: Number(config.round_witness_time), warmupTime: Number(config.warmup_time), - // TODO: Fetch from ExternalModelConfig (off-chain) - lrSchedule: { + // Use cached external config or fallback to placeholder + lrSchedule: run.externalConfig?.lr_schedule ?? { Constant: { base_lr: 0, warmup_init_lr: 0, warmup_steps: 0 }, }, }, @@ -741,8 +774,8 @@ function makeRunSummary( : undefined const summary: RunSummary = { - // TODO: Fetch from ExternalModelConfig (off-chain) - arch: 'HfLlama', + // Use cached external config or fallback to HfLlama + arch: run.externalConfig?.architecture ?? 'HfLlama', id: c.run_id, index: index, isOnlyRunAtThisIndex, diff --git a/website/backend/src/externalConfig.ts b/website/backend/src/externalConfig.ts new file mode 100644 index 000000000..f68f5ef30 --- /dev/null +++ b/website/backend/src/externalConfig.ts @@ -0,0 +1,174 @@ +import type { + Checkpoint, + HubRepo, + GcsRepo, + LearningRateSchedule, + LLMArchitecture, +} from 'psyche-deserialize-zerocopy-wasm' + +// External config schema (matches Rust ExternalModelConfig) +interface ExternalModelConfig { + version: number + architecture: LLMArchitecture + data_type: string + data_location: unknown + lr_schedule: LearningRateSchedule + optimizer: unknown + run_metadata?: unknown + client_requirements?: unknown +} + +interface ExternalConfigCacheEntry { + config: ExternalModelConfig + fetchedAt: number +} + +// Cache for external configs to avoid repeated fetches +const configCache = new Map() +const CACHE_TTL_MS = 5 * 60 * 1000 // 5 minutes + +/** + * Extract string from FixedString (null-terminated byte array) + */ +function fixedStringToString(fixedStr: { inner: number[] } | string): string { + if (typeof fixedStr === 'string') { + return fixedStr + } + // Find null terminator and convert to string + const bytes = fixedStr.inner || fixedStr + if (Array.isArray(bytes)) { + const nullIndex = bytes.indexOf(0) + const relevantBytes = nullIndex >= 0 ? bytes.slice(0, nullIndex) : bytes + return String.fromCharCode(...relevantBytes) + } + return String(bytes) +} + +/** + * Get the external config URL from a checkpoint + */ +function getExternalConfigUrl(checkpoint: Checkpoint): string | null { + if (typeof checkpoint !== 'object' || checkpoint === null) { + return null + } + + // Handle Hub checkpoint + if ('Hub' in checkpoint && checkpoint.Hub) { + const hub = checkpoint.Hub as HubRepo + const repoId = fixedStringToString(hub.repo_id) + const revision = hub.revision ? fixedStringToString(hub.revision) : 'main' + // HuggingFace raw content URL + return `https://huggingface.co/${repoId}/raw/${revision}/config/model_config.json` + } + + // Handle P2P checkpoint (also uses Hub repo) + if ('P2P' in checkpoint && checkpoint.P2P) { + const p2p = checkpoint.P2P as HubRepo + const repoId = fixedStringToString(p2p.repo_id) + const revision = p2p.revision ? fixedStringToString(p2p.revision) : 'main' + return `https://huggingface.co/${repoId}/raw/${revision}/config/model_config.json` + } + + // Handle Gcs checkpoint + if ('Gcs' in checkpoint && checkpoint.Gcs) { + const gcs = checkpoint.Gcs as GcsRepo + const bucket = fixedStringToString(gcs.bucket) + const prefix = gcs.prefix ? fixedStringToString(gcs.prefix) : '' + const pathPrefix = prefix ? `${prefix}/` : '' + // GCS public URL + return `https://storage.googleapis.com/${bucket}/${pathPrefix}config/model_config.json` + } + + // Handle P2PGcs checkpoint + if ('P2PGcs' in checkpoint && checkpoint.P2PGcs) { + const gcs = checkpoint.P2PGcs as GcsRepo + const bucket = fixedStringToString(gcs.bucket) + const prefix = gcs.prefix ? fixedStringToString(gcs.prefix) : '' + const pathPrefix = prefix ? `${prefix}/` : '' + return `https://storage.googleapis.com/${bucket}/${pathPrefix}config/model_config.json` + } + + // Dummy and Ephemeral checkpoints don't have external config + return null +} + +/** + * Fetch external config from the checkpoint's config URL + */ +export async function fetchExternalConfig( + checkpoint: Checkpoint +): Promise { + const url = getExternalConfigUrl(checkpoint) + if (!url) { + return null + } + + // Check cache + const cached = configCache.get(url) + if (cached && Date.now() - cached.fetchedAt < CACHE_TTL_MS) { + return cached.config + } + + try { + const response = await fetch(url, { + headers: { + Accept: 'application/json', + }, + }) + + if (!response.ok) { + console.warn( + `Failed to fetch external config from ${url}: ${response.status} ${response.statusText}` + ) + return null + } + + const config = (await response.json()) as ExternalModelConfig + + // Validate required fields + if (!config.architecture || !config.lr_schedule) { + console.warn( + `Invalid external config from ${url}: missing architecture or lr_schedule` + ) + return null + } + + // Cache the result + configCache.set(url, { + config, + fetchedAt: Date.now(), + }) + + return config + } catch (error) { + console.warn(`Error fetching external config from ${url}:`, error) + return null + } +} + +/** + * Get architecture from checkpoint (fetches external config if needed) + */ +export async function getArchitectureFromCheckpoint( + checkpoint: Checkpoint +): Promise { + const config = await fetchExternalConfig(checkpoint) + return config?.architecture ?? null +} + +/** + * Get LR schedule from checkpoint (fetches external config if needed) + */ +export async function getLRScheduleFromCheckpoint( + checkpoint: Checkpoint +): Promise { + const config = await fetchExternalConfig(checkpoint) + return config?.lr_schedule ?? null +} + +/** + * Clear the config cache (useful for testing or manual refresh) + */ +export function clearExternalConfigCache(): void { + configCache.clear() +} From 9cfad6a4f63ab319567739b05010a7fd11811f38 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Wed, 28 Jan 2026 15:01:50 -0300 Subject: [PATCH 10/22] Bring back update_client_version to ts --- website/backend/src/coordinatorChainLoop.ts | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/website/backend/src/coordinatorChainLoop.ts b/website/backend/src/coordinatorChainLoop.ts index fc17b5687..21aa04dc2 100644 --- a/website/backend/src/coordinatorChainLoop.ts +++ b/website/backend/src/coordinatorChainLoop.ts @@ -342,6 +342,17 @@ export async function startWatchCoordinatorChainLoop( }) break } + case 'update_client_version': { + const runPdaAddr = i.accounts[1].toString() + const coordinatorAddr = i.accounts[2].toString() + runUpdates.getAndTouchCurrentRun({ + runPdaAddr, + coordinatorAddr, + decoded, + tx, + }) + break + } default: { const _missed_tx: never = decoded throw new Error( From da40341f927760c0afd28b8432c08daab055ff7f Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Wed, 28 Jan 2026 16:24:23 -0300 Subject: [PATCH 11/22] Add console logs to debug external config fetch --- .../src/dataStores/flatFileCoordinator.ts | 25 ++++++++++++++++++- website/backend/src/externalConfig.ts | 7 ++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/website/backend/src/dataStores/flatFileCoordinator.ts b/website/backend/src/dataStores/flatFileCoordinator.ts index 01b3201d6..f19c4a4a0 100644 --- a/website/backend/src/dataStores/flatFileCoordinator.ts +++ b/website/backend/src/dataStores/flatFileCoordinator.ts @@ -303,9 +303,25 @@ export class FlatFileCoordinatorDataStore implements CoordinatorDataStore { // Fetch external config in the background (non-blocking) const checkpoint = newState.coordinator.model.LLM.checkpoint + console.log( + '[externalConfig] Checking checkpoint for run', + lastRun.runId, + ':', + checkpoint + ) if (!lastRun.externalConfig) { + console.log( + '[externalConfig] Fetching external config for run', + lastRun.runId + ) fetchExternalConfig(checkpoint) .then((config) => { + console.log( + '[externalConfig] Fetch result for run', + lastRun.runId, + ':', + config + ) if (config) { lastRun.externalConfig = { architecture: config.architecture, @@ -320,10 +336,17 @@ export class FlatFileCoordinatorDataStore implements CoordinatorDataStore { }) .catch((err) => { console.warn( - `Failed to fetch external config for run ${lastRun.runId}:`, + `[externalConfig] Failed to fetch external config for run ${lastRun.runId}:`, err ) }) + } else { + console.log( + '[externalConfig] Already have external config for run', + lastRun.runId, + ':', + lastRun.externalConfig + ) } if (configChanged) { diff --git a/website/backend/src/externalConfig.ts b/website/backend/src/externalConfig.ts index f68f5ef30..ab59d4fb8 100644 --- a/website/backend/src/externalConfig.ts +++ b/website/backend/src/externalConfig.ts @@ -99,7 +99,14 @@ export async function fetchExternalConfig( checkpoint: Checkpoint ): Promise { const url = getExternalConfigUrl(checkpoint) + console.log( + '[externalConfig] fetchExternalConfig called with checkpoint:', + checkpoint, + '-> URL:', + url + ) if (!url) { + console.log('[externalConfig] No URL found for checkpoint') return null } From 663301d4e28b5d88b2fcd8d01c27ec4afd6df137 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Wed, 28 Jan 2026 17:15:13 -0300 Subject: [PATCH 12/22] Remove externalconfig from the website --- .../src/dataStores/flatFileCoordinator.ts | 66 ------- website/backend/src/externalConfig.ts | 181 ------------------ .../frontend/src/components/RunSummary.tsx | 2 - .../frontend/src/routes/runs/$run.$index.tsx | 1 - website/shared/index.ts | 10 +- 5 files changed, 1 insertion(+), 259 deletions(-) delete mode 100644 website/backend/src/externalConfig.ts diff --git a/website/backend/src/dataStores/flatFileCoordinator.ts b/website/backend/src/dataStores/flatFileCoordinator.ts index f19c4a4a0..3023b0da1 100644 --- a/website/backend/src/dataStores/flatFileCoordinator.ts +++ b/website/backend/src/dataStores/flatFileCoordinator.ts @@ -4,8 +4,6 @@ import { Model, PsycheCoordinator, RunMetadata, - LearningRateSchedule, - LLMArchitecture, } from 'psyche-deserialize-zerocopy-wasm' import { RunSummary, @@ -27,7 +25,6 @@ import { UniqueRunKey, runKey } from '../coordinator.js' import { readVersionedFile, writeVersionedFile } from './versioned.js' import { CURRENT_VERSION } from 'shared/formats/type.js' import { existsSync, renameSync } from 'fs' -import { fetchExternalConfig } from '../externalConfig.js' // any run ID outside this list will not be returned to the frontend in the summary list, const ALLOWLISTED_RUN_IDS = @@ -68,11 +65,6 @@ type WitnessV2 = Omit< prompt_index: number } -interface ExternalModelConfig { - architecture: LLMArchitecture - lr_schedule: LearningRateSchedule -} - interface RunHistoryV2 { runId: string createdAt: ChainTimestamp @@ -103,9 +95,6 @@ interface RunHistoryV2 { observedLrByStep: Array<[number, number]> recentTxs: Array - - // Cached external config (fetched from HuggingFace/GCS) - externalConfig?: ExternalModelConfig } interface RunSummaries { @@ -301,54 +290,6 @@ export class FlatFileCoordinatorDataStore implements CoordinatorDataStore { lastRun.lastUpdated = eventTime lastRun.lastState = newState - // Fetch external config in the background (non-blocking) - const checkpoint = newState.coordinator.model.LLM.checkpoint - console.log( - '[externalConfig] Checking checkpoint for run', - lastRun.runId, - ':', - checkpoint - ) - if (!lastRun.externalConfig) { - console.log( - '[externalConfig] Fetching external config for run', - lastRun.runId - ) - fetchExternalConfig(checkpoint) - .then((config) => { - console.log( - '[externalConfig] Fetch result for run', - lastRun.runId, - ':', - config - ) - if (config) { - lastRun.externalConfig = { - architecture: config.architecture, - lr_schedule: config.lr_schedule, - } - // Clear caches so next request will show updated data - this.#runCache.delete(runKey(lastRun.runId, index)) - this.#summaryCache = null - // Notify listeners of update - this.eventEmitter.emit('update', runKey(lastRun.runId, index)) - } - }) - .catch((err) => { - console.warn( - `[externalConfig] Failed to fetch external config for run ${lastRun.runId}:`, - err - ) - }) - } else { - console.log( - '[externalConfig] Already have external config for run', - lastRun.runId, - ':', - lastRun.externalConfig - ) - } - if (configChanged) { lastRun.configChanges.push({ timestamp: eventTime, @@ -721,11 +662,6 @@ export class FlatFileCoordinatorDataStore implements CoordinatorDataStore { maxRoundTrainTime: Number(config.max_round_train_time), roundWitnessTime: Number(config.round_witness_time), warmupTime: Number(config.warmup_time), - - // Use cached external config or fallback to placeholder - lrSchedule: run.externalConfig?.lr_schedule ?? { - Constant: { base_lr: 0, warmup_init_lr: 0, warmup_steps: 0 }, - }, }, } } @@ -797,8 +733,6 @@ function makeRunSummary( : undefined const summary: RunSummary = { - // Use cached external config or fallback to HfLlama - arch: run.externalConfig?.architecture ?? 'HfLlama', id: c.run_id, index: index, isOnlyRunAtThisIndex, diff --git a/website/backend/src/externalConfig.ts b/website/backend/src/externalConfig.ts deleted file mode 100644 index ab59d4fb8..000000000 --- a/website/backend/src/externalConfig.ts +++ /dev/null @@ -1,181 +0,0 @@ -import type { - Checkpoint, - HubRepo, - GcsRepo, - LearningRateSchedule, - LLMArchitecture, -} from 'psyche-deserialize-zerocopy-wasm' - -// External config schema (matches Rust ExternalModelConfig) -interface ExternalModelConfig { - version: number - architecture: LLMArchitecture - data_type: string - data_location: unknown - lr_schedule: LearningRateSchedule - optimizer: unknown - run_metadata?: unknown - client_requirements?: unknown -} - -interface ExternalConfigCacheEntry { - config: ExternalModelConfig - fetchedAt: number -} - -// Cache for external configs to avoid repeated fetches -const configCache = new Map() -const CACHE_TTL_MS = 5 * 60 * 1000 // 5 minutes - -/** - * Extract string from FixedString (null-terminated byte array) - */ -function fixedStringToString(fixedStr: { inner: number[] } | string): string { - if (typeof fixedStr === 'string') { - return fixedStr - } - // Find null terminator and convert to string - const bytes = fixedStr.inner || fixedStr - if (Array.isArray(bytes)) { - const nullIndex = bytes.indexOf(0) - const relevantBytes = nullIndex >= 0 ? bytes.slice(0, nullIndex) : bytes - return String.fromCharCode(...relevantBytes) - } - return String(bytes) -} - -/** - * Get the external config URL from a checkpoint - */ -function getExternalConfigUrl(checkpoint: Checkpoint): string | null { - if (typeof checkpoint !== 'object' || checkpoint === null) { - return null - } - - // Handle Hub checkpoint - if ('Hub' in checkpoint && checkpoint.Hub) { - const hub = checkpoint.Hub as HubRepo - const repoId = fixedStringToString(hub.repo_id) - const revision = hub.revision ? fixedStringToString(hub.revision) : 'main' - // HuggingFace raw content URL - return `https://huggingface.co/${repoId}/raw/${revision}/config/model_config.json` - } - - // Handle P2P checkpoint (also uses Hub repo) - if ('P2P' in checkpoint && checkpoint.P2P) { - const p2p = checkpoint.P2P as HubRepo - const repoId = fixedStringToString(p2p.repo_id) - const revision = p2p.revision ? fixedStringToString(p2p.revision) : 'main' - return `https://huggingface.co/${repoId}/raw/${revision}/config/model_config.json` - } - - // Handle Gcs checkpoint - if ('Gcs' in checkpoint && checkpoint.Gcs) { - const gcs = checkpoint.Gcs as GcsRepo - const bucket = fixedStringToString(gcs.bucket) - const prefix = gcs.prefix ? fixedStringToString(gcs.prefix) : '' - const pathPrefix = prefix ? `${prefix}/` : '' - // GCS public URL - return `https://storage.googleapis.com/${bucket}/${pathPrefix}config/model_config.json` - } - - // Handle P2PGcs checkpoint - if ('P2PGcs' in checkpoint && checkpoint.P2PGcs) { - const gcs = checkpoint.P2PGcs as GcsRepo - const bucket = fixedStringToString(gcs.bucket) - const prefix = gcs.prefix ? fixedStringToString(gcs.prefix) : '' - const pathPrefix = prefix ? `${prefix}/` : '' - return `https://storage.googleapis.com/${bucket}/${pathPrefix}config/model_config.json` - } - - // Dummy and Ephemeral checkpoints don't have external config - return null -} - -/** - * Fetch external config from the checkpoint's config URL - */ -export async function fetchExternalConfig( - checkpoint: Checkpoint -): Promise { - const url = getExternalConfigUrl(checkpoint) - console.log( - '[externalConfig] fetchExternalConfig called with checkpoint:', - checkpoint, - '-> URL:', - url - ) - if (!url) { - console.log('[externalConfig] No URL found for checkpoint') - return null - } - - // Check cache - const cached = configCache.get(url) - if (cached && Date.now() - cached.fetchedAt < CACHE_TTL_MS) { - return cached.config - } - - try { - const response = await fetch(url, { - headers: { - Accept: 'application/json', - }, - }) - - if (!response.ok) { - console.warn( - `Failed to fetch external config from ${url}: ${response.status} ${response.statusText}` - ) - return null - } - - const config = (await response.json()) as ExternalModelConfig - - // Validate required fields - if (!config.architecture || !config.lr_schedule) { - console.warn( - `Invalid external config from ${url}: missing architecture or lr_schedule` - ) - return null - } - - // Cache the result - configCache.set(url, { - config, - fetchedAt: Date.now(), - }) - - return config - } catch (error) { - console.warn(`Error fetching external config from ${url}:`, error) - return null - } -} - -/** - * Get architecture from checkpoint (fetches external config if needed) - */ -export async function getArchitectureFromCheckpoint( - checkpoint: Checkpoint -): Promise { - const config = await fetchExternalConfig(checkpoint) - return config?.architecture ?? null -} - -/** - * Get LR schedule from checkpoint (fetches external config if needed) - */ -export async function getLRScheduleFromCheckpoint( - checkpoint: Checkpoint -): Promise { - const config = await fetchExternalConfig(checkpoint) - return config?.lr_schedule ?? null -} - -/** - * Clear the config cache (useful for testing or manual refresh) - */ -export function clearExternalConfigCache(): void { - configCache.clear() -} diff --git a/website/frontend/src/components/RunSummary.tsx b/website/frontend/src/components/RunSummary.tsx index 57a9af6e2..d72e69a44 100644 --- a/website/frontend/src/components/RunSummary.tsx +++ b/website/frontend/src/components/RunSummary.tsx @@ -55,7 +55,6 @@ const InfoChits = styled.div` export const RunSummaryCard = memo(function RunSummaryCard({ info: { id, - arch, description, name, size, @@ -93,7 +92,6 @@ export const RunSummaryCard = memo(function RunSummaryCard({ {size !== 0n && ( {formatNumber(Number(size), 2)} )} - {arch} {type} {formatNumber(Number(totalTokens), 2)} diff --git a/website/frontend/src/routes/runs/$run.$index.tsx b/website/frontend/src/routes/runs/$run.$index.tsx index 943f2abf4..ebb6625cc 100644 --- a/website/frontend/src/routes/runs/$run.$index.tsx +++ b/website/frontend/src/routes/runs/$run.$index.tsx @@ -178,7 +178,6 @@ function RouteComponent() { {formatNumber(Number(info.size), 2)} - {info.arch} {info.type} diff --git a/website/shared/index.ts b/website/shared/index.ts index 61f10b7a0..095104451 100644 --- a/website/shared/index.ts +++ b/website/shared/index.ts @@ -8,12 +8,7 @@ import * as miningPoolTypes from './idl/mining-pool_idlType.js' type PsycheSolanaCoordinator = coordinatorTypes.PsycheSolanaCoordinator type PsycheSolanaMiningPool = miningPoolTypes.PsycheSolanaMiningPool -import type { - HubRepo, - LearningRateSchedule, - LLMArchitecture, - RunState, -} from 'psyche-deserialize-zerocopy-wasm' +import type { HubRepo, RunState } from 'psyche-deserialize-zerocopy-wasm' export type * from 'psyche-deserialize-zerocopy-wasm' @@ -72,7 +67,6 @@ export interface RunSummary { } size: bigint - arch: LLMArchitecture type: ModelType } @@ -130,8 +124,6 @@ export interface RunData { maxRoundTrainTime: number roundWitnessTime: number - - lrSchedule: LearningRateSchedule } } recentTxs: Array From 343ff5fa2de04cdd3572085ac009d5c44b0a5284 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Thu, 29 Jan 2026 11:05:16 -0300 Subject: [PATCH 13/22] Clean the code and refactor some functions --- .../suites/memnet_coordinator_data_layout.rs | 2 - config/solana-test/light-config.toml | 4 +- scripts/train-solana-test.sh | 60 ++-- shared/client/src/state/init.rs | 63 ++-- shared/coordinator/src/external_config.rs | 304 +----------------- shared/coordinator/src/model.rs | 15 - shared/data-provider/src/gcs.rs | 19 -- shared/data-provider/src/hub.rs | 41 ++- shared/data-provider/src/lib.rs | 5 +- shared/watcher/src/tui.rs | 13 +- tools/rust-tools/preview-lr/src/main.rs | 4 +- .../src/commands/run/update_config.rs | 55 +++- website/frontend/src/fakeData.ts | 13 - 13 files changed, 163 insertions(+), 435 deletions(-) diff --git a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_data_layout.rs b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_data_layout.rs index 88a663362..87e1a26d0 100644 --- a/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_data_layout.rs +++ b/architectures/decentralized/solana-tooling/tests/suites/memnet_coordinator_data_layout.rs @@ -35,8 +35,6 @@ pub async fn run() { assert_eq!(coordinator.run_state_start_unix_timestamp, 0); assert_eq!(coordinator.pending_pause, SmallBoolean::FALSE); // Coordinator model (only on-chain fields) - // Note: architecture, data_type, data_location, lr_schedule, optimizer are now - // stored off-chain in ExternalModelConfig (GCS) and not part of the on-chain LLM struct match coordinator.model { Model::LLM(llm) => { assert_eq!(llm.max_seq_len, 2048); diff --git a/config/solana-test/light-config.toml b/config/solana-test/light-config.toml index c0968e61b..19c2153f6 100644 --- a/config/solana-test/light-config.toml +++ b/config/solana-test/light-config.toml @@ -18,8 +18,8 @@ waiting_for_members_extra_time = 3 max_seq_len = 2048 cold_start_warmup_steps = 0 -[model.LLM.checkpoint.Gcs] -bucket = "my_checkpoints" +[model.LLM.checkpoint.Hub] +repo_id = "emozilla/llama2-20m-init" [external_config] architecture = "HfLlama" diff --git a/scripts/train-solana-test.sh b/scripts/train-solana-test.sh index abbc87f45..5794cfdc4 100755 --- a/scripts/train-solana-test.sh +++ b/scripts/train-solana-test.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash + set -eo pipefail # use the agenix provided wallet if you have it @@ -6,12 +7,16 @@ if [[ -n "${devnet__keypair__wallet_PATH}" && -f "${devnet__keypair__wallet_PATH WALLET_FILE="${devnet__keypair__wallet_PATH}" elif [[ -z "${WALLET_FILE:-}" ]]; then echo "No wallet file specified, generating ephemeral keypair..." + # Create a named pipe for the keypair data mkdir -p ~/.config/solana/solana-keys WALLET_FILE=$(mktemp ~/.config/solana/solana-keys/solana-wallet-XXXXXXXXX) + # Generate keypair and write to the generated file solana-keygen new --no-bip39-passphrase --force --outfile "${WALLET_FILE}" echo "Using ephemeral keypair (will not persist after script exits)" + # Set up cleanup trap to remove the wallet file when script exits + # This will run on normal exit, SIGINT (Ctrl+C), SIGTERM, or ERR trap "echo 'Cleaning up ephemeral wallet file...'; rm -f '${WALLET_FILE}'" EXIT fi @@ -35,35 +40,38 @@ else fi # fine if this fails -solana airdrop 10 "$(solana-keygen pubkey "${WALLET_FILE}")" --url "${RPC}" || true +solana airdrop 10 "$(solana-keygen pubkey ${WALLET_FILE})" --url "${RPC}" || true export RUST_LOG="info,psyche=debug" -COMMON_ARGS=( - train - --wallet-private-key-path "${WALLET_FILE}" - --rpc "${RPC}" - --ws-rpc "${WS_RPC}" - --run-id "${RUN_ID}" - --data-parallelism "${DP}" - --tensor-parallelism "${TP}" - --micro-batch-size "${BATCH_SIZE}" - --authorizer "${AUTHORIZER}" - --logs console - "${CHECKPOINT_ARGS[@]}" - "$@" -) - -if [[ -z "${OTLP_METRICS_URL:-}" ]]; then - HF_TOKEN=${HF_TOKEN} \ - GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS} \ - cargo run --release --bin psyche-solana-client -- \ - "${COMMON_ARGS[@]}" +if [[ "$OTLP_METRICS_URL" == "" ]]; then + cargo run --release --bin psyche-solana-client -- \ + train \ + --wallet-private-key-path ${WALLET_FILE} \ + --rpc ${RPC} \ + --ws-rpc ${WS_RPC} \ + --run-id ${RUN_ID} \ + --data-parallelism ${DP} \ + --tensor-parallelism ${TP} \ + --micro-batch-size ${BATCH_SIZE} \ + --authorizer ${AUTHORIZER} \ + --logs "console" \ + ${CHECKPOINT_ARGS[@]} \ + "$@" else - HF_TOKEN=${HF_TOKEN} \ - GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS} \ - cargo run --release --bin psyche-solana-client -- \ - "${COMMON_ARGS[@]}" \ + cargo run --release --bin psyche-solana-client -- \ + train \ + --wallet-private-key-path ${WALLET_FILE} \ + --rpc ${RPC} \ + --ws-rpc ${WS_RPC} \ + --run-id ${RUN_ID} \ + --data-parallelism ${DP} \ + --tensor-parallelism ${TP} \ + --micro-batch-size ${BATCH_SIZE} \ + --logs "console" \ + --authorizer ${AUTHORIZER} \ --oltp-metrics-url "http://localhost:4318/v1/metrics" \ - --oltp-logs-url "http://localhost:4318/v1/logs" + --oltp-logs-url "http://localhost:4318/v1/logs" \ + ${CHECKPOINT_ARGS[@]} \ + "$@" fi diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs index 58c24d43f..a8ef7a48e 100644 --- a/shared/client/src/state/init.rs +++ b/shared/client/src/state/init.rs @@ -1,7 +1,7 @@ use crate::{WandBInfo, fetch_data::DataFetcher}; use psyche_coordinator::{ Coordinator, HealthChecks, - external_config::{ExternalModelConfig, get_config_gcs_path, get_config_hub_path}, + external_config::{CONFIG_PREFIX, ExternalModelConfig, MODEL_CONFIG_FILENAME}, model::{self, HttpLLMTrainingDataLocation, LLMTrainingDataLocation}, }; use psyche_core::{ @@ -201,37 +201,40 @@ impl RunInitConfigAndIO config, - Err(e) => { - debug!( - "Failed to fetch external config from Hub ({}), using default: {}", - repo_id, e - ); - ExternalModelConfig::default() + // Fetch run config that is stored off-chain + let external_config: ExternalModelConfig = match llm.checkpoint { + model::Checkpoint::Gcs(gcs_repo) | model::Checkpoint::P2PGcs(gcs_repo) => { + let bucket = gcs_repo.bucket.to_string(); + let path = format!("{}/{}", CONFIG_PREFIX, MODEL_CONFIG_FILENAME); + debug!("Fetching external config from gs://{}/{}", bucket, path); + fetch_json_from_gcs(&bucket, &path).await? + } + model::Checkpoint::Hub(repo) | model::Checkpoint::P2P(repo) => { + let repo_id: String = (&repo.repo_id).into(); + let revision = repo.revision.map(|bytes| (&bytes).into()); + debug!( + "Fetching external config from Hub: {}/{}", + repo_id, MODEL_CONFIG_FILENAME + ); + match fetch_json_from_hub( + &repo_id, + revision, + MODEL_CONFIG_FILENAME, + init_config.hub_read_token.clone(), + ) + .await + { + Ok(config) => config, + Err(e) => { + debug!( + "Failed to fetch external config from Hub ({}), using default: {}", + repo_id, e + ); + ExternalModelConfig::default() + } } } - } else { - debug!("No GCS/Hub checkpoint, using default external config"); - ExternalModelConfig::default() + _ => ExternalModelConfig::default(), }; let hub_read_token = init_config.hub_read_token.clone(); diff --git a/shared/coordinator/src/external_config.rs b/shared/coordinator/src/external_config.rs index d28b25d79..fc6c17af6 100644 --- a/shared/coordinator/src/external_config.rs +++ b/shared/coordinator/src/external_config.rs @@ -1,19 +1,6 @@ -//! External model configuration stored in GCS. -//! -//! This module provides schemas for model configuration that lives outside -//! the on-chain state. The coordinator only needs minimal fields on-chain: -//! - `checkpoint` (reads and writes for Hub↔P2P transitions) -//! - `max_seq_len` (reads for sequence length) -//! - `cold_start_warmup_steps` (reads for warmup bounds) -//! -//! Everything else is stored in GCS at `gs://{checkpoint_bucket}/config/model_config.json` -//! and fetched by clients at startup. - use serde::{Deserialize, Serialize}; -use crate::model::{ - Checkpoint, GcsRepo, LLM, LLMArchitecture, LLMTrainingDataLocation, LLMTrainingDataType, Model, -}; +use crate::model::{LLMArchitecture, LLMTrainingDataLocation, LLMTrainingDataType}; use psyche_core::{LearningRateSchedule, OptimizerDefinition}; /// Path within the bucket where config is stored @@ -21,147 +8,37 @@ pub const CONFIG_PREFIX: &str = "config"; /// Filename for the model config pub const MODEL_CONFIG_FILENAME: &str = "model_config.json"; -// ============================================================================ -// Config-file representations (old format with all fields in [model.LLM]) -// ============================================================================ - -/// Config-file representation of the model with all fields. -/// This allows config files to keep the old format where everything -/// is under `[model.LLM]`. -/// -/// Use `ConfigModel::split()` to separate into on-chain `Model` and `ExternalModelConfig`. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub enum ConfigModel { - LLM(ConfigLLM), -} - -impl ConfigModel { - /// Split the config model into on-chain Model and ExternalModelConfig. - pub fn split(self) -> (Model, ExternalModelConfig) { - match self { - ConfigModel::LLM(config_llm) => config_llm.split(), - } - } -} - -/// Config-file representation of LLM with all fields (old format). -/// This includes both on-chain fields and external config fields. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConfigLLM { - // On-chain fields - pub max_seq_len: u32, - #[serde(default)] - pub cold_start_warmup_steps: u32, - pub checkpoint: Checkpoint, - - // External config fields (with defaults for backward compatibility) - #[serde(default = "default_architecture")] - pub architecture: LLMArchitecture, - #[serde(default = "default_data_type")] - pub data_type: LLMTrainingDataType, - #[serde(default)] - pub data_location: LLMTrainingDataLocation, - #[serde(default = "default_lr_schedule")] - pub lr_schedule: LearningRateSchedule, - #[serde(default = "default_optimizer")] - pub optimizer: OptimizerDefinition, -} - -impl ConfigLLM { - /// Split into on-chain LLM and ExternalModelConfig. - pub fn split(self) -> (Model, ExternalModelConfig) { - let llm = LLM { - max_seq_len: self.max_seq_len, - cold_start_warmup_steps: self.cold_start_warmup_steps, - checkpoint: self.checkpoint, - }; - - let external_config = ExternalModelConfig { - version: default_version(), - architecture: self.architecture, - data_type: self.data_type, - data_location: self.data_location, - lr_schedule: self.lr_schedule, - optimizer: self.optimizer, - run_metadata: None, - client_requirements: None, - }; - - (Model::LLM(llm), external_config) - } -} - /// External model configuration schema. -/// This is stored in GCS and fetched by clients. -/// -/// Adding new fields here doesn't affect on-chain memory layout. -/// Use `#[serde(default)]` for backward compatibility. +/// This is stored in an off-chain storage and fetched by clients. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ExternalModelConfig { - /// Schema version for forward compatibility - #[serde(default = "default_version")] pub version: u32, - /// Model architecture (HfLlama, HfDeepseek, etc.) - #[serde(default = "default_architecture")] pub architecture: LLMArchitecture, - /// Training data type (Pretraining, Finetuning) - #[serde(default = "default_data_type")] pub data_type: LLMTrainingDataType, - /// Training data location - #[serde(default)] pub data_location: LLMTrainingDataLocation, - /// Learning rate schedule - #[serde(default = "default_lr_schedule")] pub lr_schedule: LearningRateSchedule, - /// Optimizer configuration - #[serde(default = "default_optimizer")] pub optimizer: OptimizerDefinition, /// Optional run metadata #[serde(default, skip_serializing_if = "Option::is_none")] pub run_metadata: Option, - - /// Optional client requirements - #[serde(default, skip_serializing_if = "Option::is_none")] - pub client_requirements: Option, -} - -fn default_version() -> u32 { - 1 -} - -fn default_architecture() -> LLMArchitecture { - LLMArchitecture::HfLlama -} - -fn default_data_type() -> LLMTrainingDataType { - LLMTrainingDataType::Pretraining -} - -fn default_lr_schedule() -> LearningRateSchedule { - LearningRateSchedule::Constant(psyche_core::ConstantLR::default()) -} - -fn default_optimizer() -> OptimizerDefinition { - OptimizerDefinition::Dummy } impl Default for ExternalModelConfig { fn default() -> Self { Self { - version: default_version(), + version: 1, architecture: LLMArchitecture::HfLlama, data_type: LLMTrainingDataType::Pretraining, data_location: LLMTrainingDataLocation::default(), - lr_schedule: default_lr_schedule(), - optimizer: default_optimizer(), + lr_schedule: LearningRateSchedule::Constant(psyche_core::ConstantLR::default()), + optimizer: OptimizerDefinition::Dummy, run_metadata: None, - client_requirements: None, } } } @@ -185,36 +62,17 @@ pub struct RunMetadata { pub client_version: String, } -/// Client requirements for joining the run -#[derive(Debug, Clone, Serialize, Deserialize, Default)] -pub struct ClientRequirements { - #[serde(default, skip_serializing_if = "Option::is_none")] - pub min_gpu_memory_gb: Option, - - #[serde(default, skip_serializing_if = "Option::is_none")] - pub recommended_gpu: Option, - - #[serde(default, skip_serializing_if = "Option::is_none")] - pub recommended_micro_batch: Option, - - #[serde(default, skip_serializing_if = "Option::is_none")] - pub recommended_total_batch: Option, -} - impl ExternalModelConfig { - /// Serialize to JSON string pub fn to_json(&self) -> Result { serde_json::to_string_pretty(self) } - /// Deserialize from JSON string pub fn from_json(json: &str) -> Result { serde_json::from_str(json) } /// Validate the configuration pub fn check(&self) -> bool { - // Validate data location let bad_data_location = match &self.data_location { LLMTrainingDataLocation::Dummy => false, LLMTrainingDataLocation::Server(url) => url.is_empty(), @@ -239,7 +97,6 @@ impl ExternalModelConfig { return false; } - // Validate optimizer match &self.optimizer { OptimizerDefinition::Dummy => false, OptimizerDefinition::AdamW { .. } => true, @@ -247,154 +104,3 @@ impl ExternalModelConfig { } } } - -/// Helper to derive the config GCS path from a checkpoint. -/// Returns `Some((bucket, path))` for GCS checkpoints, `None` for others. -pub fn get_config_gcs_path(checkpoint: &Checkpoint) -> Option<(String, String)> { - let gcs_repo = match checkpoint { - Checkpoint::Gcs(repo) | Checkpoint::P2PGcs(repo) => repo, - _ => return None, - }; - - let bucket = gcs_repo.bucket.to_string(); - let path = format!("{}/{}", CONFIG_PREFIX, MODEL_CONFIG_FILENAME); - - Some((bucket, path)) -} - -/// Helper to derive the config Hub path from a checkpoint. -/// Returns `Some((repo_id, revision, filename))` for Hub checkpoints, `None` for others. -pub fn get_config_hub_path( - checkpoint: &Checkpoint, -) -> Option<(String, Option, &'static str)> { - let hub_repo = match checkpoint { - Checkpoint::Hub(repo) | Checkpoint::P2P(repo) | Checkpoint::Dummy(repo) => repo, - _ => return None, - }; - - let repo_id = hub_repo.repo_id.to_string(); - if repo_id.is_empty() { - return None; - } - - let revision = hub_repo.revision.as_ref().map(|r| r.to_string()); - - Some((repo_id, revision, MODEL_CONFIG_FILENAME)) -} - -/// Construct the full GCS URI for the config file -pub fn get_config_gcs_uri(checkpoint: &Checkpoint) -> Option { - get_config_gcs_path(checkpoint).map(|(bucket, path)| format!("gs://{}/{}", bucket, path)) -} - -/// Helper to create a GcsRepo for the config location from a checkpoint -pub fn get_config_gcs_repo(checkpoint: &Checkpoint) -> Option { - match checkpoint { - Checkpoint::Gcs(repo) | Checkpoint::P2PGcs(repo) => Some(GcsRepo { - bucket: repo.bucket, - prefix: Some(psyche_core::FixedString::from_str_truncated(CONFIG_PREFIX)), - }), - _ => None, - } -} - -#[cfg(test)] -mod tests { - use super::*; - use psyche_core::FixedString; - - #[test] - fn test_roundtrip() { - let config = ExternalModelConfig { - version: 1, - architecture: LLMArchitecture::HfLlama, - data_type: LLMTrainingDataType::Pretraining, - data_location: LLMTrainingDataLocation::default(), - lr_schedule: default_lr_schedule(), - optimizer: OptimizerDefinition::AdamW { - betas: [0.9, 0.999], - weight_decay: 0.01, - eps: 1e-8, - clip_grad_norm: None, - }, - run_metadata: Some(RunMetadata { - name: "Test Run".to_string(), - description: "A test training run".to_string(), - num_parameters: 20_000_000, - vocab_size: 32_000, - client_version: "v1.0.0".to_string(), - }), - client_requirements: None, - }; - - let json = config.to_json().unwrap(); - let parsed = ExternalModelConfig::from_json(&json).unwrap(); - - assert_eq!(parsed.version, config.version); - assert_eq!(parsed.architecture, config.architecture); - assert_eq!(parsed.run_metadata.unwrap().name, "Test Run"); - } - - #[test] - fn test_backward_compatibility() { - // Old JSON without new fields - let old_json = r#"{ - "version": 1, - "architecture": "HfLlama" - }"#; - - let config = ExternalModelConfig::from_json(old_json).unwrap(); - - // Should use defaults for missing fields - assert_eq!(config.architecture, LLMArchitecture::HfLlama); - assert!(matches!( - config.data_location, - LLMTrainingDataLocation::Dummy - )); - assert!(config.run_metadata.is_none()); - } - - #[test] - fn test_config_gcs_path() { - let checkpoint = Checkpoint::Gcs(GcsRepo { - bucket: FixedString::from_str_truncated("my-bucket"), - prefix: Some(FixedString::from_str_truncated("checkpoints")), - }); - - let (bucket, path) = get_config_gcs_path(&checkpoint).unwrap(); - assert_eq!(bucket, "my-bucket"); - assert_eq!(path, "config/model_config.json"); - - let uri = get_config_gcs_uri(&checkpoint).unwrap(); - assert_eq!(uri, "gs://my-bucket/config/model_config.json"); - } - - #[test] - fn test_config_gcs_path_hub_returns_none() { - use crate::model::HubRepo; - - let checkpoint = Checkpoint::Hub(HubRepo { - repo_id: FixedString::from_str_truncated("org/model"), - revision: None, - }); - - assert!(get_config_gcs_path(&checkpoint).is_none()); - } - - #[test] - fn test_adding_new_fields() { - // This test demonstrates that adding new fields doesn't break parsing - // of old configs (as long as they have #[serde(default)]) - let config_with_future_fields = r#"{ - "version": 2, - "architecture": "HfLlama", - "some_future_field": "this field doesn't exist yet", - "another_future_field": { "nested": true } - }"#; - - // Should parse without error, ignoring unknown fields - let config = ExternalModelConfig::from_json(config_with_future_fields).unwrap(); - assert_eq!(config.version, 2); - assert_eq!(config.architecture, LLMArchitecture::HfLlama); - } -} diff --git a/shared/coordinator/src/model.rs b/shared/coordinator/src/model.rs index 3fa2f8825..d0b0fe33b 100644 --- a/shared/coordinator/src/model.rs +++ b/shared/coordinator/src/model.rs @@ -180,23 +180,13 @@ pub enum HttpTrainingDataLocation { }, } -/// On-chain LLM configuration. -/// -/// This struct only contains fields that the coordinator needs to read/write. -/// All other configuration (architecture, data_location, lr_schedule, optimizer, etc.) -/// is stored externally in GCS and fetched by clients. -/// -/// See `external_config::ExternalModelConfig` for the external configuration schema. #[derive( AnchorSerialize, AnchorDeserialize, Serialize, Deserialize, Clone, Debug, Zeroable, Copy, TS, )] #[repr(C)] pub struct LLM { - /// Maximum sequence length for training pub max_seq_len: u32, - /// Number of warmup steps for cold start pub cold_start_warmup_steps: u32, - /// Checkpoint location - coordinator reads and writes this for Hub↔P2P transitions pub checkpoint: Checkpoint, } @@ -302,11 +292,6 @@ impl std::fmt::Display for Checkpoint { } impl Model { - /// Check on-chain model configuration validity. - /// - /// This only validates fields stored on-chain. External configuration - /// (architecture, data_location, optimizer, etc.) is validated by - /// `ExternalModelConfig::check()` on the client side. pub fn check(&self) -> bool { match self { Model::LLM(llm) => { diff --git a/shared/data-provider/src/gcs.rs b/shared/data-provider/src/gcs.rs index f5adc6d46..ddd31e0b3 100644 --- a/shared/data-provider/src/gcs.rs +++ b/shared/data-provider/src/gcs.rs @@ -355,15 +355,6 @@ pub async fn fetch_json_from_gcs( serde_json::from_slice(&data).map_err(DownloadError::Json) } -/// Fetch a JSON file from GCS synchronously. -pub fn fetch_json_from_gcs_sync( - bucket: &str, - object_path: &str, -) -> Result { - let rt = Runtime::new().map_err(DownloadError::Io)?; - rt.block_on(fetch_json_from_gcs(bucket, object_path)) -} - /// Upload a JSON-serializable value to GCS. pub async fn upload_json_to_gcs( bucket: &str, @@ -395,16 +386,6 @@ pub async fn upload_json_to_gcs( Ok(()) } -/// Upload a JSON-serializable value to GCS synchronously. -pub fn upload_json_to_gcs_sync( - bucket: &str, - object_path: &str, - value: &T, -) -> Result<(), UploadError> { - let rt = Runtime::new().map_err(UploadError::Io)?; - rt.block_on(upload_json_to_gcs(bucket, object_path, value)) -} - pub async fn upload_to_gcs( gcs_info: GcsUploadInfo, manifest_metadata: GcsManifestMetadata, diff --git a/shared/data-provider/src/hub.rs b/shared/data-provider/src/hub.rs index 6a504aead..fd61ef760 100644 --- a/shared/data-provider/src/hub.rs +++ b/shared/data-provider/src/hub.rs @@ -7,6 +7,7 @@ use hf_hub::{ tokio::{ApiError, UploadSource}, }, }; +use psyche_coordinator::external_config::ExternalModelConfig; use psyche_coordinator::model; use psyche_core::FixedString; use std::{path::PathBuf, time::Instant}; @@ -194,7 +195,7 @@ pub fn download_dataset_repo_sync( } /// Fetch a JSON file from HuggingFace and deserialize it. -/// Used for fetching external model configuration from Hub checkpoints. +/// Used for fetching external model configuration from Hub. pub async fn fetch_json_from_hub( repo_id: &str, revision: Option, @@ -222,6 +223,44 @@ pub async fn fetch_json_from_hub( serde_json::from_str(&content).map_err(DownloadError::Json) } +/// Upload a JSON-serializable value to HuggingFace Hub. +pub async fn upload_extra_config_to_hub( + repo_id: &str, + filename: &str, + external_config: &ExternalModelConfig, + token: Option, + commit_message: Option, +) -> Result<(), UploadError> { + let cache = Cache::default(); + let api = hf_hub::api::tokio::ApiBuilder::new() + .with_cache_dir(cache.path().clone()) + .with_token(token.or(cache.token())) + .with_progress(false) + .build()?; + + let repo = Repo::model(repo_id.to_string()); + let api_repo = api.repo(repo); + + let json = serde_json::to_string_pretty(external_config)?; + let data = json.into_bytes(); + + info!("Uploading JSON to {}/{} on HuggingFace", repo_id, filename); + + api_repo + .upload_file( + UploadSource::Bytes(data), + filename, + commit_message.or_else(|| Some(format!("Upload {}", filename))), + None, + false, + ) + .await?; + + info!("Uploaded JSON to {}/{} on HuggingFace", repo_id, filename); + + Ok(()) +} + #[derive(Debug, Clone)] pub struct HubUploadInfo { pub hub_repo: String, diff --git a/shared/data-provider/src/lib.rs b/shared/data-provider/src/lib.rs index a33a724c6..9c1c0b9f0 100644 --- a/shared/data-provider/src/lib.rs +++ b/shared/data-provider/src/lib.rs @@ -20,11 +20,12 @@ pub use file_extensions::{DATA_FILE_EXTENSIONS, PARQUET_EXTENSION}; pub use gcs::{ GcsCheckpointManifest, GcsManifestMetadata, GcsUploadInfo, ManifestFileEntry, ManifestMetadata, download_model_from_gcs_async, download_model_from_gcs_sync, fetch_json_from_gcs, - fetch_json_from_gcs_sync, upload_json_to_gcs, upload_json_to_gcs_sync, upload_to_gcs, + upload_json_to_gcs, upload_to_gcs, }; pub use hub::{ HubUploadInfo, download_dataset_repo_async, download_dataset_repo_sync, - download_model_repo_async, download_model_repo_sync, fetch_json_from_hub, upload_to_hub, + download_model_repo_async, download_model_repo_sync, fetch_json_from_hub, + upload_extra_config_to_hub, upload_to_hub, }; pub use local::LocalDataProvider; pub use parquet::record::{ListAccessor, MapAccessor, RowAccessor}; diff --git a/shared/watcher/src/tui.rs b/shared/watcher/src/tui.rs index 9be2b57ff..b95272cca 100644 --- a/shared/watcher/src/tui.rs +++ b/shared/watcher/src/tui.rs @@ -43,13 +43,10 @@ impl psyche_tui::CustomWidget for CoordinatorTui { let vsplit = Layout::vertical(Constraint::from_fills([1, 1])).split(coord_split[1]); { Paragraph::new( - [ - format!("Data Source: {}", state.data_source), - format!("Model Checkpoint: {}", state.model_checkpoint), - ] - .into_iter() - .map(Line::from) - .collect::>(), + [format!("Model Checkpoint: {}", state.model_checkpoint)] + .into_iter() + .map(Line::from) + .collect::>(), ) .block(Block::bordered().title("Config")) .render(vsplit[0], buf); @@ -169,7 +166,6 @@ pub struct CoordinatorTuiState { pub run_state: TuiRunState, pub height: u32, pub clients: Vec, - pub data_source: String, pub model_checkpoint: String, pub exited_clients: usize, pub pending_pause: bool, @@ -187,7 +183,6 @@ impl From<&Coordinator> for CoordinatorTuiState { .iter() .map(|c| format!("{:?}", c.id)) .collect(), - data_source: "External Config".to_string(), // data_type moved to external config model_checkpoint: match &value.model { Model::LLM(l) => format!("{}", l.checkpoint), }, diff --git a/tools/rust-tools/preview-lr/src/main.rs b/tools/rust-tools/preview-lr/src/main.rs index cc557b187..43f015b81 100644 --- a/tools/rust-tools/preview-lr/src/main.rs +++ b/tools/rust-tools/preview-lr/src/main.rs @@ -1,6 +1,6 @@ use clap::Parser; use plotters::prelude::*; -use psyche_coordinator::{CoordinatorConfig, external_config::ExternalModelConfig, model::Model}; +use psyche_coordinator::{CoordinatorConfig, external_config::ExternalModelConfig}; use serde::Deserialize; use std::path::PathBuf; @@ -28,8 +28,6 @@ enum Commands { #[derive(Deserialize)] struct Config { pub config: CoordinatorConfig, - #[allow(dead_code)] - pub model: Model, pub external_config: ExternalModelConfig, } fn main() -> anyhow::Result<()> { diff --git a/tools/rust-tools/run-manager/src/commands/run/update_config.rs b/tools/rust-tools/run-manager/src/commands/run/update_config.rs index 5f4a0092e..ad540d081 100644 --- a/tools/rust-tools/run-manager/src/commands/run/update_config.rs +++ b/tools/rust-tools/run-manager/src/commands/run/update_config.rs @@ -6,7 +6,7 @@ use anyhow::{Context, Result, bail}; use clap::Args; use psyche_coordinator::{ CoordinatorConfig, CoordinatorProgress, - external_config::{ExternalModelConfig, get_config_gcs_path}, + external_config::{CONFIG_PREFIX, ExternalModelConfig, MODEL_CONFIG_FILENAME}, get_data_index_for_step, model::{Checkpoint, Model}, }; @@ -79,7 +79,6 @@ impl Command for CommandUpdateConfig { struct State { pub config: CoordinatorConfig, pub model: Model, - #[serde(default)] pub external_config: ExternalModelConfig, } let state: State = toml::from_str(std::str::from_utf8( @@ -145,24 +144,52 @@ impl Command for CommandUpdateConfig { coordinator_account_state.state.coordinator.model = model; } - // Upload external config to GCS if provided - if let Some(ref external_config) = external_config { + // Upload external config to GCS or hub repo depending of the model checkpoint + if let Some(external_config) = external_config { let Model::LLM(llm) = &coordinator_account_state.state.coordinator.model; - if let Some((bucket, path)) = get_config_gcs_path(&llm.checkpoint) { - info!("Uploading external config to gs://{}/{}", bucket, path); - upload_json_to_gcs(&bucket, &path, external_config) + match llm.checkpoint { + Checkpoint::Gcs(ref gcs_repo) | Checkpoint::P2PGcs(ref gcs_repo) => { + let bucket = gcs_repo.bucket.to_string(); + let path = format!("{}/{}", CONFIG_PREFIX, MODEL_CONFIG_FILENAME); + info!("Uploading external config to gs://{}/{}", bucket, path); + upload_json_to_gcs(&bucket, &path, &external_config) + .await + .with_context(|| { + format!( + "failed to upload external config to gs://{}/{}", + bucket, path + ) + })?; + println!("Uploaded external config to gs://{}/{}", bucket, path); + } + Checkpoint::Hub(ref hub_repo) + | Checkpoint::P2P(ref hub_repo) + | Checkpoint::Dummy(ref hub_repo) => { + let repo_id = hub_repo.repo_id.to_string(); + psyche_data_provider::upload_extra_config_to_hub( + &repo_id, + "model_config.json", + &external_config, + None, + None, + ) .await .with_context(|| { format!( - "failed to upload external config to gs://{}/{}", - bucket, path + "failed to upload external config to Hub repo {}/external_config.json", + repo_id ) })?; - println!("Uploaded external config to gs://{}/{}", bucket, path); - } else { - println!( - "Warning: external_config provided but checkpoint is not GCS-based, skipping upload" - ); + println!( + "Uploaded external config to Hub repo {}/external_config.json", + repo_id + ); + } + _ => { + println!( + "Warning: external_config provided but checkpoint is not GCS- or Hub-based, skipping upload" + ); + } } } diff --git a/website/frontend/src/fakeData.ts b/website/frontend/src/fakeData.ts index 87df62f7e..6b92e8827 100644 --- a/website/frontend/src/fakeData.ts +++ b/website/frontend/src/fakeData.ts @@ -60,7 +60,6 @@ export const fakeRunSummaries: RunSummary[] = [ status: { type: 'paused' }, totalTokens: 100000n, size: 1000000000n, - arch: 'HfLlama', type: 'vision', pauseHistory: [], lastUpdate: { @@ -77,7 +76,6 @@ export const fakeRunSummaries: RunSummary[] = [ status: { type: 'active' }, totalTokens: 200000n, size: 2000000000n, - arch: 'HfLlama', type: 'text', pauseHistory: [], lastUpdate: { @@ -100,7 +98,6 @@ export const fakeRunSummaries: RunSummary[] = [ }, // 1 day ago totalTokens: 50000n, size: 500000000n, - arch: 'HfLlama', type: 'text', pauseHistory: [], lastUpdate: { @@ -117,7 +114,6 @@ export const fakeRunSummaries: RunSummary[] = [ status: { type: 'active' }, totalTokens: 100000n, size: 1000000000n, - arch: 'HfLlama', type: 'vision', pauseHistory: [], lastUpdate: { @@ -392,15 +388,6 @@ function makeFakeRunDataSeeded(seed = 1, step = 0, index = 0): RunData { roundWitnessTime: 2_000, minClients, epochTime, - lrSchedule: { - Cosine: { - base_lr: 4.0e-4, - warmup_steps: 500, - warmup_init_lr: 0.0, - total_steps: 25000, - final_lr: 4.0e-5, - }, - }, }, }, } From 15e4a557afc8c6febe0092d2372fb5cc0ff92319 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Thu, 29 Jan 2026 13:34:32 -0300 Subject: [PATCH 14/22] Add default to version field --- config/solana-test/light-config.toml | 1 + config/solana-test/nano-config.toml | 1 + shared/coordinator/src/external_config.rs | 1 + 3 files changed, 3 insertions(+) diff --git a/config/solana-test/light-config.toml b/config/solana-test/light-config.toml index 19c2153f6..a9d5b4d1f 100644 --- a/config/solana-test/light-config.toml +++ b/config/solana-test/light-config.toml @@ -22,6 +22,7 @@ cold_start_warmup_steps = 0 repo_id = "emozilla/llama2-20m-init" [external_config] +version = 1 architecture = "HfLlama" data_type = "Pretraining" diff --git a/config/solana-test/nano-config.toml b/config/solana-test/nano-config.toml index 4ed67eec8..73de52bd0 100644 --- a/config/solana-test/nano-config.toml +++ b/config/solana-test/nano-config.toml @@ -23,6 +23,7 @@ repo_id = "pefontana/Nano-Llama" revision = "cf48eac4944f6e954a3d9c9c30e8c865e64e7d03" [external_config] +version = 1 architecture = "HfLlama" data_type = "Pretraining" diff --git a/shared/coordinator/src/external_config.rs b/shared/coordinator/src/external_config.rs index fc6c17af6..6c76b3eeb 100644 --- a/shared/coordinator/src/external_config.rs +++ b/shared/coordinator/src/external_config.rs @@ -12,6 +12,7 @@ pub const MODEL_CONFIG_FILENAME: &str = "model_config.json"; /// This is stored in an off-chain storage and fetched by clients. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ExternalModelConfig { + #[serde(default)] pub version: u32, pub architecture: LLMArchitecture, From b77447cfc239f028df816874c0c60773bf1e158e Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Thu, 29 Jan 2026 09:56:21 -0800 Subject: [PATCH 15/22] Add flag to skip uploading metadata only for testing purposes --- docker/psyche_client_create_run.sh | 3 +- docker/test/run_owner_entrypoint.sh | 3 +- scripts/create-permissionless-run.sh | 3 +- scripts/setup-test-run.sh | 3 +- shared/client/src/state/init.rs | 4 +- .../src/commands/run/update_config.rs | 81 ++++++++++--------- 6 files changed, 53 insertions(+), 44 deletions(-) diff --git a/docker/psyche_client_create_run.sh b/docker/psyche_client_create_run.sh index 9fc6b177a..b0e291372 100755 --- a/docker/psyche_client_create_run.sh +++ b/docker/psyche_client_create_run.sh @@ -65,7 +65,8 @@ cargo run --release --bin run-manager -- \ --rpc ${RPC} \ --ws-rpc ${WS_RPC} \ --run-id ${RUN_ID} \ - --config-path "${CONFIG_PATH}" + --config-path "${CONFIG_PATH}" \ + --skip-upload-external-config echo -e "\n[+] Model config uploaded successfully" diff --git a/docker/test/run_owner_entrypoint.sh b/docker/test/run_owner_entrypoint.sh index 9d6fc9db2..b863985cd 100644 --- a/docker/test/run_owner_entrypoint.sh +++ b/docker/test/run_owner_entrypoint.sh @@ -25,7 +25,8 @@ run-manager update-config \ --rpc "${RPC}" \ --ws-rpc "${WS_RPC}" \ --run-id "${RUN_ID}" \ - --config-path "/usr/local/config.toml" + --config-path "/usr/local/config.toml" \ + --skip-upload-external-config run-manager set-paused \ --wallet-private-key-path ${WALLET_FILE} \ diff --git a/scripts/create-permissionless-run.sh b/scripts/create-permissionless-run.sh index 582974fff..f7e4bf0a7 100755 --- a/scripts/create-permissionless-run.sh +++ b/scripts/create-permissionless-run.sh @@ -81,7 +81,8 @@ cargo run --release --bin run-manager -- \ --run-id ${RUN_ID} \ --config-path ${CONFIG_FILE} \ --num-parameters 1100000000 \ - --vocab-size 32768 + --vocab-size 32768 \ + --skip-upload-external-config echo -e "\n[+] Unpause the training run..." cargo run --release --bin run-manager -- \ diff --git a/scripts/setup-test-run.sh b/scripts/setup-test-run.sh index ae0407b29..d870c45e2 100755 --- a/scripts/setup-test-run.sh +++ b/scripts/setup-test-run.sh @@ -55,7 +55,8 @@ cargo run --release --bin run-manager -- \ --rpc "${RPC}" \ --ws-rpc "${WS_RPC}" \ --run-id "${RUN_ID}" \ - --config-path "config/solana-test/test-config.toml" + --config-path "config/solana-test/test-config.toml" \ + --skip-upload-external-config echo "[+] Unpausing run..." cargo run --release --bin run-manager -- \ diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs index a8ef7a48e..87217b479 100644 --- a/shared/client/src/state/init.rs +++ b/shared/client/src/state/init.rs @@ -226,11 +226,11 @@ impl RunInitConfigAndIO config, Err(e) => { - debug!( + error!( "Failed to fetch external config from Hub ({}), using default: {}", repo_id, e ); - ExternalModelConfig::default() + return Err(InitRunError::GcsModelLoad(e)); } } } diff --git a/tools/rust-tools/run-manager/src/commands/run/update_config.rs b/tools/rust-tools/run-manager/src/commands/run/update_config.rs index ad540d081..5acd600c3 100644 --- a/tools/rust-tools/run-manager/src/commands/run/update_config.rs +++ b/tools/rust-tools/run-manager/src/commands/run/update_config.rs @@ -44,6 +44,8 @@ pub struct CommandUpdateConfig { // end metadata #[clap(long, env)] pub client_version: Option, + #[clap(long, default_value_t = false, hide = true)] + pub skip_upload_external_config: bool, } #[async_trait] @@ -60,6 +62,7 @@ impl Command for CommandUpdateConfig { num_parameters, vocab_size, client_version, + skip_upload_external_config, } = self; let main_authority = backend.get_payer(); @@ -145,50 +148,52 @@ impl Command for CommandUpdateConfig { } // Upload external config to GCS or hub repo depending of the model checkpoint - if let Some(external_config) = external_config { - let Model::LLM(llm) = &coordinator_account_state.state.coordinator.model; - match llm.checkpoint { - Checkpoint::Gcs(ref gcs_repo) | Checkpoint::P2PGcs(ref gcs_repo) => { - let bucket = gcs_repo.bucket.to_string(); - let path = format!("{}/{}", CONFIG_PREFIX, MODEL_CONFIG_FILENAME); - info!("Uploading external config to gs://{}/{}", bucket, path); - upload_json_to_gcs(&bucket, &path, &external_config) + if !skip_upload_external_config { + if let Some(external_config) = external_config { + let Model::LLM(llm) = &coordinator_account_state.state.coordinator.model; + match llm.checkpoint { + Checkpoint::Gcs(ref gcs_repo) | Checkpoint::P2PGcs(ref gcs_repo) => { + let bucket = gcs_repo.bucket.to_string(); + let path = format!("{}/{}", CONFIG_PREFIX, MODEL_CONFIG_FILENAME); + info!("Uploading external config to gs://{}/{}", bucket, path); + upload_json_to_gcs(&bucket, &path, &external_config) + .await + .with_context(|| { + format!( + "failed to upload external config to gs://{}/{}", + bucket, path + ) + })?; + println!("Uploaded external config to gs://{}/{}", bucket, path); + } + Checkpoint::Hub(ref hub_repo) + | Checkpoint::P2P(ref hub_repo) + | Checkpoint::Dummy(ref hub_repo) => { + let repo_id = hub_repo.repo_id.to_string(); + psyche_data_provider::upload_extra_config_to_hub( + &repo_id, + "model_config.json", + &external_config, + None, + None, + ) .await .with_context(|| { format!( - "failed to upload external config to gs://{}/{}", - bucket, path + "failed to upload external config to Hub repo {}/external_config.json", + repo_id ) })?; - println!("Uploaded external config to gs://{}/{}", bucket, path); - } - Checkpoint::Hub(ref hub_repo) - | Checkpoint::P2P(ref hub_repo) - | Checkpoint::Dummy(ref hub_repo) => { - let repo_id = hub_repo.repo_id.to_string(); - psyche_data_provider::upload_extra_config_to_hub( - &repo_id, - "model_config.json", - &external_config, - None, - None, - ) - .await - .with_context(|| { - format!( - "failed to upload external config to Hub repo {}/external_config.json", + println!( + "Uploaded external config to Hub repo {}/external_config.json", repo_id - ) - })?; - println!( - "Uploaded external config to Hub repo {}/external_config.json", - repo_id - ); - } - _ => { - println!( - "Warning: external_config provided but checkpoint is not GCS- or Hub-based, skipping upload" - ); + ); + } + _ => { + println!( + "Warning: external_config provided but checkpoint is not GCS- or Hub-based, skipping upload" + ); + } } } } From db15a63bf8dbd672e1f1ab61fbf1622801596b68 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Thu, 29 Jan 2026 15:04:42 -0300 Subject: [PATCH 16/22] Add hub token to upload external config --- shared/client/src/state/init.rs | 8 +++---- .../src/commands/run/update_config.rs | 23 ++++++++++--------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs index 87217b479..da2231731 100644 --- a/shared/client/src/state/init.rs +++ b/shared/client/src/state/init.rs @@ -212,14 +212,12 @@ impl RunInitConfigAndIO { let repo_id: String = (&repo.repo_id).into(); let revision = repo.revision.map(|bytes| (&bytes).into()); - debug!( - "Fetching external config from Hub: {}/{}", - repo_id, MODEL_CONFIG_FILENAME - ); + let path = format!("{}/{}", CONFIG_PREFIX, MODEL_CONFIG_FILENAME); + debug!("Fetching external config from Hub: {}/{}", repo_id, path); match fetch_json_from_hub( &repo_id, revision, - MODEL_CONFIG_FILENAME, + &path, init_config.hub_read_token.clone(), ) .await diff --git a/tools/rust-tools/run-manager/src/commands/run/update_config.rs b/tools/rust-tools/run-manager/src/commands/run/update_config.rs index 5acd600c3..1ccdc608e 100644 --- a/tools/rust-tools/run-manager/src/commands/run/update_config.rs +++ b/tools/rust-tools/run-manager/src/commands/run/update_config.rs @@ -46,6 +46,10 @@ pub struct CommandUpdateConfig { pub client_version: Option, #[clap(long, default_value_t = false, hide = true)] pub skip_upload_external_config: bool, + + /// HuggingFace token for uploading to Hub repos (can also use HF_TOKEN env var) + #[clap(long, env = "HF_TOKEN")] + pub hub_token: Option, } #[async_trait] @@ -63,6 +67,7 @@ impl Command for CommandUpdateConfig { vocab_size, client_version, skip_upload_external_config, + hub_token, } = self; let main_authority = backend.get_payer(); @@ -166,28 +171,24 @@ impl Command for CommandUpdateConfig { })?; println!("Uploaded external config to gs://{}/{}", bucket, path); } - Checkpoint::Hub(ref hub_repo) - | Checkpoint::P2P(ref hub_repo) - | Checkpoint::Dummy(ref hub_repo) => { + Checkpoint::Hub(ref hub_repo) | Checkpoint::P2P(ref hub_repo) => { let repo_id = hub_repo.repo_id.to_string(); + let path = format!("{}/{}", CONFIG_PREFIX, MODEL_CONFIG_FILENAME); psyche_data_provider::upload_extra_config_to_hub( &repo_id, - "model_config.json", + &path, &external_config, - None, + hub_token.clone(), None, ) .await .with_context(|| { format!( - "failed to upload external config to Hub repo {}/external_config.json", - repo_id + "failed to upload external config to Hub repo {}/{}", + repo_id, path ) })?; - println!( - "Uploaded external config to Hub repo {}/external_config.json", - repo_id - ); + println!("Uploaded external config to Hub repo {}/{}", repo_id, path); } _ => { println!( From 92fe2522d6868159323bf637230fa99b3b7cf682 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Thu, 29 Jan 2026 15:49:13 -0300 Subject: [PATCH 17/22] Be able to use a local config to override external config and avoid fetching for testing --- Cargo.lock | 1 + architectures/centralized/client/src/app.rs | 5 +- .../decentralized/solana-client/src/app.rs | 5 +- docker/test/client_test_entrypoint.sh | 2 + scripts/train-solana-test.sh | 2 + shared/client/Cargo.toml | 1 + shared/client/src/cli.rs | 34 ++++++++- shared/client/src/lib.rs | 4 +- shared/client/src/state/init.rs | 73 +++++++++++-------- shared/client/src/state/mod.rs | 2 +- 10 files changed, 93 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ef66cf80f..d81f5dd93 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6968,6 +6968,7 @@ dependencies = [ "tokenizers", "tokio", "tokio-util 0.7.16", + "toml 0.8.23", "tracing", "wandb", ] diff --git a/architectures/centralized/client/src/app.rs b/architectures/centralized/client/src/app.rs index 8f6a9bcca..706e36d78 100644 --- a/architectures/centralized/client/src/app.rs +++ b/architectures/centralized/client/src/app.rs @@ -5,7 +5,8 @@ use psyche_centralized_shared::{ClientId, ClientToServerMessage, ServerToClientM use psyche_client::HubUploadInfo; use psyche_client::UploadInfo; use psyche_client::{ - Client, ClientTUI, ClientTUIState, NC, RunInitConfig, TrainArgs, read_identity_secret_key, + Client, ClientTUI, ClientTUIState, ExternalModelConfig, NC, RunInitConfig, TrainArgs, + read_identity_secret_key, }; use psyche_coordinator::{Coordinator, HealthChecks, model}; use psyche_metrics::ClientMetrics; @@ -110,6 +111,7 @@ pub async fn build_app( let hub_read_token = std::env::var("HF_TOKEN").ok(); let eval_tasks = p.eval_tasks()?; let checkpoint_config = p.checkpoint_config()?; + let external_config_override: Option = p.external_config_override()?; let wandb_info = p.wandb_info(format!( "{}-{}", p.run_id.clone(), @@ -153,6 +155,7 @@ pub async fn build_app( max_concurrent_parameter_requests: p.max_concurrent_parameter_requests, device: p.device, sidecar_port: p.sidecar_port, + external_config_override, }; let app = App { cancel, diff --git a/architectures/decentralized/solana-client/src/app.rs b/architectures/decentralized/solana-client/src/app.rs index 36a529bbb..27fbd86fc 100644 --- a/architectures/decentralized/solana-client/src/app.rs +++ b/architectures/decentralized/solana-client/src/app.rs @@ -11,7 +11,8 @@ use anchor_client::{ }; use anyhow::{Result, anyhow}; use psyche_client::{ - Client, ClientTUI, ClientTUIState, NC, RunInitConfig, TrainArgs, read_identity_secret_key, + Client, ClientTUI, ClientTUIState, ExternalModelConfig, NC, RunInitConfig, TrainArgs, + read_identity_secret_key, }; use psyche_coordinator::{ClientState, Coordinator, CoordinatorError, RunState}; use psyche_core::sha256; @@ -90,6 +91,7 @@ pub async fn build_app( let eval_tasks = p.eval_tasks()?; let hub_read_token = std::env::var("HF_TOKEN").ok(); let checkpoint_config = p.checkpoint_config()?; + let external_config_override: Option = p.external_config_override()?; let solana_pubkey = wallet_keypair.pubkey(); let wandb_info = p.wandb_info(format!("{}-{solana_pubkey}", p.run_id))?; @@ -134,6 +136,7 @@ pub async fn build_app( max_concurrent_parameter_requests: p.max_concurrent_parameter_requests, device: p.device, sidecar_port: p.sidecar_port, + external_config_override, }; let app = App { run_id: p.run_id.clone(), diff --git a/docker/test/client_test_entrypoint.sh b/docker/test/client_test_entrypoint.sh index 64e739841..74f35b611 100644 --- a/docker/test/client_test_entrypoint.sh +++ b/docker/test/client_test_entrypoint.sh @@ -18,6 +18,7 @@ if [ "${PYTHON_ENABLED}" = "true" ]; then --run-id "${RUN_ID}" \ --data-parallelism 8 \ --sidecar-port "${SIDECAR_PORT}" \ + --external-config-toml "/usr/local/config.toml" \ --logs "json" else echo "Starting client without Python features" @@ -26,5 +27,6 @@ else --rpc "${RPC}" \ --ws-rpc "${WS_RPC}" \ --run-id "${RUN_ID}" \ + --external-config-toml "/usr/local/config.toml" \ --logs "json" fi diff --git a/scripts/train-solana-test.sh b/scripts/train-solana-test.sh index 5794cfdc4..01d04f551 100755 --- a/scripts/train-solana-test.sh +++ b/scripts/train-solana-test.sh @@ -56,6 +56,7 @@ if [[ "$OTLP_METRICS_URL" == "" ]]; then --micro-batch-size ${BATCH_SIZE} \ --authorizer ${AUTHORIZER} \ --logs "console" \ + --external-config-toml ./config/solana-test/light-config.toml \ ${CHECKPOINT_ARGS[@]} \ "$@" else @@ -72,6 +73,7 @@ else --authorizer ${AUTHORIZER} \ --oltp-metrics-url "http://localhost:4318/v1/metrics" \ --oltp-logs-url "http://localhost:4318/v1/logs" \ + --external-config-toml ./config/solana-test/light-config.toml \ ${CHECKPOINT_ARGS[@]} \ "$@" fi diff --git a/shared/client/Cargo.toml b/shared/client/Cargo.toml index 5f7159d81..1249c71d6 100644 --- a/shared/client/Cargo.toml +++ b/shared/client/Cargo.toml @@ -18,6 +18,7 @@ postcard.workspace = true anyhow.workspace = true serde.workspace = true serde_json.workspace = true +toml.workspace = true tch.workspace = true tokenizers.workspace = true tokio.workspace = true diff --git a/shared/client/src/cli.rs b/shared/client/src/cli.rs index a4ef145f0..085e58984 100644 --- a/shared/client/src/cli.rs +++ b/shared/client/src/cli.rs @@ -1,8 +1,9 @@ use crate::{CheckpointConfig, WandBInfo}; use crate::UploadInfo; -use anyhow::{Result, anyhow, bail}; +use anyhow::{Context, Result, anyhow, bail}; use clap::Args; +use psyche_coordinator::external_config::ExternalModelConfig; use psyche_data_provider::{GcsUploadInfo, HubUploadInfo}; use psyche_eval::tasktype_from_name; use psyche_modeling::Devices; @@ -204,6 +205,11 @@ pub struct TrainArgs { #[clap(long, default_value_t = 3, env)] pub keep_steps: u32, + + /// Path to a TOML config file. If provided, uses this config instead of fetching from the remote repo. + /// Only meant for testing/debugging. + #[clap(long, env, hide = true)] + pub external_config_toml: Option, } impl TrainArgs { @@ -327,6 +333,32 @@ impl TrainArgs { .collect(); result } + + pub fn external_config_override(&self) -> Result> { + let Some(path) = &self.external_config_toml else { + return Ok(None); + }; + + let content = std::fs::read_to_string(path) + .with_context(|| format!("failed to read external config TOML file {:?}", path))?; + + let toml_value: toml::Value = toml::from_str(&content) + .with_context(|| format!("failed to parse TOML file {:?}", path))?; + + let external_config_table = toml_value + .get("external_config") + .ok_or_else(|| anyhow::anyhow!("missing [external_config] section in {:?}", path))?; + + let config: ExternalModelConfig = + external_config_table.clone().try_into().with_context(|| { + format!( + "failed to deserialize external_config from TOML file {:?}", + path + ) + })?; + + Ok(Some(config)) + } } pub fn prepare_environment() { diff --git a/shared/client/src/lib.rs b/shared/client/src/lib.rs index bdad43e30..ed52c5a2e 100644 --- a/shared/client/src/lib.rs +++ b/shared/client/src/lib.rs @@ -9,8 +9,8 @@ pub use cli::{TrainArgs, prepare_environment, print_identity_keys, read_identity pub use client::Client; pub use protocol::{Broadcast, BroadcastType, Finished, NC, TrainingResult}; pub use state::{ - CheckpointConfig, GcsUploadInfo, HubUploadInfo, InitRunError, RoundState, RunInitConfig, - RunInitConfigAndIO, UploadInfo, + CheckpointConfig, ExternalModelConfig, GcsUploadInfo, HubUploadInfo, InitRunError, RoundState, + RunInitConfig, RunInitConfigAndIO, UploadInfo, }; pub use tui::{ClientTUI, ClientTUIState}; diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs index da2231731..4a9af15c3 100644 --- a/shared/client/src/state/init.rs +++ b/shared/client/src/state/init.rs @@ -1,7 +1,8 @@ use crate::{WandBInfo, fetch_data::DataFetcher}; +pub use psyche_coordinator::external_config::ExternalModelConfig; use psyche_coordinator::{ Coordinator, HealthChecks, - external_config::{CONFIG_PREFIX, ExternalModelConfig, MODEL_CONFIG_FILENAME}, + external_config::{CONFIG_PREFIX, MODEL_CONFIG_FILENAME}, model::{self, HttpLLMTrainingDataLocation, LLMTrainingDataLocation}, }; use psyche_core::{ @@ -77,6 +78,10 @@ pub struct RunInitConfig { pub dummy_training_delay_secs: Option, pub sidecar_port: Option, + + /// If provided, use this external config instead of fetching from GCS/Hub. + /// Only meant for testing/debugging. + pub external_config_override: Option, } #[derive(Debug, Error)] @@ -201,38 +206,46 @@ impl RunInitConfigAndIO { - let bucket = gcs_repo.bucket.to_string(); - let path = format!("{}/{}", CONFIG_PREFIX, MODEL_CONFIG_FILENAME); - debug!("Fetching external config from gs://{}/{}", bucket, path); - fetch_json_from_gcs(&bucket, &path).await? - } - model::Checkpoint::Hub(repo) | model::Checkpoint::P2P(repo) => { - let repo_id: String = (&repo.repo_id).into(); - let revision = repo.revision.map(|bytes| (&bytes).into()); - let path = format!("{}/{}", CONFIG_PREFIX, MODEL_CONFIG_FILENAME); - debug!("Fetching external config from Hub: {}/{}", repo_id, path); - match fetch_json_from_hub( - &repo_id, - revision, - &path, - init_config.hub_read_token.clone(), - ) - .await - { - Ok(config) => config, - Err(e) => { - error!( - "Failed to fetch external config from Hub ({}), using default: {}", - repo_id, e - ); - return Err(InitRunError::GcsModelLoad(e)); + // Use external config override if provided (only meant for testing/debugging), + // otherwise fetch from GCS/Hub + let external_config: ExternalModelConfig = if let Some(config) = + init_config.external_config_override.clone() + { + info!("Using external config override from CLI"); + config + } else { + match llm.checkpoint { + model::Checkpoint::Gcs(gcs_repo) | model::Checkpoint::P2PGcs(gcs_repo) => { + let bucket = gcs_repo.bucket.to_string(); + let path = format!("{}/{}", CONFIG_PREFIX, MODEL_CONFIG_FILENAME); + debug!("Fetching external config from gs://{}/{}", bucket, path); + fetch_json_from_gcs(&bucket, &path).await? + } + model::Checkpoint::Hub(repo) | model::Checkpoint::P2P(repo) => { + let repo_id: String = (&repo.repo_id).into(); + let revision = repo.revision.map(|bytes| (&bytes).into()); + let path = format!("{}/{}", CONFIG_PREFIX, MODEL_CONFIG_FILENAME); + debug!("Fetching external config from Hub: {}/{}", repo_id, path); + match fetch_json_from_hub( + &repo_id, + revision, + &path, + init_config.hub_read_token.clone(), + ) + .await + { + Ok(config) => config, + Err(e) => { + error!( + "Failed to fetch external config from Hub ({}), using default: {}", + repo_id, e + ); + return Err(InitRunError::GcsModelLoad(e)); + } } } + _ => ExternalModelConfig::default(), } - _ => ExternalModelConfig::default(), }; let hub_read_token = init_config.hub_read_token.clone(); diff --git a/shared/client/src/state/mod.rs b/shared/client/src/state/mod.rs index 78e6cd1eb..6932edd6e 100644 --- a/shared/client/src/state/mod.rs +++ b/shared/client/src/state/mod.rs @@ -13,7 +13,7 @@ mod train; mod warmup; mod witness; -pub use init::{InitRunError, RunInitConfig, RunInitConfigAndIO}; +pub use init::{ExternalModelConfig, InitRunError, RunInitConfig, RunInitConfigAndIO}; pub use psyche_data_provider::{GcsUploadInfo, HubUploadInfo}; pub use round_state::RoundState; pub use steps::{ApplyMessageOutcome, RunManager}; From f962b968fa19126e37a2541a3387953de1489d83 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Thu, 29 Jan 2026 16:48:58 -0300 Subject: [PATCH 18/22] Fix config mount on docker container test --- .../decentralized/testing/src/docker_setup.rs | 22 ++++++++++++++++++- docker/test/docker-compose.yml | 2 ++ .../three_clients_test/docker-compose.yml | 6 +++++ nix/docker.nix | 2 -- 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/architectures/decentralized/testing/src/docker_setup.rs b/architectures/decentralized/testing/src/docker_setup.rs index 4f5a80dd9..6d0f09abd 100644 --- a/architectures/decentralized/testing/src/docker_setup.rs +++ b/architectures/decentralized/testing/src/docker_setup.rs @@ -4,7 +4,7 @@ use bollard::{ Config, CreateContainerOptions, KillContainerOptions, ListContainersOptions, RemoveContainerOptions, }, - models::DeviceRequest, + models::{DeviceRequest, Mount, MountTypeEnum}, secret::{ContainerSummary, HostConfig}, }; use psyche_core::IntegrationTestLogMarker; @@ -119,6 +119,24 @@ pub async fn spawn_new_client(docker_client: Arc) -> Result) -> Result Date: Fri, 30 Jan 2026 17:49:38 -0300 Subject: [PATCH 19/22] Rename extra config ExternalModelConfig to ModelExtraData --- architectures/centralized/client/src/app.rs | 6 +- .../decentralized/solana-client/src/app.rs | 6 +- .../decentralized/testing/src/utils.rs | 2 +- config/solana-test/light-config.toml | 10 +-- config/solana-test/nano-config.toml | 10 +-- shared/client/src/cli.rs | 22 +++---- shared/client/src/lib.rs | 2 +- shared/client/src/state/init.rs | 62 +++++++++---------- shared/client/src/state/mod.rs | 2 +- shared/coordinator/src/lib.rs | 2 +- ...external_config.rs => model_extra_data.rs} | 9 ++- shared/data-provider/src/hub.rs | 10 +-- shared/data-provider/src/lib.rs | 2 +- tools/rust-tools/preview-lr/src/main.rs | 6 +- .../src/commands/run/update_config.rs | 39 ++++++------ website/wasm/src/lib.rs | 2 +- 16 files changed, 95 insertions(+), 97 deletions(-) rename shared/coordinator/src/{external_config.rs => model_extra_data.rs} (93%) diff --git a/architectures/centralized/client/src/app.rs b/architectures/centralized/client/src/app.rs index 706e36d78..aab6e66b9 100644 --- a/architectures/centralized/client/src/app.rs +++ b/architectures/centralized/client/src/app.rs @@ -5,7 +5,7 @@ use psyche_centralized_shared::{ClientId, ClientToServerMessage, ServerToClientM use psyche_client::HubUploadInfo; use psyche_client::UploadInfo; use psyche_client::{ - Client, ClientTUI, ClientTUIState, ExternalModelConfig, NC, RunInitConfig, TrainArgs, + Client, ClientTUI, ClientTUIState, ModelExtraData, NC, RunInitConfig, TrainArgs, read_identity_secret_key, }; use psyche_coordinator::{Coordinator, HealthChecks, model}; @@ -111,7 +111,7 @@ pub async fn build_app( let hub_read_token = std::env::var("HF_TOKEN").ok(); let eval_tasks = p.eval_tasks()?; let checkpoint_config = p.checkpoint_config()?; - let external_config_override: Option = p.external_config_override()?; + let model_extra_data_override: Option = p.model_extra_data_override()?; let wandb_info = p.wandb_info(format!( "{}-{}", p.run_id.clone(), @@ -155,7 +155,7 @@ pub async fn build_app( max_concurrent_parameter_requests: p.max_concurrent_parameter_requests, device: p.device, sidecar_port: p.sidecar_port, - external_config_override, + model_extra_data_override, }; let app = App { cancel, diff --git a/architectures/decentralized/solana-client/src/app.rs b/architectures/decentralized/solana-client/src/app.rs index 27fbd86fc..dfb242cb0 100644 --- a/architectures/decentralized/solana-client/src/app.rs +++ b/architectures/decentralized/solana-client/src/app.rs @@ -11,7 +11,7 @@ use anchor_client::{ }; use anyhow::{Result, anyhow}; use psyche_client::{ - Client, ClientTUI, ClientTUIState, ExternalModelConfig, NC, RunInitConfig, TrainArgs, + Client, ClientTUI, ClientTUIState, ModelExtraData, NC, RunInitConfig, TrainArgs, read_identity_secret_key, }; use psyche_coordinator::{ClientState, Coordinator, CoordinatorError, RunState}; @@ -91,7 +91,7 @@ pub async fn build_app( let eval_tasks = p.eval_tasks()?; let hub_read_token = std::env::var("HF_TOKEN").ok(); let checkpoint_config = p.checkpoint_config()?; - let external_config_override: Option = p.external_config_override()?; + let model_extra_data_override: Option = p.model_extra_data_override()?; let solana_pubkey = wallet_keypair.pubkey(); let wandb_info = p.wandb_info(format!("{}-{solana_pubkey}", p.run_id))?; @@ -136,7 +136,7 @@ pub async fn build_app( max_concurrent_parameter_requests: p.max_concurrent_parameter_requests, device: p.device, sidecar_port: p.sidecar_port, - external_config_override, + model_extra_data_override, }; let app = App { run_id: p.run_id.clone(), diff --git a/architectures/decentralized/testing/src/utils.rs b/architectures/decentralized/testing/src/utils.rs index 4b3f92c58..71fa07abe 100644 --- a/architectures/decentralized/testing/src/utils.rs +++ b/architectures/decentralized/testing/src/utils.rs @@ -183,7 +183,7 @@ impl ConfigBuilder { // This means that every client is a witness self.set_value("config.witness_nodes", 0_u32); - self.set_value("external_config.architecture", self.architecture.clone()); + self.set_value("model_extra_data.architecture", self.architecture.clone()); self.set_value("config.global_batch_size_start", self.batch_size); self.set_value("config.global_batch_size_end", self.batch_size); diff --git a/config/solana-test/light-config.toml b/config/solana-test/light-config.toml index a9d5b4d1f..6d244fb9f 100644 --- a/config/solana-test/light-config.toml +++ b/config/solana-test/light-config.toml @@ -21,27 +21,27 @@ cold_start_warmup_steps = 0 [model.LLM.checkpoint.Hub] repo_id = "emozilla/llama2-20m-init" -[external_config] +[model_extra_data] version = 1 architecture = "HfLlama" data_type = "Pretraining" -[external_config.data_location.Http] +[model_extra_data.data_location.Http] token_size_in_bytes = "TwoBytes" shuffle = "DontShuffle" -[external_config.data_location.Http.location.Gcp] +[model_extra_data.data_location.Http.location.Gcp] bucket_name = "nous-pretraining-public-us" filter_directory = "fineweb-edu-tokenized-llama2" -[external_config.lr_schedule.Cosine] +[model_extra_data.lr_schedule.Cosine] base_lr = 4.0e-4 warmup_steps = 250 warmup_init_lr = 0.0 total_steps = 25000 final_lr = 4.0e-5 -[external_config.optimizer.Distro] +[model_extra_data.optimizer.Distro] clip_grad_norm = 1.0 compression_decay = 0.999 compression_chunk = 64 diff --git a/config/solana-test/nano-config.toml b/config/solana-test/nano-config.toml index 73de52bd0..7e3636eb9 100644 --- a/config/solana-test/nano-config.toml +++ b/config/solana-test/nano-config.toml @@ -22,26 +22,26 @@ cold_start_warmup_steps = 0 repo_id = "pefontana/Nano-Llama" revision = "cf48eac4944f6e954a3d9c9c30e8c865e64e7d03" -[external_config] +[model_extra_data] version = 1 architecture = "HfLlama" data_type = "Pretraining" -[external_config.data_location.Http] +[model_extra_data.data_location.Http] token_size_in_bytes = "TwoBytes" shuffle = "DontShuffle" -[external_config.data_location.Http.location] +[model_extra_data.data_location.Http.location] SingleUrl = "https://huggingface.co/pefontana/Nano-Llama/resolve/main/tiny-ci-dataset/000_tiny-test.ds" -[external_config.lr_schedule.Cosine] +[model_extra_data.lr_schedule.Cosine] base_lr = 4.0e-4 warmup_steps = 250 warmup_init_lr = 0.0 total_steps = 25000 final_lr = 4.0e-5 -[external_config.optimizer.Distro] +[model_extra_data.optimizer.Distro] clip_grad_norm = 1.0 compression_decay = 0.999 compression_chunk = 64 diff --git a/shared/client/src/cli.rs b/shared/client/src/cli.rs index 085e58984..32b4cddc0 100644 --- a/shared/client/src/cli.rs +++ b/shared/client/src/cli.rs @@ -3,7 +3,7 @@ use crate::{CheckpointConfig, WandBInfo}; use crate::UploadInfo; use anyhow::{Context, Result, anyhow, bail}; use clap::Args; -use psyche_coordinator::external_config::ExternalModelConfig; +use psyche_coordinator::model_extra_data::ModelExtraData; use psyche_data_provider::{GcsUploadInfo, HubUploadInfo}; use psyche_eval::tasktype_from_name; use psyche_modeling::Devices; @@ -209,7 +209,7 @@ pub struct TrainArgs { /// Path to a TOML config file. If provided, uses this config instead of fetching from the remote repo. /// Only meant for testing/debugging. #[clap(long, env, hide = true)] - pub external_config_toml: Option, + pub model_extra_data_toml: Option, } impl TrainArgs { @@ -334,25 +334,25 @@ impl TrainArgs { result } - pub fn external_config_override(&self) -> Result> { - let Some(path) = &self.external_config_toml else { + pub fn model_extra_data_override(&self) -> Result> { + let Some(path) = &self.model_extra_data_toml else { return Ok(None); }; let content = std::fs::read_to_string(path) - .with_context(|| format!("failed to read external config TOML file {:?}", path))?; + .with_context(|| format!("failed to read model extra data TOML file {:?}", path))?; let toml_value: toml::Value = toml::from_str(&content) .with_context(|| format!("failed to parse TOML file {:?}", path))?; - let external_config_table = toml_value - .get("external_config") - .ok_or_else(|| anyhow::anyhow!("missing [external_config] section in {:?}", path))?; + let model_extra_data_table = toml_value + .get("model_extra_data") + .ok_or_else(|| anyhow::anyhow!("missing [model_extra_data] section in {:?}", path))?; - let config: ExternalModelConfig = - external_config_table.clone().try_into().with_context(|| { + let config: ModelExtraData = + model_extra_data_table.clone().try_into().with_context(|| { format!( - "failed to deserialize external_config from TOML file {:?}", + "failed to deserialize model_extra_data from TOML file {:?}", path ) })?; diff --git a/shared/client/src/lib.rs b/shared/client/src/lib.rs index ed52c5a2e..998795c35 100644 --- a/shared/client/src/lib.rs +++ b/shared/client/src/lib.rs @@ -9,7 +9,7 @@ pub use cli::{TrainArgs, prepare_environment, print_identity_keys, read_identity pub use client::Client; pub use protocol::{Broadcast, BroadcastType, Finished, NC, TrainingResult}; pub use state::{ - CheckpointConfig, ExternalModelConfig, GcsUploadInfo, HubUploadInfo, InitRunError, RoundState, + CheckpointConfig, GcsUploadInfo, HubUploadInfo, InitRunError, ModelExtraData, RoundState, RunInitConfig, RunInitConfigAndIO, UploadInfo, }; pub use tui::{ClientTUI, ClientTUIState}; diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs index 4a9af15c3..7953dbb5d 100644 --- a/shared/client/src/state/init.rs +++ b/shared/client/src/state/init.rs @@ -1,9 +1,9 @@ use crate::{WandBInfo, fetch_data::DataFetcher}; -pub use psyche_coordinator::external_config::ExternalModelConfig; +pub use psyche_coordinator::model_extra_data::ModelExtraData; use psyche_coordinator::{ Coordinator, HealthChecks, - external_config::{CONFIG_PREFIX, MODEL_CONFIG_FILENAME}, model::{self, HttpLLMTrainingDataLocation, LLMTrainingDataLocation}, + model_extra_data::{CONFIG_PREFIX, MODEL_CONFIG_FILENAME}, }; use psyche_core::{ Barrier, CancellableBarrier, IntegrationTestLogMarker, NodeIdentity, Shuffle, TokenSize, @@ -79,9 +79,9 @@ pub struct RunInitConfig { pub sidecar_port: Option, - /// If provided, use this external config instead of fetching from GCS/Hub. + /// If provided, use this model extra data instead of fetching from GCS/Hub. /// Only meant for testing/debugging. - pub external_config_override: Option, + pub model_extra_data_override: Option, } #[derive(Debug, Error)] @@ -206,26 +206,26 @@ impl RunInitConfigAndIO { let bucket = gcs_repo.bucket.to_string(); let path = format!("{}/{}", CONFIG_PREFIX, MODEL_CONFIG_FILENAME); - debug!("Fetching external config from gs://{}/{}", bucket, path); + debug!("Fetching model extra data from gs://{}/{}", bucket, path); fetch_json_from_gcs(&bucket, &path).await? } model::Checkpoint::Hub(repo) | model::Checkpoint::P2P(repo) => { let repo_id: String = (&repo.repo_id).into(); let revision = repo.revision.map(|bytes| (&bytes).into()); let path = format!("{}/{}", CONFIG_PREFIX, MODEL_CONFIG_FILENAME); - debug!("Fetching external config from Hub: {}/{}", repo_id, path); + debug!("Fetching model extra data from Hub: {}/{}", repo_id, path); match fetch_json_from_hub( &repo_id, revision, @@ -237,14 +237,14 @@ impl RunInitConfigAndIO config, Err(e) => { error!( - "Failed to fetch external config from Hub ({}), using default: {}", + "Failed to fetch model extra data from Hub ({}), using default: {}", repo_id, e ); return Err(InitRunError::GcsModelLoad(e)); } } } - _ => ExternalModelConfig::default(), + _ => ModelExtraData::default(), } }; @@ -253,9 +253,9 @@ impl RunInitConfigAndIO DataProvider::Server( DataProviderTcpClient::connect( (&data_server).into(), @@ -320,7 +320,7 @@ impl RunInitConfigAndIO> = match &external_config + let model_future: JoinHandle> = match &model_extra_data .architecture { model::LLMArchitecture::HfLlama @@ -440,7 +440,7 @@ impl RunInitConfigAndIO { AutoConfig::Llama(serde_json::from_str(&model_config)?) } @@ -461,7 +461,7 @@ impl RunInitConfigAndIO RunInitConfigAndIO 1 - && external_config.architecture == model::LLMArchitecture::HfAuto + && model_extra_data.architecture == model::LLMArchitecture::HfAuto { 1 } else { @@ -542,7 +542,7 @@ impl RunInitConfigAndIO = - match external_config.data_type { + match model_extra_data.data_type { model::LLMTrainingDataType::Finetuning => { #[cfg(feature = "parallelism")] { @@ -556,7 +556,7 @@ impl RunInitConfigAndIO None, }; - let raw_loaded_model_type: RawLoadedModelType = match external_config + let raw_loaded_model_type: RawLoadedModelType = match model_extra_data .architecture { model::LLMArchitecture::HfAuto | model::LLMArchitecture::Torchtitan => { @@ -568,7 +568,7 @@ impl RunInitConfigAndIO RunInitConfigAndIO RunInitConfigAndIO RunInitConfigAndIO RunInitConfigAndIO, data_parallel: None, }, - external_config.lr_schedule, - external_config.optimizer, + model_extra_data.lr_schedule, + model_extra_data.optimizer, init_config.micro_batch_size, init_config.optim_stats_every_n_steps, init_config.grad_accum_in_fp32, @@ -877,8 +877,8 @@ impl RunInitConfigAndIO RunInitConfigAndIO RunInitConfigAndIO quantize_1bit, _ => false, }; diff --git a/shared/client/src/state/mod.rs b/shared/client/src/state/mod.rs index 6932edd6e..547bb6fef 100644 --- a/shared/client/src/state/mod.rs +++ b/shared/client/src/state/mod.rs @@ -13,7 +13,7 @@ mod train; mod warmup; mod witness; -pub use init::{ExternalModelConfig, InitRunError, RunInitConfig, RunInitConfigAndIO}; +pub use init::{InitRunError, ModelExtraData, RunInitConfig, RunInitConfigAndIO}; pub use psyche_data_provider::{GcsUploadInfo, HubUploadInfo}; pub use round_state::RoundState; pub use steps::{ApplyMessageOutcome, RunManager}; diff --git a/shared/coordinator/src/lib.rs b/shared/coordinator/src/lib.rs index fff25b812..aae450fb0 100644 --- a/shared/coordinator/src/lib.rs +++ b/shared/coordinator/src/lib.rs @@ -4,8 +4,8 @@ mod commitment; mod committee_selection; mod coordinator; mod data_selection; -pub mod external_config; pub mod model; +pub mod model_extra_data; pub use commitment::Commitment; pub use committee_selection::{ diff --git a/shared/coordinator/src/external_config.rs b/shared/coordinator/src/model_extra_data.rs similarity index 93% rename from shared/coordinator/src/external_config.rs rename to shared/coordinator/src/model_extra_data.rs index 6c76b3eeb..23b68b0de 100644 --- a/shared/coordinator/src/external_config.rs +++ b/shared/coordinator/src/model_extra_data.rs @@ -8,10 +8,9 @@ pub const CONFIG_PREFIX: &str = "config"; /// Filename for the model config pub const MODEL_CONFIG_FILENAME: &str = "model_config.json"; -/// External model configuration schema. -/// This is stored in an off-chain storage and fetched by clients. +/// Extra model data that is stored off-chain and fetched by clients. #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ExternalModelConfig { +pub struct ModelExtraData { #[serde(default)] pub version: u32, @@ -30,7 +29,7 @@ pub struct ExternalModelConfig { pub run_metadata: Option, } -impl Default for ExternalModelConfig { +impl Default for ModelExtraData { fn default() -> Self { Self { version: 1, @@ -63,7 +62,7 @@ pub struct RunMetadata { pub client_version: String, } -impl ExternalModelConfig { +impl ModelExtraData { pub fn to_json(&self) -> Result { serde_json::to_string_pretty(self) } diff --git a/shared/data-provider/src/hub.rs b/shared/data-provider/src/hub.rs index fd61ef760..c38236460 100644 --- a/shared/data-provider/src/hub.rs +++ b/shared/data-provider/src/hub.rs @@ -7,8 +7,8 @@ use hf_hub::{ tokio::{ApiError, UploadSource}, }, }; -use psyche_coordinator::external_config::ExternalModelConfig; use psyche_coordinator::model; +use psyche_coordinator::model_extra_data::ModelExtraData; use psyche_core::FixedString; use std::{path::PathBuf, time::Instant}; use tokio::sync::mpsc; @@ -223,11 +223,11 @@ pub async fn fetch_json_from_hub( serde_json::from_str(&content).map_err(DownloadError::Json) } -/// Upload a JSON-serializable value to HuggingFace Hub. -pub async fn upload_extra_config_to_hub( +/// Upload model extra data to HuggingFace Hub. +pub async fn upload_model_extra_data_to_hub( repo_id: &str, filename: &str, - external_config: &ExternalModelConfig, + model_extra_data: &ModelExtraData, token: Option, commit_message: Option, ) -> Result<(), UploadError> { @@ -241,7 +241,7 @@ pub async fn upload_extra_config_to_hub( let repo = Repo::model(repo_id.to_string()); let api_repo = api.repo(repo); - let json = serde_json::to_string_pretty(external_config)?; + let json = serde_json::to_string_pretty(model_extra_data)?; let data = json.into_bytes(); info!("Uploading JSON to {}/{} on HuggingFace", repo_id, filename); diff --git a/shared/data-provider/src/lib.rs b/shared/data-provider/src/lib.rs index 9c1c0b9f0..8899313e3 100644 --- a/shared/data-provider/src/lib.rs +++ b/shared/data-provider/src/lib.rs @@ -25,7 +25,7 @@ pub use gcs::{ pub use hub::{ HubUploadInfo, download_dataset_repo_async, download_dataset_repo_sync, download_model_repo_async, download_model_repo_sync, fetch_json_from_hub, - upload_extra_config_to_hub, upload_to_hub, + upload_model_extra_data_to_hub, upload_to_hub, }; pub use local::LocalDataProvider; pub use parquet::record::{ListAccessor, MapAccessor, RowAccessor}; diff --git a/tools/rust-tools/preview-lr/src/main.rs b/tools/rust-tools/preview-lr/src/main.rs index 43f015b81..51c52a3c6 100644 --- a/tools/rust-tools/preview-lr/src/main.rs +++ b/tools/rust-tools/preview-lr/src/main.rs @@ -1,6 +1,6 @@ use clap::Parser; use plotters::prelude::*; -use psyche_coordinator::{CoordinatorConfig, external_config::ExternalModelConfig}; +use psyche_coordinator::{CoordinatorConfig, model_extra_data::ModelExtraData}; use serde::Deserialize; use std::path::PathBuf; @@ -28,7 +28,7 @@ enum Commands { #[derive(Deserialize)] struct Config { pub config: CoordinatorConfig, - pub external_config: ExternalModelConfig, + pub model_extra_data: ModelExtraData, } fn main() -> anyhow::Result<()> { let args = Args::parse(); @@ -49,7 +49,7 @@ fn main() -> anyhow::Result<()> { let config: Config = toml::from_str(&std::fs::read_to_string(&config_path)?)?; let steps = config.config.total_steps; - let lr = config.external_config.lr_schedule; + let lr = config.model_extra_data.lr_schedule; let root = BitMapBackend::new("lr-plot.png", (steps.min(10_000), 1024)).into_drawing_area(); root.fill(&WHITE)?; diff --git a/tools/rust-tools/run-manager/src/commands/run/update_config.rs b/tools/rust-tools/run-manager/src/commands/run/update_config.rs index 1ccdc608e..7356de704 100644 --- a/tools/rust-tools/run-manager/src/commands/run/update_config.rs +++ b/tools/rust-tools/run-manager/src/commands/run/update_config.rs @@ -5,10 +5,9 @@ use std::path::PathBuf; use anyhow::{Context, Result, bail}; use clap::Args; use psyche_coordinator::{ - CoordinatorConfig, CoordinatorProgress, - external_config::{CONFIG_PREFIX, ExternalModelConfig, MODEL_CONFIG_FILENAME}, - get_data_index_for_step, + CoordinatorConfig, CoordinatorProgress, get_data_index_for_step, model::{Checkpoint, Model}, + model_extra_data::{CONFIG_PREFIX, MODEL_CONFIG_FILENAME, ModelExtraData}, }; use psyche_data_provider::upload_json_to_gcs; use psyche_solana_treasurer::logic::RunUpdateParams; @@ -45,7 +44,7 @@ pub struct CommandUpdateConfig { #[clap(long, env)] pub client_version: Option, #[clap(long, default_value_t = false, hide = true)] - pub skip_upload_external_config: bool, + pub skip_upload_model_extra_data: bool, /// HuggingFace token for uploading to Hub repos (can also use HF_TOKEN env var) #[clap(long, env = "HF_TOKEN")] @@ -66,7 +65,7 @@ impl Command for CommandUpdateConfig { num_parameters, vocab_size, client_version, - skip_upload_external_config, + skip_upload_model_extra_data, hub_token, } = self; @@ -81,13 +80,13 @@ impl Command for CommandUpdateConfig { .get_coordinator_account(&coordinator_account) .await?; - let (config, mut model, external_config) = match config_path { + let (config, mut model, model_extra_data) = match config_path { Some(config_path) => { #[derive(Serialize, Deserialize)] struct State { pub config: CoordinatorConfig, pub model: Model, - pub external_config: ExternalModelConfig, + pub model_extra_data: ModelExtraData, } let state: State = toml::from_str(std::str::from_utf8( &std::fs::read(&config_path).with_context(|| { @@ -99,7 +98,7 @@ impl Command for CommandUpdateConfig { ( Some(state.config), Some(state.model), - Some(state.external_config), + Some(state.model_extra_data), ) } None => (None, None, None), @@ -152,47 +151,47 @@ impl Command for CommandUpdateConfig { coordinator_account_state.state.coordinator.model = model; } - // Upload external config to GCS or hub repo depending of the model checkpoint - if !skip_upload_external_config { - if let Some(external_config) = external_config { + // Upload model extra data to GCS or hub repo depending of the model checkpoint + if !skip_upload_model_extra_data { + if let Some(model_extra_data) = model_extra_data { let Model::LLM(llm) = &coordinator_account_state.state.coordinator.model; match llm.checkpoint { Checkpoint::Gcs(ref gcs_repo) | Checkpoint::P2PGcs(ref gcs_repo) => { let bucket = gcs_repo.bucket.to_string(); let path = format!("{}/{}", CONFIG_PREFIX, MODEL_CONFIG_FILENAME); - info!("Uploading external config to gs://{}/{}", bucket, path); - upload_json_to_gcs(&bucket, &path, &external_config) + info!("Uploading model extra data to gs://{}/{}", bucket, path); + upload_json_to_gcs(&bucket, &path, &model_extra_data) .await .with_context(|| { format!( - "failed to upload external config to gs://{}/{}", + "failed to upload model extra data to gs://{}/{}", bucket, path ) })?; - println!("Uploaded external config to gs://{}/{}", bucket, path); + println!("Uploaded model extra data to gs://{}/{}", bucket, path); } Checkpoint::Hub(ref hub_repo) | Checkpoint::P2P(ref hub_repo) => { let repo_id = hub_repo.repo_id.to_string(); let path = format!("{}/{}", CONFIG_PREFIX, MODEL_CONFIG_FILENAME); - psyche_data_provider::upload_extra_config_to_hub( + psyche_data_provider::upload_model_extra_data_to_hub( &repo_id, &path, - &external_config, + &model_extra_data, hub_token.clone(), None, ) .await .with_context(|| { format!( - "failed to upload external config to Hub repo {}/{}", + "failed to upload model extra data to Hub repo {}/{}", repo_id, path ) })?; - println!("Uploaded external config to Hub repo {}/{}", repo_id, path); + println!("Uploaded model extra data to Hub repo {}/{}", repo_id, path); } _ => { println!( - "Warning: external_config provided but checkpoint is not GCS- or Hub-based, skipping upload" + "Warning: model_extra_data provided but checkpoint is not GCS- or Hub-based, skipping upload" ); } } diff --git a/website/wasm/src/lib.rs b/website/wasm/src/lib.rs index 1e84298bc..f57cf73da 100644 --- a/website/wasm/src/lib.rs +++ b/website/wasm/src/lib.rs @@ -40,7 +40,7 @@ pub struct DummyCoordinatorAccount(CoordinatorAccount); #[ts(export)] pub struct DummyClientId(ClientId); -// Export types that are now in ExternalModelConfig but still needed by the website +// Export types that are now in ModelExtraData but still needed by the website #[allow(dead_code)] #[derive(TS)] #[ts(export)] From 33edbfb9fea43e8c7e5e309242c3de4bcea3230b Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Mon, 2 Feb 2026 11:13:48 -0300 Subject: [PATCH 20/22] Fix flag to skip upload to external storage --- docker/psyche_client_create_run.sh | 2 +- docker/test/run_owner_entrypoint.sh | 2 +- scripts/create-permissionless-run.sh | 2 +- scripts/setup-test-run.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/psyche_client_create_run.sh b/docker/psyche_client_create_run.sh index b0e291372..61474c113 100755 --- a/docker/psyche_client_create_run.sh +++ b/docker/psyche_client_create_run.sh @@ -66,7 +66,7 @@ cargo run --release --bin run-manager -- \ --ws-rpc ${WS_RPC} \ --run-id ${RUN_ID} \ --config-path "${CONFIG_PATH}" \ - --skip-upload-external-config + --skip-upload-model-extra-data echo -e "\n[+] Model config uploaded successfully" diff --git a/docker/test/run_owner_entrypoint.sh b/docker/test/run_owner_entrypoint.sh index b863985cd..9d8320333 100644 --- a/docker/test/run_owner_entrypoint.sh +++ b/docker/test/run_owner_entrypoint.sh @@ -26,7 +26,7 @@ run-manager update-config \ --ws-rpc "${WS_RPC}" \ --run-id "${RUN_ID}" \ --config-path "/usr/local/config.toml" \ - --skip-upload-external-config + --skip-upload-model-extra-data run-manager set-paused \ --wallet-private-key-path ${WALLET_FILE} \ diff --git a/scripts/create-permissionless-run.sh b/scripts/create-permissionless-run.sh index f7e4bf0a7..125f62f4b 100755 --- a/scripts/create-permissionless-run.sh +++ b/scripts/create-permissionless-run.sh @@ -82,7 +82,7 @@ cargo run --release --bin run-manager -- \ --config-path ${CONFIG_FILE} \ --num-parameters 1100000000 \ --vocab-size 32768 \ - --skip-upload-external-config + --skip-upload-model-extra-data echo -e "\n[+] Unpause the training run..." cargo run --release --bin run-manager -- \ diff --git a/scripts/setup-test-run.sh b/scripts/setup-test-run.sh index d870c45e2..984757137 100755 --- a/scripts/setup-test-run.sh +++ b/scripts/setup-test-run.sh @@ -56,7 +56,7 @@ cargo run --release --bin run-manager -- \ --ws-rpc "${WS_RPC}" \ --run-id "${RUN_ID}" \ --config-path "config/solana-test/test-config.toml" \ - --skip-upload-external-config + --skip-upload-model-extra-data echo "[+] Unpausing run..." cargo run --release --bin run-manager -- \ From 8bba81dd9406d6d52478546ebbcbbfe3578d4824 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Mon, 2 Feb 2026 11:07:21 -0800 Subject: [PATCH 21/22] Fix argument to get local model extra data --- docker/test/client_test_entrypoint.sh | 4 ++-- scripts/train-solana-test.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/test/client_test_entrypoint.sh b/docker/test/client_test_entrypoint.sh index 74f35b611..09d16ec91 100644 --- a/docker/test/client_test_entrypoint.sh +++ b/docker/test/client_test_entrypoint.sh @@ -18,7 +18,7 @@ if [ "${PYTHON_ENABLED}" = "true" ]; then --run-id "${RUN_ID}" \ --data-parallelism 8 \ --sidecar-port "${SIDECAR_PORT}" \ - --external-config-toml "/usr/local/config.toml" \ + --model-extra-data-toml "/usr/local/config.toml" \ --logs "json" else echo "Starting client without Python features" @@ -27,6 +27,6 @@ else --rpc "${RPC}" \ --ws-rpc "${WS_RPC}" \ --run-id "${RUN_ID}" \ - --external-config-toml "/usr/local/config.toml" \ + --model-extra-data-toml "/usr/local/config.toml" \ --logs "json" fi diff --git a/scripts/train-solana-test.sh b/scripts/train-solana-test.sh index 01d04f551..4b5986ded 100755 --- a/scripts/train-solana-test.sh +++ b/scripts/train-solana-test.sh @@ -56,7 +56,7 @@ if [[ "$OTLP_METRICS_URL" == "" ]]; then --micro-batch-size ${BATCH_SIZE} \ --authorizer ${AUTHORIZER} \ --logs "console" \ - --external-config-toml ./config/solana-test/light-config.toml \ + --model-extra-data-toml ./config/solana-test/light-config.toml \ ${CHECKPOINT_ARGS[@]} \ "$@" else @@ -73,7 +73,7 @@ else --authorizer ${AUTHORIZER} \ --oltp-metrics-url "http://localhost:4318/v1/metrics" \ --oltp-logs-url "http://localhost:4318/v1/logs" \ - --external-config-toml ./config/solana-test/light-config.toml \ + --model-extra-data-toml ./config/solana-test/light-config.toml \ ${CHECKPOINT_ARGS[@]} \ "$@" fi From 96aa9203ef053a88d177c092af5c26ea64be0524 Mon Sep 17 00:00:00 2001 From: IAvecilla Date: Mon, 2 Feb 2026 17:30:52 -0300 Subject: [PATCH 22/22] Add P2PDummy alternative for testing purposes --- architectures/centralized/server/src/app.rs | 2 +- .../centralized/testing/tests/integration_tests.rs | 9 +++------ architectures/decentralized/solana-client/src/main.rs | 3 +++ shared/client/src/state/init.rs | 6 +++++- shared/coordinator/src/coordinator.rs | 8 ++++---- shared/coordinator/src/model.rs | 6 ++++-- 6 files changed, 20 insertions(+), 14 deletions(-) diff --git a/architectures/centralized/server/src/app.rs b/architectures/centralized/server/src/app.rs index 8d1fd1400..f2a4c7797 100644 --- a/architectures/centralized/server/src/app.rs +++ b/architectures/centralized/server/src/app.rs @@ -211,7 +211,7 @@ impl App { Checkpoint::Dummy(_) => { // ok! } - Checkpoint::P2P(_) | Checkpoint::P2PGcs(_) => { + Checkpoint::P2P(_) | Checkpoint::P2PDummy | Checkpoint::P2PGcs(_) => { bail!("Can't start up a run with a P2P checkpoint.") } Checkpoint::Gcs(gcs_repo) => { diff --git a/architectures/centralized/testing/tests/integration_tests.rs b/architectures/centralized/testing/tests/integration_tests.rs index 81943e827..656e6951a 100644 --- a/architectures/centralized/testing/tests/integration_tests.rs +++ b/architectures/centralized/testing/tests/integration_tests.rs @@ -9,10 +9,7 @@ use psyche_centralized_testing::{ spawn_clients_with_training_delay, }, }; -use psyche_coordinator::{ - RunState, - model::{Checkpoint, HubRepo}, -}; +use psyche_coordinator::{RunState, model::Checkpoint}; use tracing::info; #[test_log::test(tokio::test(flavor = "multi_thread"))] @@ -639,7 +636,7 @@ async fn client_join_in_training_and_get_model_using_p2p() { assert_with_retries( || server_handle.get_checkpoint(), - std::mem::discriminant(&Checkpoint::P2P(HubRepo::dummy())), + std::mem::discriminant(&Checkpoint::P2PDummy), ) .await; @@ -722,7 +719,7 @@ async fn two_clients_join_in_training_and_get_model_using_p2p() { assert_with_retries( || server_handle.get_checkpoint(), - std::mem::discriminant(&Checkpoint::P2P(HubRepo::dummy())), + std::mem::discriminant(&Checkpoint::P2PDummy), ) .await; diff --git a/architectures/decentralized/solana-client/src/main.rs b/architectures/decentralized/solana-client/src/main.rs index 547940b8f..d4f45c179 100644 --- a/architectures/decentralized/solana-client/src/main.rs +++ b/architectures/decentralized/solana-client/src/main.rs @@ -288,6 +288,9 @@ async fn async_main() -> Result<()> { Checkpoint::Ephemeral => { bail!("Can't predownload model with ephemeral checkpoint.") } + Checkpoint::P2PDummy => { + println!("P2PDummy checkpoint (for testing), nothing to predownload."); + } Checkpoint::Dummy(hub_repo) | Checkpoint::Hub(hub_repo) | Checkpoint::P2P(hub_repo) => { diff --git a/shared/client/src/state/init.rs b/shared/client/src/state/init.rs index 7953dbb5d..0484e795b 100644 --- a/shared/client/src/state/init.rs +++ b/shared/client/src/state/init.rs @@ -244,6 +244,7 @@ impl RunInitConfigAndIO ModelExtraData::default(), } }; @@ -370,6 +371,7 @@ impl RunInitConfigAndIO { let checkpoint = llm.checkpoint; @@ -427,7 +429,9 @@ impl RunInitConfigAndIO { + model::Checkpoint::P2P(_) + | model::Checkpoint::P2PDummy + | model::Checkpoint::P2PGcs(_) => { let (tx_model_config_response, rx_model_config_response) = oneshot::channel(); info!("Checkpoint is p2p, requesting model config over network"); diff --git a/shared/coordinator/src/coordinator.rs b/shared/coordinator/src/coordinator.rs index c726655e2..7db9a8f6d 100644 --- a/shared/coordinator/src/coordinator.rs +++ b/shared/coordinator/src/coordinator.rs @@ -1,6 +1,6 @@ use crate::{ Commitment, Committee, CommitteeProof, CommitteeSelection, WitnessProof, - model::{Checkpoint, Model}, + model::{Checkpoint, HubRepo, Model}, }; use anchor_lang::{AnchorDeserialize, AnchorSerialize, InitSpace, prelude::borsh}; @@ -936,6 +936,7 @@ impl Coordinator { match llm.checkpoint { Checkpoint::P2P(hub_repo) => llm.checkpoint = Checkpoint::Hub(hub_repo), Checkpoint::P2PGcs(gcs_repo) => llm.checkpoint = Checkpoint::Gcs(gcs_repo), + Checkpoint::P2PDummy => llm.checkpoint = Checkpoint::Dummy(HubRepo::dummy()), _ => {} } } @@ -1065,9 +1066,8 @@ impl Coordinator { // we've completed an epoch, switch to P2P from now on let Model::LLM(llm) = &mut self.model; match llm.checkpoint { - Checkpoint::Hub(hub_repo) | Checkpoint::Dummy(hub_repo) => { - llm.checkpoint = Checkpoint::P2P(hub_repo) - } + Checkpoint::Hub(hub_repo) => llm.checkpoint = Checkpoint::P2P(hub_repo), + Checkpoint::Dummy(_) => llm.checkpoint = Checkpoint::P2PDummy, Checkpoint::Gcs(gcs_repo) => llm.checkpoint = Checkpoint::P2PGcs(gcs_repo), _ => {} } diff --git a/shared/coordinator/src/model.rs b/shared/coordinator/src/model.rs index d0b0fe33b..5d3dbb958 100644 --- a/shared/coordinator/src/model.rs +++ b/shared/coordinator/src/model.rs @@ -270,6 +270,8 @@ pub enum Checkpoint { Dummy(HubRepo), Hub(HubRepo), P2P(HubRepo), + /// P2P checkpoint that originated from a Dummy checkpoint (for testing) + P2PDummy, Gcs(GcsRepo), P2PGcs(GcsRepo), } @@ -277,7 +279,7 @@ pub enum Checkpoint { impl std::fmt::Display for Checkpoint { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Checkpoint::Dummy(_hub_repo) => write!(f, "Dummy"), + Checkpoint::Dummy(_) | Checkpoint::P2PDummy => write!(f, "Dummy"), Checkpoint::Ephemeral => write!(f, "Ephemeral"), Checkpoint::Hub(hub_repo) => write!(f, "{}", &hub_repo.repo_id), Checkpoint::P2P(hub_repo) => { @@ -301,7 +303,7 @@ impl Model { } let bad_checkpoint = match llm.checkpoint { - Checkpoint::Dummy(_hub_repo) => false, + Checkpoint::Dummy(_) | Checkpoint::P2PDummy => false, Checkpoint::Ephemeral => true, Checkpoint::Hub(hub_repo) => hub_repo.repo_id.is_empty(), Checkpoint::P2P(hub_repo) => hub_repo.repo_id.is_empty(),