Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
1679fcc
wip
pefontana Dec 28, 2025
472d2d6
crendential wip
pefontana Dec 28, 2025
fa3da9c
Merge remote-tracking branch 'origin/main' into gcs-2
pefontana Jan 5, 2026
ff4ea2d
implement Checkpoint::Gcs(gcs_repo)
pefontana Jan 6, 2026
5ea32b1
light-config-gcs.toml
pefontana Jan 6, 2026
f8b193a
can_join command for gcs
pefontana Jan 6, 2026
c1f0e1f
gcs centralized version
pefontana Jan 7, 2026
6ddfe52
remove fn download_model_from_gcs_sync
pefontana Jan 7, 2026
c0aa10f
Revert "remove fn download_model_from_gcs_sync"
pefontana Jan 7, 2026
3b707ac
handle errors
pefontana Jan 7, 2026
12af6c3
Merge branch 'main' into gcs-2
pefontana Jan 7, 2026
4b4a96d
remove progress_bar
pefontana Jan 7, 2026
6162937
use default cache dir
pefontana Jan 7, 2026
1bf03ab
refactor loop
pefontana Jan 7, 2026
dcebbde
Merge branch 'main' into gcs-2
pefontana Jan 8, 2026
39fd70e
rm custom PSYCHE_CACHE_DIR
pefontana Jan 8, 2026
712f7cb
GcsRepo prefix: Option<FixedString<{ SOLANA_MAX_STRING_LEN }>>
pefontana Jan 8, 2026
2296e3f
implement P2PGCS checkpoint
pefontana Jan 8, 2026
d066a87
Merge branch 'main' into gcs-2
pefontana Jan 8, 2026
d1324e0
Add tracing to evaluate crate
pefontana Jan 8, 2026
85ed271
Merge branch 'main' into gcs-2
pefontana Jan 9, 2026
f43c18e
Merge branch 'main' into gcs-2
IAvecilla Jan 12, 2026
607335c
Merge branch 'main' into gcs-2
pefontana Jan 14, 2026
037cc08
Model checkpoint upload with GCS (#476)
IAvecilla Jan 14, 2026
8d6749b
Merge branch 'main' into gcs-2
IAvecilla Jan 14, 2026
48c1bd0
Merge branch 'main' into gcs-2
pefontana Jan 15, 2026
b4db519
generation: Some(file_entry.generation)
pefontana Jan 15, 2026
29dab79
Merge branch 'main' into gcs-2
pefontana Jan 15, 2026
78516e2
Merge branch 'main' into gcs-2
pefontana Jan 16, 2026
e2d87ef
fix tcp send_checkpoint
pefontana Jan 16, 2026
45864ca
HF_TOKEN: secrets.HF_TOKEN
pefontana Jan 16, 2026
6e7ecff
Merge branch 'main' into gcs-2
pefontana Jan 21, 2026
f4987c7
Merge branch 'main' into gcs-2
entropidelic Jan 21, 2026
150ce35
Merge branch 'main' into gcs-2
pefontana Jan 22, 2026
34198e1
Merge branch 'main' into gcs-2
entropidelic Jan 26, 2026
eb043ba
Merge branch 'main' into gcs-2
pefontana Jan 27, 2026
68e0252
Merge branch 'main' into gcs-2
pefontana Jan 28, 2026
916d3ae
add button for GCS checkpoint if bucket is public
Jan 28, 2026
2d1e527
fix fakedata.ts
Jan 28, 2026
4437389
Merge branch 'main' into gcs-2
pefontana Feb 2, 2026
699c4fc
clippy
pefontana Feb 2, 2026
39d5a4b
add GCS documentation for bucket creation and manifest file explanation
entropidelic Feb 2, 2026
a1cfcf4
Merge branch 'main' into gcs-2
pefontana Feb 4, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 12 additions & 9 deletions architectures/centralized/client/src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ use anyhow::{Error, Result};
use bytemuck::Zeroable;
use hf_hub::Repo;
use psyche_centralized_shared::{ClientId, ClientToServerMessage, ServerToClientMessage};
use psyche_client::HubUploadInfo;
use psyche_client::UploadInfo;
use psyche_client::{
Client, ClientTUI, ClientTUIState, NC, RunInitConfig, TrainArgs, read_identity_secret_key,
};
Expand Down Expand Up @@ -29,7 +31,7 @@ pub type TabsData = <Tabs as CustomWidget>::Data;
pub enum ToSend {
Witness(Box<OpportunisticData>),
HealthCheck(HealthChecks<ClientId>),
Checkpoint(model::HubRepo),
Checkpoint(model::Checkpoint),
}

struct Backend {
Expand Down Expand Up @@ -67,7 +69,7 @@ impl WatcherBackend<ClientId> for Backend {
Ok(())
}

async fn send_checkpoint(&mut self, checkpoint: model::HubRepo) -> Result<()> {
async fn send_checkpoint(&mut self, checkpoint: model::Checkpoint) -> Result<()> {
self.tx.send(ToSend::Checkpoint(checkpoint))?;
Ok(())
}
Expand Down Expand Up @@ -176,18 +178,19 @@ impl App {
) -> Result<()> {
// sanity checks
if let Some(checkpoint_config) = &state_options.checkpoint_config {
if let Some(hub_upload) = &checkpoint_config.hub_upload {
if let Some(UploadInfo::Hub(HubUploadInfo {
hub_repo,
hub_token,
})) = &checkpoint_config.upload_info
{
let api = hf_hub::api::tokio::ApiBuilder::new()
.with_token(Some(hub_upload.hub_token.clone()))
.with_token(Some(hub_token.clone()))
.build()?;
let repo_api = api.repo(Repo::new(
hub_upload.hub_repo.clone(),
hf_hub::RepoType::Model,
));
let repo_api = api.repo(Repo::new(hub_repo.clone(), hf_hub::RepoType::Model));
if !repo_api.is_writable().await {
anyhow::bail!(
"Checkpoint upload repo {} is not writable with the passed API key.",
hub_upload.hub_repo
hub_repo
)
}
}
Expand Down
13 changes: 10 additions & 3 deletions architectures/centralized/server/src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ use psyche_coordinator::{

use psyche_core::{FixedVec, Shuffle, SizedIterator, TokenSize};
use psyche_data_provider::{
DataProviderTcpServer, DataServerTui, LocalDataProvider, download_model_repo_async,
DataProviderTcpServer, DataServerTui, LocalDataProvider, download_model_from_gcs_async,
download_model_repo_async,
};
use psyche_network::{ClientNotification, TcpServer};
use psyche_tui::{
Expand Down Expand Up @@ -80,7 +81,7 @@ impl psyche_watcher::Backend<ClientId> for ChannelCoordinatorBackend {
bail!("Server does not send health checks");
}

async fn send_checkpoint(&mut self, _checkpoint: model::HubRepo) -> Result<()> {
async fn send_checkpoint(&mut self, _checkpoint: model::Checkpoint) -> Result<()> {
bail!("Server does not send checkpoints");
}
}
Expand Down Expand Up @@ -201,9 +202,15 @@ impl App {
Checkpoint::Dummy(_) => {
// ok!
}
Checkpoint::P2P(_) => {
Checkpoint::P2P(_) | Checkpoint::P2PGcs(_) => {
bail!("Can't start up a run with a P2P checkpoint.")
}
Checkpoint::Gcs(gcs_repo) => {
let bucket: String = (&gcs_repo.bucket).into();
let prefix: Option<String> =
gcs_repo.prefix.map(|p| (&p).into());
download_model_from_gcs_async(&bucket, prefix.as_deref()).await?;
}
}

let server_addr: SocketAddr = String::from(url).parse().map_err(|e| {
Expand Down
2 changes: 1 addition & 1 deletion architectures/centralized/shared/src/protocol.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ pub enum ClientToServerMessage {
Join { run_id: String },
Witness(Box<OpportunisticData>),
HealthCheck(HealthChecks<ClientId>),
Checkpoint(model::HubRepo),
Checkpoint(model::Checkpoint),
}

#[derive(Serialize, Deserialize, Debug, Clone)]
Expand Down
58 changes: 37 additions & 21 deletions architectures/decentralized/solana-client/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -283,34 +283,50 @@ async fn async_main() -> Result<()> {
bail!("Model is not an LLM, unsure how to predownload.");
};

let checkpoint = match model_config.checkpoint {
match model_config.checkpoint {
Checkpoint::Ephemeral => {
bail!("Can't predownload model with ephemeral checkpoint.")
}
Checkpoint::Dummy(hub_repo)
| Checkpoint::Hub(hub_repo)
| Checkpoint::P2P(hub_repo) => hub_repo,
| Checkpoint::P2P(hub_repo) => {
let repo_id = hub_repo.repo_id.to_string();
let revision = hub_repo.revision.map(|s| s.to_string());
println!(
"Predownloading model {repo_id} revision {}",
revision.as_ref().unwrap_or(&"main".to_string())
);

let hub_read_token = std::env::var("HF_TOKEN").ok();
let cache_folder = None; // Uses HF_HOME env var

psyche_data_provider::download_model_repo_async(
&repo_id,
revision,
cache_folder,
hub_read_token,
Some(hub_max_concurrent_downloads),
true,
)
.await?;
}
Checkpoint::Gcs(gcs_repo) | Checkpoint::P2PGcs(gcs_repo) => {
let bucket = gcs_repo.bucket.to_string();
let prefix: Option<String> = gcs_repo.prefix.map(|p| p.to_string());
println!(
"Predownloading model from gs://{}/{}",
bucket,
prefix.as_deref().unwrap_or("")
);

psyche_data_provider::download_model_from_gcs_async(
&bucket,
prefix.as_deref(),
)
.await?;
}
};

let repo_id = checkpoint.repo_id.to_string();
let revision = checkpoint.revision.map(|s| s.to_string());
println!(
"Predownloading model {repo_id} revision {}",
revision.as_ref().unwrap_or(&"main".to_string())
);

let hub_read_token = std::env::var("HF_TOKEN").ok();
let cache_folder = None; // Uses HF_HOME env var

psyche_data_provider::download_model_repo_async(
&repo_id,
revision,
cache_folder,
hub_read_token,
Some(hub_max_concurrent_downloads),
true,
)
.await?;
println!("Model predownloaded successfully.");
}

Expand Down
7 changes: 4 additions & 3 deletions architectures/decentralized/solana-common/src/backend.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ use anchor_client::{
};
use anyhow::{Context, Result, anyhow};
use futures_util::StreamExt;
use psyche_coordinator::{CommitteeProof, Coordinator, HealthChecks, model::HubRepo};
use psyche_coordinator::model::{self, Checkpoint};
use psyche_coordinator::{CommitteeProof, Coordinator, HealthChecks};
use psyche_core::IntegrationTestLogMarker;
use psyche_watcher::{Backend as WatcherBackend, OpportunisticData};
use solana_account_decoder_client_types::{UiAccount, UiAccountEncoding};
Expand Down Expand Up @@ -334,7 +335,7 @@ impl SolanaBackend {
&self,
coordinator_instance: Pubkey,
coordinator_account: Pubkey,
repo: HubRepo,
repo: Checkpoint,
) {
let user = self.get_payer();
let instruction = instructions::coordinator_checkpoint(
Expand Down Expand Up @@ -604,7 +605,7 @@ impl WatcherBackend<psyche_solana_coordinator::ClientId> for SolanaBackendRunner
Ok(())
}

async fn send_checkpoint(&mut self, checkpoint: HubRepo) -> Result<()> {
async fn send_checkpoint(&mut self, checkpoint: model::Checkpoint) -> Result<()> {
self.backend
.send_checkpoint(self.instance, self.account, checkpoint);
Ok(())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ pub fn coordinator_checkpoint(
coordinator_instance: &Pubkey,
coordinator_account: &Pubkey,
user: &Pubkey,
repo: psyche_coordinator::model::HubRepo,
repo: psyche_coordinator::model::Checkpoint,
) -> Instruction {
anchor_instruction(
psyche_solana_coordinator::ID,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use psyche_coordinator::RunState;
use psyche_coordinator::SOLANA_MAX_STRING_LEN;
use psyche_coordinator::TickResult;
use psyche_coordinator::Witness;
use psyche_coordinator::model::HubRepo;
use psyche_coordinator::model::Checkpoint;
use psyche_coordinator::model::Model;
use psyche_core::FixedString;
use psyche_core::SmallBoolean;
Expand Down Expand Up @@ -389,7 +389,11 @@ impl CoordinatorInstanceState {
self.tick()
}

pub fn checkpoint(&mut self, payer: &Pubkey, repo: HubRepo) -> Result<()> {
pub fn checkpoint(
&mut self,
payer: &Pubkey,
repo: Checkpoint,
) -> Result<()> {
// O(n) on clients, reconsider
let id = self.clients_state.find_signer(payer)?;
let index = self
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ use psyche_coordinator::Witness;
use psyche_coordinator::WitnessBloom;
use psyche_coordinator::WitnessMetadata;
use psyche_coordinator::WitnessProof;
use psyche_coordinator::model::{HubRepo, Model};
use psyche_coordinator::model::Model;
use psyche_core::MerkleRoot;
use serde::Deserialize;
use serde::Serialize;
Expand Down Expand Up @@ -313,7 +313,7 @@ pub mod psyche_solana_coordinator {

pub fn checkpoint(
ctx: Context<PermissionlessCoordinatorAccounts>,
repo: HubRepo,
repo: psyche_coordinator::model::Checkpoint,
) -> Result<()> {
let mut account = ctx.accounts.coordinator_account.load_mut()?;
account.increment_nonce();
Expand Down
2 changes: 1 addition & 1 deletion architectures/decentralized/solana-mining-pool/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

44 changes: 44 additions & 0 deletions config/solana-test/light-config-gcs.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
[config]
warmup_time = 30
cooldown_time = 30
epoch_time = 60
max_round_train_time = 15
round_witness_time = 1
min_clients = 1
init_min_clients = 1
verification_percent = 0
witness_nodes = 0
global_batch_size_start = 8
global_batch_size_end = 8
global_batch_size_warmup_tokens = 0
total_steps = 25000
waiting_for_members_extra_time = 3

[model.LLM]
architecture = "HfLlama"
data_type = "Pretraining"
max_seq_len = 2048
cold_start_warmup_steps = 0

[model.LLM.checkpoint.Gcs]
bucket = "llama220minit"

[model.LLM.data_location.Http]
token_size_in_bytes = "TwoBytes"
shuffle = "DontShuffle"
[model.LLM.data_location.Http.location.Gcp]
bucket_name = "nous-pretraining-public-us"
filter_directory = "fineweb-edu-tokenized-llama2"

[model.LLM.lr_schedule.Cosine]
base_lr = 4.0e-4
warmup_steps = 250
warmup_init_lr = 0.0
total_steps = 25000
final_lr = 4.0e-5
[model.LLM.optimizer.Distro]
clip_grad_norm = 1.0
compression_decay = 0.999
compression_chunk = 64
compression_topk = 8
quantize_1bit = true
1 change: 1 addition & 0 deletions nix/lib.nix
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ let
python312
pkg-config
perl
cargo-nextest
];

buildInputs =
Expand Down
1 change: 1 addition & 0 deletions psyche-book/src/SUMMARY.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- [General workflow](./explain/general-workflow.md)
- [Data provider](./explain/data-provider.md)
- [Model sharing](./explain/model-sharing.md)
- [GCS Checkpoints](./explain/gcs-checkpoints.md)
- [Rewards](./explain/rewards.md)
- [Glossary](./explain/glossary.md)

Expand Down
Loading