Changing convention on checkpoint paths.

georgedahl · copybara-github · commit 8c201404810c · 2025-12-15T14:27:38.000-08:00
PiperOrigin-RevId: 844876403
diff --git a/init2winit/trainer_lib/base_trainer.py b/init2winit/trainer_lib/base_trainer.py
@@ -34,6 +34,9 @@
 import orbax.checkpoint as ocp
 
 
+CHECKPOINT_TTL = 'ttl=180d'
+
+
 class BaseTrainer(metaclass=abc.ABCMeta):
   """Abstract parent class for all trainers."""
 
@@ -142,6 +145,7 @@ def __init__(
     del loss_name
     del metrics_name
     self._train_dir = train_dir
+    self._checkpoint_dir = os.path.join(train_dir, CHECKPOINT_TTL)
     self._model = model
     self._dataset_builder = dataset_builder
     self._data_selector = data_selector
@@ -200,7 +204,9 @@ def __init__(
 
     # Only used if checkpoints_steps is non-empty. Standard checkpoints are
     # saved in train_dir.
-    self._checkpoint_dir = os.path.join(self._train_dir, 'checkpoints')
+    self._extra_checkpoint_dir = os.path.join(
+        self._checkpoint_dir, 'checkpoints'
+    )
 
     # During eval, we can donate the 'batch' buffer. We don't donate the
     # 'params' and 'batch_stats' buffers as we don't re-assign those values in
@@ -256,7 +262,7 @@ def maybe_restore_from_checkpoint(self,
         unreplicated_params,
         unreplicated_batch_stats,
         unreplicated_metrics_state,
-        train_dir=self._train_dir,
+        train_dir=self._checkpoint_dir,
         external_checkpoint_path=self._external_checkpoint_path,
         orbax_checkpointer=self._orbax_checkpointer,
     )
@@ -405,7 +411,7 @@ def _eval(self, start_step, start_time, save=True):
 
     Has the side-effects of:
       - synchronizing self._batch_stats across hosts
-      - checkpointing via self._save(self._train_dir)
+      - checkpointing via self._save(self._checkpoint_dir)
       - resetting self._sum_train_cost to jnp.zeros
       - resetting self._time_at_prev_eval_end to the current time
       - resetting self._prev_eval_step to self._global_step
@@ -440,7 +446,7 @@ def _eval(self, start_step, start_time, save=True):
     )
     self._run_eval_callbacks(report)
     if save:
-      self._save(self._train_dir)
+      self._save(self._checkpoint_dir)
     steps_since_last_eval = self._global_step - self._prev_eval_step
     steps_per_sec_no_eval = steps_since_last_eval / time_since_last_eval
     run_time = time.time() - self._time_at_prev_eval_end
@@ -635,7 +641,7 @@ def train(self):
     self._prev_eval_step = self._global_step
 
     if self._global_step in self._checkpoint_steps:
-      self._save(self._checkpoint_dir, max_to_keep=None)
+      self._save(self._extra_checkpoint_dir, max_to_keep=None)
 
     for _ in range(start_step, self._num_train_steps):
       with jax.profiler.StepTraceAnnotation(
@@ -671,7 +677,7 @@ def train(self):
             self._sum_train_cost,
         )
         if self._global_step in self._checkpoint_steps:
-          self._save(self._checkpoint_dir, max_to_keep=None)
+          self._save(self._extra_checkpoint_dir, max_to_keep=None)
 
         # TODO(gdahl, gilmer): consider moving this test up.
         # NB: Since this test is after we increment self._global_step, having 0
diff --git a/init2winit/trainer_lib/test_trainer.py b/init2winit/trainer_lib/test_trainer.py
@@ -738,7 +738,7 @@ def as_dataset(self, *args, **kwargs):
     epoch_reports = list(self.trainer.train())
 
     # check that the additional checkpoints are saved.
-    checkpoint_dir = os.path.join(self.test_dir, 'checkpoints')
+    checkpoint_dir = os.path.join(self.test_dir, 'ttl=180d', 'checkpoints')
     saved_steps = []
     for f in tf.io.gfile.listdir(checkpoint_dir):
       if f[:5] == 'ckpt_':
diff --git a/init2winit/trainer_lib/trainer_utils.py b/init2winit/trainer_lib/trainer_utils.py
@@ -19,7 +19,6 @@
 import time
 
 from absl import logging
-from flax import jax_utils
 from init2winit import utils
 from init2winit.dataset_lib import data_utils
 import jax
@@ -76,18 +75,6 @@ def log_epoch_report(report, metrics_logger):
                report['epoch'])
 
 
-def maybe_log_training_metrics(metrics_state,
-                               metrics_summary_fn,
-                               metrics_logger):
-  """If appropriate, send a summary tree of training metrics to the logger."""
-  if metrics_state:
-    unreplicated_metrics_state = jax_utils.unreplicate(metrics_state)
-    summary_tree = metrics_summary_fn(unreplicated_metrics_state)
-    metrics_logger.append_pytree(summary_tree)
-    metrics_logger.write_pytree(unreplicated_metrics_state,
-                                prefix='metrics_state')
-
-
 def should_eval(global_step, eval_frequency, eval_steps):
   on_step = eval_steps and global_step in eval_steps
   on_freq = (global_step % eval_frequency == 0)
diff --git a/init2winit/utils.py b/init2winit/utils.py
@@ -105,7 +105,7 @@ def wrapper(*args, **kwargs):
 def set_up_loggers(train_dir, xm_work_unit=None):
   """Creates a logger for eval metrics as well as initialization metrics."""
   csv_path = os.path.join(train_dir, 'measurements.csv')
-  pytree_path = os.path.join(train_dir, 'training_metrics')
+  pytree_path = os.path.join(train_dir, 'ttl=180d', 'training_metrics')
   metrics_logger = MetricLogger(
       csv_path=csv_path,
       pytree_path=pytree_path,