diff --git a/egomimic/pl_utils/pl_model.py b/egomimic/pl_utils/pl_model.py index 487381be..5beaf9a1 100644 --- a/egomimic/pl_utils/pl_model.py +++ b/egomimic/pl_utils/pl_model.py @@ -78,19 +78,23 @@ def on_validation_start(self): self.model.device = self.device if self.trainer.is_global_zero: - os.makedirs( - os.path.join(self.video_dir(), f"epoch_{self.trainer.current_epoch}"), - exist_ok=True, - ) + os.makedirs(os.path.join(self.video_dir(), f"epoch_{self.trainer.current_epoch}"),exist_ok=True) + - @rank_zero_only def validation_step(self, batch, batch_idx, dataloader_idx=0): """ Run a validation step on the batch, and save that batch of images into the val_image_buffer. Once the buffer hits 1000 images, save that as a 30fps video using torchvision.io.write_video. """ + print(f"[VAL_STEP] rank={self.global_rank}, batch_idx={batch_idx}",flush=True) + batch = self.model.process_batch_for_training(batch) metrics, images_dict = self.model.forward_eval_logging(batch) + metrics = { + k: (v.to(self.device) if torch.is_tensor(v) else torch.tensor(v, device=self.device)) + for k, v in metrics.items() + } + ## images is now a dict for key, images in images_dict.items(): os.makedirs( @@ -117,10 +121,10 @@ def validation_step(self, batch, batch_idx, dataloader_idx=0): self.val_image_buffer[key].clear() self.val_counter[key] += 1 - self.log_dict(metrics) + self.log_dict(metrics, sync_dist=True) - @rank_zero_only def on_validation_end(self): + print(f"[ON_VALIDATION_END] rank={self.global_rank}",flush=True) for key, buffer in self.val_image_buffer.items(): os.makedirs( os.path.join(