fabian-sp · fabian-sp · Jun 25, 2025 · Jun 11, 2025 · Jun 11, 2025 · Jun 13, 2025
diff --git a/run.py b/run.py
@@ -21,6 +21,7 @@
 
 parser.add_argument('-nw', '--num-workers', nargs='?', type=int, default=DEFAULTS.num_workers, help="Number of workers for DataLoader.")
 parser.add_argument('--data-parallel', nargs='+', default=DEFAULTS.data_parallel, help='Device list for DataParallel in Pytorch.')
+parser.add_argument('-logk', '--log-every-k-steps', nargs='?', type=int, default=DEFAULTS.log_every_k_steps, help="Stepwise logging.")
 parser.add_argument('--verbose', action="store_true", help="Verbose mode.")
 parser.add_argument('--force-deterministic', action="store_true", help="Use deterministic mode in Pytorch. Might require setting environment variables.")
 
@@ -31,6 +32,7 @@ def run_one(exp_id: str,
             device: str=DEFAULTS.device, 
             num_workers: int=DEFAULTS.num_workers,
             data_parallel: Union[list, None]=DEFAULTS.data_parallel,
+            log_every_k_steps: Union[int, None]=DEFAULTS.log_every_k_steps,
             verbose: bool=DEFAULTS.verbose,
             force_deterministic: bool=DEFAULTS.force_deterministic
             ):
@@ -55,6 +57,9 @@ def run_one(exp_id: str,
     data_parallel : Union[list, None], optional
         If not None, this specifies the device ids for DataParallel mode in Pytorch.
         See https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html.
+    log_every_k_steps: Union[int, None], optional
+        If not None, log batch loss and grad_norm every k steps. Careful: this results in larger output files.
+        By default None (no stepwise logging).
     verbose : bool, optional
         Verbose mode flag.
         If True, prints progress bars, model architecture and other useful information.
@@ -84,6 +89,7 @@ def run_one(exp_id: str,
                  data_dir=data_dir,
                  num_workers=num_workers,
                  data_parallel=data_parallel,
+                 log_every_k_steps=log_every_k_steps,
                  verbose=verbose)
 
         B.setup()
@@ -107,6 +113,7 @@ def run_one(exp_id: str,
             device=args.device,
             num_workers=args.num_workers,
             data_parallel=args.data_parallel,
+            log_every_k_steps=args.log_every_k_steps,
             verbose=args.verbose,
             force_deterministic=args.force_deterministic)
 

diff --git a/stepback/base.py b/stepback/base.py
@@ -25,6 +25,7 @@ def __init__(self, name: str,
                  data_dir: str=DEFAULTS.data_dir,
                  num_workers: int=DEFAULTS.num_workers,
                  data_parallel: Union[list, None]=DEFAULTS.data_parallel,
+                 log_every_k_steps: Union[int, None]=DEFAULTS.log_every_k_steps,
                  verbose: bool=DEFAULTS.verbose):
         """The main class. Performs one single training run plus evaluation.
 
@@ -37,14 +38,17 @@ def __init__(self, name: str,
             Needs to have the keys ['loss_func', 'score_func', 'opt'].
         device : str, optional
             Device string, by default 'cuda'
-            If 'cuda' is specified, but not available on system, it switches to CPU.
+            If 'cuda' is specified, but not available on system, it switches to MPS or CPU.
         data_dir : str, optional
             Directory where datasets can be found, by default 'data/'
         num_workers : int, optional
             Number of workers for DataLoader, by default 0
         data_parallel : Union[list, None], optional
             If not None, this specifies the device ids for DataParallel mode in Pytorch. By default None.
             See https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html.
+        log_every_k_steps: Union[int, None], optional
+            If not None, log batch loss and grad_norm every k steps. Careful: this results in larger output files.
+            By default None (no stepwise logging).
         verbose : bool, optional
             Verbose mode flag, by default False.
             If True, prints progress bars, model architecture and other useful information.
@@ -55,18 +59,27 @@ def __init__(self, name: str,
         self.data_dir = data_dir
         self.num_workers = num_workers
         self.data_parallel = data_parallel
+        self.log_every_k_steps = log_every_k_steps
         self.verbose = verbose
 
-
+        # Device handling
         print("CUDA available? ", torch.cuda.is_available())
 
         if torch.cuda.is_available():
             self.device = torch.device(device)
         else:
-            self.device = torch.device('cpu')
+            # use MPS if available
+            try:
+                if torch.backends.mps.is_available() and device != "cpu":
+                    self.device = torch.device('mps')
+                else:
+                    self.device = torch.device('cpu')
+            except:
+                self.device = torch.device('cpu')
 
         print("Using device: ", self.device)
 
+        # Seeding
         self.seed = 1234567
         self.run_seed = 456789 + config.get('run_id', 0)
         torch.backends.cudnn.benchmark = False
@@ -112,9 +125,13 @@ def _setup_data(self):
         # construct train loader
         _gen = torch.Generator()
         _gen.manual_seed(self.run_seed)
-        self.train_loader = DataLoader(self.train_set, drop_last=True, shuffle=True, generator=_gen,
+        self.train_loader = DataLoader(self.train_set,
+                                       drop_last=True,
+                                       shuffle=True,
+                                       generator=_gen,
                                        batch_size=self.config['batch_size'],
-                                       num_workers=self.num_workers)
+                                       num_workers=self.num_workers
+        )
 
         return
 
@@ -155,14 +172,15 @@ def setup(self):
         opt_obj, hyperp = get_optimizer(self.config['opt'])
 
         self._init_opt(opt_obj, hyperp)
-        
-        self.sched = get_scheduler(self.config['opt'], self.opt)
+
+        self.sched, self._step_scheduler_on_epoch = get_scheduler(self.config['opt'], self.opt)
 
         #============ Results ==============
         opt_val = self._compute_opt_value()
 
-
         # Store useful information as summary
+        self.results['summary']['num_batches_per_epoch'] = len(self.train_loader)
+        self.results['summary']['step_scheduler_on_epoch'] = self._step_scheduler_on_epoch
         if opt_val is not None:
             self.results['summary']['opt_val'] = opt_val
 
@@ -180,7 +198,12 @@ def run(self):
         start_time = str(datetime.datetime.now())
         score_list = []   
         self._epochs_trained = 0
-
+
+        _init_model_norm = l2_norm(self.model)
+        self.results['summary']['init_model_norm'] = _init_model_norm
+        if self.verbose:
+            print(f"Initial model L2-norm: ", _init_model_norm)
+
         for epoch in range(self.config['max_epoch']):
 
             print(f"Epoch {epoch}, current learning rate", self.sched.get_last_lr()[0])
@@ -198,7 +221,10 @@ def run(self):
             score_dict['train_epoch_time'] = e_time - s_time       
             score_dict['model_norm'] = l2_norm(self.model)        
             score_dict['grad_norm'] = grad_norm(self.model)
-
+
+            if self.log_every_k_steps is not None:
+                score_dict['step_logs'] = copy.deepcopy(self._log_stepwise)
+
             # Validation
             with torch.no_grad():
 
@@ -207,11 +233,11 @@ def run(self):
 
                 train_dict = self.evaluate(self.train_set, 
                                            metric_dict = metric_dict,
-                                           )  
+                )  
 
                 val_dict = self.evaluate(self.val_set, 
                                          metric_dict = metric_dict,
-                                         )                     
+                )                     
 
                 # Record metrics
                 score_dict.update(train_dict)
@@ -233,11 +259,13 @@ def run(self):
             self._epochs_trained += 1
 
         end_time = str(datetime.datetime.now())
-        
+
         # ==== store =====================
         self.results['history'] = copy.deepcopy(score_list)
         self.results['summary']['start_time'] = start_time
         self.results['summary']['end_time'] = end_time
+        self.results['summary']['final_model_norm'] = l2_norm(self.model)
+
         return
 
     def train_epoch(self):
@@ -253,7 +281,13 @@ def train_epoch(self):
 
         t0 = time.time()
 
-        for batch in pbar:
+        # Reset logging dictionaries
+        if self.log_every_k_steps is not None:
+            self._log_stepwise = {"loss": {}, "grad_norm": {}}
+            if not self._step_scheduler_on_epoch:
+                self._log_stepwise["lr"] = {}
+
+        for step_counter, batch in enumerate(pbar):
             # Move batch to device
             data, targets = batch['data'].to(device=self.device), batch['targets'].to(device=self.device)
 
@@ -268,24 +302,37 @@ def train_epoch(self):
             closure = lambda: self.training_loss.compute(out, targets)
 
             # see optim/README.md for explanation 
-            if hasattr(self.opt,"prestep"):
+            if hasattr(self.opt, "prestep"):
                 ind = batch['ind'].to(device=self.device)           # indices of batch members
                 self.opt.prestep(out, targets, ind, self.training_loss.name)
 
             # Here the magic happens
             loss_val = self.opt.step(closure=closure) 
 
-            if self.device != torch.device('cpu'):
-                torch.cuda.synchronize()
+            self. _sync_device()
+
             timings_dataloader.append(t1-t0) 
             t0 = time.time()                                        # model timing ends
             timings_model.append(t0-t1)                 
 
             pbar.set_description(f'Training - loss={loss_val:.3f} - time data: last={timings_dataloader[-1]:.3f},(mean={np.mean(timings_dataloader):.3f}) - time model+step: last={timings_model[-1]:.3f}(mean={np.mean(timings_model):.3f})')
 
+            # Log loss_val and grad_norm every k steps
+            if self.log_every_k_steps is not None:
+                total_step_counter = len(self.train_loader) * self._epochs_trained + step_counter
+                if step_counter % self.log_every_k_steps == 0:
+                    self._log_stepwise["loss"][total_step_counter] = loss_val.item()
+                    self._log_stepwise["grad_norm"][total_step_counter] = grad_norm(self.model)
+                    if not self._step_scheduler_on_epoch:
+                        self._log_stepwise["lr"][total_step_counter] = self.sched.get_last_lr()[0]
+
+
+            if not self._step_scheduler_on_epoch:
+                self.sched.step()
 
         # update learning rate             
-        self.sched.step()       
+        if self._step_scheduler_on_epoch:
+            self.sched.step()   
 
         return
 
@@ -323,17 +370,16 @@ def evaluate(self, dataset, metric_dict):
                 # metric takes average over batch ==> multiply with batch size
                 score_dict[_met] += _met_fun.compute(out, targets).item() * data.shape[0] 
 
-            timings_dataloader.append(t1-t0)                                         
-            if self.device != torch.device('cpu'):
-                torch.cuda.synchronize()        
+            timings_dataloader.append(t1-t0)
+            self. _sync_device()
+
             t0 = time.time()
             timings_model.append(t0-t1)    
 
             pbar.set_description(f'Validating {dataset.split}')
             pbar.set_description(f'Validating {dataset.split} - time data: last={timings_dataloader[-1]:.3f}(mean={np.mean(timings_dataloader):.3f}) - time model: last={timings_model[-1]:.3f}(mean={np.mean(timings_model):.3f})')
 
-
-
+
         for _met in metric_dict.keys():
             # Get from sum to average
             score_dict[_met] = float(score_dict[_met] / len(dl.dataset))
@@ -353,6 +399,13 @@ def save_checkpoint(self, path):
 
         return         
 
+    def _sync_device(self):
+        if self.device == torch.device('mps'):
+            torch.mps.synchronize()
+        else:
+            if self.device != torch.device('cpu'):
+                torch.cuda.synchronize()
+
     def _compute_opt_value(self):
         """
         For linear model, the problem is convex and we can compute the optimal value

diff --git a/stepback/datasets/libsvm.py b/stepback/datasets/libsvm.py
@@ -8,58 +8,89 @@
 
 SPLIT_SEED = 12345678
 
-LIBSVM_URL = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/"
+LIBSVM_BINARY_URL = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/"
+LIBSVM_MULTICLASS_URL = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/"
 
 # mapping libsvm names to download links
-LIBSVM_NAME_MAP = {"rcv1"           : "rcv1_train.binary.bz2",
+MULTICLASS_NAME_MAP = {"dna": "dna.scale",
+                       "wine": "wine.scale"
+}
+
+BINARY_NAME_MAP = {"rcv1"           : "rcv1_train.binary.bz2",
                     "mushrooms"     : "mushrooms",
                     "a1a"           : "a1a",
                     "ijcnn"         : "ijcnn1.tr.bz2", 
                     "breast-cancer" : "breast-cancer_scale"
-                    }
+}
+LIBSVM_NAMES = list(MULTICLASS_NAME_MAP.keys()) + list(BINARY_NAME_MAP.keys())
 
 
 def get_libsvm(split, name, path, train_size=0.8):
-    X, y = load_libsvm(name, path + '/libsvm')
-
-    if np.all(np.isin(y, [0,1])):
-        y = y*2 - 1 # go from 0,1 to -1,1
-
-    if name == 'breast-cancer':
-        y[y==2] = 1
-        y[y==4] = -1
-
-    labels = np.unique(y)
-    assert np.all(np.isin(y, [-1,1])), f"Sth went wrong with class labels, have {labels}."
+    if name in BINARY_NAME_MAP.keys():
+        multiclass = False
+    elif name in MULTICLASS_NAME_MAP.keys():
+        multiclass = True
+    else:
+        raise KeyError(f"Unknwon dataset name {name} from LIBSVM. Need to be added to name mapping.")
 
+    libsvm_path = os.path.join(path, "libsvm")
+    X, y = load_libsvm(name, libsvm_path, multiclass)
+
+    if not multiclass:
+        # use -1, 1 labels in binary case
+        if np.all(np.isin(y, [0,1])):
+            y = y*2 - 1
+
+        # manual label fix for breast cancer dataset
+        if name == 'breast-cancer':
+            y[y==2] = 1
+            y[y==4] = -1
+
+        unique_labels = [-1, 1]
+        assert np.all(np.isin(y, unique_labels)), f"Sth went wrong with class labels, have {np.unique(y)}."
+    else:
+        unique_labels = list(np.unique(y).astype("int"))
+        # try to achieve class labels [0,C)
+        if min(unique_labels) > 0:
+            y = y - min(unique_labels)
+
+    print(f"Dataset labels (before split): {unique_labels}")
+
     # use fixed seed for train/val split
-    X_train, X_test, Y_train, Y_test = train_test_split(X, y, 
+    X_train, X_test, Y_train, Y_test = train_test_split(X,
+                                                        y, 
                                                         train_size=train_size, 
                                                         shuffle=True, 
-                                                        random_state=SPLIT_SEED)
-
+                                                        random_state=SPLIT_SEED
+    )
+
+    # for multiclass, we need Long Tensors
+    # for binary, FloatTensor is ok
     if split == 'train':
         X = torch.FloatTensor(X_train.toarray())
-        Y = torch.FloatTensor(Y_train)
+        Y = torch.FloatTensor(Y_train) if not multiclass else torch.LongTensor(Y_train)
     else:
         X = torch.FloatTensor(X_test.toarray())
-        Y = torch.FloatTensor(Y_test)
+        Y = torch.FloatTensor(Y_test) if not multiclass else torch.LongTensor(Y_test)
 
     ds = torch.utils.data.TensorDataset(X, Y)
 
     return ds
 
 
-
-def load_libsvm(name, path):
+def load_libsvm(name, path, multiclass=False):
     if not os.path.exists(path):
         os.mkdir(path)
 
-    fn = LIBSVM_NAME_MAP[name]
+    if multiclass:
+        fn = MULTICLASS_NAME_MAP[name]
+    else:
+        fn = BINARY_NAME_MAP[name]
     filename = os.path.join(path, fn)
 
+    _url = LIBSVM_MULTICLASS_URL if multiclass else LIBSVM_BINARY_URL
     if not os.path.exists(filename):
-        url = urllib.parse.urljoin(LIBSVM_URL, fn)
+        url = urllib.parse.urljoin(_url, fn)
         print("Downloading from %s" % url)
         urllib.request.urlretrieve(url, filename)
         print("Download complete.")