diff --git a/run.py b/run.py index 3b37cdf..2424dba 100644 --- a/run.py +++ b/run.py @@ -21,6 +21,7 @@ parser.add_argument('-nw', '--num-workers', nargs='?', type=int, default=DEFAULTS.num_workers, help="Number of workers for DataLoader.") parser.add_argument('--data-parallel', nargs='+', default=DEFAULTS.data_parallel, help='Device list for DataParallel in Pytorch.') +parser.add_argument('-logk', '--log-every-k-steps', nargs='?', type=int, default=DEFAULTS.log_every_k_steps, help="Stepwise logging.") parser.add_argument('--verbose', action="store_true", help="Verbose mode.") parser.add_argument('--force-deterministic', action="store_true", help="Use deterministic mode in Pytorch. Might require setting environment variables.") @@ -31,6 +32,7 @@ def run_one(exp_id: str, device: str=DEFAULTS.device, num_workers: int=DEFAULTS.num_workers, data_parallel: Union[list, None]=DEFAULTS.data_parallel, + log_every_k_steps: Union[int, None]=DEFAULTS.log_every_k_steps, verbose: bool=DEFAULTS.verbose, force_deterministic: bool=DEFAULTS.force_deterministic ): @@ -55,6 +57,9 @@ def run_one(exp_id: str, data_parallel : Union[list, None], optional If not None, this specifies the device ids for DataParallel mode in Pytorch. See https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html. + log_every_k_steps: Union[int, None], optional + If not None, log batch loss and grad_norm every k steps. Careful: this results in larger output files. + By default None (no stepwise logging). verbose : bool, optional Verbose mode flag. If True, prints progress bars, model architecture and other useful information. @@ -84,6 +89,7 @@ def run_one(exp_id: str, data_dir=data_dir, num_workers=num_workers, data_parallel=data_parallel, + log_every_k_steps=log_every_k_steps, verbose=verbose) B.setup() @@ -107,6 +113,7 @@ def run_one(exp_id: str, device=args.device, num_workers=args.num_workers, data_parallel=args.data_parallel, + log_every_k_steps=args.log_every_k_steps, verbose=args.verbose, force_deterministic=args.force_deterministic) diff --git a/stepback/base.py b/stepback/base.py index d4061c5..ffe4262 100644 --- a/stepback/base.py +++ b/stepback/base.py @@ -25,6 +25,7 @@ def __init__(self, name: str, data_dir: str=DEFAULTS.data_dir, num_workers: int=DEFAULTS.num_workers, data_parallel: Union[list, None]=DEFAULTS.data_parallel, + log_every_k_steps: Union[int, None]=DEFAULTS.log_every_k_steps, verbose: bool=DEFAULTS.verbose): """The main class. Performs one single training run plus evaluation. @@ -37,7 +38,7 @@ def __init__(self, name: str, Needs to have the keys ['loss_func', 'score_func', 'opt']. device : str, optional Device string, by default 'cuda' - If 'cuda' is specified, but not available on system, it switches to CPU. + If 'cuda' is specified, but not available on system, it switches to MPS or CPU. data_dir : str, optional Directory where datasets can be found, by default 'data/' num_workers : int, optional @@ -45,6 +46,9 @@ def __init__(self, name: str, data_parallel : Union[list, None], optional If not None, this specifies the device ids for DataParallel mode in Pytorch. By default None. See https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html. + log_every_k_steps: Union[int, None], optional + If not None, log batch loss and grad_norm every k steps. Careful: this results in larger output files. + By default None (no stepwise logging). verbose : bool, optional Verbose mode flag, by default False. If True, prints progress bars, model architecture and other useful information. @@ -55,18 +59,27 @@ def __init__(self, name: str, self.data_dir = data_dir self.num_workers = num_workers self.data_parallel = data_parallel + self.log_every_k_steps = log_every_k_steps self.verbose = verbose - + # Device handling print("CUDA available? ", torch.cuda.is_available()) if torch.cuda.is_available(): self.device = torch.device(device) else: - self.device = torch.device('cpu') + # use MPS if available + try: + if torch.backends.mps.is_available() and device != "cpu": + self.device = torch.device('mps') + else: + self.device = torch.device('cpu') + except: + self.device = torch.device('cpu') print("Using device: ", self.device) + # Seeding self.seed = 1234567 self.run_seed = 456789 + config.get('run_id', 0) torch.backends.cudnn.benchmark = False @@ -112,9 +125,13 @@ def _setup_data(self): # construct train loader _gen = torch.Generator() _gen.manual_seed(self.run_seed) - self.train_loader = DataLoader(self.train_set, drop_last=True, shuffle=True, generator=_gen, + self.train_loader = DataLoader(self.train_set, + drop_last=True, + shuffle=True, + generator=_gen, batch_size=self.config['batch_size'], - num_workers=self.num_workers) + num_workers=self.num_workers + ) return @@ -155,14 +172,15 @@ def setup(self): opt_obj, hyperp = get_optimizer(self.config['opt']) self._init_opt(opt_obj, hyperp) - - self.sched = get_scheduler(self.config['opt'], self.opt) + + self.sched, self._step_scheduler_on_epoch = get_scheduler(self.config['opt'], self.opt) #============ Results ============== opt_val = self._compute_opt_value() - # Store useful information as summary + self.results['summary']['num_batches_per_epoch'] = len(self.train_loader) + self.results['summary']['step_scheduler_on_epoch'] = self._step_scheduler_on_epoch if opt_val is not None: self.results['summary']['opt_val'] = opt_val @@ -180,7 +198,12 @@ def run(self): start_time = str(datetime.datetime.now()) score_list = [] self._epochs_trained = 0 - + + _init_model_norm = l2_norm(self.model) + self.results['summary']['init_model_norm'] = _init_model_norm + if self.verbose: + print(f"Initial model L2-norm: ", _init_model_norm) + for epoch in range(self.config['max_epoch']): print(f"Epoch {epoch}, current learning rate", self.sched.get_last_lr()[0]) @@ -198,7 +221,10 @@ def run(self): score_dict['train_epoch_time'] = e_time - s_time score_dict['model_norm'] = l2_norm(self.model) score_dict['grad_norm'] = grad_norm(self.model) - + + if self.log_every_k_steps is not None: + score_dict['step_logs'] = copy.deepcopy(self._log_stepwise) + # Validation with torch.no_grad(): @@ -207,11 +233,11 @@ def run(self): train_dict = self.evaluate(self.train_set, metric_dict = metric_dict, - ) + ) val_dict = self.evaluate(self.val_set, metric_dict = metric_dict, - ) + ) # Record metrics score_dict.update(train_dict) @@ -233,11 +259,13 @@ def run(self): self._epochs_trained += 1 end_time = str(datetime.datetime.now()) - + # ==== store ===================== self.results['history'] = copy.deepcopy(score_list) self.results['summary']['start_time'] = start_time self.results['summary']['end_time'] = end_time + self.results['summary']['final_model_norm'] = l2_norm(self.model) + return def train_epoch(self): @@ -253,7 +281,13 @@ def train_epoch(self): t0 = time.time() - for batch in pbar: + # Reset logging dictionaries + if self.log_every_k_steps is not None: + self._log_stepwise = {"loss": {}, "grad_norm": {}} + if not self._step_scheduler_on_epoch: + self._log_stepwise["lr"] = {} + + for step_counter, batch in enumerate(pbar): # Move batch to device data, targets = batch['data'].to(device=self.device), batch['targets'].to(device=self.device) @@ -268,24 +302,37 @@ def train_epoch(self): closure = lambda: self.training_loss.compute(out, targets) # see optim/README.md for explanation - if hasattr(self.opt,"prestep"): + if hasattr(self.opt, "prestep"): ind = batch['ind'].to(device=self.device) # indices of batch members self.opt.prestep(out, targets, ind, self.training_loss.name) # Here the magic happens loss_val = self.opt.step(closure=closure) - if self.device != torch.device('cpu'): - torch.cuda.synchronize() + self. _sync_device() + timings_dataloader.append(t1-t0) t0 = time.time() # model timing ends timings_model.append(t0-t1) pbar.set_description(f'Training - loss={loss_val:.3f} - time data: last={timings_dataloader[-1]:.3f},(mean={np.mean(timings_dataloader):.3f}) - time model+step: last={timings_model[-1]:.3f}(mean={np.mean(timings_model):.3f})') + # Log loss_val and grad_norm every k steps + if self.log_every_k_steps is not None: + total_step_counter = len(self.train_loader) * self._epochs_trained + step_counter + if step_counter % self.log_every_k_steps == 0: + self._log_stepwise["loss"][total_step_counter] = loss_val.item() + self._log_stepwise["grad_norm"][total_step_counter] = grad_norm(self.model) + if not self._step_scheduler_on_epoch: + self._log_stepwise["lr"][total_step_counter] = self.sched.get_last_lr()[0] + + + if not self._step_scheduler_on_epoch: + self.sched.step() # update learning rate - self.sched.step() + if self._step_scheduler_on_epoch: + self.sched.step() return @@ -323,17 +370,16 @@ def evaluate(self, dataset, metric_dict): # metric takes average over batch ==> multiply with batch size score_dict[_met] += _met_fun.compute(out, targets).item() * data.shape[0] - timings_dataloader.append(t1-t0) - if self.device != torch.device('cpu'): - torch.cuda.synchronize() + timings_dataloader.append(t1-t0) + self. _sync_device() + t0 = time.time() timings_model.append(t0-t1) pbar.set_description(f'Validating {dataset.split}') pbar.set_description(f'Validating {dataset.split} - time data: last={timings_dataloader[-1]:.3f}(mean={np.mean(timings_dataloader):.3f}) - time model: last={timings_model[-1]:.3f}(mean={np.mean(timings_model):.3f})') - - + for _met in metric_dict.keys(): # Get from sum to average score_dict[_met] = float(score_dict[_met] / len(dl.dataset)) @@ -353,6 +399,13 @@ def save_checkpoint(self, path): return + def _sync_device(self): + if self.device == torch.device('mps'): + torch.mps.synchronize() + else: + if self.device != torch.device('cpu'): + torch.cuda.synchronize() + def _compute_opt_value(self): """ For linear model, the problem is convex and we can compute the optimal value diff --git a/stepback/datasets/libsvm.py b/stepback/datasets/libsvm.py index 405e2cd..f2e133e 100644 --- a/stepback/datasets/libsvm.py +++ b/stepback/datasets/libsvm.py @@ -8,58 +8,89 @@ SPLIT_SEED = 12345678 -LIBSVM_URL = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/" +LIBSVM_BINARY_URL = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/" +LIBSVM_MULTICLASS_URL = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/" # mapping libsvm names to download links -LIBSVM_NAME_MAP = {"rcv1" : "rcv1_train.binary.bz2", +MULTICLASS_NAME_MAP = {"dna": "dna.scale", + "wine": "wine.scale" +} + +BINARY_NAME_MAP = {"rcv1" : "rcv1_train.binary.bz2", "mushrooms" : "mushrooms", "a1a" : "a1a", "ijcnn" : "ijcnn1.tr.bz2", "breast-cancer" : "breast-cancer_scale" - } +} +LIBSVM_NAMES = list(MULTICLASS_NAME_MAP.keys()) + list(BINARY_NAME_MAP.keys()) def get_libsvm(split, name, path, train_size=0.8): - X, y = load_libsvm(name, path + '/libsvm') - - if np.all(np.isin(y, [0,1])): - y = y*2 - 1 # go from 0,1 to -1,1 - - if name == 'breast-cancer': - y[y==2] = 1 - y[y==4] = -1 - - labels = np.unique(y) - assert np.all(np.isin(y, [-1,1])), f"Sth went wrong with class labels, have {labels}." + if name in BINARY_NAME_MAP.keys(): + multiclass = False + elif name in MULTICLASS_NAME_MAP.keys(): + multiclass = True + else: + raise KeyError(f"Unknwon dataset name {name} from LIBSVM. Need to be added to name mapping.") + libsvm_path = os.path.join(path, "libsvm") + X, y = load_libsvm(name, libsvm_path, multiclass) + + if not multiclass: + # use -1, 1 labels in binary case + if np.all(np.isin(y, [0,1])): + y = y*2 - 1 + + # manual label fix for breast cancer dataset + if name == 'breast-cancer': + y[y==2] = 1 + y[y==4] = -1 + + unique_labels = [-1, 1] + assert np.all(np.isin(y, unique_labels)), f"Sth went wrong with class labels, have {np.unique(y)}." + else: + unique_labels = list(np.unique(y).astype("int")) + # try to achieve class labels [0,C) + if min(unique_labels) > 0: + y = y - min(unique_labels) + + print(f"Dataset labels (before split): {unique_labels}") + # use fixed seed for train/val split - X_train, X_test, Y_train, Y_test = train_test_split(X, y, + X_train, X_test, Y_train, Y_test = train_test_split(X, + y, train_size=train_size, shuffle=True, - random_state=SPLIT_SEED) - + random_state=SPLIT_SEED + ) + + # for multiclass, we need Long Tensors + # for binary, FloatTensor is ok if split == 'train': X = torch.FloatTensor(X_train.toarray()) - Y = torch.FloatTensor(Y_train) + Y = torch.FloatTensor(Y_train) if not multiclass else torch.LongTensor(Y_train) else: X = torch.FloatTensor(X_test.toarray()) - Y = torch.FloatTensor(Y_test) + Y = torch.FloatTensor(Y_test) if not multiclass else torch.LongTensor(Y_test) ds = torch.utils.data.TensorDataset(X, Y) return ds - -def load_libsvm(name, path): +def load_libsvm(name, path, multiclass=False): if not os.path.exists(path): os.mkdir(path) - fn = LIBSVM_NAME_MAP[name] + if multiclass: + fn = MULTICLASS_NAME_MAP[name] + else: + fn = BINARY_NAME_MAP[name] filename = os.path.join(path, fn) + _url = LIBSVM_MULTICLASS_URL if multiclass else LIBSVM_BINARY_URL if not os.path.exists(filename): - url = urllib.parse.urljoin(LIBSVM_URL, fn) + url = urllib.parse.urljoin(_url, fn) print("Downloading from %s" % url) urllib.request.urlretrieve(url, filename) print("Download complete.") diff --git a/stepback/datasets/main.py b/stepback/datasets/main.py index d1f92a7..73e2307 100644 --- a/stepback/datasets/main.py +++ b/stepback/datasets/main.py @@ -7,7 +7,7 @@ from .cifar import get_cifar10, get_cifar100 from .mnist import get_mnist from .synthetic import get_synthetic_matrix_fac, get_synthetic_linear -from .libsvm import LIBSVM_NAME_MAP, get_libsvm +from .libsvm import LIBSVM_NAMES, get_libsvm from .sensor import get_sensor from .imagenet32 import get_imagenet32 from .imagenet import get_imagenet @@ -65,7 +65,7 @@ def get_dataset(config: dict, split: str, seed: int, path: str) -> DataClass: ds = get_synthetic_linear(classify=classify, split=split, seed=seed, **kwargs) - elif name in LIBSVM_NAME_MAP.keys(): + elif name in LIBSVM_NAMES: ds = get_libsvm(split=split, name=name, path=path, **kwargs) elif name == 'imagenet32': diff --git a/stepback/defaults.py b/stepback/defaults.py index e24b263..04dc21c 100644 --- a/stepback/defaults.py +++ b/stepback/defaults.py @@ -11,6 +11,7 @@ class Dotdict(dict): 'device': 'cuda', 'num_workers': 0, 'data_parallel': None, + 'log_every_k_steps': None, 'verbose': False, 'force_deterministic': False } diff --git a/stepback/optim/main.py b/stepback/optim/main.py index 0dbffd4..8b66210 100644 --- a/stepback/optim/main.py +++ b/stepback/optim/main.py @@ -1,5 +1,5 @@ import torch -from torch.optim.lr_scheduler import LambdaLR, StepLR +from torch.optim.lr_scheduler import LambdaLR, StepLR, SequentialLR import warnings from typing import Tuple @@ -154,21 +154,26 @@ def get_scheduler(config: dict, opt: torch.optim.Optimizer) -> torch.optim.lr_sc """ # if not specified, use constant step sizes name = config.get('lr_schedule', 'constant') + + # default is to step scheduler end of epoch + # but with this arg we can step scheduler after each step + step_on_epoch = not config.get('stepwise_schedule') + + warmup_steps = config.get('warmup_steps', 0) + # value is multiplied with initial lr in all cases if name == 'constant': - lr_fun = lambda epoch: 1 # this value is multiplied with initial lr - scheduler = LambdaLR(opt, lr_lambda=lr_fun) - - elif name == 'linear': - lr_fun = lambda epoch: 1/(epoch+1) # this value is multiplied with initial lr + #lr_fun = lambda t: warmup_lr + (1-warmup_lr)*t/warmup_steps if t < warmup_steps else 1 + lr_fun = lambda t: 1 scheduler = LambdaLR(opt, lr_lambda=lr_fun) elif name == 'sqrt': - lr_fun = lambda epoch: (epoch+1)**(-1/2) # this value is multiplied with initial lr + #lr_fun = lambda t: warmup_lr + (1-warmup_lr)*t/warmup_steps if t < warmup_steps else (t-warmup_steps+1)**(-1/2) + lr_fun = lambda t: (t+1)**(-1/2) scheduler = LambdaLR(opt, lr_lambda=lr_fun) elif 'exponential' in name: - # use sth like 'exponential_60_0.5': decay by factor 0.5 every 60 epochs + # use sth like 'exponential_60_0.5': decay by factor 0.5 every 60 epochs/steps step_size = int(name.split('_')[1]) gamma = float(name.split('_')[2]) scheduler = StepLR(opt, step_size=step_size, gamma=gamma) @@ -176,4 +181,10 @@ def get_scheduler(config: dict, opt: torch.optim.Optimizer) -> torch.optim.lr_sc else: raise ValueError(f"Unknown learning rate schedule name {name}.") - return scheduler \ No newline at end of file + if warmup_steps > 0: + warmup_lr = 1e-10 + _warmup = lambda t: warmup_lr + (1-warmup_lr)*t/warmup_steps + warmup_scheduler = LambdaLR(opt, lr_lambda=_warmup) + scheduler = SequentialLR(opt, [warmup_scheduler, scheduler], milestones=[warmup_steps]) + + return scheduler, step_on_epoch \ No newline at end of file diff --git a/stepback/record.py b/stepback/record.py index 3d05828..b093a5e 100644 --- a/stepback/record.py +++ b/stepback/record.py @@ -6,7 +6,7 @@ import copy import itertools import os -from typing import Union +from typing import Union, Optional, Tuple import warnings from pandas.api.types import is_numeric_dtype @@ -14,7 +14,6 @@ from .defaults import DEFAULTS - SCORE_NAMES = {'train_loss': 'Training loss', 'val_loss': 'Validation loss', 'train_score': 'Training score', @@ -22,9 +21,9 @@ 'model_norm': r'$\|x^k\|$', 'grad_norm': r'$\|g_k\|$', 'fstar': r'$f_*^k$' - } +} -AES = { 'sgd': {'color': '#7fb285', 'markevery': 15, 'zorder': 7}, +AES = { 'sgd': {'color': "#f68427", 'markevery': 15, 'zorder': 7}, 'sgd-m': {'color': '#de9151', 'markevery': 8, 'zorder': 8}, 'adam': {'color': '#f34213', 'markevery': 10, 'zorder': 9}, 'adamw': {'color': '#f34213', 'markevery': 10, 'zorder': 9}, @@ -37,16 +36,13 @@ 'adabound': {'color': '#4f9d69', 'markevery': 10, 'zorder': 5}, 'lion': {'color': '#dbabab', 'markevery': 10, 'zorder': 4}, 'default': {'color': 'grey','markevery': 3, 'zorder': 1}, - } +} # more colors: -#F7CE5B -#4FB0C6 -#648381 -#F7B801 -#7ea2aa +#7fb285 ALL_MARKER = ('o', 'v', 'H', 's', '>', '<' , '^', 'D', 'x') +nan_mean_fun = lambda x: x.mean(skipna=False) class Record: def __init__(self, @@ -83,30 +79,41 @@ def __init__(self, self.base_df = self._build_base_df(agg='mean') return - def filter(self, drop=dict(), keep=dict()): - """Filter out by columns in id_df. Drops if any condition is true. - For example, use exclude = {'name': 'adam'} to drop all results from Adam. + def filter(self, drop=dict(), keep=dict(), any=False): + """Filter out by columns in id_df. + + if all=True: + * Drops if any condition is true. + * Keeps if all conditions are true. + if all=False: + * Drops if all conditions are true. + * Keeps if any conditions is true. + + For example, use drop={'name': 'adam'} to drop all results from Adam. NOTE: This overwrites the base_df and id_df object. """ all_ix = list() - for k,v in drop.items(): + for k, v in drop.items(): if not isinstance(v, list): v = [v] # if single value is given convert to list ix = ~self.id_df[k].isin(v) # indices to drop --> negate all_ix.append(ix) - for k,v in keep.items(): + for k, v in keep.items(): if not isinstance(v, list): v = [v] # if single value is given convert to list ix = self.id_df[k].isin(v) # indices to keep all_ix.append(ix) - ixx = pd.concat(all_ix, axis=1).all(axis=1) # keep where all True - + if not any: + ixx = pd.concat(all_ix, axis=1).all(axis=1) # keep where all True + else: + ixx = pd.concat(all_ix, axis=1).any(axis=1) # keep where any True + ids_to_keep = ixx.index[ixx.values] self.base_df = self.base_df[self.base_df.id.isin(ids_to_keep)] @@ -131,12 +138,41 @@ def _build_raw_df(self): this_df['id'] = ':'.join(id) # identifier given by all opt specifications this_df['run_id'] = r['config']['run_id'] # differentiating multiple runs + + # convert step-wise logs to series + # pandas has a bug that we cannot insert a series into this_df.loc[row_ix, ..] + # but we can insert into row, so we concat the modified rows + if 'step_logs' in this_df.columns: + HAVE_STEPWISE_LR = bool(opt_dict.get("stepwise_schedule")) + + # some sanity checks/warnings + if HAVE_STEPWISE_LR and not ("stepwise_lr" not in this_df.columns): + warnings.warn(f"ID {id}: Expected stepwise LR log, but none found. This might cause errors in analysis.") + if not HAVE_STEPWISE_LR and ("stepwise_lr" in this_df.columns): + warnings.warn(f"ID {id}: Found stepwise LR log, but no stepwise LR scheduling. Will be overwritten.") + + new_rows = list() + for row_ix, row in this_df.iterrows(): + for k, v in row['step_logs'].items(): + row['stepwise_' + k] = pd.Series(v, name=k) + + # reconstruct when no stepwise LR scheduling + if not HAVE_STEPWISE_LR: + # use step indices from last item, and ffill the learning rate from this epoch + _reconstructed_lr = dict(zip(v.keys(), row['learning_rate'] * np.ones(len(v)))) + row['stepwise_lr'] = pd.Series(_reconstructed_lr, name='lr') + + new_rows.append(row) + + this_df = pd.DataFrame(new_rows) + this_df = this_df.drop(columns=['step_logs']) + df_list.append(this_df) - df = pd.concat(df_list) - df = df.reset_index(drop=True) + df = pd.concat(df_list) df.insert(0, 'id', df.pop('id')) # move id column to front - + df = df.reset_index(drop=True) + # raise error if duplicates if df.duplicated(subset=['id', 'epoch', 'run_id']).any(): raise KeyError("There seem to be duplicates (by id, epoch, run_id). Please check the output data.") @@ -161,11 +197,22 @@ def _build_base_df(self, agg='mean'): # compute mean for each id and(!) epoch if agg == 'mean': - # if column numeric, take mean else take first - nan_mean_fun = lambda x: x.mean(skipna=False) - agg_dict = dict([(c, nan_mean_fun) if is_numeric_dtype(raw_df[c]) else (c, 'first') for c in raw_df.columns]) - agg_dict.pop('id') - agg_dict.pop('epoch') + # Create an aggregation dictionary first + # 1) if column numeric, take mean + # 2) for stepwise logs, each element is a series --> use custom function + # 3) else take first (e.g. for step_size_list) + agg_dict = dict() + for c in raw_df.columns: + if c in ["id", "epoch"]: + continue + if is_numeric_dtype(raw_df[c]): + agg_dict[c] = nan_mean_fun + elif isinstance(raw_df[c][0], pd.Series): + agg_dict[c] = average_series_and_wrap + else: + agg_dict[c] = "first" + + self._base_df_agg_dict = agg_dict df = raw_df.groupby(['id', 'epoch'], sort=False).agg(agg_dict).drop('run_id',axis=1) @@ -175,7 +222,7 @@ def _build_base_df(self, agg='mean'): df2 = raw_df.groupby(['id', 'epoch'], sort=False)[std_columns].std().drop('run_id',axis=1) df2.columns = [c+'_std' for c in df2.columns] - df = pd.concat([df,df2], axis=1) + df = pd.concat([df,df2], axis=1) df = df.reset_index(level=-1) # moves epoch out of index elif agg == 'first': @@ -189,7 +236,12 @@ def _build_base_df(self, agg='mean'): return df - def build_sweep_df(self, score='val_score', xaxis='lr', ignore_columns=list(), cutoff=None): + def build_sweep_df(self, + score: str='val_score', + xaxis: str='lr', + ignore_columns: list=list(), + cutoff: Optional[Tuple]=None + ): base_df = self.base_df.copy() id_df = self.id_df.copy() @@ -201,7 +253,8 @@ def build_sweep_df(self, score='val_score', xaxis='lr', ignore_columns=list(), c if cutoff is None: cutoff_epoch = (max_epoch[0], max_epoch[0]) else: - cutoff_epoch = (cutoff, max_epoch[0]) + assert len(cutoff) == 2, f"Cutoff needs to be tuple (len 2), but is given as {cutoff}." + cutoff_epoch = (cutoff[0], cutoff[1]) # filter epochs sub_df = base_df[(base_df.epoch >= cutoff_epoch[0]) @@ -330,7 +383,28 @@ def plot_metric(self, s, df=None, log_scale=False, ylim=None, legend=True, figsi fig.tight_layout() return fig, ax - +def average_series_by_index(series_of_series): + """ + Aggregates a Series of Series by averaging values at overlapping indices. + + Args: + series_of_series: A pandas Series where each element is itself a pandas Series. + This is what a groupby().agg() operation passes to the function. + Returns: + A single pandas Series with the aggregated and averaged values. + """ + concatenated_series = pd.concat(list(series_of_series)) + avg_series = concatenated_series.groupby(concatenated_series.index).mean() + return avg_series + +def average_series_and_wrap(series_of_series): + """ + This is on ly a trick as .agg() insert a list, and not a Series. + Thanks to Gemini. + """ + result_series = average_series_by_index(series_of_series) + return [result_series] + def id_to_dict(id): """utility for creating a dictionary from the identifier""" tmp = id.split(':') diff --git a/tests/test_resnet.py b/tests/test_resnet.py index ea3f73d..c433f40 100644 --- a/tests/test_resnet.py +++ b/tests/test_resnet.py @@ -23,7 +23,7 @@ def _template_resnet_init(config): """test that model initialization is fixed""" - B = Base('test_resnet', config) + B = Base('test_resnet', config, device='cpu') B._setup_model() # only load model as we do not want to download dataset B.model.conv1.weight