Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

parser.add_argument('-nw', '--num-workers', nargs='?', type=int, default=DEFAULTS.num_workers, help="Number of workers for DataLoader.")
parser.add_argument('--data-parallel', nargs='+', default=DEFAULTS.data_parallel, help='Device list for DataParallel in Pytorch.')
parser.add_argument('-logk', '--log-every-k-steps', nargs='?', type=int, default=DEFAULTS.log_every_k_steps, help="Stepwise logging.")
parser.add_argument('--verbose', action="store_true", help="Verbose mode.")
parser.add_argument('--force-deterministic', action="store_true", help="Use deterministic mode in Pytorch. Might require setting environment variables.")

Expand All @@ -31,6 +32,7 @@ def run_one(exp_id: str,
device: str=DEFAULTS.device,
num_workers: int=DEFAULTS.num_workers,
data_parallel: Union[list, None]=DEFAULTS.data_parallel,
log_every_k_steps: Union[int, None]=DEFAULTS.log_every_k_steps,
verbose: bool=DEFAULTS.verbose,
force_deterministic: bool=DEFAULTS.force_deterministic
):
Expand All @@ -55,6 +57,9 @@ def run_one(exp_id: str,
data_parallel : Union[list, None], optional
If not None, this specifies the device ids for DataParallel mode in Pytorch.
See https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html.
log_every_k_steps: Union[int, None], optional
If not None, log batch loss and grad_norm every k steps. Careful: this results in larger output files.
By default None (no stepwise logging).
verbose : bool, optional
Verbose mode flag.
If True, prints progress bars, model architecture and other useful information.
Expand Down Expand Up @@ -84,6 +89,7 @@ def run_one(exp_id: str,
data_dir=data_dir,
num_workers=num_workers,
data_parallel=data_parallel,
log_every_k_steps=log_every_k_steps,
verbose=verbose)

B.setup()
Expand All @@ -107,6 +113,7 @@ def run_one(exp_id: str,
device=args.device,
num_workers=args.num_workers,
data_parallel=args.data_parallel,
log_every_k_steps=args.log_every_k_steps,
verbose=args.verbose,
force_deterministic=args.force_deterministic)

Expand Down
99 changes: 76 additions & 23 deletions stepback/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def __init__(self, name: str,
data_dir: str=DEFAULTS.data_dir,
num_workers: int=DEFAULTS.num_workers,
data_parallel: Union[list, None]=DEFAULTS.data_parallel,
log_every_k_steps: Union[int, None]=DEFAULTS.log_every_k_steps,
verbose: bool=DEFAULTS.verbose):
"""The main class. Performs one single training run plus evaluation.

Expand All @@ -37,14 +38,17 @@ def __init__(self, name: str,
Needs to have the keys ['loss_func', 'score_func', 'opt'].
device : str, optional
Device string, by default 'cuda'
If 'cuda' is specified, but not available on system, it switches to CPU.
If 'cuda' is specified, but not available on system, it switches to MPS or CPU.
data_dir : str, optional
Directory where datasets can be found, by default 'data/'
num_workers : int, optional
Number of workers for DataLoader, by default 0
data_parallel : Union[list, None], optional
If not None, this specifies the device ids for DataParallel mode in Pytorch. By default None.
See https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html.
log_every_k_steps: Union[int, None], optional
If not None, log batch loss and grad_norm every k steps. Careful: this results in larger output files.
By default None (no stepwise logging).
verbose : bool, optional
Verbose mode flag, by default False.
If True, prints progress bars, model architecture and other useful information.
Expand All @@ -55,18 +59,27 @@ def __init__(self, name: str,
self.data_dir = data_dir
self.num_workers = num_workers
self.data_parallel = data_parallel
self.log_every_k_steps = log_every_k_steps
self.verbose = verbose


# Device handling
print("CUDA available? ", torch.cuda.is_available())

if torch.cuda.is_available():
self.device = torch.device(device)
else:
self.device = torch.device('cpu')
# use MPS if available
try:
if torch.backends.mps.is_available() and device != "cpu":
self.device = torch.device('mps')
else:
self.device = torch.device('cpu')
except:
self.device = torch.device('cpu')

print("Using device: ", self.device)

# Seeding
self.seed = 1234567
self.run_seed = 456789 + config.get('run_id', 0)
torch.backends.cudnn.benchmark = False
Expand Down Expand Up @@ -112,9 +125,13 @@ def _setup_data(self):
# construct train loader
_gen = torch.Generator()
_gen.manual_seed(self.run_seed)
self.train_loader = DataLoader(self.train_set, drop_last=True, shuffle=True, generator=_gen,
self.train_loader = DataLoader(self.train_set,
drop_last=True,
shuffle=True,
generator=_gen,
batch_size=self.config['batch_size'],
num_workers=self.num_workers)
num_workers=self.num_workers
)

return

Expand Down Expand Up @@ -155,14 +172,15 @@ def setup(self):
opt_obj, hyperp = get_optimizer(self.config['opt'])

self._init_opt(opt_obj, hyperp)
self.sched = get_scheduler(self.config['opt'], self.opt)

self.sched, self._step_scheduler_on_epoch = get_scheduler(self.config['opt'], self.opt)

#============ Results ==============
opt_val = self._compute_opt_value()


# Store useful information as summary
self.results['summary']['num_batches_per_epoch'] = len(self.train_loader)
self.results['summary']['step_scheduler_on_epoch'] = self._step_scheduler_on_epoch
if opt_val is not None:
self.results['summary']['opt_val'] = opt_val

Expand All @@ -180,7 +198,12 @@ def run(self):
start_time = str(datetime.datetime.now())
score_list = []
self._epochs_trained = 0


_init_model_norm = l2_norm(self.model)
self.results['summary']['init_model_norm'] = _init_model_norm
if self.verbose:
print(f"Initial model L2-norm: ", _init_model_norm)

for epoch in range(self.config['max_epoch']):

print(f"Epoch {epoch}, current learning rate", self.sched.get_last_lr()[0])
Expand All @@ -198,7 +221,10 @@ def run(self):
score_dict['train_epoch_time'] = e_time - s_time
score_dict['model_norm'] = l2_norm(self.model)
score_dict['grad_norm'] = grad_norm(self.model)


if self.log_every_k_steps is not None:
score_dict['step_logs'] = copy.deepcopy(self._log_stepwise)

# Validation
with torch.no_grad():

Expand All @@ -207,11 +233,11 @@ def run(self):

train_dict = self.evaluate(self.train_set,
metric_dict = metric_dict,
)
)

val_dict = self.evaluate(self.val_set,
metric_dict = metric_dict,
)
)

# Record metrics
score_dict.update(train_dict)
Expand All @@ -233,11 +259,13 @@ def run(self):
self._epochs_trained += 1

end_time = str(datetime.datetime.now())

# ==== store =====================
self.results['history'] = copy.deepcopy(score_list)
self.results['summary']['start_time'] = start_time
self.results['summary']['end_time'] = end_time
self.results['summary']['final_model_norm'] = l2_norm(self.model)

return

def train_epoch(self):
Expand All @@ -253,7 +281,13 @@ def train_epoch(self):

t0 = time.time()

for batch in pbar:
# Reset logging dictionaries
if self.log_every_k_steps is not None:
self._log_stepwise = {"loss": {}, "grad_norm": {}}
if not self._step_scheduler_on_epoch:
self._log_stepwise["lr"] = {}

for step_counter, batch in enumerate(pbar):
# Move batch to device
data, targets = batch['data'].to(device=self.device), batch['targets'].to(device=self.device)

Expand All @@ -268,24 +302,37 @@ def train_epoch(self):
closure = lambda: self.training_loss.compute(out, targets)

# see optim/README.md for explanation
if hasattr(self.opt,"prestep"):
if hasattr(self.opt, "prestep"):
ind = batch['ind'].to(device=self.device) # indices of batch members
self.opt.prestep(out, targets, ind, self.training_loss.name)

# Here the magic happens
loss_val = self.opt.step(closure=closure)

if self.device != torch.device('cpu'):
torch.cuda.synchronize()
self. _sync_device()

timings_dataloader.append(t1-t0)
t0 = time.time() # model timing ends
timings_model.append(t0-t1)

pbar.set_description(f'Training - loss={loss_val:.3f} - time data: last={timings_dataloader[-1]:.3f},(mean={np.mean(timings_dataloader):.3f}) - time model+step: last={timings_model[-1]:.3f}(mean={np.mean(timings_model):.3f})')

# Log loss_val and grad_norm every k steps
if self.log_every_k_steps is not None:
total_step_counter = len(self.train_loader) * self._epochs_trained + step_counter
if step_counter % self.log_every_k_steps == 0:
self._log_stepwise["loss"][total_step_counter] = loss_val.item()
self._log_stepwise["grad_norm"][total_step_counter] = grad_norm(self.model)
if not self._step_scheduler_on_epoch:
self._log_stepwise["lr"][total_step_counter] = self.sched.get_last_lr()[0]


if not self._step_scheduler_on_epoch:
self.sched.step()

# update learning rate
self.sched.step()
if self._step_scheduler_on_epoch:
self.sched.step()

return

Expand Down Expand Up @@ -323,17 +370,16 @@ def evaluate(self, dataset, metric_dict):
# metric takes average over batch ==> multiply with batch size
score_dict[_met] += _met_fun.compute(out, targets).item() * data.shape[0]

timings_dataloader.append(t1-t0)
if self.device != torch.device('cpu'):
torch.cuda.synchronize()
timings_dataloader.append(t1-t0)
self. _sync_device()

t0 = time.time()
timings_model.append(t0-t1)

pbar.set_description(f'Validating {dataset.split}')
pbar.set_description(f'Validating {dataset.split} - time data: last={timings_dataloader[-1]:.3f}(mean={np.mean(timings_dataloader):.3f}) - time model: last={timings_model[-1]:.3f}(mean={np.mean(timings_model):.3f})')




for _met in metric_dict.keys():
# Get from sum to average
score_dict[_met] = float(score_dict[_met] / len(dl.dataset))
Expand All @@ -353,6 +399,13 @@ def save_checkpoint(self, path):

return

def _sync_device(self):
if self.device == torch.device('mps'):
torch.mps.synchronize()
else:
if self.device != torch.device('cpu'):
torch.cuda.synchronize()

def _compute_opt_value(self):
"""
For linear model, the problem is convex and we can compute the optimal value
Expand Down
77 changes: 54 additions & 23 deletions stepback/datasets/libsvm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,58 +8,89 @@

SPLIT_SEED = 12345678

LIBSVM_URL = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/"
LIBSVM_BINARY_URL = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/"
LIBSVM_MULTICLASS_URL = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/"

# mapping libsvm names to download links
LIBSVM_NAME_MAP = {"rcv1" : "rcv1_train.binary.bz2",
MULTICLASS_NAME_MAP = {"dna": "dna.scale",
"wine": "wine.scale"
}

BINARY_NAME_MAP = {"rcv1" : "rcv1_train.binary.bz2",
"mushrooms" : "mushrooms",
"a1a" : "a1a",
"ijcnn" : "ijcnn1.tr.bz2",
"breast-cancer" : "breast-cancer_scale"
}
}
LIBSVM_NAMES = list(MULTICLASS_NAME_MAP.keys()) + list(BINARY_NAME_MAP.keys())


def get_libsvm(split, name, path, train_size=0.8):
X, y = load_libsvm(name, path + '/libsvm')

if np.all(np.isin(y, [0,1])):
y = y*2 - 1 # go from 0,1 to -1,1

if name == 'breast-cancer':
y[y==2] = 1
y[y==4] = -1

labels = np.unique(y)
assert np.all(np.isin(y, [-1,1])), f"Sth went wrong with class labels, have {labels}."
if name in BINARY_NAME_MAP.keys():
multiclass = False
elif name in MULTICLASS_NAME_MAP.keys():
multiclass = True
else:
raise KeyError(f"Unknwon dataset name {name} from LIBSVM. Need to be added to name mapping.")

libsvm_path = os.path.join(path, "libsvm")
X, y = load_libsvm(name, libsvm_path, multiclass)

if not multiclass:
# use -1, 1 labels in binary case
if np.all(np.isin(y, [0,1])):
y = y*2 - 1

# manual label fix for breast cancer dataset
if name == 'breast-cancer':
y[y==2] = 1
y[y==4] = -1

unique_labels = [-1, 1]
assert np.all(np.isin(y, unique_labels)), f"Sth went wrong with class labels, have {np.unique(y)}."
else:
unique_labels = list(np.unique(y).astype("int"))
# try to achieve class labels [0,C)
if min(unique_labels) > 0:
y = y - min(unique_labels)

print(f"Dataset labels (before split): {unique_labels}")

# use fixed seed for train/val split
X_train, X_test, Y_train, Y_test = train_test_split(X, y,
X_train, X_test, Y_train, Y_test = train_test_split(X,
y,
train_size=train_size,
shuffle=True,
random_state=SPLIT_SEED)

random_state=SPLIT_SEED
)

# for multiclass, we need Long Tensors
# for binary, FloatTensor is ok
if split == 'train':
X = torch.FloatTensor(X_train.toarray())
Y = torch.FloatTensor(Y_train)
Y = torch.FloatTensor(Y_train) if not multiclass else torch.LongTensor(Y_train)
else:
X = torch.FloatTensor(X_test.toarray())
Y = torch.FloatTensor(Y_test)
Y = torch.FloatTensor(Y_test) if not multiclass else torch.LongTensor(Y_test)

ds = torch.utils.data.TensorDataset(X, Y)

return ds



def load_libsvm(name, path):
def load_libsvm(name, path, multiclass=False):
if not os.path.exists(path):
os.mkdir(path)

fn = LIBSVM_NAME_MAP[name]
if multiclass:
fn = MULTICLASS_NAME_MAP[name]
else:
fn = BINARY_NAME_MAP[name]
filename = os.path.join(path, fn)

_url = LIBSVM_MULTICLASS_URL if multiclass else LIBSVM_BINARY_URL
if not os.path.exists(filename):
url = urllib.parse.urljoin(LIBSVM_URL, fn)
url = urllib.parse.urljoin(_url, fn)
print("Downloading from %s" % url)
urllib.request.urlretrieve(url, filename)
print("Download complete.")
Expand Down
Loading