From 498db462cc9626905ba443d03d2f1a817fd78bad Mon Sep 17 00:00:00 2001 From: ganik Date: Mon, 20 Jul 2020 22:35:18 +0000 Subject: [PATCH 01/13] Replace with Dropout and Softmax --- DeBERTa/deberta/disentangled_attention.py | 4 +++- DeBERTa/deberta/ops.py | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/DeBERTa/deberta/disentangled_attention.py b/DeBERTa/deberta/disentangled_attention.py index 262425b..6f281ca 100644 --- a/DeBERTa/deberta/disentangled_attention.py +++ b/DeBERTa/deberta/disentangled_attention.py @@ -174,7 +174,9 @@ def linear(w,b,x): if self.talking_head: attention_scores = self.head_logits_proj(attention_scores.permute(0,2,3,1)).permute(0,3,1,2) - attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1) + #attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1) + nodex = torch.nn.Softmax(-1) + attention_probs = nodex(attention_scores) attention_probs = self.dropout(attention_probs) if self.talking_head: attention_probs = self.head_weights_proj(attention_probs.permute(0,2,3,1)).permute(0,3,1,2) diff --git a/DeBERTa/deberta/ops.py b/DeBERTa/deberta/ops.py index a18515f..08afda1 100644 --- a/DeBERTa/deberta/ops.py +++ b/DeBERTa/deberta/ops.py @@ -115,7 +115,11 @@ def backward(ctx, grad_output): else: return grad_output, None -class StableDropout(torch.nn.Module): +class StableDropout(torch.nn.Dropout): + def __init__(self, drop_prob): + super().__init__() + +class StableDropout1(torch.nn.Module): """ Optimized dropout module for stabilizing the training Args: From 3be82890d9502c435093685996bd89f83ebe2c85 Mon Sep 17 00:00:00 2001 From: ganik Date: Tue, 21 Jul 2020 23:04:16 +0000 Subject: [PATCH 02/13] mask attention scores in Softmax --- DeBERTa/deberta/disentangled_attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DeBERTa/deberta/disentangled_attention.py b/DeBERTa/deberta/disentangled_attention.py index 6f281ca..2905084 100644 --- a/DeBERTa/deberta/disentangled_attention.py +++ b/DeBERTa/deberta/disentangled_attention.py @@ -176,7 +176,7 @@ def linear(w,b,x): #attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1) nodex = torch.nn.Softmax(-1) - attention_probs = nodex(attention_scores) + attention_probs = nodex(attention_scores + 10000.0*(attention_mask -1)) attention_probs = self.dropout(attention_probs) if self.talking_head: attention_probs = self.head_weights_proj(attention_probs.permute(0,2,3,1)).permute(0,3,1,2) From dab83afd9d44fbc5f86d36e38218df355cac4914 Mon Sep 17 00:00:00 2001 From: ganik Date: Sun, 2 Aug 2020 05:42:45 +0000 Subject: [PATCH 03/13] onnx conversion and training --- .gitignore | 1 + DeBERTa/apps/sequence_classification.py | 4 +- DeBERTa/apps/train.py | 74 +++++++++++++++++++++++-- 3 files changed, 72 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index b6e4761..a4ba5c7 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,4 @@ dmypy.json # Pyre type checker .pyre/ +tmp/ diff --git a/DeBERTa/apps/sequence_classification.py b/DeBERTa/apps/sequence_classification.py index 218aed2..11d9b39 100644 --- a/DeBERTa/apps/sequence_classification.py +++ b/DeBERTa/apps/sequence_classification.py @@ -46,7 +46,7 @@ def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, positi pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) - loss = 0 + loss = torch.tensor(0).to(logits) if labels is not None: if self.num_labels ==1: # regression task @@ -68,4 +68,4 @@ def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, positi label_confidence = 1 loss = -((log_softmax(logits)*labels).sum(-1)*label_confidence).mean() - return (logits,loss) + return (loss, logits) diff --git a/DeBERTa/apps/train.py b/DeBERTa/apps/train.py index e9218d1..3bffee5 100644 --- a/DeBERTa/apps/train.py +++ b/DeBERTa/apps/train.py @@ -24,9 +24,10 @@ from ..utils import * from ..utils import xtqdm as tqdm from .task_registry import tasks +from onnxruntime.capi.ort_trainer import ORTTrainer, IODescription, ModelDescription, LossScaler from ..training import DistributedTrainer, initialize_distributed, batch_to, set_random_seed,kill_children -from ..data import DistributedBatchSampler, SequentialSampler, BatchSampler, AsyncDataLoader +from ..data import DistributedBatchSampler, SequentialSampler, BatchSampler, RandomSampler, AsyncDataLoader def create_model(args, num_labels, model_class_fn): # Prepare model @@ -217,9 +218,63 @@ def run_predict(args, model, device, eval_data, prefix=None): if predict_fn: predict_fn(predicts, args.output_dir, name, prefix) +def deberta_model_description(args): + vocab_size = 30528 + # set concrete input sizes to permit optimization + input_ids_desc = IODescription('input_ids', [args.train_batch_size, args.max_seq_length], torch.int32, num_classes=vocab_size) + type_ids_desc = IODescription('type_ids', [args.train_batch_size, args.max_seq_length], torch.int32) # num_classes=? + position_ids_desc = IODescription('position_ids', [args.train_batch_size, args.max_seq_length], torch.int32) # num_classes=? + input_mask_desc = IODescription('input_mask', [args.train_batch_size, args.max_seq_length], torch.int32) # num_classes=? + labels_desc = IODescription('labels', [args.train_batch_size, args.max_seq_length], torch.float32) # num_classes=? + + loss_desc = IODescription('loss', [], torch.float32) + return ModelDescription([input_ids_desc, type_ids_desc, position_ids_desc, input_mask_desc, labels_desc], [loss_desc]) + +def create_ort_trainer(args, device, model): + # default initial settings: b1=0.9, b2=0.999, e=1e-6 + def map_optimizer_attributes(name): + no_decay_keys = ["bias", "gamma", "beta", "LayerNorm"] + no_decay = False + for no_decay_key in no_decay_keys: + if no_decay_key in name: + no_decay = True + break + if no_decay: + return {"alpha": 0.9, "beta": 0.999, "lambda": 0.0, "epsilon": 1e-6} + else: + return {"alpha": 0.9, "beta": 0.999, "lambda": 0.01, "epsilon": 1e-6} + + # we request ORTTrainer to create a LambOptimizer with given optimizer_attributes. + # train_step does forward, backward, and optimize step. + model = ORTTrainer(model, None, deberta_model_description(args), "LambOptimizer", + map_optimizer_attributes, + IODescription('Learning_Rate', [1,], torch.float32), + device, + _opset_version = 10) + + return model + +def run_onnx_training(args, model, device, train_data, prefix=None): + # runs training in ONNX + trainer = create_ort_trainer(args, device, model) + train_sampler = RandomSampler(len(train_data)) + batch_sampler = BatchSampler(train_sampler, args.train_batch_size) + batch_sampler = DistributedBatchSampler(batch_sampler, rank=args.rank, world_size=args.world_size) + train_dataloader = DataLoader(train_data, batch_sampler=batch_sampler, num_workers=args.workers, pin_memory=True) + torch.cuda.empty_cache() + for step, batch in enumerate(AsyncDataLoader(train_dataloader, 100)): + #import pdb + #pdb.set_trace() + batch = batch_to(batch, device) + with torch.no_grad(): + trainer.train_step(batch['input_ids'], batch['type_ids'], batch['position_ids'], batch['input_mask'], batch['labels']) + # conversion fails now with: + # site-packages/torch/onnx/utils.py:617: UserWarning: ONNX export failed on ATen operator broadcast_tensors + # because torch.onnx.symbolic_opset10.broadcast_tensors does not exist + def main(args): - if not args.do_train and not args.do_eval and not args.do_predict: - raise ValueError("At least one of `do_train` or `do_eval` or `do_predict` must be True.") + if not args.do_train and not args.do_eval and not args.do_predict and not args.do_onnx: + raise ValueError("At least one of `do_train` or `do_eval` or `do_predict` or `do_onnx` must be True.") os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() random.seed(args.seed) @@ -236,11 +291,11 @@ def main(args): test_data = processor.test_data(max_seq_len=args.max_seq_length) logger.info(" Prediction batch size = %d", args.predict_batch_size) - if args.do_train: + if args.do_train or args.do_onnx: train_data = processor.train_data(max_seq_len=args.max_seq_length, mask_gen = None, debug=args.debug) model_class_fn = processor.get_model_class_fn() model = create_model(args, len(label_list), model_class_fn) - if args.do_train: + if args.do_train or args.do_onnx: with open(os.path.join(args.output_dir, 'model_config.json'), 'w', encoding='utf-8') as fs: fs.write(model.config.to_json_string() + '\n') logger.info("Model config {}".format(model.config)) @@ -257,6 +312,10 @@ def main(args): if args.do_predict: run_predict(args, model, device, test_data, prefix=args.tag) + # trains in ONNX + if args.do_onnx: + run_onnx_training(args, model, device, train_data, prefix=args.tag) + def build_argument_parser(): parser = argparse.ArgumentParser() @@ -437,6 +496,11 @@ def build_argument_parser(): default=None, type=str, help="The path of pre-trained RoBERTa model") + + parser.add_argument("--do_onnx", + default=False, + action='store_true', + help="Whether to run training in ONNX") return parser if __name__ == "__main__": From 64f068c6afdb57087b4439859143aa46593134e7 Mon Sep 17 00:00:00 2001 From: ganik Date: Sun, 2 Aug 2020 05:53:11 +0000 Subject: [PATCH 04/13] TBD tight coupling with torch 1.3 --- DeBERTa/optims/fp16_optimizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/DeBERTa/optims/fp16_optimizer.py b/DeBERTa/optims/fp16_optimizer.py index dea2b0b..cc58afb 100755 --- a/DeBERTa/optims/fp16_optimizer.py +++ b/DeBERTa/optims/fp16_optimizer.py @@ -20,6 +20,8 @@ from ..utils import get_logger logger=get_logger() +# Lines below tightly couple DeBerta with torch 1.3 +# TBD refactor or port to torch 1.6 lib = ctypes.cdll.LoadLibrary(None) lib.THCudaHalfTensor_normall.argtypes=[ctypes.c_void_p, ctypes.c_void_p] lib.THCudaHalfTensor_normall.restype = ctypes.c_float From 1ce5cc19cefecacd7cf742baeb28b6e4e43bb926 Mon Sep 17 00:00:00 2001 From: ganik Date: Fri, 7 Aug 2020 00:58:05 +0000 Subject: [PATCH 05/13] opset 12, expand attention mask --- DeBERTa/apps/train.py | 2 +- DeBERTa/deberta/bert.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DeBERTa/apps/train.py b/DeBERTa/apps/train.py index 3bffee5..4a8c691 100644 --- a/DeBERTa/apps/train.py +++ b/DeBERTa/apps/train.py @@ -250,7 +250,7 @@ def map_optimizer_attributes(name): map_optimizer_attributes, IODescription('Learning_Rate', [1,], torch.float32), device, - _opset_version = 10) + _opset_version = 12) return model diff --git a/DeBERTa/deberta/bert.py b/DeBERTa/deberta/bert.py index 5b2b2b2..f9ccbb7 100644 --- a/DeBERTa/deberta/bert.py +++ b/DeBERTa/deberta/bert.py @@ -160,7 +160,7 @@ def get_attention_mask(self, attention_mask): if attention_mask.dim()<=2: extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) attention_mask = extended_attention_mask*extended_attention_mask.squeeze(-2).unsqueeze(-1) - attention_mask = attention_mask.byte() + attention_mask = attention_mask.int() elif attention_mask.dim()==3: attention_mask = attention_mask.unsqueeze(1) From b301b4641409821b1acc992f1ba4daf8f24464d9 Mon Sep 17 00:00:00 2001 From: ganik Date: Sat, 8 Aug 2020 00:00:51 +0000 Subject: [PATCH 06/13] loss is first --- DeBERTa/apps/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/DeBERTa/apps/train.py b/DeBERTa/apps/train.py index 4a8c691..71d0163 100644 --- a/DeBERTa/apps/train.py +++ b/DeBERTa/apps/train.py @@ -56,7 +56,7 @@ def eval_fn(trainer, model, device, tag): return eval_metric def loss_fn(trainer, model, data): - _, loss = model(**data) + loss, _ = model(**data) return loss.mean(), data['input_ids'].size(0) trainer = DistributedTrainer(args, model, device, data_fn, loss_fn = loss_fn, eval_fn = eval_fn, dump_interval = args.dump_interval) @@ -161,7 +161,7 @@ def run_eval(args, model, device, eval_data, prefix=None, tag=None, steps=None): for batch in tqdm(AsyncDataLoader(eval_dataloader), ncols=80, desc='Evaluating: {}'.format(prefix), disable=no_tqdm): batch = batch_to(batch, device) with torch.no_grad(): - logits, tmp_eval_loss = model(**batch) + tmp_eval_loss, logits = model(**batch) label_ids = batch['labels'].to(device) predicts.append(logits) labels.append(label_ids) @@ -196,7 +196,7 @@ def run_predict(args, model, device, eval_data, prefix=None): for batch in tqdm(AsyncDataLoader(eval_dataloader), ncols=80, desc='Evaluating: {}'.format(prefix), disable=args.rank>0): batch = batch_to(batch, device) with torch.no_grad(): - logits, _ = model(**batch) + _, logits = model(**batch) if args.world_size>1: logits_all = [torch.zeros_like(logits) for _ in range(args.world_size)] torch.distributed.all_gather(logits_all, logits) From d2fa9fdf1cc2537fc3f7f9349457c0877c5732c6 Mon Sep 17 00:00:00 2001 From: ganik Date: Sat, 8 Aug 2020 00:53:30 +0000 Subject: [PATCH 07/13] commenting out v_ and q_ biases as they are always const --- DeBERTa/apps/train.py | 3 ++- DeBERTa/deberta/disentangled_attention.py | 11 +++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/DeBERTa/apps/train.py b/DeBERTa/apps/train.py index 71d0163..28808fb 100644 --- a/DeBERTa/apps/train.py +++ b/DeBERTa/apps/train.py @@ -265,9 +265,10 @@ def run_onnx_training(args, model, device, train_data, prefix=None): for step, batch in enumerate(AsyncDataLoader(train_dataloader, 100)): #import pdb #pdb.set_trace() + lr = torch.tensor([0.0000000e+00]).to(device) batch = batch_to(batch, device) with torch.no_grad(): - trainer.train_step(batch['input_ids'], batch['type_ids'], batch['position_ids'], batch['input_mask'], batch['labels']) + trainer.train_step(batch['input_ids'], batch['type_ids'], batch['position_ids'], batch['input_mask'], batch['labels'], lr) # conversion fails now with: # site-packages/torch/onnx/utils.py:617: UserWarning: ONNX export failed on ATen operator broadcast_tensors # because torch.onnx.symbolic_opset10.broadcast_tensors does not exist diff --git a/DeBERTa/deberta/disentangled_attention.py b/DeBERTa/deberta/disentangled_attention.py index 2905084..1369bf5 100644 --- a/DeBERTa/deberta/disentangled_attention.py +++ b/DeBERTa/deberta/disentangled_attention.py @@ -77,8 +77,9 @@ def __init__(self, config): self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.in_proj = torch.nn.Linear(config.hidden_size, self.all_head_size*3, bias=False) - self.q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) - self.v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) + # Looks like params below are never updated and const, so removing them + #self.q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) + #self.v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) self.pos_att_type = [x.strip() for x in getattr(config, 'pos_att_type', 'none').lower().split('|')] # c2p|p2c self.relative_attention = getattr(config, 'relative_attention', False) @@ -148,8 +149,10 @@ def linear(w,b,x): k,v = [linear(qkvw[i], qkvb[i], hidden_states) for i in range(1,3)] query_layer, key_layer, value_layer = [self.transpose_for_scores(x) for x in [q,k,v]] - query_layer += self.transpose_for_scores(self.q_bias.unsqueeze(0).unsqueeze(0)) - value_layer += self.transpose_for_scores(self.v_bias.unsqueeze(0).unsqueeze(0)) + q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) + v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) + query_layer += self.transpose_for_scores(q_bias.unsqueeze(0).unsqueeze(0)) + value_layer += self.transpose_for_scores(v_bias.unsqueeze(0).unsqueeze(0)) rel_att = None # Take the dot product between "query" and "key" to get the raw attention scores. From e4793b87a7cbf392bdf3537e394e8e61c0cc99ab Mon Sep 17 00:00:00 2001 From: ganik Date: Fri, 14 Aug 2020 19:28:24 +0000 Subject: [PATCH 08/13] Fix Dropout model regression issue --- DeBERTa/apps/multi_choice.py | 2 +- DeBERTa/apps/ner.py | 2 +- DeBERTa/apps/sequence_classification.py | 2 +- DeBERTa/deberta/bert.py | 6 +++--- DeBERTa/deberta/config.py | 2 ++ DeBERTa/deberta/disentangled_attention.py | 16 +++++++--------- DeBERTa/deberta/ops.py | 7 +------ DeBERTa/deberta/pooling.py | 4 +++- 8 files changed, 19 insertions(+), 22 deletions(-) diff --git a/DeBERTa/apps/multi_choice.py b/DeBERTa/apps/multi_choice.py index 253f7c3..5611d57 100644 --- a/DeBERTa/apps/multi_choice.py +++ b/DeBERTa/apps/multi_choice.py @@ -27,7 +27,7 @@ def __init__(self, config, num_labels = 2, drop_out=None, **kwargs): self.num_labels = num_labels self.classifier = nn.Linear(config.hidden_size, 1) drop_out = config.hidden_dropout_prob if drop_out is None else drop_out - self.dropout = StableDropout(drop_out) + self.dropout = StableDropout(drop_out) if config.use_xdropout else nn.Dropout(drop_out) self.apply(self.init_weights) def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, position_ids=None, **kwargs): diff --git a/DeBERTa/apps/ner.py b/DeBERTa/apps/ner.py index 812b042..8749e4d 100644 --- a/DeBERTa/apps/ner.py +++ b/DeBERTa/apps/ner.py @@ -27,7 +27,7 @@ def __init__(self, config, num_labels = 2, drop_out=None, **kwargs): self.proj = nn.Linear(config.hidden_size, config.hidden_size) self.classifier = nn.Linear(config.hidden_size, self.num_labels) drop_out = config.hidden_dropout_prob if drop_out is None else drop_out - self.dropout = StableDropout(drop_out) + self.dropout = StableDropout(drop_out) if config.use_xdropout else nn.Dropout(drop_out) self.apply(self.init_weights) def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, position_ids=None, **kwargs): diff --git a/DeBERTa/apps/sequence_classification.py b/DeBERTa/apps/sequence_classification.py index 11d9b39..c9d272a 100644 --- a/DeBERTa/apps/sequence_classification.py +++ b/DeBERTa/apps/sequence_classification.py @@ -35,7 +35,7 @@ def __init__(self, config, num_labels=2, drop_out=None, pre_trained=None): self.classifier = torch.nn.Linear(output_dim, num_labels) drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out - self.dropout = StableDropout(drop_out) + self.dropout = StableDropout(drop_out) if config.use_xdropout else torch.nn.Dropout(drop_out) self.apply(self.init_weights) self.bert.apply_state() diff --git a/DeBERTa/deberta/bert.py b/DeBERTa/deberta/bert.py index f9ccbb7..ba817a0 100644 --- a/DeBERTa/deberta/bert.py +++ b/DeBERTa/deberta/bert.py @@ -63,7 +63,7 @@ def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps) - self.dropout = StableDropout(config.hidden_dropout_prob) + self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob) self.config = config def forward(self, hidden_states, input_states, mask=None): @@ -110,7 +110,7 @@ def __init__(self, config): super(BertOutput, self).__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps) - self.dropout = StableDropout(config.hidden_dropout_prob) + self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob) self.config = config def forward(self, hidden_states, input_states, mask=None): @@ -229,7 +229,7 @@ def __init__(self, config): if self.embedding_size != config.hidden_size: self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False) self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps) - self.dropout = StableDropout(config.hidden_dropout_prob) + self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob) self.output_to_half = False self.config = config diff --git a/DeBERTa/deberta/config.py b/DeBERTa/deberta/config.py index 11f23aa..a324c97 100644 --- a/DeBERTa/deberta/config.py +++ b/DeBERTa/deberta/config.py @@ -15,6 +15,8 @@ def from_dict(cls, json_object): if isinstance(value, dict): value = AbsModelConfig.from_dict(value) config.__dict__[key] = value + config.use_xdropout = True + config.use_xsoftmax = True return config @classmethod diff --git a/DeBERTa/deberta/disentangled_attention.py b/DeBERTa/deberta/disentangled_attention.py index 1369bf5..8f7801c 100644 --- a/DeBERTa/deberta/disentangled_attention.py +++ b/DeBERTa/deberta/disentangled_attention.py @@ -77,9 +77,9 @@ def __init__(self, config): self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.in_proj = torch.nn.Linear(config.hidden_size, self.all_head_size*3, bias=False) - # Looks like params below are never updated and const, so removing them - #self.q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) - #self.v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) + # ONNX graph builder thinks params below are not used for loss calcualtion + self.q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) + self.v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) self.pos_att_type = [x.strip() for x in getattr(config, 'pos_att_type', 'none').lower().split('|')] # c2p|p2c self.relative_attention = getattr(config, 'relative_attention', False) @@ -93,14 +93,14 @@ def __init__(self, config): self.max_relative_positions = getattr(config, 'max_relative_positions', -1) if self.max_relative_positions <1: self.max_relative_positions = config.max_position_embeddings - self.pos_dropout = StableDropout(config.hidden_dropout_prob) + self.pos_dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else torch.nn.Dropout(config.hidden_dropout_prob) if 'c2p' in self.pos_att_type or 'p2p' in self.pos_att_type: self.pos_proj = torch.nn.Linear(config.hidden_size, self.all_head_size, bias=False) if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type: self.pos_q_proj = torch.nn.Linear(config.hidden_size, self.all_head_size) - self.dropout = StableDropout(config.attention_probs_dropout_prob) + self.dropout = StableDropout(config.attention_probs_dropout_prob) if config.use_xdropout else torch.nn.Dropout(config.attention_probs_dropout_prob) def transpose_for_scores(self, x): new_x_shape = x.size()[:-1] + (self.num_attention_heads, -1) @@ -149,10 +149,8 @@ def linear(w,b,x): k,v = [linear(qkvw[i], qkvb[i], hidden_states) for i in range(1,3)] query_layer, key_layer, value_layer = [self.transpose_for_scores(x) for x in [q,k,v]] - q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) - v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) - query_layer += self.transpose_for_scores(q_bias.unsqueeze(0).unsqueeze(0)) - value_layer += self.transpose_for_scores(v_bias.unsqueeze(0).unsqueeze(0)) + query_layer += self.transpose_for_scores(self.q_bias.unsqueeze(0).unsqueeze(0)) + value_layer += self.transpose_for_scores(self.v_bias.unsqueeze(0).unsqueeze(0)) rel_att = None # Take the dot product between "query" and "key" to get the raw attention scores. diff --git a/DeBERTa/deberta/ops.py b/DeBERTa/deberta/ops.py index 08afda1..a1ba5fd 100644 --- a/DeBERTa/deberta/ops.py +++ b/DeBERTa/deberta/ops.py @@ -6,7 +6,6 @@ # Author: penhe@microsoft.com # Date: 01/15/2020 # - import math from packaging import version import torch @@ -115,11 +114,7 @@ def backward(ctx, grad_output): else: return grad_output, None -class StableDropout(torch.nn.Dropout): - def __init__(self, drop_prob): - super().__init__() - -class StableDropout1(torch.nn.Module): +class StableDropout(torch.nn.Module): """ Optimized dropout module for stabilizing the training Args: diff --git a/DeBERTa/deberta/pooling.py b/DeBERTa/deberta/pooling.py index 16b9aaa..4fb4f43 100644 --- a/DeBERTa/deberta/pooling.py +++ b/DeBERTa/deberta/pooling.py @@ -58,6 +58,8 @@ def __init__(self, config=None): self.hidden_size = 768 self.dropout = 0 self.hidden_act = 'gelu' + self.use_xdropout = True + self.use_xsoftmax = True if config: pool_config = getattr(config, 'pooling', config) if isinstance(pool_config, dict): @@ -70,7 +72,7 @@ class ContextPooler(nn.Module): def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.dropout = StableDropout(config.dropout) + self.dropout = StableDropout(config.dropout) if config.use_xdropout else nn.Dropout(config.dropout) self.config = config def forward(self, hidden_states, mask = None): From 155d96609d4be0466c1f918ac198b5aec114dd04 Mon Sep 17 00:00:00 2001 From: ganik Date: Fri, 14 Aug 2020 19:48:00 +0000 Subject: [PATCH 09/13] Use nn.dropout and nn.softmax by default --- DeBERTa/deberta/config.py | 4 ++-- DeBERTa/deberta/disentangled_attention.py | 9 ++++++--- DeBERTa/deberta/pooling.py | 4 ++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/DeBERTa/deberta/config.py b/DeBERTa/deberta/config.py index a324c97..eb5b2c5 100644 --- a/DeBERTa/deberta/config.py +++ b/DeBERTa/deberta/config.py @@ -15,8 +15,8 @@ def from_dict(cls, json_object): if isinstance(value, dict): value = AbsModelConfig.from_dict(value) config.__dict__[key] = value - config.use_xdropout = True - config.use_xsoftmax = True + config.use_xdropout = False + config.use_xsoftmax = False return config @classmethod diff --git a/DeBERTa/deberta/disentangled_attention.py b/DeBERTa/deberta/disentangled_attention.py index 8f7801c..87f280b 100644 --- a/DeBERTa/deberta/disentangled_attention.py +++ b/DeBERTa/deberta/disentangled_attention.py @@ -101,6 +101,7 @@ def __init__(self, config): self.pos_q_proj = torch.nn.Linear(config.hidden_size, self.all_head_size) self.dropout = StableDropout(config.attention_probs_dropout_prob) if config.use_xdropout else torch.nn.Dropout(config.attention_probs_dropout_prob) + self.use_xsoftmax = config.use_xsoftmax def transpose_for_scores(self, x): new_x_shape = x.size()[:-1] + (self.num_attention_heads, -1) @@ -175,9 +176,11 @@ def linear(w,b,x): if self.talking_head: attention_scores = self.head_logits_proj(attention_scores.permute(0,2,3,1)).permute(0,3,1,2) - #attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1) - nodex = torch.nn.Softmax(-1) - attention_probs = nodex(attention_scores + 10000.0*(attention_mask -1)) + if self.use_xsoftmax: + attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1) + else: + nodex = torch.nn.Softmax(-1) + attention_probs = nodex(attention_scores + 10000.0*(attention_mask -1)) attention_probs = self.dropout(attention_probs) if self.talking_head: attention_probs = self.head_weights_proj(attention_probs.permute(0,2,3,1)).permute(0,3,1,2) diff --git a/DeBERTa/deberta/pooling.py b/DeBERTa/deberta/pooling.py index 4fb4f43..d6cce03 100644 --- a/DeBERTa/deberta/pooling.py +++ b/DeBERTa/deberta/pooling.py @@ -58,8 +58,8 @@ def __init__(self, config=None): self.hidden_size = 768 self.dropout = 0 self.hidden_act = 'gelu' - self.use_xdropout = True - self.use_xsoftmax = True + self.use_xdropout = False + self.use_xsoftmax = False if config: pool_config = getattr(config, 'pooling', config) if isinstance(pool_config, dict): From 95ec7ad1e0ca533786690ca1170c637ae5184fa7 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Tue, 29 Sep 2020 18:06:26 +0000 Subject: [PATCH 10/13] Added ORT Glue based tests --- DeBERTa/apps/orttrain.py | 234 ++++++++++ DeBERTa/deberta/__init__.py | 2 +- DeBERTa/deberta/bert.py | 1 + DeBERTa/deberta/gpt2_tokenizer.py | 38 +- DeBERTa/onnx/__init__.py | 5 + DeBERTa/onnx/orttraining_deberta.py | 167 ++++++++ .../onnx/orttraining_test_bert_postprocess.py | 5 + .../orttraining_test_layer_norm_transform.py | 177 ++++++++ .../onnx/orttraining_test_model_transform.py | 106 +++++ .../onnx/orttraining_transformer_trainer.py | 405 ++++++++++++++++++ 10 files changed, 1138 insertions(+), 2 deletions(-) create mode 100644 DeBERTa/apps/orttrain.py create mode 100644 DeBERTa/onnx/__init__.py create mode 100644 DeBERTa/onnx/orttraining_deberta.py create mode 100644 DeBERTa/onnx/orttraining_test_bert_postprocess.py create mode 100644 DeBERTa/onnx/orttraining_test_layer_norm_transform.py create mode 100644 DeBERTa/onnx/orttraining_test_model_transform.py create mode 100644 DeBERTa/onnx/orttraining_transformer_trainer.py diff --git a/DeBERTa/apps/orttrain.py b/DeBERTa/apps/orttrain.py new file mode 100644 index 0000000..2683fc6 --- /dev/null +++ b/DeBERTa/apps/orttrain.py @@ -0,0 +1,234 @@ +# Copyright (c) Microsoft, Inc. 2020 +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# + +import os +import argparse +import random + +import numpy as np +import torch +from ..deberta import GPT2Tokenizer, DebertaPreTrainedTokenizer +from ..onnx import ORTGlueTest +from ..utils import * +from .task_registry import tasks +from onnxruntime.capi._pybind_state import get_mpi_context_local_rank, get_mpi_context_local_size, get_mpi_context_world_rank, get_mpi_context_world_size + +def create_model(args, num_labels, model_class_fn): + # Prepare model + rank = getattr(args, 'rank', 0) + init_model = args.init_model if rank<1 else None + model = model_class_fn(init_model, args.model_config, num_labels=num_labels, \ + drop_out=args.cls_drop_out, \ + pre_trained = args.pre_trained) + if args.fp16: + model = model.half() + return model + +def main(args): + os.makedirs(args.output_dir, exist_ok=True) + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + + # load model based on task + tokenizer = GPT2Tokenizer() + processor = tasks[args.task_name.lower()](tokenizer = tokenizer, max_seq_len = args.max_seq_length, data_dir = args.data_dir) + label_list = processor.get_labels() + model_class_fn = processor.get_model_class_fn() + model = create_model(args, len(label_list), model_class_fn) + logger.info("Model config {}".format(model.config)) + + # train with ORT + test = ORTGlueTest() + test.setUp(args) + test.local_rank = get_mpi_context_local_rank() + test.world_size = get_mpi_context_world_size() + print("mpirun launch, local_rank / world_size: ", test.local_rank, test.world_size) + os.environ['RANK'] = str(test.local_rank) + os.environ['WORLD_SIZE'] = str(test.world_size) + os.environ['MASTER_ADDR'] = '127.0.0.1' + os.environ['MASTER_PORT'] = '29501' + test.model = model + test.tokenizer = DebertaPreTrainedTokenizer() + test.run_glue(task_name=args.task_name, fp16=False, use_new_api=True) + +def build_argument_parser(): + parser = argparse.ArgumentParser() + + ## Required parameters + parser.add_argument("--data_dir", + default=None, + type=str, + required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--task_name", + default=None, + type=str, + required=True, + help="The name of the task to train.") + parser.add_argument("--output_dir", + default=None, + type=str, + required=True, + help="The output directory where the model checkpoints will be written.") + parser.add_argument("--cache_dir", + default=None, + type=str, + required=True, + help="The directory to store the pretrained models downloaded from s3.") + + ## Other parameters + parser.add_argument("--max_seq_length", + default=128, + type=int, + help="The maximum total input sequence length after WordPiece tokenization. \n" + "Sequences longer than this will be truncated, and sequences shorter \n" + "than this will be padded.") + parser.add_argument("--train_batch_size", + default=32, + type=int, + help="Total batch size for training.") + parser.add_argument("--eval_batch_size", + default=32, + type=int, + help="Total batch size for eval.") + parser.add_argument("--max_grad_norm", + default=1, + type=float, + help="The clip threshold of global gradient norm") + parser.add_argument("--learning_rate", + default=5e-5, + type=float, + help="The initial learning rate for Adam.") + parser.add_argument("--epsilon", + default=1e-6, + type=float, + help="epsilon setting for Adam.") + parser.add_argument("--adam_beta1", + default=0.9, + type=float, + help="The beta1 parameter for Adam.") + parser.add_argument("--adam_beta2", + default=0.999, + type=float, + help="The beta2 parameter for Adam.") + parser.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--warmup_proportion", + default=0.1, + type=float, + help="Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10%% of training.") + parser.add_argument("--lr_schedule_ends", + default=0, + type=float, + help="The ended learning rate scale for learning rate scheduling") + parser.add_argument("--lr_schedule", + default='warmup_linear', + type=str, + help="The learning rate scheduler used for traning. " + "E.g. warmup_linear, warmup_linear_shift, warmup_cosine, warmup_constant. Default, warmup_linear") + + parser.add_argument("--local_rank", + type=int, + default=-1, + help="local_rank for distributed training on gpus") + + parser.add_argument('--seed', + type=int, + default=1234, + help="random seed for initialization") + + parser.add_argument('--accumulative_update', + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.") + + parser.add_argument('--fp16', + default=False, + type=boolean_string, + help="Whether to use 16-bit float precision instead of 32-bit") + + parser.add_argument('--loss_scale', + type=float, default=256, + help='Loss scaling, positive power of 2 values can improve fp16 convergence.') + + parser.add_argument('--scale_steps', + type=int, default=1000, + help='The steps to wait to increase the loss scale.') + + parser.add_argument('--init_model', + type=str, + help="The model state file used to initialize the model weights.") + + parser.add_argument('--model_config', + type=str, + help="The config file of bert model.") + + parser.add_argument('--cls_drop_out', + type=float, + default=None, + help="The config file model initialization and fine tuning.") + parser.add_argument('--weight_decay', + type=float, + default=0.01, + help="The weight decay rate") + + parser.add_argument('--tag', + type=str, + default='final', + help="The tag name of current prediction/runs.") + + parser.add_argument("--dump_interval", + default=10000, + type=int, + help="Interval steps for generating checkpoint.") + + parser.add_argument('--lookahead_k', + default=-1, + type=int, + help="lookahead k parameter") + + parser.add_argument('--lookahead_alpha', + default=0.5, + type=float, + help="lookahead alpha parameter") + + parser.add_argument('--opt_type', + type=str.lower, + default='adam', + choices=['adam', 'admax'], + help="The optimizer to be used.") + + parser.add_argument('--workers', + type=int, + default=2, + help="The workers to load data.") + + parser.add_argument('--pre_trained', + default=None, + type=str, + help="The path of pre-trained RoBERTa model") + + return parser + +if __name__ == "__main__": + parser = build_argument_parser() + args = parser.parse_args() + logger = set_logger(args.task_name, os.path.join(args.output_dir, 'training_{}.log'.format(args.task_name))) + logger.info(args) + try: + main(args) + except Exception as ex: + try: + logger.exception(f'Uncatched exception happened during execution.') + import atexit + atexit._run_exitfuncs() + except: + pass + os._exit(-1) diff --git a/DeBERTa/deberta/__init__.py b/DeBERTa/deberta/__init__.py index 87d22dd..6450486 100644 --- a/DeBERTa/deberta/__init__.py +++ b/DeBERTa/deberta/__init__.py @@ -17,5 +17,5 @@ from .disentangled_attention import * from .ops import * from .bert import * -from .gpt2_tokenizer import GPT2Tokenizer +from .gpt2_tokenizer import GPT2Tokenizer, DebertaPreTrainedTokenizer from .config import * diff --git a/DeBERTa/deberta/bert.py b/DeBERTa/deberta/bert.py index ba817a0..c7f1cb6 100644 --- a/DeBERTa/deberta/bert.py +++ b/DeBERTa/deberta/bert.py @@ -145,6 +145,7 @@ class BertEncoder(nn.Module): def __init__(self, config): super().__init__() layer = BertLayer(config) + # Set number of layers here self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) self.relative_attention = getattr(config, 'relative_attention', False) if self.relative_attention: diff --git a/DeBERTa/deberta/gpt2_tokenizer.py b/DeBERTa/deberta/gpt2_tokenizer.py index 20acb75..006cad9 100644 --- a/DeBERTa/deberta/gpt2_tokenizer.py +++ b/DeBERTa/deberta/gpt2_tokenizer.py @@ -15,8 +15,9 @@ import os from .gpt2_bpe_utils import get_encoder,_is_control,_is_whitespace,_is_punctuation from .cache_utils import load_vocab +from transformers import PreTrainedTokenizer -__all__ = ['GPT2Tokenizer'] +__all__ = ['GPT2Tokenizer', 'DebertaPreTrainedTokenizer'] class GPT2Tokenizer(object): """ A wrapper of GPT2 tokenizer with similar interface as BERT tokenizer @@ -214,3 +215,38 @@ def add_symbol(self, word, n=1): def save_pretrained(self, path: str): torch.save(self.gpt2_encoder, path) + +class DebertaPreTrainedTokenizer(PreTrainedTokenizer): + def __init__( + self, + vocab_file=None, + do_lower_case=True, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + **kwargs + ): + super().__init__( + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + **kwargs, + ) + + self.GPT2Tokenizer = GPT2Tokenizer(vocab_file, do_lower_case, **kwargs) + + def _convert_token_to_id(self, token): + return self.GPT2Tokenizer.id(token) + + def _tokenize(self, text, **kwargs): + """ + Converts a string in a sequence of tokens (string), using the tokenizer. + Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies + (BPE/SentencePieces/WordPieces). + Do NOT take care of added tokens. + """ + return self.GPT2Tokenizer.tokenize(text) diff --git a/DeBERTa/onnx/__init__.py b/DeBERTa/onnx/__init__.py new file mode 100644 index 0000000..7181044 --- /dev/null +++ b/DeBERTa/onnx/__init__.py @@ -0,0 +1,5 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from .orttraining_deberta import ORTGlueTest \ No newline at end of file diff --git a/DeBERTa/onnx/orttraining_deberta.py b/DeBERTa/onnx/orttraining_deberta.py new file mode 100644 index 0000000..4cb5c27 --- /dev/null +++ b/DeBERTa/onnx/orttraining_deberta.py @@ -0,0 +1,167 @@ +# adapted from run_glue.py of huggingface transformers + +import dataclasses +import logging +import os +from dataclasses import dataclass, field +from typing import Dict, Optional +import unittest +import numpy as np +from numpy.testing import assert_allclose + +from transformers import ( + AutoConfig, + AutoModelForSequenceClassification, + AutoTokenizer, + EvalPrediction, + GlueDataset, + GlueDataTrainingArguments, + TrainingArguments, + glue_compute_metrics, + glue_output_modes, + glue_tasks_num_labels, + set_seed, +) + +import onnxruntime +from onnxruntime.capi.ort_trainer import ORTTrainer, LossScaler, ModelDescription, IODescription +from onnxruntime.capi._pybind_state import get_mpi_context_local_rank, get_mpi_context_local_size, get_mpi_context_world_rank, get_mpi_context_world_size + +from .orttraining_transformer_trainer import ORTTransformerTrainer + +import torch + +logger = logging.getLogger(__name__) + +class ORTGlueTest(unittest.TestCase): + + def setUp(self, args): + # configurations not to be changed accoss tests + self.max_seq_length = args.max_seq_length + self.train_batch_size = args.train_batch_size + self.learning_rate = args.learning_rate + self.num_train_epochs = args.num_train_epochs + self.local_rank = -1 + self.world_size = 1 + self.overwrite_output_dir = True + self.gradient_accumulation_steps = 1 + self.data_dir = args.data_dir + self.output_dir = args.output_dir + self.cache_dir = args.cache_dir + self.logging_steps = 100 + self.rtol = 1e-02 + self.seed = args.seed + + def model_to_desc(self): + batch_size = int(self.train_batch_size) # * self.world_size) + new_model_desc = { + 'inputs': [ + ('input_ids', ['batch', 'max_seq_len_in_batch'],), + ('token_type_ids', ['batch', 'max_seq_len_in_batch'],), + ('attention_mask', ['batch', 'max_seq_len_in_batch'],), + ('labels', ['batch', ],)], + 'outputs': [('loss', [], True), + ('logits', ['batch',])]} + model_desc = ModelDescription([ + IODescription('input_ids', ['batch', 'max_seq_len_in_batch']), + IODescription('token_type_ids', ['batch', 'max_seq_len_in_batch']), + #IODescription('position_ids', [batch_size, self.max_seq_length]), + IODescription('attention_mask', ['batch', 'max_seq_len_in_batch']), + IODescription('labels', ['batch',])], [ + IODescription('loss', []), + IODescription('logits', ['batch',])]) + + return model_desc, new_model_desc + + def run_glue(self, task_name, fp16, use_new_api): + data_args = GlueDataTrainingArguments( + task_name=task_name, data_dir=os.path.join(self.data_dir, task_name), + max_seq_length=self.max_seq_length) + + training_args = TrainingArguments( + output_dir=os.path.join(self.output_dir, task_name), do_train=True, do_eval=True, + per_gpu_train_batch_size=self.train_batch_size, + per_gpu_eval_batch_size = self.train_batch_size, + learning_rate=self.learning_rate, num_train_epochs=self.num_train_epochs, + local_rank=self.local_rank, + overwrite_output_dir=self.overwrite_output_dir, gradient_accumulation_steps=self.gradient_accumulation_steps, + fp16=fp16, logging_steps=self.logging_steps, + seed=self.seed) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + training_args.local_rank, + training_args.device, + training_args.n_gpu, + bool(training_args.local_rank != -1), + training_args.fp16, + ) + logger.info("Training/evaluation parameters %s", training_args) + + set_seed(training_args.seed) + onnxruntime.set_seed(training_args.seed) + + try: + num_labels = glue_tasks_num_labels[data_args.task_name] + output_mode = glue_output_modes[data_args.task_name] + except KeyError: + raise ValueError("Task not found: %s" % (data_args.task_name)) + + train_dataset = ( + GlueDataset(data_args, tokenizer=self.tokenizer) + if training_args.do_train + else None + ) + + eval_dataset = ( + GlueDataset(data_args, tokenizer=self.tokenizer, mode="dev") + if training_args.do_eval + else None + ) + + def compute_metrics(p: EvalPrediction) -> Dict: + if output_mode == "classification": + preds = np.argmax(p.predictions, axis=1) + elif output_mode == "regression": + preds = np.squeeze(p.predictions) + return glue_compute_metrics(data_args.task_name, preds, p.label_ids) + + model_desc, new_model_desc = self.model_to_desc() + # Initialize the ORTTrainer within ORTTransformerTrainer + trainer = ORTTransformerTrainer( + model=self.model, + model_desc=model_desc, + new_model_desc=new_model_desc, + args=training_args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + compute_metrics=compute_metrics, + use_new_api=use_new_api, + world_size=self.world_size, + ) + + # Training + if training_args.do_train: + trainer.train() + trainer.save_model() + + # Evaluation + results = {} + if training_args.do_eval and training_args.local_rank in [-1, 0]: + logger.info("*** Evaluate ***") + + result = trainer.evaluate() + + logger.info("***** Eval results {} *****".format(data_args.task_name)) + for key, value in result.items(): + logger.info(" %s = %s", key, value) + + results.update(result) + + return results \ No newline at end of file diff --git a/DeBERTa/onnx/orttraining_test_bert_postprocess.py b/DeBERTa/onnx/orttraining_test_bert_postprocess.py new file mode 100644 index 0000000..890db47 --- /dev/null +++ b/DeBERTa/onnx/orttraining_test_bert_postprocess.py @@ -0,0 +1,5 @@ +from .orttraining_test_model_transform import add_name, fix_transpose, add_expand_shape +from .orttraining_test_layer_norm_transform import layer_norm_transform + +def postprocess_model(model): + add_name(model) diff --git a/DeBERTa/onnx/orttraining_test_layer_norm_transform.py b/DeBERTa/onnx/orttraining_test_layer_norm_transform.py new file mode 100644 index 0000000..883d738 --- /dev/null +++ b/DeBERTa/onnx/orttraining_test_layer_norm_transform.py @@ -0,0 +1,177 @@ +import onnx + +def find_node(graph_proto, op_type): + nodes = [] + map_input_node = {} + for node in graph_proto.node: + if node.op_type == op_type: + map_input_node[node.input[0]] = node + if op_type == 'Div' or op_type == 'Mul': + map_input_node[node.input[1]] = node + nodes.append(node) + return nodes, map_input_node + +def gen_attribute(key, value): + attr = AttributeProto() + attr.name = key + attr.ints.extend(int(v) for v in value) + attr.type = AttributeProto.INTS + return attr + +def layer_norm_transform(model_proto): + # a layer norm subgraph + # input + # | + # ReduceMean + # __|____ + # | | + # Sub Sub + # | | + # | Pow + # | | + # | ReduceMean + # | | + # | Add + # | | + # |__ __Sqrt + # | | + # Div + # | + # Mul + # | + # Add + # | + # output + + graph_proto = model_proto.graph + + _, map_input_Div = find_node(graph_proto, 'Div') + + _, map_input_Sqrt = find_node(graph_proto, 'Sqrt') + + _, map_input_Add = find_node(graph_proto, 'Add') + + nodes_ReduceMean, map_input_ReduceMean = find_node(graph_proto, 'ReduceMean') + + _, map_input_Pow = find_node(graph_proto, 'Pow') + + _, map_input_Mul = find_node(graph_proto, 'Mul') + + # find right side Sub (see the layer norm subgrapg) + nodes_Sub = [] + map_input_Sub = {} + for node in graph_proto.node: + if node.op_type == 'Sub': + if node.output[0] in map_input_Pow: + nodes_Sub.append(node) + map_input_Sub[node.input[1]] = node + + # find first ReduceMean + first_ReduceMean = [] + first_ReduceMean_outputs = [] + for node in nodes_ReduceMean: + if node.output[0] in map_input_Sub: + first_ReduceMean.append(node) + first_ReduceMean_outputs.append(node.output[0]) + + # find constant node + nodes_Constant = [] + map_output_Constant = {} + for node in graph_proto.node: + if node.op_type == 'Constant': + nodes_Constant.append(node) + map_output_Constant[node.output[0]] = node + + id = 0 + removed_nodes = [] + layer_norm_nodes = [] + # Replace with layer norm + for node in first_ReduceMean: + layer_norm_input = [] + layer_norm_output = [] + layer_norm_input.append(node.input[0]) + + # collect nodes within a layer norm subgraph. + # skip building layer norm node if there is a pattern miss-match. + if node.output[0] not in map_input_Sub: + continue + + node_sub = map_input_Sub[node.output[0]] + if node_sub.output[0] not in map_input_Pow: + continue + + node_pow = map_input_Pow[node_sub.output[0]] + if node_pow.output[0] not in map_input_ReduceMean: + continue + + node_reduce = map_input_ReduceMean[node_pow.output[0]] + if node_reduce.output[0] not in map_input_Add: + continue + + node_Add = map_input_Add[node_reduce.output[0]] + if node_Add.output[0] not in map_input_Sqrt: + continue + + node_Sqrt = map_input_Sqrt[node_Add.output[0]] + if node_Sqrt.output[0] not in map_input_Div: + continue + + node_Div = map_input_Div[node_Sqrt.output[0]] + if node_Div.output[0] not in map_input_Mul: + continue + + node_Mul = map_input_Mul[node_Div.output[0]] + + if node_Mul.input[0] != node_Div.output[0]: + layer_norm_input.append(node_Mul.input[0]) + else: + layer_norm_input.append(node_Mul.input[1]) + + if node_Mul.output[0] not in map_input_Add: + continue + + node_Add1 = map_input_Add[node_Mul.output[0]] + layer_norm_input.append(node_Add1.input[1]) + + removed_nodes.append(node) + removed_nodes.append(node_sub) + removed_nodes.append(node_pow) + removed_nodes.append(node_reduce) + removed_nodes.append(node_Add) + removed_nodes.append(node_Sqrt) + removed_nodes.append(node_Div) + removed_nodes.append(node_Mul) + removed_nodes.append(node_Add1) + removed_nodes.append(map_output_Constant[node_pow.input[1]]) + + removed_nodes.append(map_output_Constant[node_Add.input[1]]) + layer_norm_output.append(node_Add1.output[0]) + id = id + 1 + layer_norm_output.append('saved_mean_' + str(id)) + id = id + 1 + layer_norm_output.append('saved_inv_std_var_' + str(id)) + layer_norm = onnx.helper.make_node("LayerNormalization", + layer_norm_input, + layer_norm_output, + "LayerNormalization_" + str(id), + None, + axis = node_reduce.attribute[0].ints[0], + epsilon = 9.999999960041972e-13) + layer_norm_nodes.append(layer_norm) + + # remove left side Subs + for node in graph_proto.node: + if node.op_type == 'Sub': + if node.input[1] in first_ReduceMean_outputs: + removed_nodes.append(node) + + all_nodes = [] + for node in graph_proto.node: + if node not in removed_nodes: + all_nodes.append(node) + + for node in layer_norm_nodes: + all_nodes.append(node) + + graph_proto.ClearField("node") + graph_proto.node.extend(all_nodes) diff --git a/DeBERTa/onnx/orttraining_test_model_transform.py b/DeBERTa/onnx/orttraining_test_model_transform.py new file mode 100644 index 0000000..9ef92aa --- /dev/null +++ b/DeBERTa/onnx/orttraining_test_model_transform.py @@ -0,0 +1,106 @@ +from onnx import numpy_helper + +def add_name(model): + i = 0 + for node in model.graph.node: + node.name = '%s_%d' %(node.op_type, i) + i += 1 + +def find_single_output_node(model, arg): + result = [] + for node in model.graph.node: + for input in node.input: + if input == arg: + result.append(node) + return result[0] if len(result) == 1 else None + +def find_input_as_initializer(model, arg): + for initializer in model.graph.initializer: + if initializer.name == arg: + return initializer + return None + +def get_node_index(model, node): + for i, n in enumerate(model.graph.node): + if n == node: + return i + return None + +def replace_input_arg(model, arg, new_arg): + for node in model.graph.node: + for i in range(len(node.input)): + if node.input[i] == arg: + node.input[i] = new_arg + +def find_weight_index(model, name): + for index, w in enumerate(model.graph.initializer): + if w.name == name: + return index + index += 1 + return None + +def fix_transpose(model): + """ + remove transpose node if its input is a 2d weight which only feeds to the node. + """ + + # Find transpose nodes with initializer weight as input. + # The input weight needs to be only feeded into the transpose node. + # Collect these nodes and weights. + transpose = [] + for node in model.graph.node: + if node.op_type == 'Transpose': + weight = find_input_as_initializer(model, node.input[0]) + if weight is not None: + result = [] + for n in model.graph.node: + for input in n.input: + if input == weight.name: + result.append(n) + if len(result) > 1: + continue + perm = node.attribute[0] + assert perm.name == 'perm' + perm = perm.ints + assert len(perm) == 2 and perm[0] == 1 and perm[1] == 0 + transpose.append((get_node_index(model, node), weight)) + + # Transpose collected weights and add it to the model initializers. + # The transposed weight initializers become inputs to the transpose nodes' recipient nodes. + for t in transpose: + node = model.graph.node[t[0]] + weight = numpy_helper.to_array(t[1]) + assert len(weight.shape) == 2 + weight = weight.transpose(perm) + new_weight = numpy_helper.from_array(weight, "%s_transposed" % t[1].name) + model.graph.initializer.extend([new_weight]) + replace_input_arg(model, node.output[0], new_weight.name) + + # collected transpose nodes can be removed. + transpose.sort(reverse=True) + for t in transpose: + del model.graph.node[t[0]] + + # the original weight initializer can be removed. + # (remember that a wight needs only to be feeded into the transpose node when collecting wights) + old_ws = [] + for t in transpose: + if find_single_output_node(model, t[1].name) is None: + old_ws.append(find_weight_index(model, t[1].name)) + old_ws.sort(reverse=True) + for w_i in old_ws: + del model.graph.initializer[w_i] + +def add_expand_shape(model): + """ + this method is very specific to the Bert model where there is a solo Expand op. + training backend requires the op's output shape. it is the same as the shape of the model (single) input. + """ + + expand_node = [n for n in model.graph.node if n.op_type == 'Expand'] + if len(expand_node) != 1: + raise "cannot find the single expand node in the BERT model." + return + expand_out = model.graph.value_info.add() + expand_out.name = expand_node[0].output[0] # base: '421' # tiny: '85' + expand_out.type.CopyFrom(model.graph.input[0].type) \ No newline at end of file diff --git a/DeBERTa/onnx/orttraining_transformer_trainer.py b/DeBERTa/onnx/orttraining_transformer_trainer.py new file mode 100644 index 0000000..ffd81e6 --- /dev/null +++ b/DeBERTa/onnx/orttraining_transformer_trainer.py @@ -0,0 +1,405 @@ +# adapted from Trainer.py of huggingface transformers + +import json +import logging +import os +import random + +from typing import Callable, Dict, List, NamedTuple, Optional, Tuple + +import numpy as np +import torch +from torch import nn +from torch.utils.data.dataloader import DataLoader +from torch.utils.data.dataset import Dataset +from torch.utils.data.distributed import DistributedSampler +from torch.utils.data.sampler import RandomSampler, SequentialSampler +from tqdm import tqdm, trange + +from transformers.data.data_collator import DataCollator, DefaultDataCollator +from transformers.modeling_utils import PreTrainedModel +from transformers.training_args import TrainingArguments + +import onnxruntime +from .orttraining_test_bert_postprocess import postprocess_model +from onnxruntime.capi.ort_trainer import ORTTrainer, LossScaler, ModelDescription, IODescription + +from onnxruntime.training import _utils, amp, optim, orttrainer, TrainStepInfo,\ + model_desc_validation as md_val,\ + orttrainer_options as orttrainer_options +from onnxruntime.training.optim import LinearWarmupLRScheduler, _LRScheduler + +try: + from torch.utils.tensorboard import SummaryWriter + + _has_tensorboard = True +except ImportError: + try: + from tensorboardX import SummaryWriter + + _has_tensorboard = True + except ImportError: + _has_tensorboard = False + + +def is_tensorboard_available(): + return _has_tensorboard + + +logger = logging.getLogger(__name__) + + +def set_seed(seed: int): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + onnxruntime.set_seed(seed) + +class EvalPrediction(NamedTuple): + predictions: np.ndarray + label_ids: np.ndarray + + +class PredictionOutput(NamedTuple): + predictions: np.ndarray + label_ids: Optional[np.ndarray] + metrics: Optional[Dict[str, float]] + + +class TrainOutput(NamedTuple): + global_step: int + training_loss: float + +def get_linear_schedule_with_warmup(num_warmup_steps, num_training_steps, base_lr): + + def lr_lambda_linear(current_step): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1, num_warmup_steps)) + return max( + 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)) + ) + + def lambda_lr_get_lr(current_global_step): + # LambdaLR increment self.last_epoch at evert sept() + return base_lr * lr_lambda_linear(current_global_step) + + return lambda_lr_get_lr + + +class ORTTransformerTrainer: + """ + """ + + model: PreTrainedModel + args: TrainingArguments + train_dataset: Dataset + eval_dataset: Dataset + compute_metrics: Callable[[EvalPrediction], Dict] + + def __init__( + self, + model: PreTrainedModel, + model_desc: ModelDescription, + new_model_desc: dict, + args: TrainingArguments, + train_dataset: Dataset, + eval_dataset: Dataset, + compute_metrics: Callable[[EvalPrediction], Dict], + world_size: Optional[int] = 1, + use_new_api : Optional[bool] = False, + ): + """ + """ + + self.model = model + self.model_desc = model_desc + self.new_model_desc = new_model_desc + self.args = args + self.world_size = world_size + self.data_collator = DefaultDataCollator() + self.train_dataset = train_dataset + self.eval_dataset = eval_dataset + self.compute_metrics = compute_metrics + set_seed(self.args.seed) + # Create output directory if needed + if self.args.local_rank in [-1, 0]: + os.makedirs(self.args.output_dir, exist_ok=True) + + self.use_new_api = use_new_api + + def get_train_dataloader(self) -> DataLoader: + if self.train_dataset is None: + raise ValueError("Trainer: training requires a train_dataset.") + train_sampler = ( + SequentialSampler(self.train_dataset) if self.args.local_rank == -1 else DistributedSampler(self.train_dataset) + ) + return DataLoader( + self.train_dataset, + batch_size=self.args.train_batch_size, + sampler=train_sampler, + #drop_last=True, + collate_fn=self.data_collator.collate_batch, + ) + + def get_eval_dataloader(self) -> DataLoader: + return DataLoader( + self.eval_dataset, + batch_size=self.args.eval_batch_size, + shuffle=False, + #drop_last=True, + collate_fn=self.data_collator.collate_batch, + ) + + def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: + # We use the same batch_size as for eval. + return DataLoader( + test_dataset, + batch_size=self.args.eval_batch_size, + shuffle=False, + #drop_last=True, + collate_fn=self.data_collator.collate_batch, + ) + + + def train(self): + """ + Main training entry point. + """ + train_dataloader = self.get_train_dataloader() + + if self.args.max_steps > 0: + t_total = self.args.max_steps + num_train_epochs = ( + self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 + ) + else: + t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) + num_train_epochs = self.args.num_train_epochs + + if self.use_new_api: + lr_scheduler = orttrainer.optim.LinearWarmupLRScheduler(t_total, self.args.warmup_steps/float(t_total)) + + loss_scaler = amp.DynamicLossScaler() if self.args.fp16 else None + device = self.args.device.type + + device = f'{device}:{self.args.device.index}' if self.args.device.index else f'{device}:0' + options = orttrainer.ORTTrainerOptions({'batch' : { + 'gradient_accumulation_steps' : self.args.gradient_accumulation_steps}, + 'device': {'id': device}, + 'mixed_precision': { + 'enabled': self.args.fp16, + 'loss_scaler': loss_scaler}, + 'debug': {'deterministic_compute': True, }, + 'utils': { + 'grad_norm_clip': False}, + 'distributed': { + # we are running single node multi gpu test. thus world_rank = local_rank + # and world_size = self.args.n_gpu + 'world_rank': max(0, self.args.local_rank), + 'world_size': int(self.world_size), + 'local_rank': max(0, self.args.local_rank), + 'allreduce_post_accumulation': True}, + 'lr_scheduler': lr_scheduler + }) + + param_optimizer = list(self.model.named_parameters()) + params = [{ + 'params': [n for n, p in param_optimizer if "bias" in n or "LayerNorm.weight" in n], + "weight_decay_mode": 1, }, { + 'params': [n for n, p in param_optimizer if not ("bias" in n or "LayerNorm.weight" in n)], + "weight_decay_mode": 1, } + ] + + optim_config = optim.AdamConfig(params=params, lr=2e-5, do_bias_correction=True) + self.model = orttrainer.ORTTrainer(self.model, self.new_model_desc, optim_config, options=options) + else: + def map_optimizer_attributes(name): + no_decay = "bias" in name or "LayerNorm.weight" in name + if no_decay: + return {"weight_decay_mode" : 1} + else: + return {"weight_decay_mode" : 1} + get_lr_this_step = get_linear_schedule_with_warmup(self.args.warmup_steps, t_total, self.args.learning_rate) + loss_scaler = LossScaler('loss_scale_input_name', True, up_scale_window=2000) if self.args.fp16 else None + self.model = ORTTrainer(self.model, None, + self.model_desc, + "AdamOptimizer", + map_optimizer_attributes=map_optimizer_attributes, + learning_rate_description=IODescription('Learning_Rate', [1,], torch.float32), + device=self.args.device, + gradient_accumulation_steps=self.args.gradient_accumulation_steps, + world_rank=max(0, self.args.local_rank), + world_size=int(self.world_size), + use_mixed_precision=self.args.fp16, + allreduce_post_accumulation=True, + get_lr_this_step=get_lr_this_step, + loss_scaler=loss_scaler, + enable_grad_norm_clip=False, + _opset_version=12, + _use_deterministic_compute=True) + + # Train! + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_dataloader.dataset)) + logger.info(" Num Epochs = %d", num_train_epochs) + logger.info(" Instantaneous batch size per GPU = %d", self.args.per_gpu_train_batch_size) + logger.info( + " Total train batch size (w. parallel, distributed & accumulation) = %d", + self.args.train_batch_size + * self.args.gradient_accumulation_steps + * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1), + ) + logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) + logger.info(" Total optimization steps = %d", t_total) + + global_step = 0 + epochs_trained = 0 + steps_trained_in_current_epoch = 0 + + tr_loss = 0.0 + logging_loss = 0.0 + train_iterator = trange( + epochs_trained, int(num_train_epochs), desc="Epoch", disable=self.args.local_rank not in [-1, 0], + ) + + for epoch in train_iterator: + epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.args.local_rank not in [-1, 0]) + for step, inputs in enumerate(epoch_iterator): + + # Skip past any already trained steps if resuming training + if steps_trained_in_current_epoch > 0: + steps_trained_in_current_epoch -= 1 + continue + + tr_loss += self._training_step(self.model, inputs) + + if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( + len(epoch_iterator) <= self.args.gradient_accumulation_steps + and (step + 1) == len(epoch_iterator) + ): + global_step += 1 + + if self.args.local_rank in [-1, 0]: + if (self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0) or ( + global_step == 1 and self.args.logging_first_step + ): + logs = {} + if self.args.evaluate_during_training: + results = self.evaluate() + for key, value in results.items(): + eval_key = "eval_{}".format(key) + logs[eval_key] = value + + loss_scalar = (tr_loss - logging_loss) / self.args.logging_steps + if not self.use_new_api: + learning_rate_scalar = get_lr_this_step(global_step) + logs["learning_rate"] = learning_rate_scalar + logs["loss"] = loss_scalar + logging_loss = tr_loss + + epoch_iterator.write(json.dumps({**logs, **{"step": global_step}})) + + if self.args.max_steps > 0 and global_step > self.args.max_steps: + epoch_iterator.close() + break + if self.args.max_steps > 0 and global_step > self.args.max_steps: + train_iterator.close() + break + + logger.info("\n\nTraining completed. \n\n") + return TrainOutput(global_step, tr_loss / global_step) + + def _training_step( + self, model, inputs: Dict[str, torch.Tensor]) -> float: + for k, v in inputs.items(): + inputs[k] = v.to(self.args.device) + + outputs = model.train_step(**inputs) + loss = outputs[0] # model outputs are always tuple in transformers (see doc) + + return loss.item() + + def save_model(self, output_dir: Optional[str] = None): + output_dir = output_dir if output_dir is not None else self.args.output_dir + os.makedirs(output_dir, exist_ok=True) + self.model.save_as_onnx(os.path.join(output_dir, "transformer.onnx")) + + def evaluate(self) -> Dict[str, float]: + """ + Run evaluation and return metrics. + + Returns: + A dict containing: + - the eval loss + - the potential metrics computed from the predictions + """ + eval_dataloader = self.get_eval_dataloader() + + output = self._prediction_loop(eval_dataloader, description="Evaluation") + return output.metrics + + def predict(self, test_dataset: Dataset) -> PredictionOutput: + """ + Run prediction and return predictions and potential metrics. + + Depending on the dataset and your use case, your test dataset may contain labels. + In that case, this method will also return metrics, like in evaluate(). + """ + test_dataloader = self.get_test_dataloader(test_dataset) + return self._prediction_loop(test_dataloader, description="Prediction") + + def _prediction_loop( + self, dataloader: DataLoader, description: str + ) -> PredictionOutput: + """ + Prediction/evaluation loop, shared by `evaluate()` and `predict()`. + + Works both with or without labels. + """ + + logger.info("***** Running %s *****", description) + logger.info(" Num examples = %d", len(dataloader.dataset)) + logger.info(" Batch size = %d", dataloader.batch_size) + eval_losses: List[float] = [] + preds: np.ndarray = None + label_ids: np.ndarray = None + + if not self.use_new_api: + self.model.eval() + + for inputs in tqdm(dataloader, desc=description): + has_labels = any(inputs.get(k) is not None for k in ["labels", "masked_lm_labels"]) + + for k, v in inputs.items(): + inputs[k] = v.to(self.args.device) + + with torch.no_grad(): + if self.use_new_api: + outputs = self.model.eval_step(**inputs) + else: + outputs = self.model(**inputs) + if has_labels: + step_eval_loss, logits = outputs[:2] + eval_losses += [step_eval_loss.mean().item()] + else: + logits = outputs[0] + + if preds is None: + preds = logits.detach().cpu().numpy() + else: + preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) + if inputs.get("labels") is not None: + if label_ids is None: + label_ids = inputs["labels"].detach().cpu().numpy() + else: + label_ids = np.append(label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) + + if self.compute_metrics is not None and preds is not None and label_ids is not None: + metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) + else: + metrics = {} + if len(eval_losses) > 0: + metrics["loss"] = np.mean(eval_losses) + + return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics) From bf8a3cee885030a61b694ab30a466fbcb3b40e1c Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Tue, 29 Sep 2020 18:15:54 +0000 Subject: [PATCH 11/13] remove onnx path in train.py --- DeBERTa/apps/train.py | 74 ++--------------------- DeBERTa/deberta/disentangled_attention.py | 1 - DeBERTa/deberta/ops.py | 1 + DeBERTa/onnx/__init__.py | 2 +- 4 files changed, 7 insertions(+), 71 deletions(-) diff --git a/DeBERTa/apps/train.py b/DeBERTa/apps/train.py index 28808fb..de96c46 100644 --- a/DeBERTa/apps/train.py +++ b/DeBERTa/apps/train.py @@ -24,10 +24,9 @@ from ..utils import * from ..utils import xtqdm as tqdm from .task_registry import tasks -from onnxruntime.capi.ort_trainer import ORTTrainer, IODescription, ModelDescription, LossScaler from ..training import DistributedTrainer, initialize_distributed, batch_to, set_random_seed,kill_children -from ..data import DistributedBatchSampler, SequentialSampler, BatchSampler, RandomSampler, AsyncDataLoader +from ..data import DistributedBatchSampler, SequentialSampler, BatchSampler, AsyncDataLoader def create_model(args, num_labels, model_class_fn): # Prepare model @@ -218,64 +217,9 @@ def run_predict(args, model, device, eval_data, prefix=None): if predict_fn: predict_fn(predicts, args.output_dir, name, prefix) -def deberta_model_description(args): - vocab_size = 30528 - # set concrete input sizes to permit optimization - input_ids_desc = IODescription('input_ids', [args.train_batch_size, args.max_seq_length], torch.int32, num_classes=vocab_size) - type_ids_desc = IODescription('type_ids', [args.train_batch_size, args.max_seq_length], torch.int32) # num_classes=? - position_ids_desc = IODescription('position_ids', [args.train_batch_size, args.max_seq_length], torch.int32) # num_classes=? - input_mask_desc = IODescription('input_mask', [args.train_batch_size, args.max_seq_length], torch.int32) # num_classes=? - labels_desc = IODescription('labels', [args.train_batch_size, args.max_seq_length], torch.float32) # num_classes=? - - loss_desc = IODescription('loss', [], torch.float32) - return ModelDescription([input_ids_desc, type_ids_desc, position_ids_desc, input_mask_desc, labels_desc], [loss_desc]) - -def create_ort_trainer(args, device, model): - # default initial settings: b1=0.9, b2=0.999, e=1e-6 - def map_optimizer_attributes(name): - no_decay_keys = ["bias", "gamma", "beta", "LayerNorm"] - no_decay = False - for no_decay_key in no_decay_keys: - if no_decay_key in name: - no_decay = True - break - if no_decay: - return {"alpha": 0.9, "beta": 0.999, "lambda": 0.0, "epsilon": 1e-6} - else: - return {"alpha": 0.9, "beta": 0.999, "lambda": 0.01, "epsilon": 1e-6} - - # we request ORTTrainer to create a LambOptimizer with given optimizer_attributes. - # train_step does forward, backward, and optimize step. - model = ORTTrainer(model, None, deberta_model_description(args), "LambOptimizer", - map_optimizer_attributes, - IODescription('Learning_Rate', [1,], torch.float32), - device, - _opset_version = 12) - - return model - -def run_onnx_training(args, model, device, train_data, prefix=None): - # runs training in ONNX - trainer = create_ort_trainer(args, device, model) - train_sampler = RandomSampler(len(train_data)) - batch_sampler = BatchSampler(train_sampler, args.train_batch_size) - batch_sampler = DistributedBatchSampler(batch_sampler, rank=args.rank, world_size=args.world_size) - train_dataloader = DataLoader(train_data, batch_sampler=batch_sampler, num_workers=args.workers, pin_memory=True) - torch.cuda.empty_cache() - for step, batch in enumerate(AsyncDataLoader(train_dataloader, 100)): - #import pdb - #pdb.set_trace() - lr = torch.tensor([0.0000000e+00]).to(device) - batch = batch_to(batch, device) - with torch.no_grad(): - trainer.train_step(batch['input_ids'], batch['type_ids'], batch['position_ids'], batch['input_mask'], batch['labels'], lr) - # conversion fails now with: - # site-packages/torch/onnx/utils.py:617: UserWarning: ONNX export failed on ATen operator broadcast_tensors - # because torch.onnx.symbolic_opset10.broadcast_tensors does not exist - def main(args): - if not args.do_train and not args.do_eval and not args.do_predict and not args.do_onnx: - raise ValueError("At least one of `do_train` or `do_eval` or `do_predict` or `do_onnx` must be True.") + if not args.do_train and not args.do_eval and not args.do_predict: + raise ValueError("At least one of `do_train` or `do_eval` or `do_predict` must be True.") os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() random.seed(args.seed) @@ -292,11 +236,11 @@ def main(args): test_data = processor.test_data(max_seq_len=args.max_seq_length) logger.info(" Prediction batch size = %d", args.predict_batch_size) - if args.do_train or args.do_onnx: + if args.do_train: train_data = processor.train_data(max_seq_len=args.max_seq_length, mask_gen = None, debug=args.debug) model_class_fn = processor.get_model_class_fn() model = create_model(args, len(label_list), model_class_fn) - if args.do_train or args.do_onnx: + if args.do_train: with open(os.path.join(args.output_dir, 'model_config.json'), 'w', encoding='utf-8') as fs: fs.write(model.config.to_json_string() + '\n') logger.info("Model config {}".format(model.config)) @@ -313,10 +257,6 @@ def main(args): if args.do_predict: run_predict(args, model, device, test_data, prefix=args.tag) - # trains in ONNX - if args.do_onnx: - run_onnx_training(args, model, device, train_data, prefix=args.tag) - def build_argument_parser(): parser = argparse.ArgumentParser() @@ -498,10 +438,6 @@ def build_argument_parser(): type=str, help="The path of pre-trained RoBERTa model") - parser.add_argument("--do_onnx", - default=False, - action='store_true', - help="Whether to run training in ONNX") return parser if __name__ == "__main__": diff --git a/DeBERTa/deberta/disentangled_attention.py b/DeBERTa/deberta/disentangled_attention.py index 87f280b..ec18cf4 100644 --- a/DeBERTa/deberta/disentangled_attention.py +++ b/DeBERTa/deberta/disentangled_attention.py @@ -77,7 +77,6 @@ def __init__(self, config): self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size self.in_proj = torch.nn.Linear(config.hidden_size, self.all_head_size*3, bias=False) - # ONNX graph builder thinks params below are not used for loss calcualtion self.q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) self.v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) self.pos_att_type = [x.strip() for x in getattr(config, 'pos_att_type', 'none').lower().split('|')] # c2p|p2c diff --git a/DeBERTa/deberta/ops.py b/DeBERTa/deberta/ops.py index a1ba5fd..a18515f 100644 --- a/DeBERTa/deberta/ops.py +++ b/DeBERTa/deberta/ops.py @@ -6,6 +6,7 @@ # Author: penhe@microsoft.com # Date: 01/15/2020 # + import math from packaging import version import torch diff --git a/DeBERTa/onnx/__init__.py b/DeBERTa/onnx/__init__.py index 7181044..a821423 100644 --- a/DeBERTa/onnx/__init__.py +++ b/DeBERTa/onnx/__init__.py @@ -2,4 +2,4 @@ from __future__ import division from __future__ import print_function -from .orttraining_deberta import ORTGlueTest \ No newline at end of file +from .orttraining_deberta import ORTGlueTest From c71818bafb68a4339e8aef423ecd285dfcdd7772 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Tue, 29 Sep 2020 18:54:53 +0000 Subject: [PATCH 12/13] Add Readme --- DeBERTa/apps/orttrain.py | 92 ++++++++--------------------- DeBERTa/apps/train.py | 1 - DeBERTa/onnx/README.md | 47 +++++++++++++++ DeBERTa/onnx/orttraining_deberta.py | 3 +- 4 files changed, 75 insertions(+), 68 deletions(-) create mode 100644 DeBERTa/onnx/README.md diff --git a/DeBERTa/apps/orttrain.py b/DeBERTa/apps/orttrain.py index 2683fc6..237057a 100644 --- a/DeBERTa/apps/orttrain.py +++ b/DeBERTa/apps/orttrain.py @@ -79,8 +79,7 @@ def build_argument_parser(): type=str, required=True, help="The directory to store the pretrained models downloaded from s3.") - - ## Other parameters + ## Other parameters, parser.add_argument("--max_seq_length", default=128, type=int, @@ -95,14 +94,35 @@ def build_argument_parser(): default=32, type=int, help="Total batch size for eval.") - parser.add_argument("--max_grad_norm", - default=1, - type=float, - help="The clip threshold of global gradient norm") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") + parser.add_argument("--num_train_epochs", + default=3.0, + type=float, + help="Total number of training epochs to perform.") + parser.add_argument('--seed', + type=int, + default=1234, + help="random seed for initialization") + parser.add_argument('--fp16', + default=False, + type=boolean_string, + help="Whether to use 16-bit float precision instead of 32-bit") + parser.add_argument('--init_model', + type=str, + help="The model state file used to initialize the model weights.") + parser.add_argument('--pre_trained', + default=None, + type=str, + help="The path of pre-trained RoBERTa model") + + ## TBD: review params below + parser.add_argument("--max_grad_norm", + default=1, + type=float, + help="The clip threshold of global gradient norm") parser.add_argument("--epsilon", default=1e-6, type=float, @@ -115,10 +135,6 @@ def build_argument_parser(): default=0.999, type=float, help="The beta2 parameter for Adam.") - parser.add_argument("--num_train_epochs", - default=3.0, - type=float, - help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, @@ -133,43 +149,19 @@ def build_argument_parser(): type=str, help="The learning rate scheduler used for traning. " "E.g. warmup_linear, warmup_linear_shift, warmup_cosine, warmup_constant. Default, warmup_linear") - - parser.add_argument("--local_rank", - type=int, - default=-1, - help="local_rank for distributed training on gpus") - - parser.add_argument('--seed', - type=int, - default=1234, - help="random seed for initialization") - parser.add_argument('--accumulative_update', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") - - parser.add_argument('--fp16', - default=False, - type=boolean_string, - help="Whether to use 16-bit float precision instead of 32-bit") - parser.add_argument('--loss_scale', type=float, default=256, help='Loss scaling, positive power of 2 values can improve fp16 convergence.') - parser.add_argument('--scale_steps', type=int, default=1000, help='The steps to wait to increase the loss scale.') - - parser.add_argument('--init_model', - type=str, - help="The model state file used to initialize the model weights.") - parser.add_argument('--model_config', type=str, help="The config file of bert model.") - parser.add_argument('--cls_drop_out', type=float, default=None, @@ -178,43 +170,11 @@ def build_argument_parser(): type=float, default=0.01, help="The weight decay rate") - - parser.add_argument('--tag', - type=str, - default='final', - help="The tag name of current prediction/runs.") - - parser.add_argument("--dump_interval", - default=10000, - type=int, - help="Interval steps for generating checkpoint.") - - parser.add_argument('--lookahead_k', - default=-1, - type=int, - help="lookahead k parameter") - - parser.add_argument('--lookahead_alpha', - default=0.5, - type=float, - help="lookahead alpha parameter") - parser.add_argument('--opt_type', type=str.lower, default='adam', choices=['adam', 'admax'], help="The optimizer to be used.") - - parser.add_argument('--workers', - type=int, - default=2, - help="The workers to load data.") - - parser.add_argument('--pre_trained', - default=None, - type=str, - help="The path of pre-trained RoBERTa model") - return parser if __name__ == "__main__": diff --git a/DeBERTa/apps/train.py b/DeBERTa/apps/train.py index de96c46..d3c5c77 100644 --- a/DeBERTa/apps/train.py +++ b/DeBERTa/apps/train.py @@ -437,7 +437,6 @@ def build_argument_parser(): default=None, type=str, help="The path of pre-trained RoBERTa model") - return parser if __name__ == "__main__": diff --git a/DeBERTa/onnx/README.md b/DeBERTa/onnx/README.md new file mode 100644 index 0000000..4965593 --- /dev/null +++ b/DeBERTa/onnx/README.md @@ -0,0 +1,47 @@ +# DeBERTa: Fine-tuning with ONNX Runtime. + +## Requirements +- All the DeBERTA requirements +- onnx +- onnxruntime + +### Workaround fixes +- The workaround is needed until MSE operator becomes available in ORT + vi $PYTHONPATH/site-packages/torch/nn/functional.py + search for "def mse_loss" + proceed to lines + else: + expanded_input, expanded_target = torch.broadcast_tensors(input, target) + ret = torch._C._nn.mse_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction)) + and change them to: + expanded_input = input + expanded_target = target + t = expanded_input - expanded_target + t = t ** 2 + ret = torch.mean(t) + +- The workaround is needed until fix is available to disable Unsqueeze optimization for trainable weights in ORT + Changes in onnx runtime code: + Open onnxruntime/onnxruntime/core/graph/graph_utils.cc + Hardcode to return false to disable Unsqueeze optimization for DeBERTa, see below + if (output_name_is_changing) { + std::vector output_edges = GetNodeOutputEdges(node); + can_remove = CanUpdateImplicitInputNameInSubgraphs(graph, output_edges, initializer_name, logger); + can_remove = false; // <- Put this line in + +## Run task + +``` bash +task=STS-B +OUTPUT=/tmp/DeBERTa/exps/$task +python3 -m DeBERTa.apps.orttrain --task_name $task \ + --data_dir $cache_dir/glue_tasks/$task \ + --output_dir $OUTPUT \ + --eval_batch_size 128 \ + --train_batch_size 32 \ + --num_train_epochs 6 \ + --learning_rate 2e-5 \ + --max_seq_len 128 \ + --init_model base \ + --seed 123 +``` \ No newline at end of file diff --git a/DeBERTa/onnx/orttraining_deberta.py b/DeBERTa/onnx/orttraining_deberta.py index 4cb5c27..5bf7893 100644 --- a/DeBERTa/onnx/orttraining_deberta.py +++ b/DeBERTa/onnx/orttraining_deberta.py @@ -39,6 +39,7 @@ def setUp(self, args): # configurations not to be changed accoss tests self.max_seq_length = args.max_seq_length self.train_batch_size = args.train_batch_size + self.eval_batch_size = args.eval_batch_size self.learning_rate = args.learning_rate self.num_train_epochs = args.num_train_epochs self.local_rank = -1 @@ -81,7 +82,7 @@ def run_glue(self, task_name, fp16, use_new_api): training_args = TrainingArguments( output_dir=os.path.join(self.output_dir, task_name), do_train=True, do_eval=True, per_gpu_train_batch_size=self.train_batch_size, - per_gpu_eval_batch_size = self.train_batch_size, + per_gpu_eval_batch_size = self.eval_batch_size, learning_rate=self.learning_rate, num_train_epochs=self.num_train_epochs, local_rank=self.local_rank, overwrite_output_dir=self.overwrite_output_dir, gradient_accumulation_steps=self.gradient_accumulation_steps, From 7eff1fd75b8c626a0cb506334cc1f1280e152596 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Tue, 29 Sep 2020 19:30:35 +0000 Subject: [PATCH 13/13] Use random seed by default --- DeBERTa/apps/orttrain.py | 3 ++- DeBERTa/onnx/orttraining_transformer_trainer.py | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/DeBERTa/apps/orttrain.py b/DeBERTa/apps/orttrain.py index 237057a..89cca11 100644 --- a/DeBERTa/apps/orttrain.py +++ b/DeBERTa/apps/orttrain.py @@ -29,6 +29,7 @@ def create_model(args, num_labels, model_class_fn): def main(args): os.makedirs(args.output_dir, exist_ok=True) + logger.info("Using seed " + str(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) @@ -104,7 +105,7 @@ def build_argument_parser(): help="Total number of training epochs to perform.") parser.add_argument('--seed', type=int, - default=1234, + default=random.randint(0, 2**32 - 1), help="random seed for initialization") parser.add_argument('--fp16', default=False, diff --git a/DeBERTa/onnx/orttraining_transformer_trainer.py b/DeBERTa/onnx/orttraining_transformer_trainer.py index ffd81e6..540cfef 100644 --- a/DeBERTa/onnx/orttraining_transformer_trainer.py +++ b/DeBERTa/onnx/orttraining_transformer_trainer.py @@ -138,7 +138,6 @@ def get_train_dataloader(self) -> DataLoader: self.train_dataset, batch_size=self.args.train_batch_size, sampler=train_sampler, - #drop_last=True, collate_fn=self.data_collator.collate_batch, ) @@ -147,7 +146,6 @@ def get_eval_dataloader(self) -> DataLoader: self.eval_dataset, batch_size=self.args.eval_batch_size, shuffle=False, - #drop_last=True, collate_fn=self.data_collator.collate_batch, ) @@ -157,7 +155,6 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: test_dataset, batch_size=self.args.eval_batch_size, shuffle=False, - #drop_last=True, collate_fn=self.data_collator.collate_batch, )