From 498db462cc9626905ba443d03d2f1a817fd78bad Mon Sep 17 00:00:00 2001
From: ganik <ganinz@hotmail.com>
Date: Mon, 20 Jul 2020 22:35:18 +0000
Subject: [PATCH 01/13] Replace with Dropout and Softmax

---
 DeBERTa/deberta/disentangled_attention.py | 4 +++-
 DeBERTa/deberta/ops.py                    | 6 +++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/DeBERTa/deberta/disentangled_attention.py b/DeBERTa/deberta/disentangled_attention.py
index 262425b..6f281ca 100644
--- a/DeBERTa/deberta/disentangled_attention.py
+++ b/DeBERTa/deberta/disentangled_attention.py
@@ -174,7 +174,9 @@ def linear(w,b,x):
         if self.talking_head:
             attention_scores = self.head_logits_proj(attention_scores.permute(0,2,3,1)).permute(0,3,1,2)
 
-        attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
+        #attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
+        nodex = torch.nn.Softmax(-1)
+        attention_probs = nodex(attention_scores)
         attention_probs = self.dropout(attention_probs)
         if self.talking_head:
             attention_probs = self.head_weights_proj(attention_probs.permute(0,2,3,1)).permute(0,3,1,2)
diff --git a/DeBERTa/deberta/ops.py b/DeBERTa/deberta/ops.py
index a18515f..08afda1 100644
--- a/DeBERTa/deberta/ops.py
+++ b/DeBERTa/deberta/ops.py
@@ -115,7 +115,11 @@ def backward(ctx, grad_output):
     else:
       return grad_output, None
 
-class StableDropout(torch.nn.Module):
+class StableDropout(torch.nn.Dropout):
+  def __init__(self, drop_prob):
+      super().__init__()
+
+class StableDropout1(torch.nn.Module):
   """ Optimized dropout module for stabilizing the training
 
   Args:

From 3be82890d9502c435093685996bd89f83ebe2c85 Mon Sep 17 00:00:00 2001
From: ganik <ganinz@hotmail.com>
Date: Tue, 21 Jul 2020 23:04:16 +0000
Subject: [PATCH 02/13] mask attention scores in Softmax

---
 DeBERTa/deberta/disentangled_attention.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DeBERTa/deberta/disentangled_attention.py b/DeBERTa/deberta/disentangled_attention.py
index 6f281ca..2905084 100644
--- a/DeBERTa/deberta/disentangled_attention.py
+++ b/DeBERTa/deberta/disentangled_attention.py
@@ -176,7 +176,7 @@ def linear(w,b,x):
 
         #attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
         nodex = torch.nn.Softmax(-1)
-        attention_probs = nodex(attention_scores)
+        attention_probs = nodex(attention_scores + 10000.0*(attention_mask -1))
         attention_probs = self.dropout(attention_probs)
         if self.talking_head:
             attention_probs = self.head_weights_proj(attention_probs.permute(0,2,3,1)).permute(0,3,1,2)

From dab83afd9d44fbc5f86d36e38218df355cac4914 Mon Sep 17 00:00:00 2001
From: ganik <ganinz@hotmail.com>
Date: Sun, 2 Aug 2020 05:42:45 +0000
Subject: [PATCH 03/13] onnx conversion and training

---
 .gitignore                              |  1 +
 DeBERTa/apps/sequence_classification.py |  4 +-
 DeBERTa/apps/train.py                   | 74 +++++++++++++++++++++++--
 3 files changed, 72 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index b6e4761..a4ba5c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -127,3 +127,4 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+tmp/
diff --git a/DeBERTa/apps/sequence_classification.py b/DeBERTa/apps/sequence_classification.py
index 218aed2..11d9b39 100644
--- a/DeBERTa/apps/sequence_classification.py
+++ b/DeBERTa/apps/sequence_classification.py
@@ -46,7 +46,7 @@ def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, positi
     pooled_output = self.dropout(pooled_output)
     logits = self.classifier(pooled_output)
 
-    loss = 0
+    loss = torch.tensor(0).to(logits)
     if labels is not None:
       if self.num_labels ==1:
         # regression task
@@ -68,4 +68,4 @@ def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, positi
         label_confidence = 1
         loss = -((log_softmax(logits)*labels).sum(-1)*label_confidence).mean()
 
-    return (logits,loss)
+    return (loss, logits)
diff --git a/DeBERTa/apps/train.py b/DeBERTa/apps/train.py
index e9218d1..3bffee5 100644
--- a/DeBERTa/apps/train.py
+++ b/DeBERTa/apps/train.py
@@ -24,9 +24,10 @@
 from ..utils import *
 from ..utils import xtqdm as tqdm
 from .task_registry import tasks
+from onnxruntime.capi.ort_trainer import ORTTrainer, IODescription, ModelDescription, LossScaler
 
 from ..training import DistributedTrainer, initialize_distributed, batch_to, set_random_seed,kill_children
-from ..data import DistributedBatchSampler, SequentialSampler, BatchSampler, AsyncDataLoader
+from ..data import DistributedBatchSampler, SequentialSampler, BatchSampler, RandomSampler, AsyncDataLoader
 
 def create_model(args, num_labels, model_class_fn):
   # Prepare model
@@ -217,9 +218,63 @@ def run_predict(args, model, device, eval_data, prefix=None):
       if predict_fn:
         predict_fn(predicts, args.output_dir, name, prefix)
 
+def deberta_model_description(args):
+    vocab_size = 30528
+    # set concrete input sizes to permit optimization
+    input_ids_desc = IODescription('input_ids', [args.train_batch_size, args.max_seq_length], torch.int32, num_classes=vocab_size)
+    type_ids_desc = IODescription('type_ids', [args.train_batch_size, args.max_seq_length], torch.int32) # num_classes=?
+    position_ids_desc = IODescription('position_ids', [args.train_batch_size, args.max_seq_length], torch.int32) # num_classes=?
+    input_mask_desc = IODescription('input_mask', [args.train_batch_size, args.max_seq_length], torch.int32) # num_classes=?
+    labels_desc = IODescription('labels', [args.train_batch_size, args.max_seq_length], torch.float32) # num_classes=?
+    
+    loss_desc = IODescription('loss', [], torch.float32)
+    return ModelDescription([input_ids_desc, type_ids_desc, position_ids_desc, input_mask_desc, labels_desc], [loss_desc])
+
+def create_ort_trainer(args, device, model):
+    # default initial settings: b1=0.9, b2=0.999, e=1e-6
+    def map_optimizer_attributes(name):
+        no_decay_keys = ["bias", "gamma", "beta", "LayerNorm"]
+        no_decay = False
+        for no_decay_key in no_decay_keys:
+            if no_decay_key in name:
+                no_decay = True
+                break
+        if no_decay:
+            return {"alpha": 0.9, "beta": 0.999, "lambda": 0.0, "epsilon": 1e-6}
+        else:
+            return {"alpha": 0.9, "beta": 0.999, "lambda": 0.01, "epsilon": 1e-6}
+
+    # we request ORTTrainer to create a LambOptimizer with given optimizer_attributes. 
+    # train_step does forward, backward, and optimize step.
+    model = ORTTrainer(model, None, deberta_model_description(args), "LambOptimizer", 
+        map_optimizer_attributes,
+        IODescription('Learning_Rate', [1,], torch.float32),
+        device,
+        _opset_version = 10)
+
+    return model
+
+def run_onnx_training(args, model, device, train_data, prefix=None):
+  # runs training in ONNX
+  trainer = create_ort_trainer(args, device, model)
+  train_sampler = RandomSampler(len(train_data))
+  batch_sampler = BatchSampler(train_sampler, args.train_batch_size)
+  batch_sampler = DistributedBatchSampler(batch_sampler, rank=args.rank, world_size=args.world_size)
+  train_dataloader = DataLoader(train_data, batch_sampler=batch_sampler, num_workers=args.workers, pin_memory=True)
+  torch.cuda.empty_cache()
+  for step, batch in enumerate(AsyncDataLoader(train_dataloader, 100)):
+    #import pdb
+    #pdb.set_trace()
+    batch = batch_to(batch, device)
+    with torch.no_grad():
+      trainer.train_step(batch['input_ids'], batch['type_ids'], batch['position_ids'], batch['input_mask'], batch['labels'])
+      # conversion fails now with:
+      # site-packages/torch/onnx/utils.py:617: UserWarning: ONNX export failed on ATen operator broadcast_tensors
+      # because torch.onnx.symbolic_opset10.broadcast_tensors does not exist
+
 def main(args):
-  if not args.do_train and not args.do_eval and not args.do_predict:
-    raise ValueError("At least one of `do_train` or `do_eval` or `do_predict` must be True.")
+  if not args.do_train and not args.do_eval and not args.do_predict and not args.do_onnx:
+    raise ValueError("At least one of `do_train` or `do_eval` or `do_predict` or `do_onnx` must be True.")
   os.makedirs(args.output_dir, exist_ok=True)
   task_name = args.task_name.lower()
   random.seed(args.seed)
@@ -236,11 +291,11 @@ def main(args):
     test_data = processor.test_data(max_seq_len=args.max_seq_length)
     logger.info("  Prediction batch size = %d", args.predict_batch_size)
 
-  if args.do_train:
+  if args.do_train or args.do_onnx:
     train_data = processor.train_data(max_seq_len=args.max_seq_length, mask_gen = None, debug=args.debug)
   model_class_fn = processor.get_model_class_fn()
   model = create_model(args, len(label_list), model_class_fn)
-  if args.do_train:
+  if args.do_train or args.do_onnx:
     with open(os.path.join(args.output_dir, 'model_config.json'), 'w', encoding='utf-8') as fs:
       fs.write(model.config.to_json_string() + '\n')
   logger.info("Model config {}".format(model.config))
@@ -257,6 +312,10 @@ def main(args):
   if args.do_predict:
     run_predict(args, model, device, test_data, prefix=args.tag)
 
+  # trains in ONNX
+  if args.do_onnx:
+    run_onnx_training(args, model, device, train_data, prefix=args.tag)
+
 def build_argument_parser():
   parser = argparse.ArgumentParser()
 
@@ -437,6 +496,11 @@ def build_argument_parser():
             default=None,
             type=str,
             help="The path of pre-trained RoBERTa model")
+  
+  parser.add_argument("--do_onnx",
+            default=False,
+            action='store_true',
+            help="Whether to run training in ONNX")
   return parser
 
 if __name__ == "__main__":

From 64f068c6afdb57087b4439859143aa46593134e7 Mon Sep 17 00:00:00 2001
From: ganik <ganinz@hotmail.com>
Date: Sun, 2 Aug 2020 05:53:11 +0000
Subject: [PATCH 04/13] TBD tight coupling with torch 1.3

---
 DeBERTa/optims/fp16_optimizer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/DeBERTa/optims/fp16_optimizer.py b/DeBERTa/optims/fp16_optimizer.py
index dea2b0b..cc58afb 100755
--- a/DeBERTa/optims/fp16_optimizer.py
+++ b/DeBERTa/optims/fp16_optimizer.py
@@ -20,6 +20,8 @@
 from ..utils import get_logger
 logger=get_logger()
 
+# Lines below tightly couple DeBerta with torch 1.3 
+# TBD refactor or port to torch 1.6
 lib = ctypes.cdll.LoadLibrary(None)
 lib.THCudaHalfTensor_normall.argtypes=[ctypes.c_void_p, ctypes.c_void_p]
 lib.THCudaHalfTensor_normall.restype = ctypes.c_float

From 1ce5cc19cefecacd7cf742baeb28b6e4e43bb926 Mon Sep 17 00:00:00 2001
From: ganik <ganinz@hotmail.com>
Date: Fri, 7 Aug 2020 00:58:05 +0000
Subject: [PATCH 05/13] opset 12, expand attention mask

---
 DeBERTa/apps/train.py   | 2 +-
 DeBERTa/deberta/bert.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/DeBERTa/apps/train.py b/DeBERTa/apps/train.py
index 3bffee5..4a8c691 100644
--- a/DeBERTa/apps/train.py
+++ b/DeBERTa/apps/train.py
@@ -250,7 +250,7 @@ def map_optimizer_attributes(name):
         map_optimizer_attributes,
         IODescription('Learning_Rate', [1,], torch.float32),
         device,
-        _opset_version = 10)
+        _opset_version = 12)
 
     return model
 
diff --git a/DeBERTa/deberta/bert.py b/DeBERTa/deberta/bert.py
index 5b2b2b2..f9ccbb7 100644
--- a/DeBERTa/deberta/bert.py
+++ b/DeBERTa/deberta/bert.py
@@ -160,7 +160,7 @@ def get_attention_mask(self, attention_mask):
     if attention_mask.dim()<=2:
       extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
       attention_mask = extended_attention_mask*extended_attention_mask.squeeze(-2).unsqueeze(-1)
-      attention_mask = attention_mask.byte()
+      attention_mask = attention_mask.int()
     elif attention_mask.dim()==3:
       attention_mask = attention_mask.unsqueeze(1)
 

From b301b4641409821b1acc992f1ba4daf8f24464d9 Mon Sep 17 00:00:00 2001
From: ganik <ganinz@hotmail.com>
Date: Sat, 8 Aug 2020 00:00:51 +0000
Subject: [PATCH 06/13] loss is first

---
 DeBERTa/apps/train.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/DeBERTa/apps/train.py b/DeBERTa/apps/train.py
index 4a8c691..71d0163 100644
--- a/DeBERTa/apps/train.py
+++ b/DeBERTa/apps/train.py
@@ -56,7 +56,7 @@ def eval_fn(trainer, model, device, tag):
     return eval_metric
 
   def loss_fn(trainer, model, data):
-    _, loss = model(**data)
+    loss, _ = model(**data)
     return loss.mean(), data['input_ids'].size(0)
 
   trainer = DistributedTrainer(args, model, device, data_fn, loss_fn = loss_fn, eval_fn = eval_fn, dump_interval = args.dump_interval)
@@ -161,7 +161,7 @@ def run_eval(args, model, device, eval_data, prefix=None, tag=None, steps=None):
     for batch in tqdm(AsyncDataLoader(eval_dataloader), ncols=80, desc='Evaluating: {}'.format(prefix), disable=no_tqdm):
       batch = batch_to(batch, device)
       with torch.no_grad():
-        logits, tmp_eval_loss = model(**batch)
+        tmp_eval_loss, logits = model(**batch)
       label_ids = batch['labels'].to(device)
       predicts.append(logits)
       labels.append(label_ids)
@@ -196,7 +196,7 @@ def run_predict(args, model, device, eval_data, prefix=None):
     for batch in tqdm(AsyncDataLoader(eval_dataloader), ncols=80, desc='Evaluating: {}'.format(prefix), disable=args.rank>0):
       batch = batch_to(batch, device)
       with torch.no_grad():
-        logits, _ = model(**batch)
+        _, logits = model(**batch)
       if args.world_size>1:
         logits_all = [torch.zeros_like(logits) for _ in range(args.world_size)]
         torch.distributed.all_gather(logits_all, logits)

From d2fa9fdf1cc2537fc3f7f9349457c0877c5732c6 Mon Sep 17 00:00:00 2001
From: ganik <ganinz@hotmail.com>
Date: Sat, 8 Aug 2020 00:53:30 +0000
Subject: [PATCH 07/13] commenting out v_ and q_ biases as they are always
 const

---
 DeBERTa/apps/train.py                     |  3 ++-
 DeBERTa/deberta/disentangled_attention.py | 11 +++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/DeBERTa/apps/train.py b/DeBERTa/apps/train.py
index 71d0163..28808fb 100644
--- a/DeBERTa/apps/train.py
+++ b/DeBERTa/apps/train.py
@@ -265,9 +265,10 @@ def run_onnx_training(args, model, device, train_data, prefix=None):
   for step, batch in enumerate(AsyncDataLoader(train_dataloader, 100)):
     #import pdb
     #pdb.set_trace()
+    lr = torch.tensor([0.0000000e+00]).to(device)
     batch = batch_to(batch, device)
     with torch.no_grad():
-      trainer.train_step(batch['input_ids'], batch['type_ids'], batch['position_ids'], batch['input_mask'], batch['labels'])
+      trainer.train_step(batch['input_ids'], batch['type_ids'], batch['position_ids'], batch['input_mask'], batch['labels'], lr)
       # conversion fails now with:
       # site-packages/torch/onnx/utils.py:617: UserWarning: ONNX export failed on ATen operator broadcast_tensors
       # because torch.onnx.symbolic_opset10.broadcast_tensors does not exist
diff --git a/DeBERTa/deberta/disentangled_attention.py b/DeBERTa/deberta/disentangled_attention.py
index 2905084..1369bf5 100644
--- a/DeBERTa/deberta/disentangled_attention.py
+++ b/DeBERTa/deberta/disentangled_attention.py
@@ -77,8 +77,9 @@ def __init__(self, config):
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
         self.in_proj = torch.nn.Linear(config.hidden_size, self.all_head_size*3, bias=False)
-        self.q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
-        self.v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
+        # Looks like params below are never updated and const, so removing them
+        #self.q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
+        #self.v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
         self.pos_att_type = [x.strip() for x in getattr(config, 'pos_att_type', 'none').lower().split('|')] # c2p|p2c
         
         self.relative_attention = getattr(config, 'relative_attention', False)
@@ -148,8 +149,10 @@ def linear(w,b,x):
             k,v = [linear(qkvw[i], qkvb[i], hidden_states) for i in range(1,3)]
             query_layer, key_layer, value_layer = [self.transpose_for_scores(x) for x in [q,k,v]]
 
-        query_layer += self.transpose_for_scores(self.q_bias.unsqueeze(0).unsqueeze(0))
-        value_layer += self.transpose_for_scores(self.v_bias.unsqueeze(0).unsqueeze(0))
+        q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
+        v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
+        query_layer += self.transpose_for_scores(q_bias.unsqueeze(0).unsqueeze(0))
+        value_layer += self.transpose_for_scores(v_bias.unsqueeze(0).unsqueeze(0))
 
         rel_att = None
         # Take the dot product between "query" and "key" to get the raw attention scores.

From e4793b87a7cbf392bdf3537e394e8e61c0cc99ab Mon Sep 17 00:00:00 2001
From: ganik <ganinz@hotmail.com>
Date: Fri, 14 Aug 2020 19:28:24 +0000
Subject: [PATCH 08/13] Fix Dropout model regression issue

---
 DeBERTa/apps/multi_choice.py              |  2 +-
 DeBERTa/apps/ner.py                       |  2 +-
 DeBERTa/apps/sequence_classification.py   |  2 +-
 DeBERTa/deberta/bert.py                   |  6 +++---
 DeBERTa/deberta/config.py                 |  2 ++
 DeBERTa/deberta/disentangled_attention.py | 16 +++++++---------
 DeBERTa/deberta/ops.py                    |  7 +------
 DeBERTa/deberta/pooling.py                |  4 +++-
 8 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/DeBERTa/apps/multi_choice.py b/DeBERTa/apps/multi_choice.py
index 253f7c3..5611d57 100644
--- a/DeBERTa/apps/multi_choice.py
+++ b/DeBERTa/apps/multi_choice.py
@@ -27,7 +27,7 @@ def __init__(self, config, num_labels = 2, drop_out=None, **kwargs):
     self.num_labels = num_labels
     self.classifier = nn.Linear(config.hidden_size, 1)
     drop_out = config.hidden_dropout_prob if drop_out is None else drop_out
-    self.dropout = StableDropout(drop_out)
+    self.dropout = StableDropout(drop_out) if config.use_xdropout else nn.Dropout(drop_out)
     self.apply(self.init_weights)
 
   def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, position_ids=None, **kwargs):
diff --git a/DeBERTa/apps/ner.py b/DeBERTa/apps/ner.py
index 812b042..8749e4d 100644
--- a/DeBERTa/apps/ner.py
+++ b/DeBERTa/apps/ner.py
@@ -27,7 +27,7 @@ def __init__(self, config, num_labels = 2, drop_out=None, **kwargs):
     self.proj = nn.Linear(config.hidden_size, config.hidden_size)
     self.classifier = nn.Linear(config.hidden_size, self.num_labels)
     drop_out = config.hidden_dropout_prob if drop_out is None else drop_out
-    self.dropout = StableDropout(drop_out)
+    self.dropout = StableDropout(drop_out) if config.use_xdropout else nn.Dropout(drop_out)
     self.apply(self.init_weights)
 
   def forward(self, input_ids, type_ids=None, input_mask=None, labels=None, position_ids=None, **kwargs):
diff --git a/DeBERTa/apps/sequence_classification.py b/DeBERTa/apps/sequence_classification.py
index 11d9b39..c9d272a 100644
--- a/DeBERTa/apps/sequence_classification.py
+++ b/DeBERTa/apps/sequence_classification.py
@@ -35,7 +35,7 @@ def __init__(self, config, num_labels=2, drop_out=None, pre_trained=None):
 
     self.classifier = torch.nn.Linear(output_dim, num_labels)
     drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
-    self.dropout = StableDropout(drop_out)
+    self.dropout = StableDropout(drop_out) if config.use_xdropout else  torch.nn.Dropout(drop_out)
     self.apply(self.init_weights)
     self.bert.apply_state()
 
diff --git a/DeBERTa/deberta/bert.py b/DeBERTa/deberta/bert.py
index f9ccbb7..ba817a0 100644
--- a/DeBERTa/deberta/bert.py
+++ b/DeBERTa/deberta/bert.py
@@ -63,7 +63,7 @@ def __init__(self, config):
     super().__init__()
     self.dense = nn.Linear(config.hidden_size, config.hidden_size)
     self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
-    self.dropout = StableDropout(config.hidden_dropout_prob)
+    self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob)
     self.config = config
 
   def forward(self, hidden_states, input_states, mask=None):
@@ -110,7 +110,7 @@ def __init__(self, config):
     super(BertOutput, self).__init__()
     self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
     self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
-    self.dropout = StableDropout(config.hidden_dropout_prob)
+    self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob)
     self.config = config
 
   def forward(self, hidden_states, input_states, mask=None):
@@ -229,7 +229,7 @@ def __init__(self, config):
     if self.embedding_size != config.hidden_size:
       self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False)
     self.LayerNorm = BertLayerNorm(config.hidden_size, config.layer_norm_eps)
-    self.dropout = StableDropout(config.hidden_dropout_prob)
+    self.dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else nn.Dropout(config.hidden_dropout_prob)
     self.output_to_half = False
     self.config = config
 
diff --git a/DeBERTa/deberta/config.py b/DeBERTa/deberta/config.py
index 11f23aa..a324c97 100644
--- a/DeBERTa/deberta/config.py
+++ b/DeBERTa/deberta/config.py
@@ -15,6 +15,8 @@ def from_dict(cls, json_object):
             if isinstance(value, dict):
                 value = AbsModelConfig.from_dict(value)
             config.__dict__[key] = value
+        config.use_xdropout = True
+        config.use_xsoftmax = True
         return config
 
     @classmethod
diff --git a/DeBERTa/deberta/disentangled_attention.py b/DeBERTa/deberta/disentangled_attention.py
index 1369bf5..8f7801c 100644
--- a/DeBERTa/deberta/disentangled_attention.py
+++ b/DeBERTa/deberta/disentangled_attention.py
@@ -77,9 +77,9 @@ def __init__(self, config):
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
         self.in_proj = torch.nn.Linear(config.hidden_size, self.all_head_size*3, bias=False)
-        # Looks like params below are never updated and const, so removing them
-        #self.q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
-        #self.v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
+        # ONNX graph builder thinks params below are not used for loss calcualtion
+        self.q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
+        self.v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
         self.pos_att_type = [x.strip() for x in getattr(config, 'pos_att_type', 'none').lower().split('|')] # c2p|p2c
         
         self.relative_attention = getattr(config, 'relative_attention', False)
@@ -93,14 +93,14 @@ def __init__(self, config):
             self.max_relative_positions = getattr(config, 'max_relative_positions', -1)
             if self.max_relative_positions <1:
                 self.max_relative_positions = config.max_position_embeddings
-            self.pos_dropout = StableDropout(config.hidden_dropout_prob)
+            self.pos_dropout = StableDropout(config.hidden_dropout_prob) if config.use_xdropout else torch.nn.Dropout(config.hidden_dropout_prob)
 
             if 'c2p' in self.pos_att_type or 'p2p' in self.pos_att_type:
                 self.pos_proj = torch.nn.Linear(config.hidden_size, self.all_head_size, bias=False)
             if 'p2c' in self.pos_att_type or 'p2p' in self.pos_att_type:
                 self.pos_q_proj = torch.nn.Linear(config.hidden_size, self.all_head_size)
 
-        self.dropout = StableDropout(config.attention_probs_dropout_prob)
+        self.dropout = StableDropout(config.attention_probs_dropout_prob) if config.use_xdropout else torch.nn.Dropout(config.attention_probs_dropout_prob)
 
     def transpose_for_scores(self, x):
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, -1)
@@ -149,10 +149,8 @@ def linear(w,b,x):
             k,v = [linear(qkvw[i], qkvb[i], hidden_states) for i in range(1,3)]
             query_layer, key_layer, value_layer = [self.transpose_for_scores(x) for x in [q,k,v]]
 
-        q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
-        v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
-        query_layer += self.transpose_for_scores(q_bias.unsqueeze(0).unsqueeze(0))
-        value_layer += self.transpose_for_scores(v_bias.unsqueeze(0).unsqueeze(0))
+        query_layer += self.transpose_for_scores(self.q_bias.unsqueeze(0).unsqueeze(0))
+        value_layer += self.transpose_for_scores(self.v_bias.unsqueeze(0).unsqueeze(0))
 
         rel_att = None
         # Take the dot product between "query" and "key" to get the raw attention scores.
diff --git a/DeBERTa/deberta/ops.py b/DeBERTa/deberta/ops.py
index 08afda1..a1ba5fd 100644
--- a/DeBERTa/deberta/ops.py
+++ b/DeBERTa/deberta/ops.py
@@ -6,7 +6,6 @@
 # Author: penhe@microsoft.com
 # Date: 01/15/2020
 #
-
 import math
 from packaging import version
 import torch
@@ -115,11 +114,7 @@ def backward(ctx, grad_output):
     else:
       return grad_output, None
 
-class StableDropout(torch.nn.Dropout):
-  def __init__(self, drop_prob):
-      super().__init__()
-
-class StableDropout1(torch.nn.Module):
+class StableDropout(torch.nn.Module):
   """ Optimized dropout module for stabilizing the training
 
   Args:
diff --git a/DeBERTa/deberta/pooling.py b/DeBERTa/deberta/pooling.py
index 16b9aaa..4fb4f43 100644
--- a/DeBERTa/deberta/pooling.py
+++ b/DeBERTa/deberta/pooling.py
@@ -58,6 +58,8 @@ def __init__(self, config=None):
         self.hidden_size = 768
         self.dropout = 0
         self.hidden_act = 'gelu'
+        self.use_xdropout = True
+        self.use_xsoftmax = True
         if config:
             pool_config = getattr(config, 'pooling', config)
             if isinstance(pool_config, dict):
@@ -70,7 +72,7 @@ class ContextPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.dropout = StableDropout(config.dropout)
+        self.dropout = StableDropout(config.dropout) if config.use_xdropout else nn.Dropout(config.dropout) 
         self.config = config
 
     def forward(self, hidden_states, mask = None):

From 155d96609d4be0466c1f918ac198b5aec114dd04 Mon Sep 17 00:00:00 2001
From: ganik <ganinz@hotmail.com>
Date: Fri, 14 Aug 2020 19:48:00 +0000
Subject: [PATCH 09/13] Use nn.dropout and nn.softmax by default

---
 DeBERTa/deberta/config.py                 | 4 ++--
 DeBERTa/deberta/disentangled_attention.py | 9 ++++++---
 DeBERTa/deberta/pooling.py                | 4 ++--
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/DeBERTa/deberta/config.py b/DeBERTa/deberta/config.py
index a324c97..eb5b2c5 100644
--- a/DeBERTa/deberta/config.py
+++ b/DeBERTa/deberta/config.py
@@ -15,8 +15,8 @@ def from_dict(cls, json_object):
             if isinstance(value, dict):
                 value = AbsModelConfig.from_dict(value)
             config.__dict__[key] = value
-        config.use_xdropout = True
-        config.use_xsoftmax = True
+        config.use_xdropout = False
+        config.use_xsoftmax = False
         return config
 
     @classmethod
diff --git a/DeBERTa/deberta/disentangled_attention.py b/DeBERTa/deberta/disentangled_attention.py
index 8f7801c..87f280b 100644
--- a/DeBERTa/deberta/disentangled_attention.py
+++ b/DeBERTa/deberta/disentangled_attention.py
@@ -101,6 +101,7 @@ def __init__(self, config):
                 self.pos_q_proj = torch.nn.Linear(config.hidden_size, self.all_head_size)
 
         self.dropout = StableDropout(config.attention_probs_dropout_prob) if config.use_xdropout else torch.nn.Dropout(config.attention_probs_dropout_prob)
+        self.use_xsoftmax = config.use_xsoftmax
 
     def transpose_for_scores(self, x):
         new_x_shape = x.size()[:-1] + (self.num_attention_heads, -1)
@@ -175,9 +176,11 @@ def linear(w,b,x):
         if self.talking_head:
             attention_scores = self.head_logits_proj(attention_scores.permute(0,2,3,1)).permute(0,3,1,2)
 
-        #attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
-        nodex = torch.nn.Softmax(-1)
-        attention_probs = nodex(attention_scores + 10000.0*(attention_mask -1))
+        if self.use_xsoftmax:
+            attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1)
+        else:
+            nodex = torch.nn.Softmax(-1)
+            attention_probs = nodex(attention_scores + 10000.0*(attention_mask -1))
         attention_probs = self.dropout(attention_probs)
         if self.talking_head:
             attention_probs = self.head_weights_proj(attention_probs.permute(0,2,3,1)).permute(0,3,1,2)
diff --git a/DeBERTa/deberta/pooling.py b/DeBERTa/deberta/pooling.py
index 4fb4f43..d6cce03 100644
--- a/DeBERTa/deberta/pooling.py
+++ b/DeBERTa/deberta/pooling.py
@@ -58,8 +58,8 @@ def __init__(self, config=None):
         self.hidden_size = 768
         self.dropout = 0
         self.hidden_act = 'gelu'
-        self.use_xdropout = True
-        self.use_xsoftmax = True
+        self.use_xdropout = False
+        self.use_xsoftmax = False
         if config:
             pool_config = getattr(config, 'pooling', config)
             if isinstance(pool_config, dict):

From 95ec7ad1e0ca533786690ca1170c637ae5184fa7 Mon Sep 17 00:00:00 2001
From: Gani Nazirov
 <ganaziro@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
Date: Tue, 29 Sep 2020 18:06:26 +0000
Subject: [PATCH 10/13] Added ORT Glue based tests

---
 DeBERTa/apps/orttrain.py                      | 234 ++++++++++
 DeBERTa/deberta/__init__.py                   |   2 +-
 DeBERTa/deberta/bert.py                       |   1 +
 DeBERTa/deberta/gpt2_tokenizer.py             |  38 +-
 DeBERTa/onnx/__init__.py                      |   5 +
 DeBERTa/onnx/orttraining_deberta.py           | 167 ++++++++
 .../onnx/orttraining_test_bert_postprocess.py |   5 +
 .../orttraining_test_layer_norm_transform.py  | 177 ++++++++
 .../onnx/orttraining_test_model_transform.py  | 106 +++++
 .../onnx/orttraining_transformer_trainer.py   | 405 ++++++++++++++++++
 10 files changed, 1138 insertions(+), 2 deletions(-)
 create mode 100644 DeBERTa/apps/orttrain.py
 create mode 100644 DeBERTa/onnx/__init__.py
 create mode 100644 DeBERTa/onnx/orttraining_deberta.py
 create mode 100644 DeBERTa/onnx/orttraining_test_bert_postprocess.py
 create mode 100644 DeBERTa/onnx/orttraining_test_layer_norm_transform.py
 create mode 100644 DeBERTa/onnx/orttraining_test_model_transform.py
 create mode 100644 DeBERTa/onnx/orttraining_transformer_trainer.py

diff --git a/DeBERTa/apps/orttrain.py b/DeBERTa/apps/orttrain.py
new file mode 100644
index 0000000..2683fc6
--- /dev/null
+++ b/DeBERTa/apps/orttrain.py
@@ -0,0 +1,234 @@
+# Copyright (c) Microsoft, Inc. 2020
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+import os
+import argparse
+import random
+
+import numpy as np
+import torch
+from ..deberta import GPT2Tokenizer, DebertaPreTrainedTokenizer
+from ..onnx import ORTGlueTest
+from ..utils import *
+from .task_registry import tasks
+from onnxruntime.capi._pybind_state import get_mpi_context_local_rank, get_mpi_context_local_size, get_mpi_context_world_rank, get_mpi_context_world_size
+
+def create_model(args, num_labels, model_class_fn):
+  # Prepare model
+  rank = getattr(args, 'rank', 0)
+  init_model = args.init_model if rank<1 else None
+  model = model_class_fn(init_model, args.model_config, num_labels=num_labels, \
+      drop_out=args.cls_drop_out, \
+      pre_trained = args.pre_trained)
+  if args.fp16:
+    model = model.half()
+  return model
+
+def main(args):
+  os.makedirs(args.output_dir, exist_ok=True)
+  random.seed(args.seed)
+  np.random.seed(args.seed)
+  torch.manual_seed(args.seed)
+  
+  # load model based on task
+  tokenizer = GPT2Tokenizer()
+  processor = tasks[args.task_name.lower()](tokenizer = tokenizer, max_seq_len = args.max_seq_length, data_dir = args.data_dir)
+  label_list = processor.get_labels()
+  model_class_fn = processor.get_model_class_fn()
+  model = create_model(args, len(label_list), model_class_fn)
+  logger.info("Model config {}".format(model.config))
+  
+  # train with ORT
+  test = ORTGlueTest()
+  test.setUp(args)
+  test.local_rank = get_mpi_context_local_rank()
+  test.world_size = get_mpi_context_world_size()
+  print("mpirun launch, local_rank / world_size: ", test.local_rank, test.world_size)
+  os.environ['RANK'] = str(test.local_rank)
+  os.environ['WORLD_SIZE'] = str(test.world_size)
+  os.environ['MASTER_ADDR'] = '127.0.0.1'
+  os.environ['MASTER_PORT'] = '29501'
+  test.model = model
+  test.tokenizer = DebertaPreTrainedTokenizer()
+  test.run_glue(task_name=args.task_name, fp16=False, use_new_api=True)
+
+def build_argument_parser():
+  parser = argparse.ArgumentParser()
+
+  ## Required parameters
+  parser.add_argument("--data_dir",
+            default=None,
+            type=str,
+            required=True,
+            help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+  parser.add_argument("--task_name",
+            default=None,
+            type=str,
+            required=True,
+            help="The name of the task to train.")
+  parser.add_argument("--output_dir",
+            default=None,
+            type=str,
+            required=True,
+            help="The output directory where the model checkpoints will be written.")
+  parser.add_argument("--cache_dir",
+            default=None,
+            type=str,
+            required=True,
+            help="The directory to store the pretrained models downloaded from s3.")
+
+  ## Other parameters
+  parser.add_argument("--max_seq_length",
+            default=128,
+            type=int,
+            help="The maximum total input sequence length after WordPiece tokenization. \n"
+              "Sequences longer than this will be truncated, and sequences shorter \n"
+              "than this will be padded.")
+  parser.add_argument("--train_batch_size",
+            default=32,
+            type=int,
+            help="Total batch size for training.")
+  parser.add_argument("--eval_batch_size",
+            default=32,
+            type=int,
+            help="Total batch size for eval.")
+  parser.add_argument("--max_grad_norm",
+            default=1,
+            type=float,
+            help="The clip threshold of global gradient norm")
+  parser.add_argument("--learning_rate",
+            default=5e-5,
+            type=float,
+            help="The initial learning rate for Adam.")
+  parser.add_argument("--epsilon",
+            default=1e-6,
+            type=float,
+            help="epsilon setting for Adam.")
+  parser.add_argument("--adam_beta1",
+            default=0.9,
+            type=float,
+            help="The beta1 parameter for Adam.")
+  parser.add_argument("--adam_beta2",
+            default=0.999,
+            type=float,
+            help="The beta2 parameter for Adam.")
+  parser.add_argument("--num_train_epochs",
+            default=3.0,
+            type=float,
+            help="Total number of training epochs to perform.")
+  parser.add_argument("--warmup_proportion",
+            default=0.1,
+            type=float,
+            help="Proportion of training to perform linear learning rate warmup for. "
+              "E.g., 0.1 = 10%% of training.")
+  parser.add_argument("--lr_schedule_ends",
+            default=0,
+            type=float,
+            help="The ended learning rate scale for learning rate scheduling")
+  parser.add_argument("--lr_schedule",
+            default='warmup_linear',
+            type=str,
+            help="The learning rate scheduler used for traning. "
+              "E.g. warmup_linear, warmup_linear_shift, warmup_cosine, warmup_constant. Default, warmup_linear")
+
+  parser.add_argument("--local_rank",
+            type=int,
+            default=-1,
+            help="local_rank for distributed training on gpus")
+
+  parser.add_argument('--seed',
+            type=int,
+            default=1234,
+            help="random seed for initialization")
+
+  parser.add_argument('--accumulative_update',
+            type=int,
+            default=1,
+            help="Number of updates steps to accumulate before performing a backward/update pass.")
+
+  parser.add_argument('--fp16',
+            default=False,
+            type=boolean_string,
+            help="Whether to use 16-bit float precision instead of 32-bit")
+
+  parser.add_argument('--loss_scale',
+            type=float, default=256,
+            help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
+
+  parser.add_argument('--scale_steps',
+            type=int, default=1000,
+            help='The steps to wait to increase the loss scale.')
+
+  parser.add_argument('--init_model',
+            type=str,
+            help="The model state file used to initialize the model weights.")
+
+  parser.add_argument('--model_config',
+            type=str,
+            help="The config file of bert model.")
+
+  parser.add_argument('--cls_drop_out',
+            type=float,
+            default=None,
+            help="The config file model initialization and fine tuning.")
+  parser.add_argument('--weight_decay',
+            type=float,
+            default=0.01,
+            help="The weight decay rate")
+
+  parser.add_argument('--tag',
+            type=str,
+            default='final',
+            help="The tag name of current prediction/runs.")
+
+  parser.add_argument("--dump_interval",
+            default=10000,
+            type=int,
+            help="Interval steps for generating checkpoint.")
+
+  parser.add_argument('--lookahead_k',
+            default=-1,
+            type=int,
+            help="lookahead k parameter")
+
+  parser.add_argument('--lookahead_alpha',
+            default=0.5,
+            type=float,
+            help="lookahead alpha parameter")
+
+  parser.add_argument('--opt_type',
+            type=str.lower,
+            default='adam',
+            choices=['adam', 'admax'],
+            help="The optimizer to be used.")
+
+  parser.add_argument('--workers',
+            type=int,
+            default=2,
+            help="The workers to load data.")
+
+  parser.add_argument('--pre_trained',
+            default=None,
+            type=str,
+            help="The path of pre-trained RoBERTa model")
+  
+  return parser
+
+if __name__ == "__main__":
+  parser = build_argument_parser()
+  args = parser.parse_args()
+  logger = set_logger(args.task_name, os.path.join(args.output_dir, 'training_{}.log'.format(args.task_name)))
+  logger.info(args)
+  try:
+    main(args)
+  except Exception as ex:
+    try:
+      logger.exception(f'Uncatched exception happened during execution.')
+      import atexit
+      atexit._run_exitfuncs()
+    except:
+      pass
+    os._exit(-1)
diff --git a/DeBERTa/deberta/__init__.py b/DeBERTa/deberta/__init__.py
index 87d22dd..6450486 100644
--- a/DeBERTa/deberta/__init__.py
+++ b/DeBERTa/deberta/__init__.py
@@ -17,5 +17,5 @@
 from .disentangled_attention import *
 from .ops import *
 from .bert import *
-from .gpt2_tokenizer import GPT2Tokenizer
+from .gpt2_tokenizer import GPT2Tokenizer, DebertaPreTrainedTokenizer
 from .config import *
diff --git a/DeBERTa/deberta/bert.py b/DeBERTa/deberta/bert.py
index ba817a0..c7f1cb6 100644
--- a/DeBERTa/deberta/bert.py
+++ b/DeBERTa/deberta/bert.py
@@ -145,6 +145,7 @@ class BertEncoder(nn.Module):
   def __init__(self, config):
     super().__init__()
     layer = BertLayer(config)
+    # Set number of layers here
     self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
     self.relative_attention = getattr(config, 'relative_attention', False)
     if self.relative_attention:
diff --git a/DeBERTa/deberta/gpt2_tokenizer.py b/DeBERTa/deberta/gpt2_tokenizer.py
index 20acb75..006cad9 100644
--- a/DeBERTa/deberta/gpt2_tokenizer.py
+++ b/DeBERTa/deberta/gpt2_tokenizer.py
@@ -15,8 +15,9 @@
 import os
 from .gpt2_bpe_utils import get_encoder,_is_control,_is_whitespace,_is_punctuation
 from .cache_utils import load_vocab
+from transformers import PreTrainedTokenizer
 
-__all__ = ['GPT2Tokenizer']
+__all__ = ['GPT2Tokenizer', 'DebertaPreTrainedTokenizer']
 
 class GPT2Tokenizer(object):
   """ A wrapper of GPT2 tokenizer with similar interface as BERT tokenizer
@@ -214,3 +215,38 @@ def add_symbol(self, word, n=1):
 
   def save_pretrained(self, path: str):
     torch.save(self.gpt2_encoder, path)
+
+class DebertaPreTrainedTokenizer(PreTrainedTokenizer):
+  def __init__(
+        self,
+        vocab_file=None,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        super().__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
+        self.GPT2Tokenizer = GPT2Tokenizer(vocab_file, do_lower_case, **kwargs)
+  
+  def _convert_token_to_id(self, token):
+    return self.GPT2Tokenizer.id(token)
+
+  def _tokenize(self, text, **kwargs):
+    """
+    Converts a string in a sequence of tokens (string), using the tokenizer.
+    Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
+    (BPE/SentencePieces/WordPieces).
+    Do NOT take care of added tokens.
+    """
+    return self.GPT2Tokenizer.tokenize(text)
diff --git a/DeBERTa/onnx/__init__.py b/DeBERTa/onnx/__init__.py
new file mode 100644
index 0000000..7181044
--- /dev/null
+++ b/DeBERTa/onnx/__init__.py
@@ -0,0 +1,5 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from .orttraining_deberta import ORTGlueTest
\ No newline at end of file
diff --git a/DeBERTa/onnx/orttraining_deberta.py b/DeBERTa/onnx/orttraining_deberta.py
new file mode 100644
index 0000000..4cb5c27
--- /dev/null
+++ b/DeBERTa/onnx/orttraining_deberta.py
@@ -0,0 +1,167 @@
+# adapted from run_glue.py of huggingface transformers
+
+import dataclasses
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Dict, Optional
+import unittest
+import numpy as np
+from numpy.testing import assert_allclose
+
+from transformers import (
+    AutoConfig,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    EvalPrediction,
+    GlueDataset,
+    GlueDataTrainingArguments,
+    TrainingArguments,
+    glue_compute_metrics,
+    glue_output_modes,
+    glue_tasks_num_labels,
+    set_seed,
+)
+
+import onnxruntime
+from onnxruntime.capi.ort_trainer import ORTTrainer, LossScaler, ModelDescription, IODescription
+from onnxruntime.capi._pybind_state import get_mpi_context_local_rank, get_mpi_context_local_size, get_mpi_context_world_rank, get_mpi_context_world_size
+
+from .orttraining_transformer_trainer import ORTTransformerTrainer
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+class ORTGlueTest(unittest.TestCase):
+
+    def setUp(self, args):
+        # configurations not to be changed accoss tests
+        self.max_seq_length = args.max_seq_length
+        self.train_batch_size = args.train_batch_size
+        self.learning_rate = args.learning_rate
+        self.num_train_epochs = args.num_train_epochs
+        self.local_rank = -1
+        self.world_size = 1
+        self.overwrite_output_dir = True
+        self.gradient_accumulation_steps = 1
+        self.data_dir = args.data_dir
+        self.output_dir = args.output_dir
+        self.cache_dir = args.cache_dir
+        self.logging_steps = 100
+        self.rtol = 1e-02
+        self.seed = args.seed
+
+    def model_to_desc(self):
+        batch_size = int(self.train_batch_size) # * self.world_size)
+        new_model_desc = {
+            'inputs': [
+                ('input_ids', ['batch', 'max_seq_len_in_batch'],),
+                ('token_type_ids', ['batch', 'max_seq_len_in_batch'],),
+                ('attention_mask', ['batch', 'max_seq_len_in_batch'],),
+                ('labels', ['batch', ],)],
+            'outputs': [('loss', [], True),
+                        ('logits', ['batch',])]}
+        model_desc = ModelDescription([
+            IODescription('input_ids', ['batch', 'max_seq_len_in_batch']),
+            IODescription('token_type_ids', ['batch', 'max_seq_len_in_batch']),
+            #IODescription('position_ids', [batch_size, self.max_seq_length]),
+            IODescription('attention_mask', ['batch', 'max_seq_len_in_batch']),
+            IODescription('labels', ['batch',])], [
+            IODescription('loss', []),
+            IODescription('logits', ['batch',])])
+
+        return model_desc, new_model_desc
+
+    def run_glue(self, task_name, fp16, use_new_api):
+        data_args = GlueDataTrainingArguments(
+            task_name=task_name, data_dir=os.path.join(self.data_dir, task_name),
+            max_seq_length=self.max_seq_length)
+
+        training_args = TrainingArguments(
+            output_dir=os.path.join(self.output_dir, task_name), do_train=True, do_eval=True,
+            per_gpu_train_batch_size=self.train_batch_size,
+            per_gpu_eval_batch_size = self.train_batch_size,
+            learning_rate=self.learning_rate, num_train_epochs=self.num_train_epochs,
+            local_rank=self.local_rank,
+            overwrite_output_dir=self.overwrite_output_dir, gradient_accumulation_steps=self.gradient_accumulation_steps,
+            fp16=fp16, logging_steps=self.logging_steps,
+            seed=self.seed)
+
+        # Setup logging
+        logging.basicConfig(
+            format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+            datefmt="%m/%d/%Y %H:%M:%S",
+            level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
+        )
+        logger.warning(
+            "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+            training_args.local_rank,
+            training_args.device,
+            training_args.n_gpu,
+            bool(training_args.local_rank != -1),
+            training_args.fp16,
+        )
+        logger.info("Training/evaluation parameters %s", training_args)
+
+        set_seed(training_args.seed)
+        onnxruntime.set_seed(training_args.seed)
+
+        try:
+            num_labels = glue_tasks_num_labels[data_args.task_name]
+            output_mode = glue_output_modes[data_args.task_name]
+        except KeyError:
+            raise ValueError("Task not found: %s" % (data_args.task_name))
+
+        train_dataset = (
+            GlueDataset(data_args, tokenizer=self.tokenizer)
+            if training_args.do_train
+            else None
+        )
+
+        eval_dataset = (
+            GlueDataset(data_args, tokenizer=self.tokenizer, mode="dev")
+            if training_args.do_eval
+            else None
+        )
+
+        def compute_metrics(p: EvalPrediction) -> Dict:
+            if output_mode == "classification":
+                preds = np.argmax(p.predictions, axis=1)
+            elif output_mode == "regression":
+                preds = np.squeeze(p.predictions)
+            return glue_compute_metrics(data_args.task_name, preds, p.label_ids)
+
+        model_desc, new_model_desc = self.model_to_desc()
+        # Initialize the ORTTrainer within ORTTransformerTrainer
+        trainer = ORTTransformerTrainer(
+            model=self.model,
+            model_desc=model_desc,
+            new_model_desc=new_model_desc,
+            args=training_args,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            compute_metrics=compute_metrics,
+            use_new_api=use_new_api,
+            world_size=self.world_size,
+        )
+
+        # Training
+        if training_args.do_train:
+            trainer.train()
+            trainer.save_model()
+
+        # Evaluation
+        results = {}
+        if training_args.do_eval and training_args.local_rank in [-1, 0]:
+            logger.info("*** Evaluate ***")
+
+            result = trainer.evaluate()
+
+            logger.info("***** Eval results {} *****".format(data_args.task_name))
+            for key, value in result.items():
+               logger.info("  %s = %s", key, value)
+
+            results.update(result)
+
+        return results
\ No newline at end of file
diff --git a/DeBERTa/onnx/orttraining_test_bert_postprocess.py b/DeBERTa/onnx/orttraining_test_bert_postprocess.py
new file mode 100644
index 0000000..890db47
--- /dev/null
+++ b/DeBERTa/onnx/orttraining_test_bert_postprocess.py
@@ -0,0 +1,5 @@
+from .orttraining_test_model_transform import add_name, fix_transpose, add_expand_shape
+from .orttraining_test_layer_norm_transform import layer_norm_transform
+
+def postprocess_model(model):
+    add_name(model)
diff --git a/DeBERTa/onnx/orttraining_test_layer_norm_transform.py b/DeBERTa/onnx/orttraining_test_layer_norm_transform.py
new file mode 100644
index 0000000..883d738
--- /dev/null
+++ b/DeBERTa/onnx/orttraining_test_layer_norm_transform.py
@@ -0,0 +1,177 @@
+import onnx
+
+def find_node(graph_proto, op_type):
+    nodes = []
+    map_input_node = {}
+    for node in graph_proto.node:
+        if node.op_type == op_type:
+            map_input_node[node.input[0]] = node
+            if op_type == 'Div' or op_type == 'Mul':
+                map_input_node[node.input[1]] = node
+            nodes.append(node)
+    return nodes, map_input_node
+
+def gen_attribute(key, value):
+    attr = AttributeProto()
+    attr.name = key
+    attr.ints.extend(int(v) for v in value)
+    attr.type = AttributeProto.INTS
+    return attr
+
+def layer_norm_transform(model_proto):
+    # a layer norm subgraph
+    # input
+    #   |
+    # ReduceMean
+    #  __|____
+    # |       |
+    # Sub     Sub
+    # |       |
+    # |       Pow
+    # |        |
+    # |        ReduceMean
+    # |        |
+    # |        Add
+    # |        |
+    # |__    __Sqrt
+    #    |  |
+    #     Div
+    #     |
+    #     Mul
+    #     |
+    #     Add
+    #     |
+    #     output
+
+    graph_proto = model_proto.graph
+
+    _,  map_input_Div = find_node(graph_proto, 'Div')
+
+    _,  map_input_Sqrt = find_node(graph_proto, 'Sqrt')
+
+    _,  map_input_Add = find_node(graph_proto, 'Add')
+
+    nodes_ReduceMean,  map_input_ReduceMean = find_node(graph_proto, 'ReduceMean')
+
+    _,  map_input_Pow = find_node(graph_proto, 'Pow')
+
+    _,  map_input_Mul = find_node(graph_proto, 'Mul')
+
+    # find right side Sub (see the layer norm subgrapg)
+    nodes_Sub = []
+    map_input_Sub = {}
+    for node in graph_proto.node:
+        if node.op_type == 'Sub':
+            if node.output[0] in map_input_Pow:
+                nodes_Sub.append(node)
+                map_input_Sub[node.input[1]] = node
+
+    # find first ReduceMean
+    first_ReduceMean = []
+    first_ReduceMean_outputs = []
+    for node in nodes_ReduceMean:
+        if node.output[0] in map_input_Sub:
+            first_ReduceMean.append(node)
+            first_ReduceMean_outputs.append(node.output[0])
+
+    # find constant node
+    nodes_Constant = []
+    map_output_Constant = {}
+    for node in graph_proto.node:
+        if node.op_type == 'Constant':
+            nodes_Constant.append(node)
+            map_output_Constant[node.output[0]] = node
+
+    id = 0
+    removed_nodes = []
+    layer_norm_nodes = []
+    # Replace with layer norm
+    for node in first_ReduceMean:
+        layer_norm_input = []
+        layer_norm_output = []
+        layer_norm_input.append(node.input[0])
+
+        # collect nodes within a layer norm subgraph.
+        # skip building layer norm node if there is a pattern miss-match.
+        if node.output[0] not in map_input_Sub:
+            continue
+
+        node_sub = map_input_Sub[node.output[0]]
+        if node_sub.output[0] not in map_input_Pow:
+            continue
+
+        node_pow = map_input_Pow[node_sub.output[0]]
+        if node_pow.output[0] not in map_input_ReduceMean:
+            continue
+
+        node_reduce = map_input_ReduceMean[node_pow.output[0]]
+        if node_reduce.output[0] not in map_input_Add:
+            continue
+
+        node_Add = map_input_Add[node_reduce.output[0]]
+        if node_Add.output[0] not in map_input_Sqrt:
+            continue
+
+        node_Sqrt = map_input_Sqrt[node_Add.output[0]]
+        if node_Sqrt.output[0] not in map_input_Div:
+            continue
+ 
+        node_Div = map_input_Div[node_Sqrt.output[0]]
+        if node_Div.output[0] not in map_input_Mul:
+            continue
+
+        node_Mul = map_input_Mul[node_Div.output[0]]
+
+        if node_Mul.input[0] != node_Div.output[0]:
+            layer_norm_input.append(node_Mul.input[0])
+        else:
+            layer_norm_input.append(node_Mul.input[1])
+
+        if node_Mul.output[0] not in map_input_Add:
+            continue
+
+        node_Add1 = map_input_Add[node_Mul.output[0]]
+        layer_norm_input.append(node_Add1.input[1])
+
+        removed_nodes.append(node)
+        removed_nodes.append(node_sub)
+        removed_nodes.append(node_pow)
+        removed_nodes.append(node_reduce)
+        removed_nodes.append(node_Add)
+        removed_nodes.append(node_Sqrt)
+        removed_nodes.append(node_Div)
+        removed_nodes.append(node_Mul)
+        removed_nodes.append(node_Add1)
+        removed_nodes.append(map_output_Constant[node_pow.input[1]])
+
+        removed_nodes.append(map_output_Constant[node_Add.input[1]])
+        layer_norm_output.append(node_Add1.output[0])
+        id = id + 1
+        layer_norm_output.append('saved_mean_' + str(id))
+        id = id + 1
+        layer_norm_output.append('saved_inv_std_var_' + str(id))
+        layer_norm = onnx.helper.make_node("LayerNormalization",
+                                        layer_norm_input,
+                                        layer_norm_output,
+                                        "LayerNormalization_" + str(id),
+                                        None,
+                                        axis = node_reduce.attribute[0].ints[0],
+                                        epsilon = 9.999999960041972e-13)
+        layer_norm_nodes.append(layer_norm)
+
+    # remove left side Subs
+    for node in graph_proto.node:
+        if node.op_type == 'Sub':
+            if node.input[1] in first_ReduceMean_outputs:
+                removed_nodes.append(node)
+
+    all_nodes = []
+    for node in graph_proto.node:
+        if node not in removed_nodes:
+            all_nodes.append(node)
+
+    for node in layer_norm_nodes:
+        all_nodes.append(node)
+
+    graph_proto.ClearField("node")
+    graph_proto.node.extend(all_nodes)
diff --git a/DeBERTa/onnx/orttraining_test_model_transform.py b/DeBERTa/onnx/orttraining_test_model_transform.py
new file mode 100644
index 0000000..9ef92aa
--- /dev/null
+++ b/DeBERTa/onnx/orttraining_test_model_transform.py
@@ -0,0 +1,106 @@
+from onnx import numpy_helper
+
+def add_name(model):
+    i = 0
+    for node in model.graph.node:
+       node.name = '%s_%d' %(node.op_type, i)
+       i += 1
+
+def find_single_output_node(model, arg):
+    result = []
+    for node in model.graph.node:
+        for input in node.input:
+            if input == arg:
+                result.append(node)
+    return result[0] if len(result) == 1 else None
+
+def find_input_as_initializer(model, arg):
+    for initializer in model.graph.initializer:
+        if initializer.name == arg:
+            return initializer
+    return None
+
+def get_node_index(model, node):
+    for i, n in enumerate(model.graph.node):
+        if n == node:
+            return i
+    return None
+
+def replace_input_arg(model, arg, new_arg):
+    for node in model.graph.node:
+        for i in range(len(node.input)):
+            if node.input[i] == arg:
+                node.input[i] = new_arg
+
+def find_weight_index(model, name):
+    for index, w in enumerate(model.graph.initializer):
+        if w.name == name:
+            return index
+        index += 1
+    return None
+
+def fix_transpose(model):
+    """
+    remove transpose node if its input is a 2d weight which only feeds to the node.
+    """
+
+    # Find transpose nodes with initializer weight as input. 
+    # The input weight needs to be only feeded into the transpose node.
+    # Collect these nodes and weights.
+    transpose = []
+    for node in model.graph.node:
+        if node.op_type == 'Transpose':
+            weight = find_input_as_initializer(model, node.input[0])
+            if weight is not None:
+                result = []
+                for n in model.graph.node:
+                    for input in n.input:
+                        if input == weight.name:
+                            result.append(n)
+                if len(result) > 1:
+                    continue
+                perm = node.attribute[0]
+                assert perm.name == 'perm'
+                perm = perm.ints
+                assert len(perm) == 2 and perm[0] == 1 and perm[1] == 0
+                transpose.append((get_node_index(model, node), weight))
+
+    # Transpose collected weights and add it to the model initializers. 
+    # The transposed weight initializers become inputs to the transpose nodes' recipient nodes.
+    for t in transpose:
+        node = model.graph.node[t[0]]
+        weight = numpy_helper.to_array(t[1])
+        assert len(weight.shape) == 2
+        weight = weight.transpose(perm)
+        new_weight = numpy_helper.from_array(weight, "%s_transposed" % t[1].name)
+        model.graph.initializer.extend([new_weight])
+        replace_input_arg(model, node.output[0], new_weight.name)
+
+    # collected transpose nodes can be removed.
+    transpose.sort(reverse=True)
+    for t in transpose:
+        del model.graph.node[t[0]]
+
+    # the original weight initializer can be removed. 
+    # (remember that a wight needs only to be feeded into the transpose node when collecting wights)
+    old_ws = []
+    for t in transpose:
+        if find_single_output_node(model, t[1].name) is None:
+            old_ws.append(find_weight_index(model, t[1].name))
+    old_ws.sort(reverse=True)
+    for w_i in old_ws:
+        del model.graph.initializer[w_i]
+
+def add_expand_shape(model):
+    """
+    this method is very specific to the Bert model where there is a solo Expand op.
+    training backend requires the op's output shape. it is the same as the shape of the model (single) input.
+    """
+
+    expand_node = [n for n in model.graph.node if n.op_type == 'Expand']
+    if len(expand_node) != 1:
+        raise "cannot find the single expand node in the BERT model."
+        return
+    expand_out = model.graph.value_info.add()
+    expand_out.name = expand_node[0].output[0] # base: '421' # tiny: '85'
+    expand_out.type.CopyFrom(model.graph.input[0].type)
\ No newline at end of file
diff --git a/DeBERTa/onnx/orttraining_transformer_trainer.py b/DeBERTa/onnx/orttraining_transformer_trainer.py
new file mode 100644
index 0000000..ffd81e6
--- /dev/null
+++ b/DeBERTa/onnx/orttraining_transformer_trainer.py
@@ -0,0 +1,405 @@
+# adapted from Trainer.py of huggingface transformers
+
+import json
+import logging
+import os
+import random
+
+from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
+
+import numpy as np
+import torch
+from torch import nn
+from torch.utils.data.dataloader import DataLoader
+from torch.utils.data.dataset import Dataset
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data.sampler import RandomSampler, SequentialSampler
+from tqdm import tqdm, trange
+
+from transformers.data.data_collator import DataCollator, DefaultDataCollator
+from transformers.modeling_utils import PreTrainedModel
+from transformers.training_args import TrainingArguments
+
+import onnxruntime
+from .orttraining_test_bert_postprocess import postprocess_model
+from onnxruntime.capi.ort_trainer import ORTTrainer, LossScaler, ModelDescription, IODescription
+
+from onnxruntime.training import _utils, amp, optim, orttrainer, TrainStepInfo,\
+                                      model_desc_validation as md_val,\
+                                      orttrainer_options as orttrainer_options
+from onnxruntime.training.optim import LinearWarmupLRScheduler, _LRScheduler
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+
+    _has_tensorboard = True
+except ImportError:
+    try:
+        from tensorboardX import SummaryWriter
+
+        _has_tensorboard = True
+    except ImportError:
+        _has_tensorboard = False
+
+
+def is_tensorboard_available():
+    return _has_tensorboard
+
+
+logger = logging.getLogger(__name__)
+
+
+def set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    onnxruntime.set_seed(seed)
+
+class EvalPrediction(NamedTuple):
+    predictions: np.ndarray
+    label_ids: np.ndarray
+
+
+class PredictionOutput(NamedTuple):
+    predictions: np.ndarray
+    label_ids: Optional[np.ndarray]
+    metrics: Optional[Dict[str, float]]
+
+
+class TrainOutput(NamedTuple):
+    global_step: int
+    training_loss: float
+
+def get_linear_schedule_with_warmup(num_warmup_steps, num_training_steps, base_lr):
+
+    def lr_lambda_linear(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        return max(
+            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
+        )
+
+    def lambda_lr_get_lr(current_global_step):
+        # LambdaLR increment self.last_epoch at evert sept()
+        return base_lr * lr_lambda_linear(current_global_step)
+
+    return lambda_lr_get_lr
+
+
+class ORTTransformerTrainer:
+    """
+    """
+
+    model: PreTrainedModel
+    args: TrainingArguments
+    train_dataset: Dataset
+    eval_dataset: Dataset
+    compute_metrics: Callable[[EvalPrediction], Dict]
+
+    def __init__(
+        self,
+        model: PreTrainedModel,
+        model_desc: ModelDescription,
+        new_model_desc: dict,
+        args: TrainingArguments,
+        train_dataset: Dataset,
+        eval_dataset: Dataset,
+        compute_metrics: Callable[[EvalPrediction], Dict],
+        world_size: Optional[int] = 1,
+        use_new_api : Optional[bool] = False,
+    ):
+        """
+        """
+
+        self.model = model
+        self.model_desc = model_desc
+        self.new_model_desc = new_model_desc
+        self.args = args
+        self.world_size = world_size
+        self.data_collator = DefaultDataCollator()
+        self.train_dataset = train_dataset
+        self.eval_dataset = eval_dataset
+        self.compute_metrics = compute_metrics
+        set_seed(self.args.seed)
+        # Create output directory if needed
+        if self.args.local_rank in [-1, 0]:
+            os.makedirs(self.args.output_dir, exist_ok=True)
+
+        self.use_new_api = use_new_api
+
+    def get_train_dataloader(self) -> DataLoader:
+        if self.train_dataset is None:
+            raise ValueError("Trainer: training requires a train_dataset.")
+        train_sampler = (
+            SequentialSampler(self.train_dataset) if self.args.local_rank == -1 else DistributedSampler(self.train_dataset)
+        )
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.args.train_batch_size,
+            sampler=train_sampler,
+            #drop_last=True,
+            collate_fn=self.data_collator.collate_batch,
+        )
+
+    def get_eval_dataloader(self) -> DataLoader:
+        return DataLoader(
+            self.eval_dataset,
+            batch_size=self.args.eval_batch_size,
+            shuffle=False,
+            #drop_last=True,
+            collate_fn=self.data_collator.collate_batch,
+        )
+
+    def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
+        # We use the same batch_size as for eval.
+        return DataLoader(
+            test_dataset,
+            batch_size=self.args.eval_batch_size,
+            shuffle=False,
+            #drop_last=True,
+            collate_fn=self.data_collator.collate_batch,
+        )
+
+
+    def train(self):
+        """
+        Main training entry point.
+        """
+        train_dataloader = self.get_train_dataloader()
+
+        if self.args.max_steps > 0:
+            t_total = self.args.max_steps
+            num_train_epochs = (
+                self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1
+            )
+        else:
+            t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs)
+            num_train_epochs = self.args.num_train_epochs
+
+        if self.use_new_api:
+            lr_scheduler = orttrainer.optim.LinearWarmupLRScheduler(t_total, self.args.warmup_steps/float(t_total))
+
+            loss_scaler = amp.DynamicLossScaler() if self.args.fp16 else None
+            device = self.args.device.type
+
+            device = f'{device}:{self.args.device.index}' if self.args.device.index else f'{device}:0'
+            options = orttrainer.ORTTrainerOptions({'batch' : {
+                                                        'gradient_accumulation_steps' : self.args.gradient_accumulation_steps},
+                                                    'device': {'id': device},
+                                                    'mixed_precision': {
+                                                        'enabled': self.args.fp16,
+                                                        'loss_scaler': loss_scaler},
+                                                    'debug': {'deterministic_compute': True, },
+                                                    'utils': {
+                                                        'grad_norm_clip': False},
+                                                    'distributed': {
+                                                        # we are running single node multi gpu test. thus world_rank = local_rank
+                                                        # and world_size = self.args.n_gpu
+                                                        'world_rank': max(0, self.args.local_rank),
+                                                        'world_size': int(self.world_size),
+                                                        'local_rank': max(0, self.args.local_rank),
+                                                        'allreduce_post_accumulation': True},
+                                                    'lr_scheduler': lr_scheduler
+                                                    })
+
+            param_optimizer = list(self.model.named_parameters())
+            params = [{
+                'params': [n for n, p in param_optimizer if "bias" in n or "LayerNorm.weight" in n],
+                "weight_decay_mode": 1, }, {
+                'params': [n for n, p in param_optimizer if not ("bias" in n or "LayerNorm.weight" in n)],
+                "weight_decay_mode": 1, }
+                ]
+
+            optim_config = optim.AdamConfig(params=params, lr=2e-5, do_bias_correction=True)
+            self.model = orttrainer.ORTTrainer(self.model, self.new_model_desc, optim_config, options=options)
+        else:
+            def map_optimizer_attributes(name):
+                no_decay = "bias" in name or "LayerNorm.weight" in name
+                if no_decay:
+                    return {"weight_decay_mode" : 1}
+                else:
+                    return {"weight_decay_mode" : 1}
+            get_lr_this_step = get_linear_schedule_with_warmup(self.args.warmup_steps, t_total, self.args.learning_rate)
+            loss_scaler = LossScaler('loss_scale_input_name', True, up_scale_window=2000) if self.args.fp16 else None
+            self.model = ORTTrainer(self.model, None,
+                self.model_desc,
+                "AdamOptimizer",
+                map_optimizer_attributes=map_optimizer_attributes,
+                learning_rate_description=IODescription('Learning_Rate', [1,], torch.float32),
+                device=self.args.device,
+                gradient_accumulation_steps=self.args.gradient_accumulation_steps,
+                world_rank=max(0, self.args.local_rank),
+                world_size=int(self.world_size),
+                use_mixed_precision=self.args.fp16,
+                allreduce_post_accumulation=True,
+                get_lr_this_step=get_lr_this_step,
+                loss_scaler=loss_scaler,
+                enable_grad_norm_clip=False,
+                _opset_version=12,
+                _use_deterministic_compute=True)
+
+        # Train!
+        logger.info("***** Running training *****")
+        logger.info("  Num examples = %d", len(train_dataloader.dataset))
+        logger.info("  Num Epochs = %d", num_train_epochs)
+        logger.info("  Instantaneous batch size per GPU = %d", self.args.per_gpu_train_batch_size)
+        logger.info(
+            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+            self.args.train_batch_size
+            * self.args.gradient_accumulation_steps
+            * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1),
+        )
+        logger.info("  Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps)
+        logger.info("  Total optimization steps = %d", t_total)
+
+        global_step = 0
+        epochs_trained = 0
+        steps_trained_in_current_epoch = 0
+
+        tr_loss = 0.0
+        logging_loss = 0.0
+        train_iterator = trange(
+            epochs_trained, int(num_train_epochs), desc="Epoch", disable=self.args.local_rank not in [-1, 0],
+        )
+
+        for epoch in train_iterator:
+            epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.args.local_rank not in [-1, 0])
+            for step, inputs in enumerate(epoch_iterator):
+
+                # Skip past any already trained steps if resuming training
+                if steps_trained_in_current_epoch > 0:
+                    steps_trained_in_current_epoch -= 1
+                    continue
+
+                tr_loss += self._training_step(self.model, inputs)
+
+                if (step + 1) % self.args.gradient_accumulation_steps == 0 or (
+                    len(epoch_iterator) <= self.args.gradient_accumulation_steps
+                    and (step + 1) == len(epoch_iterator)
+                ):
+                    global_step += 1
+
+                    if self.args.local_rank in [-1, 0]:
+                        if (self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0) or (
+                            global_step == 1 and self.args.logging_first_step
+                        ):
+                            logs = {}
+                            if self.args.evaluate_during_training:
+                                results = self.evaluate()
+                                for key, value in results.items():
+                                    eval_key = "eval_{}".format(key)
+                                    logs[eval_key] = value
+
+                            loss_scalar = (tr_loss - logging_loss) / self.args.logging_steps
+                            if not self.use_new_api:
+                                learning_rate_scalar = get_lr_this_step(global_step)
+                                logs["learning_rate"] = learning_rate_scalar
+                            logs["loss"] = loss_scalar
+                            logging_loss = tr_loss
+
+                            epoch_iterator.write(json.dumps({**logs, **{"step": global_step}}))
+
+                if self.args.max_steps > 0 and global_step > self.args.max_steps:
+                    epoch_iterator.close()
+                    break
+            if self.args.max_steps > 0 and global_step > self.args.max_steps:
+                train_iterator.close()
+                break
+
+        logger.info("\n\nTraining completed. \n\n")
+        return TrainOutput(global_step, tr_loss / global_step)
+
+    def _training_step(
+        self, model, inputs: Dict[str, torch.Tensor]) -> float:
+        for k, v in inputs.items():
+            inputs[k] = v.to(self.args.device)
+
+        outputs = model.train_step(**inputs)
+        loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
+
+        return loss.item()
+
+    def save_model(self, output_dir: Optional[str] = None):
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        self.model.save_as_onnx(os.path.join(output_dir, "transformer.onnx"))
+
+    def evaluate(self) -> Dict[str, float]:
+        """
+        Run evaluation and return metrics.
+
+        Returns:
+            A dict containing:
+                - the eval loss
+                - the potential metrics computed from the predictions
+        """
+        eval_dataloader = self.get_eval_dataloader()
+
+        output = self._prediction_loop(eval_dataloader, description="Evaluation")
+        return output.metrics
+
+    def predict(self, test_dataset: Dataset) -> PredictionOutput:
+        """
+        Run prediction and return predictions and potential metrics.
+
+        Depending on the dataset and your use case, your test dataset may contain labels.
+        In that case, this method will also return metrics, like in evaluate().
+        """
+        test_dataloader = self.get_test_dataloader(test_dataset)
+        return self._prediction_loop(test_dataloader, description="Prediction")
+
+    def _prediction_loop(
+        self, dataloader: DataLoader, description: str
+    ) -> PredictionOutput:
+        """
+        Prediction/evaluation loop, shared by `evaluate()` and `predict()`.
+
+        Works both with or without labels.
+        """
+
+        logger.info("***** Running %s *****", description)
+        logger.info("  Num examples = %d", len(dataloader.dataset))
+        logger.info("  Batch size = %d", dataloader.batch_size)
+        eval_losses: List[float] = []
+        preds: np.ndarray = None
+        label_ids: np.ndarray = None
+
+        if not self.use_new_api:
+            self.model.eval()
+
+        for inputs in tqdm(dataloader, desc=description):
+            has_labels = any(inputs.get(k) is not None for k in ["labels", "masked_lm_labels"])
+
+            for k, v in inputs.items():
+                inputs[k] = v.to(self.args.device)
+
+            with torch.no_grad():
+                if self.use_new_api:
+                    outputs = self.model.eval_step(**inputs)
+                else:
+                    outputs = self.model(**inputs)
+                if has_labels:
+                    step_eval_loss, logits = outputs[:2]
+                    eval_losses += [step_eval_loss.mean().item()]
+                else:
+                    logits = outputs[0]
+
+            if preds is None:
+                preds = logits.detach().cpu().numpy()
+            else:
+                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+            if inputs.get("labels") is not None:
+                if label_ids is None:
+                    label_ids = inputs["labels"].detach().cpu().numpy()
+                else:
+                    label_ids = np.append(label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
+
+        if self.compute_metrics is not None and preds is not None and label_ids is not None:
+            metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids))
+        else:
+            metrics = {}
+        if len(eval_losses) > 0:
+            metrics["loss"] = np.mean(eval_losses)
+
+        return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)

From bf8a3cee885030a61b694ab30a466fbcb3b40e1c Mon Sep 17 00:00:00 2001
From: Gani Nazirov
 <ganaziro@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
Date: Tue, 29 Sep 2020 18:15:54 +0000
Subject: [PATCH 11/13] remove onnx path in train.py

---
 DeBERTa/apps/train.py                     | 74 ++---------------------
 DeBERTa/deberta/disentangled_attention.py |  1 -
 DeBERTa/deberta/ops.py                    |  1 +
 DeBERTa/onnx/__init__.py                  |  2 +-
 4 files changed, 7 insertions(+), 71 deletions(-)

diff --git a/DeBERTa/apps/train.py b/DeBERTa/apps/train.py
index 28808fb..de96c46 100644
--- a/DeBERTa/apps/train.py
+++ b/DeBERTa/apps/train.py
@@ -24,10 +24,9 @@
 from ..utils import *
 from ..utils import xtqdm as tqdm
 from .task_registry import tasks
-from onnxruntime.capi.ort_trainer import ORTTrainer, IODescription, ModelDescription, LossScaler
 
 from ..training import DistributedTrainer, initialize_distributed, batch_to, set_random_seed,kill_children
-from ..data import DistributedBatchSampler, SequentialSampler, BatchSampler, RandomSampler, AsyncDataLoader
+from ..data import DistributedBatchSampler, SequentialSampler, BatchSampler, AsyncDataLoader
 
 def create_model(args, num_labels, model_class_fn):
   # Prepare model
@@ -218,64 +217,9 @@ def run_predict(args, model, device, eval_data, prefix=None):
       if predict_fn:
         predict_fn(predicts, args.output_dir, name, prefix)
 
-def deberta_model_description(args):
-    vocab_size = 30528
-    # set concrete input sizes to permit optimization
-    input_ids_desc = IODescription('input_ids', [args.train_batch_size, args.max_seq_length], torch.int32, num_classes=vocab_size)
-    type_ids_desc = IODescription('type_ids', [args.train_batch_size, args.max_seq_length], torch.int32) # num_classes=?
-    position_ids_desc = IODescription('position_ids', [args.train_batch_size, args.max_seq_length], torch.int32) # num_classes=?
-    input_mask_desc = IODescription('input_mask', [args.train_batch_size, args.max_seq_length], torch.int32) # num_classes=?
-    labels_desc = IODescription('labels', [args.train_batch_size, args.max_seq_length], torch.float32) # num_classes=?
-    
-    loss_desc = IODescription('loss', [], torch.float32)
-    return ModelDescription([input_ids_desc, type_ids_desc, position_ids_desc, input_mask_desc, labels_desc], [loss_desc])
-
-def create_ort_trainer(args, device, model):
-    # default initial settings: b1=0.9, b2=0.999, e=1e-6
-    def map_optimizer_attributes(name):
-        no_decay_keys = ["bias", "gamma", "beta", "LayerNorm"]
-        no_decay = False
-        for no_decay_key in no_decay_keys:
-            if no_decay_key in name:
-                no_decay = True
-                break
-        if no_decay:
-            return {"alpha": 0.9, "beta": 0.999, "lambda": 0.0, "epsilon": 1e-6}
-        else:
-            return {"alpha": 0.9, "beta": 0.999, "lambda": 0.01, "epsilon": 1e-6}
-
-    # we request ORTTrainer to create a LambOptimizer with given optimizer_attributes. 
-    # train_step does forward, backward, and optimize step.
-    model = ORTTrainer(model, None, deberta_model_description(args), "LambOptimizer", 
-        map_optimizer_attributes,
-        IODescription('Learning_Rate', [1,], torch.float32),
-        device,
-        _opset_version = 12)
-
-    return model
-
-def run_onnx_training(args, model, device, train_data, prefix=None):
-  # runs training in ONNX
-  trainer = create_ort_trainer(args, device, model)
-  train_sampler = RandomSampler(len(train_data))
-  batch_sampler = BatchSampler(train_sampler, args.train_batch_size)
-  batch_sampler = DistributedBatchSampler(batch_sampler, rank=args.rank, world_size=args.world_size)
-  train_dataloader = DataLoader(train_data, batch_sampler=batch_sampler, num_workers=args.workers, pin_memory=True)
-  torch.cuda.empty_cache()
-  for step, batch in enumerate(AsyncDataLoader(train_dataloader, 100)):
-    #import pdb
-    #pdb.set_trace()
-    lr = torch.tensor([0.0000000e+00]).to(device)
-    batch = batch_to(batch, device)
-    with torch.no_grad():
-      trainer.train_step(batch['input_ids'], batch['type_ids'], batch['position_ids'], batch['input_mask'], batch['labels'], lr)
-      # conversion fails now with:
-      # site-packages/torch/onnx/utils.py:617: UserWarning: ONNX export failed on ATen operator broadcast_tensors
-      # because torch.onnx.symbolic_opset10.broadcast_tensors does not exist
-
 def main(args):
-  if not args.do_train and not args.do_eval and not args.do_predict and not args.do_onnx:
-    raise ValueError("At least one of `do_train` or `do_eval` or `do_predict` or `do_onnx` must be True.")
+  if not args.do_train and not args.do_eval and not args.do_predict:
+    raise ValueError("At least one of `do_train` or `do_eval` or `do_predict` must be True.")
   os.makedirs(args.output_dir, exist_ok=True)
   task_name = args.task_name.lower()
   random.seed(args.seed)
@@ -292,11 +236,11 @@ def main(args):
     test_data = processor.test_data(max_seq_len=args.max_seq_length)
     logger.info("  Prediction batch size = %d", args.predict_batch_size)
 
-  if args.do_train or args.do_onnx:
+  if args.do_train:
     train_data = processor.train_data(max_seq_len=args.max_seq_length, mask_gen = None, debug=args.debug)
   model_class_fn = processor.get_model_class_fn()
   model = create_model(args, len(label_list), model_class_fn)
-  if args.do_train or args.do_onnx:
+  if args.do_train:
     with open(os.path.join(args.output_dir, 'model_config.json'), 'w', encoding='utf-8') as fs:
       fs.write(model.config.to_json_string() + '\n')
   logger.info("Model config {}".format(model.config))
@@ -313,10 +257,6 @@ def main(args):
   if args.do_predict:
     run_predict(args, model, device, test_data, prefix=args.tag)
 
-  # trains in ONNX
-  if args.do_onnx:
-    run_onnx_training(args, model, device, train_data, prefix=args.tag)
-
 def build_argument_parser():
   parser = argparse.ArgumentParser()
 
@@ -498,10 +438,6 @@ def build_argument_parser():
             type=str,
             help="The path of pre-trained RoBERTa model")
   
-  parser.add_argument("--do_onnx",
-            default=False,
-            action='store_true',
-            help="Whether to run training in ONNX")
   return parser
 
 if __name__ == "__main__":
diff --git a/DeBERTa/deberta/disentangled_attention.py b/DeBERTa/deberta/disentangled_attention.py
index 87f280b..ec18cf4 100644
--- a/DeBERTa/deberta/disentangled_attention.py
+++ b/DeBERTa/deberta/disentangled_attention.py
@@ -77,7 +77,6 @@ def __init__(self, config):
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
         self.in_proj = torch.nn.Linear(config.hidden_size, self.all_head_size*3, bias=False)
-        # ONNX graph builder thinks params below are not used for loss calcualtion
         self.q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
         self.v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float))
         self.pos_att_type = [x.strip() for x in getattr(config, 'pos_att_type', 'none').lower().split('|')] # c2p|p2c
diff --git a/DeBERTa/deberta/ops.py b/DeBERTa/deberta/ops.py
index a1ba5fd..a18515f 100644
--- a/DeBERTa/deberta/ops.py
+++ b/DeBERTa/deberta/ops.py
@@ -6,6 +6,7 @@
 # Author: penhe@microsoft.com
 # Date: 01/15/2020
 #
+
 import math
 from packaging import version
 import torch
diff --git a/DeBERTa/onnx/__init__.py b/DeBERTa/onnx/__init__.py
index 7181044..a821423 100644
--- a/DeBERTa/onnx/__init__.py
+++ b/DeBERTa/onnx/__init__.py
@@ -2,4 +2,4 @@
 from __future__ import division
 from __future__ import print_function
 
-from .orttraining_deberta import ORTGlueTest
\ No newline at end of file
+from .orttraining_deberta import ORTGlueTest

From c71818bafb68a4339e8aef423ecd285dfcdd7772 Mon Sep 17 00:00:00 2001
From: Gani Nazirov
 <ganaziro@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
Date: Tue, 29 Sep 2020 18:54:53 +0000
Subject: [PATCH 12/13] Add Readme

---
 DeBERTa/apps/orttrain.py            | 92 ++++++++---------------------
 DeBERTa/apps/train.py               |  1 -
 DeBERTa/onnx/README.md              | 47 +++++++++++++++
 DeBERTa/onnx/orttraining_deberta.py |  3 +-
 4 files changed, 75 insertions(+), 68 deletions(-)
 create mode 100644 DeBERTa/onnx/README.md

diff --git a/DeBERTa/apps/orttrain.py b/DeBERTa/apps/orttrain.py
index 2683fc6..237057a 100644
--- a/DeBERTa/apps/orttrain.py
+++ b/DeBERTa/apps/orttrain.py
@@ -79,8 +79,7 @@ def build_argument_parser():
             type=str,
             required=True,
             help="The directory to store the pretrained models downloaded from s3.")
-
-  ## Other parameters
+  ## Other parameters, 
   parser.add_argument("--max_seq_length",
             default=128,
             type=int,
@@ -95,14 +94,35 @@ def build_argument_parser():
             default=32,
             type=int,
             help="Total batch size for eval.")
-  parser.add_argument("--max_grad_norm",
-            default=1,
-            type=float,
-            help="The clip threshold of global gradient norm")
   parser.add_argument("--learning_rate",
             default=5e-5,
             type=float,
             help="The initial learning rate for Adam.")
+  parser.add_argument("--num_train_epochs",
+            default=3.0,
+            type=float,
+            help="Total number of training epochs to perform.")
+  parser.add_argument('--seed',
+            type=int,
+            default=1234,
+            help="random seed for initialization")
+  parser.add_argument('--fp16',
+            default=False,
+            type=boolean_string,
+            help="Whether to use 16-bit float precision instead of 32-bit")
+  parser.add_argument('--init_model',
+            type=str,
+            help="The model state file used to initialize the model weights.")
+  parser.add_argument('--pre_trained',
+            default=None,
+            type=str,
+            help="The path of pre-trained RoBERTa model")
+
+  ## TBD: review params below
+  parser.add_argument("--max_grad_norm",
+            default=1,
+            type=float,
+            help="The clip threshold of global gradient norm")
   parser.add_argument("--epsilon",
             default=1e-6,
             type=float,
@@ -115,10 +135,6 @@ def build_argument_parser():
             default=0.999,
             type=float,
             help="The beta2 parameter for Adam.")
-  parser.add_argument("--num_train_epochs",
-            default=3.0,
-            type=float,
-            help="Total number of training epochs to perform.")
   parser.add_argument("--warmup_proportion",
             default=0.1,
             type=float,
@@ -133,43 +149,19 @@ def build_argument_parser():
             type=str,
             help="The learning rate scheduler used for traning. "
               "E.g. warmup_linear, warmup_linear_shift, warmup_cosine, warmup_constant. Default, warmup_linear")
-
-  parser.add_argument("--local_rank",
-            type=int,
-            default=-1,
-            help="local_rank for distributed training on gpus")
-
-  parser.add_argument('--seed',
-            type=int,
-            default=1234,
-            help="random seed for initialization")
-
   parser.add_argument('--accumulative_update',
             type=int,
             default=1,
             help="Number of updates steps to accumulate before performing a backward/update pass.")
-
-  parser.add_argument('--fp16',
-            default=False,
-            type=boolean_string,
-            help="Whether to use 16-bit float precision instead of 32-bit")
-
   parser.add_argument('--loss_scale',
             type=float, default=256,
             help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
-
   parser.add_argument('--scale_steps',
             type=int, default=1000,
             help='The steps to wait to increase the loss scale.')
-
-  parser.add_argument('--init_model',
-            type=str,
-            help="The model state file used to initialize the model weights.")
-
   parser.add_argument('--model_config',
             type=str,
             help="The config file of bert model.")
-
   parser.add_argument('--cls_drop_out',
             type=float,
             default=None,
@@ -178,43 +170,11 @@ def build_argument_parser():
             type=float,
             default=0.01,
             help="The weight decay rate")
-
-  parser.add_argument('--tag',
-            type=str,
-            default='final',
-            help="The tag name of current prediction/runs.")
-
-  parser.add_argument("--dump_interval",
-            default=10000,
-            type=int,
-            help="Interval steps for generating checkpoint.")
-
-  parser.add_argument('--lookahead_k',
-            default=-1,
-            type=int,
-            help="lookahead k parameter")
-
-  parser.add_argument('--lookahead_alpha',
-            default=0.5,
-            type=float,
-            help="lookahead alpha parameter")
-
   parser.add_argument('--opt_type',
             type=str.lower,
             default='adam',
             choices=['adam', 'admax'],
             help="The optimizer to be used.")
-
-  parser.add_argument('--workers',
-            type=int,
-            default=2,
-            help="The workers to load data.")
-
-  parser.add_argument('--pre_trained',
-            default=None,
-            type=str,
-            help="The path of pre-trained RoBERTa model")
-  
   return parser
 
 if __name__ == "__main__":
diff --git a/DeBERTa/apps/train.py b/DeBERTa/apps/train.py
index de96c46..d3c5c77 100644
--- a/DeBERTa/apps/train.py
+++ b/DeBERTa/apps/train.py
@@ -437,7 +437,6 @@ def build_argument_parser():
             default=None,
             type=str,
             help="The path of pre-trained RoBERTa model")
-  
   return parser
 
 if __name__ == "__main__":
diff --git a/DeBERTa/onnx/README.md b/DeBERTa/onnx/README.md
new file mode 100644
index 0000000..4965593
--- /dev/null
+++ b/DeBERTa/onnx/README.md
@@ -0,0 +1,47 @@
+# DeBERTa: Fine-tuning with ONNX Runtime.
+
+## Requirements
+- All the DeBERTA requirements
+- onnx
+- onnxruntime
+
+### Workaround fixes
+- The workaround is needed until MSE operator becomes available in ORT
+  vi $PYTHONPATH/site-packages/torch/nn/functional.py
+  search for "def mse_loss"
+  proceed to lines
+     else:
+        expanded_input, expanded_target = torch.broadcast_tensors(input, target)
+        ret = torch._C._nn.mse_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction))
+  and change them to:
+        expanded_input = input
+        expanded_target = target
+        t = expanded_input - expanded_target
+        t = t ** 2
+        ret = torch.mean(t)
+
+- The workaround is needed until fix is available to disable Unsqueeze optimization for trainable weights in ORT
+  Changes in onnx runtime code:
+  Open onnxruntime/onnxruntime/core/graph/graph_utils.cc 
+  Hardcode to return false to disable Unsqueeze optimization for DeBERTa, see below 
+  if (output_name_is_changing) {
+    std::vector<GraphEdge> output_edges = GetNodeOutputEdges(node);
+    can_remove = CanUpdateImplicitInputNameInSubgraphs(graph, output_edges, initializer_name, logger);
+    can_remove = false;  // <- Put this line in
+
+## Run task
+
+``` bash
+task=STS-B 
+OUTPUT=/tmp/DeBERTa/exps/$task
+python3 -m DeBERTa.apps.orttrain --task_name $task \
+  --data_dir $cache_dir/glue_tasks/$task \
+  --output_dir $OUTPUT \
+  --eval_batch_size 128 \
+  --train_batch_size 32 \
+  --num_train_epochs 6 \
+  --learning_rate 2e-5 \
+  --max_seq_len 128 \
+  --init_model base \
+  --seed 123
+```
\ No newline at end of file
diff --git a/DeBERTa/onnx/orttraining_deberta.py b/DeBERTa/onnx/orttraining_deberta.py
index 4cb5c27..5bf7893 100644
--- a/DeBERTa/onnx/orttraining_deberta.py
+++ b/DeBERTa/onnx/orttraining_deberta.py
@@ -39,6 +39,7 @@ def setUp(self, args):
         # configurations not to be changed accoss tests
         self.max_seq_length = args.max_seq_length
         self.train_batch_size = args.train_batch_size
+        self.eval_batch_size = args.eval_batch_size
         self.learning_rate = args.learning_rate
         self.num_train_epochs = args.num_train_epochs
         self.local_rank = -1
@@ -81,7 +82,7 @@ def run_glue(self, task_name, fp16, use_new_api):
         training_args = TrainingArguments(
             output_dir=os.path.join(self.output_dir, task_name), do_train=True, do_eval=True,
             per_gpu_train_batch_size=self.train_batch_size,
-            per_gpu_eval_batch_size = self.train_batch_size,
+            per_gpu_eval_batch_size = self.eval_batch_size,
             learning_rate=self.learning_rate, num_train_epochs=self.num_train_epochs,
             local_rank=self.local_rank,
             overwrite_output_dir=self.overwrite_output_dir, gradient_accumulation_steps=self.gradient_accumulation_steps,

From 7eff1fd75b8c626a0cb506334cc1f1280e152596 Mon Sep 17 00:00:00 2001
From: Gani Nazirov
 <ganaziro@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
Date: Tue, 29 Sep 2020 19:30:35 +0000
Subject: [PATCH 13/13] Use random seed by default

---
 DeBERTa/apps/orttrain.py                        | 3 ++-
 DeBERTa/onnx/orttraining_transformer_trainer.py | 3 ---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/DeBERTa/apps/orttrain.py b/DeBERTa/apps/orttrain.py
index 237057a..89cca11 100644
--- a/DeBERTa/apps/orttrain.py
+++ b/DeBERTa/apps/orttrain.py
@@ -29,6 +29,7 @@ def create_model(args, num_labels, model_class_fn):
 
 def main(args):
   os.makedirs(args.output_dir, exist_ok=True)
+  logger.info("Using seed " + str(args.seed))
   random.seed(args.seed)
   np.random.seed(args.seed)
   torch.manual_seed(args.seed)
@@ -104,7 +105,7 @@ def build_argument_parser():
             help="Total number of training epochs to perform.")
   parser.add_argument('--seed',
             type=int,
-            default=1234,
+            default=random.randint(0,  2**32 - 1),
             help="random seed for initialization")
   parser.add_argument('--fp16',
             default=False,
diff --git a/DeBERTa/onnx/orttraining_transformer_trainer.py b/DeBERTa/onnx/orttraining_transformer_trainer.py
index ffd81e6..540cfef 100644
--- a/DeBERTa/onnx/orttraining_transformer_trainer.py
+++ b/DeBERTa/onnx/orttraining_transformer_trainer.py
@@ -138,7 +138,6 @@ def get_train_dataloader(self) -> DataLoader:
             self.train_dataset,
             batch_size=self.args.train_batch_size,
             sampler=train_sampler,
-            #drop_last=True,
             collate_fn=self.data_collator.collate_batch,
         )
 
@@ -147,7 +146,6 @@ def get_eval_dataloader(self) -> DataLoader:
             self.eval_dataset,
             batch_size=self.args.eval_batch_size,
             shuffle=False,
-            #drop_last=True,
             collate_fn=self.data_collator.collate_batch,
         )
 
@@ -157,7 +155,6 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
             test_dataset,
             batch_size=self.args.eval_batch_size,
             shuffle=False,
-            #drop_last=True,
             collate_fn=self.data_collator.collate_batch,
         )