diff --git a/README.md b/README.md
index fa52322..8528753 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,9 @@
 # pytorch-micro-benchmarking
+
+This repo provides microbenchmarking script for training and inferencing models in pytorch, apex and torchaudio libraries on ROCm.
+
+## Pytorch
+
 We supply a small microbenchmarking script for PyTorch training on ROCm.
 
 To execute:
@@ -37,10 +42,10 @@ python3 micro_benchmarking_pytorch.py --device_ids=1 --network resnet50 --distri
 To run FlopsProfiler (with deepspeed.profiling.flops_profiler imported):
 `python micro_benchmarking_pytorch.py --network resnet50 --amp-opt-level=2 --batch-size=256 --iterations=20 --flops-prof-step 10`
 
-## Performance tuning
+### Performance tuning
 If performance on a specific card and/or model is found to be lacking, typically some gains can be made by tuning MIOpen. For this, `export MIOPEN_FIND_ENFORCE=3` prior to running the model. This will take some time if untuned configurations are encountered and write to a local performance database. More information on this can be found in the [MIOpen documentation](https://rocm.github.io/MIOpen/doc/html/perfdatabase.html).
 
-## PyTorch 2.0
+### PyTorch 2.0
 Added the `--compile` option opens up PyTorch 2.0 capabilities, which comes with several options. Here are some notes from upstream: 
 ```
     Optimizes given model/function using TorchDynamo and specified backend.
@@ -75,3 +80,26 @@ python micro_benchmarking_pytorch.py --network resnet50 --compile --compileConte
 python micro_benchmarking_pytorch.py --network resnet50 --compile --compileContext "{'options': {'static-memory': 'True', 'matmul-padding': 'True'}}"
 ```
 Note: you cannot pass the `mode` and `options` options together.
+
+## TorchAudio
+
+The script and parameters for torchaudio are similar to pytorch.
+
+To execute:
+`python micro_benchmarking_audio.py --network <network name> [--batch-size <batch size> ] [--iterations <number of iterations>] [--fp16 <0 or 1> ] [--distributed_dataparallel] [--device_ids <comma separated list (no spaces) of GPU indices (0-indexed) to run distributed_dataparallel api on>] `
+
+Possible network names are: `wav2vec2_base`, `deepspeech`, `hdemucs_low`, `tacotron2`, `wavernn`, `wav2letter`, `hubert_base` etc.
+
+## Apex
+
+The script and parameters for torchaudio are similar to pytorch. 
+
+To execute:
+`python micro_benchmarking_apex.py --network <network name> [--batch-size <batch size> ] [--iterations <number of iterations>] [--fp16 <0 or 1> ] [--distributed_dataparallel] [--device_ids <comma separated list (no spaces) of GPU indices (0-indexed) to run distributed_dataparallel api on>] [--sync_bn] [--keep-batchnorm-fp32 <true|false>] [--loss-scale <value|dynamic>]`
+
+There are three additional parameters.
+1. `--sync_bn`: Use apex synchronized batch normalization across GPUs (useful for multi-GPU training).
+2. `--keep-batchnorm-fp32`: Keep batch norm layers in FP32 when using AMP (e.g. `--keep-batchnorm-fp32 true`). Omit with opt_level O1.
+3. `--loss-scale`: Loss scale for mixed precision. It is a number (e.g. `1024`) for static scaling, or `dynamic` for adaptive scaling.
+
+Instead of amp flag (true/false), there  is a level of amp optimization used in apex. 
\ No newline at end of file
diff --git a/audio/audio_input.py b/audio/audio_input.py
new file mode 100644
index 0000000..22e1288
--- /dev/null
+++ b/audio/audio_input.py
@@ -0,0 +1,65 @@
+import torch
+from audio.audio_model import *
+
+
+def get_input_type(network_name):
+    if network_name in acoustic_models or network_name in source_separation_models or network_name in speech_quality_models:
+        return "waveform"
+    elif network_name in speech_recognition_models:
+        return "acoustic features"
+    elif network_name in speech_synthesis_models:
+        if "wavernn" in network_name:
+            return "waveform"
+        else:
+            return "tokens"
+
+
+def get_input(network_name, network, batch_size):
+    if network_name in acoustic_models:
+        inp = {"waveforms": torch.randn(batch_size, FRAME_COUNT, device="cuda")}
+    elif network_name in source_separation_models:
+        if "hdemucs" in network_name:
+            inp = {"input" : torch.randn(batch_size, 2, FRAME_COUNT, device="cuda")}
+        else:
+            inp = {"input" : torch.randn(batch_size, 1, FRAME_COUNT, device="cuda")}
+    elif network_name in speech_recognition_models:
+        if "deepspeech" in network_name:
+            #number of channels must be specified for deepspeech
+            inp = {"x" : torch.randn(batch_size, 1, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda")}
+        elif "wav2letter" in network_name:
+            inp = {"x" : torch.randn(batch_size, ACOUSTIC_FEATURES_SIZE, FRAME_COUNT, device="cuda")}
+        elif "emformer" in network_name:
+            inp = {"input" : torch.randn(batch_size, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda"),
+                   "lengths" : torch.randint(1, FRAME_COUNT, (batch_size,)).to(device="cuda")}
+        elif "conformer" in network_name:
+            lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")
+            inp = {"input" : torch.rand(batch_size, int(lengths.max()), 80, device="cuda"),
+                   "lengths" : lengths}
+    elif network_name in speech_quality_models:
+        if "subjective" in network_name:
+            inp = {"waveform" : torch.randn(batch_size, FRAME_COUNT, device="cuda"),
+                   "reference" : torch.randn(batch_size, FRAME_COUNT, device="cuda")}
+        else:
+            inp = {"x" : torch.randn(batch_size, FRAME_COUNT, device="cuda")}
+    elif network_name in speech_synthesis_models:
+        if "wavernn" in network_name:
+            spec_frames = 64
+            waveform_length = HOP_LENGTH * (spec_frames - 4)
+            
+            inp = {"waveform" : torch.rand(batch_size, 1, waveform_length, device="cuda"),
+                   "specgram": torch.rand(batch_size, 1, N_FREQ, spec_frames, device="cuda")}
+        elif "tacotron2" in network_name:
+            n_mels = 80
+            max_mel_specgram_length = 300
+            max_text_length = 100
+            inp = {"tokens" : torch.randint(0, 148, (batch_size, max_text_length), dtype=torch.int32, device="cuda"),
+                   "token_lengths" : max_text_length * torch.ones((batch_size,), device="cuda"),
+                   "mel_specgram": torch.rand(batch_size, n_mels, max_mel_specgram_length, device="cuda"),
+                   "mel_specgram_lengths" : max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda")}
+    elif network_name in speech_representation_models:
+        inp = {"waveforms" : torch.rand(batch_size, FRAME_COUNT, device="cuda"),
+               "labels" : torch.randint(0, 100, (batch_size, 2), dtype=torch.int32, device="cuda")}
+    else:
+        print (f"Input for {network_name} not defined")
+        sys.exit(1)
+    return inp
\ No newline at end of file
diff --git a/audio/audio_loss.py b/audio/audio_loss.py
new file mode 100644
index 0000000..817437a
--- /dev/null
+++ b/audio/audio_loss.py
@@ -0,0 +1,73 @@
+import torch
+from audio.languagemodels import LanguageModel
+import string
+from audio.hubert_loss import hubert_loss
+from audio.sdr import si_sdr_loss
+from torch import nn
+from audio.audio_model import *
+
+
+def get_criterion(network_name):
+    criterion = None
+    if network_name in speech_representation_models:
+        criterion = hubert_loss
+    elif network_name in speech_recognition_models or network_name in acoustic_models:
+        char_blank = "*"
+        char_space = " "
+        char_apostrophe = "'"
+        labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase
+        language_model = LanguageModel(labels, char_blank, char_space)
+        criterion = torch.nn.CTCLoss(blank=language_model.mapping[char_blank], zero_infinity=False)
+    elif "wavernn" in network_name:
+        criterion = nn.CrossEntropyLoss()
+    elif "conv_tasnet" in network_name:
+        criterion = si_sdr_loss
+    elif "tacotron2" in network_name:
+        criterion = nn.MSELoss()
+    elif "hdemucs" in network_name:
+        criterion = nn.L1Loss(reduction='none')
+    elif "squim" in network_name:
+        criterion = nn.L1Loss()
+    else:
+        print (f"Criterion for network name {network_name} not defined")
+        sys.exit(1)
+    return criterion
+
+    
+def calculate_loss(network_name, criterion, output, target, batch_size, input):
+    loss = 0
+    if network_name in speech_representation_models:
+        logit_m, logit_u, feature_penalty = output
+        loss = criterion(logit_m, logit_u, feature_penalty)
+    elif network_name in speech_recognition_models or network_name in acoustic_models:
+        output = output.transpose(-1, -2).transpose(0, 1)
+        T, N, C = output.shape
+        target, target_lengths = target
+        tensors_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
+        loss = criterion(output, target, tensors_lengths, target_lengths)
+    elif "wavernn" in network_name:
+        output = output.squeeze(1)
+        output = output.transpose(1, 2)
+        loss = criterion(output, target)
+    elif "conv_tasnet" in network_name:
+        target, mask = target
+        loss = criterion(output, target, mask)
+    elif "tacotron2" in network_name or "subjective" in network_name:
+        loss = criterion(output, target)
+    elif "objective" in network_name:
+        loss = 0
+        weights = [1, 2, 0.5, 2]
+        for index in range(len(output)):
+            if index == 0:
+                loss = criterion(output[index], target[index])
+            else:
+                loss += criterion(output[index], target[index])
+        loss += criterion(input["x"], target[3])
+    elif "hdemucs" in network_name:
+        dims = tuple(range(2, target.dim()))
+        loss = criterion(output, target)
+        loss = loss.mean(dims).mean(0)
+    else:
+        print (f"Loss function for {network_name} not defined")
+        sys.exit(1)
+    return loss
\ No newline at end of file
diff --git a/audio/audio_model.py b/audio/audio_model.py
new file mode 100644
index 0000000..e13787b
--- /dev/null
+++ b/audio/audio_model.py
@@ -0,0 +1,103 @@
+import torch
+import torchaudio
+import sys
+
+ACOUSTIC_FEATURES_SIZE = 32
+FRAME_COUNT = 1024
+HOP_LENGTH = 36
+N_FREQ = 128
+CLASSES_COUNT = 29
+
+#different audio tasks related models
+acoustic_models = {
+    "wav2vec2_base" : torchaudio.models.wav2vec2_base,
+    "wav2vec2_large" : torchaudio.models.wav2vec2_large,
+    "wav2vec2_large_lv60k" : torchaudio.models.wav2vec2_large_lv60k,
+    "wav2vec2_xlsr_300m" : torchaudio.models.wav2vec2_xlsr_300m,
+    "wav2vec2_xlsr_1b" : torchaudio.models.wav2vec2_xlsr_1b,
+    "wav2vec2_xlsr_2b" : torchaudio.models.wav2vec2_xlsr_2b,
+    "hubert_base" :  torchaudio.models.hubert_base,
+    "hubert_large" : torchaudio.models.hubert_large,
+    "hubert_xlarge" : torchaudio.models.hubert_xlarge,
+    "wavlm_base" : torchaudio.models.wavlm_base,
+    "wavlm_large" : torchaudio.models.wavlm_large,
+}
+
+speech_recognition_models = {
+    "conformer" : torchaudio.models.Conformer,
+    "deepspeech" : torchaudio.models.DeepSpeech,
+    "emformer" : torchaudio.models.Emformer,
+    "wav2letter" : torchaudio.models.Wav2Letter
+}
+
+source_separation_models = {
+    "conv_tasnet_base" : torchaudio.models.conv_tasnet_base,
+    "hdemucs_low" : torchaudio.models.hdemucs_low,
+    "hdemucs_medium" : torchaudio.models.hdemucs_medium,
+    "hdemucs_high" : torchaudio.models.hdemucs_high,
+}
+
+speech_quality_models = {
+    "squim_objective_base" : torchaudio.models.squim_objective_base,
+    "squim_subjective_base" : torchaudio.models.squim_subjective_base
+}
+
+speech_synthesis_models = {
+    "tacotron2" : torchaudio.models.Tacotron2,
+    "wavernn" : torchaudio.models.WaveRNN
+}
+
+
+speech_representation_models = {
+    "hubert_pretrain_base" : torchaudio.models.hubert_pretrain_base,
+    "hubert_pretrain_large" : torchaudio.models.hubert_pretrain_large,
+    "hubert_pretrain_xlarge" : torchaudio.models.hubert_pretrain_xlarge
+}
+
+def get_network_names():
+    return sorted(list(acoustic_models.keys()) +
+                  list(speech_recognition_models.keys()) +
+                  list(source_separation_models.keys()) +
+                  list(speech_quality_models.keys()) + 
+                  list(speech_synthesis_models.keys()) +
+                  list(speech_representation_models.keys()))
+
+
+def get_network(network_name):
+    if network_name in acoustic_models:
+        return acoustic_models[network_name](aux_num_out=CLASSES_COUNT).to(device="cuda")
+    elif network_name in source_separation_models:
+        if "hdemucs" in network_name:
+            return source_separation_models[network_name](sources = ["vocals"]).to(device="cuda")
+        else:
+            return source_separation_models[network_name]().to(device="cuda")
+    elif network_name in speech_recognition_models:
+        if "deepspeech" in network_name:
+            return speech_recognition_models[network_name](n_feature = ACOUSTIC_FEATURES_SIZE).to(device="cuda")
+        elif "wav2letter" in network_name:
+            return speech_recognition_models[network_name](num_features = ACOUSTIC_FEATURES_SIZE).to(device="cuda")
+        elif "emformer" in network_name:
+            return speech_recognition_models[network_name](input_dim = ACOUSTIC_FEATURES_SIZE,
+                                                           num_heads=8, 
+                                                           ffn_dim=1024, 
+                                                           num_layers=20,
+                                                           segment_length=4).to(device="cuda")
+        elif "conformer" in network_name:
+            return speech_recognition_models[network_name](input_dim = 80,
+                                                           num_heads=4, 
+                                                           ffn_dim=128, 
+                                                           num_layers=4,
+                                                           depthwise_conv_kernel_size=31).to(device="cuda")
+    elif network_name in speech_quality_models:
+        return speech_quality_models[network_name]().to(device="cuda")
+    elif network_name in speech_synthesis_models:
+        if "wavernn" in network_name:
+            return speech_synthesis_models[network_name](upsample_scales = [3, 3, 4], n_classes = 10, 
+                                                         hop_length = HOP_LENGTH, n_freq = 128).to(device="cuda")
+        else:
+            return speech_synthesis_models[network_name]().to(device="cuda")    
+    elif network_name in speech_representation_models:
+        return speech_representation_models[network_name]().to(device="cuda")                                       
+    else:
+        print ("ERROR: not a supported model '%s'" % network_name)
+        sys.exit(1)
\ No newline at end of file
diff --git a/audio/audio_output.py b/audio/audio_output.py
new file mode 100644
index 0000000..c5c5cb6
--- /dev/null
+++ b/audio/audio_output.py
@@ -0,0 +1,54 @@
+from audio.audio_model import *
+
+
+def get_output_selection(network_name):
+    if network_name in acoustic_models:
+        return 0
+    elif "conformer" in network_name or "emformer" in network_name:
+        return 0
+    elif "tacotron2" in network_name:
+        return 1
+    return None
+
+def create_target(network_name, network, input, batch_size):
+    
+    #get output
+    output = network(**input)
+    output_index = get_output_selection(network_name) 
+    if output_index is not None:
+        output = output[output_index]
+
+    target = None
+    if network_name in speech_recognition_models or network_name in acoustic_models:
+        output = output.transpose(-1, -2).transpose(0, 1)
+        T, N, C = output.shape
+        target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long)
+        target = torch.randint(
+            low=1,
+            high=C,
+            size=(sum(target_lengths),),
+            dtype=torch.long,
+        )
+        target = [target, target_lengths]
+    elif "wavernn" in network_name:
+        target = torch.randn_like(output)
+        target = target.squeeze(1)
+        target = target.transpose(1, 2)
+    elif "conv_tasnet" in network_name:
+        batch, _, time = output.shape
+        mask = torch.randint(low=0, high=1, size=(batch,1,time), dtype=torch.long).cuda()
+        target = torch.randn_like(output)
+        target = [target, mask]
+    elif "tacotron2" in network_name:
+        target = torch.randn_like(output)
+    elif "hdemucs" in network_name or "subjective" in network_name:
+        target = torch.randn_like(output)
+    elif "objective" in network_name:
+        target = []
+        for index in range(len(output)):
+            target.append(torch.randn_like(output[index]))
+        target.append(torch.randn_like(input["x"]))
+    else:
+        print (f"Target for {network_name} not defined")
+        sys.exit(1)
+    return target
\ No newline at end of file
diff --git a/audio/hubert_loss.py b/audio/hubert_loss.py
new file mode 100644
index 0000000..1edaebd
--- /dev/null
+++ b/audio/hubert_loss.py
@@ -0,0 +1,36 @@
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+
+def hubert_loss(
+    logit_m: Optional[Tensor],
+    logit_u: Optional[Tensor],
+    feature_penalty: Tensor,
+    masked_weight: float = 1.0,
+    unmasked_weight: float = 0.0,
+    feature_weight: float = 10.0,
+    reduction: str = "sum",
+) -> Tensor:
+    """Compute the cross-entropy loss on HuBERT masked and non-masked logits.
+    Args:
+        logit_m (Tensor or None): The masked logit Tensor of dimension `(masked_frames, final_dim)`.
+        logit_u (Tensor or None): The non-masked logit Tensor of dimension `(unmasked_frames, final_dim)`.
+        feature_penalty (Tensor): The feature mean value for additional penalty loss.
+        masked_weight (float, optional): The weight for masked cross-entropy loss (Default: ``1.0``).
+        unmasked_weight (float, optional): The weight for non-masked cross-entropy loss (Default: ``0.0``).
+        feature_weight (float, optional): The weight for feature penalty loss (Default: ``10.0``).
+        reduction (str, optional): The reduction method for cross-entropy loss (Default: ``"sum"``).
+    """
+    loss = feature_penalty * feature_weight * logit_m.shape[0]
+    if logit_m is not None:
+        target_m = torch.zeros(logit_m.shape[0], dtype=torch.long, device=logit_m.device)
+        loss_m = F.cross_entropy(logit_m, target_m, reduction=reduction)
+        loss += loss_m * masked_weight
+    if logit_u is not None:
+        target_u = torch.zeros(logit_u.shape[0], dtype=torch.long, device=logit_m.device)
+        loss_u = F.cross_entropy(logit_u, target_u, reduction=reduction)
+        loss += loss_u * unmasked_weight
+    return loss
\ No newline at end of file
diff --git a/audio/languagemodels.py b/audio/languagemodels.py
new file mode 100644
index 0000000..8091568
--- /dev/null
+++ b/audio/languagemodels.py
@@ -0,0 +1,38 @@
+import collections
+import itertools
+
+
+class LanguageModel:
+    def __init__(self, labels, char_blank, char_space):
+
+        self.char_space = char_space
+        self.char_blank = char_blank
+
+        labels = list(labels)
+        self.length = len(labels)
+        enumerated = list(enumerate(labels))
+        flipped = [(sub[1], sub[0]) for sub in enumerated]
+
+        d1 = collections.OrderedDict(enumerated)
+        d2 = collections.OrderedDict(flipped)
+        self.mapping = {**d1, **d2}
+
+    def encode(self, iterable):
+        if isinstance(iterable, list):
+            return [self.encode(i) for i in iterable]
+        else:
+            return [self.mapping[i] + self.mapping[self.char_blank] for i in iterable]
+
+    def decode(self, tensor):
+        if len(tensor) > 0 and isinstance(tensor[0], list):
+            return [self.decode(t) for t in tensor]
+        else:
+            # not idempotent, since clean string
+            x = (self.mapping[i] for i in tensor)
+            x = "".join(i for i, _ in itertools.groupby(x))
+            x = x.replace(self.char_blank, "")
+            # x = x.strip()
+            return x
+
+    def __len__(self):
+        return self.length
\ No newline at end of file
diff --git a/audio/sdr.py b/audio/sdr.py
new file mode 100644
index 0000000..4053b6c
--- /dev/null
+++ b/audio/sdr.py
@@ -0,0 +1,218 @@
+import math
+from itertools import permutations
+from typing import Optional
+
+import torch
+
+
+def sdr(
+    estimate: torch.Tensor, reference: torch.Tensor, mask: Optional[torch.Tensor] = None, epsilon: float = 1e-8
+) -> torch.Tensor:
+    """Computes source-to-distortion ratio.
+
+    1. scale the reference signal with power(s_est * s_ref) / powr(s_ref * s_ref)
+    2. compute SNR between adjusted estimate and reference.
+
+    Args:
+        estimate (torch.Tensor): Estimtaed signal.
+            Shape: [batch, speakers (can be 1), time frame]
+        reference (torch.Tensor): Reference signal.
+            Shape: [batch, speakers, time frame]
+        mask (torch.Tensor or None, optional): Binary mask to indicate padded value (0) or valid value (1).
+            Shape: [batch, 1, time frame]
+        epsilon (float, optional): constant value used to stabilize division.
+
+    Returns:
+        torch.Tensor: scale-invariant source-to-distortion ratio.
+        Shape: [batch, speaker]
+
+    References:
+        - Single-channel multi-speaker separation using deep clustering
+          Y. Isik, J. Le Roux, Z. Chen, S. Watanabe, and J. R. Hershey,
+        - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation
+          Luo, Yi and Mesgarani, Nima
+          https://arxiv.org/abs/1809.07454
+
+    Notes:
+        This function is tested to produce the exact same result as
+        https://github.com/naplab/Conv-TasNet/blob/e66d82a8f956a69749ec8a4ae382217faa097c5c/utility/sdr.py#L34-L56
+    """
+    reference_pow = reference.pow(2).mean(axis=2, keepdim=True)
+    mix_pow = (estimate * reference).mean(axis=2, keepdim=True)
+    scale = mix_pow / (reference_pow + epsilon)
+
+    reference = scale * reference
+    error = estimate - reference
+
+    reference_pow = reference.pow(2)
+    error_pow = error.pow(2)
+
+    if mask is None:
+        reference_pow = reference_pow.mean(axis=2)
+        error_pow = error_pow.mean(axis=2)
+    else:
+        denom = mask.sum(axis=2)
+        reference_pow = (mask * reference_pow).sum(axis=2) / denom
+        error_pow = (mask * error_pow).sum(axis=2) / denom
+
+    return 10 * torch.log10(reference_pow) - 10 * torch.log10(error_pow)
+
+
+class PIT(torch.nn.Module):
+    """Applies utterance-level speaker permutation
+
+    Computes the maxium possible value of the given utility function
+    over the permutations of the speakers.
+
+    Args:
+        utility_func (function):
+            Function that computes the utility (opposite of loss) with signature of
+            (extimate: torch.Tensor, reference: torch.Tensor) -> torch.Tensor
+            where input Tensors are shape of [batch, speakers, frame] and
+            the output Tensor is shape of [batch, speakers].
+
+    References:
+        - Multi-talker Speech Separation with Utterance-level Permutation Invariant Training of
+          Deep Recurrent Neural Networks
+          Morten Kolbæk, Dong Yu, Zheng-Hua Tan and Jesper Jensen
+          https://arxiv.org/abs/1703.06284
+    """
+
+    def __init__(self, utility_func):
+        super().__init__()
+        self.utility_func = utility_func
+
+    def forward(
+        self,
+        estimate: torch.Tensor,
+        reference: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        epsilon: float = 1e-8,
+    ) -> torch.Tensor:
+        """Compute utterance-level PIT Loss
+
+        Args:
+            estimate (torch.Tensor): Estimated source signals.
+                Shape: [bacth, speakers, time frame]
+            reference (torch.Tensor): Reference (original) source signals.
+                Shape: [batch, speakers, time frame]
+            mask (torch.Tensor or None, optional): Binary mask to indicate padded value (0) or valid value (1).
+                Shape: [batch, 1, time frame]
+            epsilon (float, optional): constant value used to stabilize division.
+
+        Returns:
+            torch.Tensor: Maximum criterion over the speaker permutation.
+                Shape: [batch, ]
+        """
+        assert estimate.shape == reference.shape
+
+        batch_size, num_speakers = reference.shape[:2]
+        num_permute = math.factorial(num_speakers)
+
+        util_mat = torch.zeros(batch_size, num_permute, dtype=estimate.dtype, device=estimate.device)
+        for i, idx in enumerate(permutations(range(num_speakers))):
+            util = self.utility_func(estimate, reference[:, idx, :], mask=mask, epsilon=epsilon)
+            util_mat[:, i] = util.mean(dim=1)  # take the average over speaker dimension
+        return util_mat.max(dim=1).values
+
+
+_sdr_pit = PIT(utility_func=sdr)
+
+
+def sdr_pit(
+    estimate: torch.Tensor, reference: torch.Tensor, mask: Optional[torch.Tensor] = None, epsilon: float = 1e-8
+):
+    """Computes scale-invariant source-to-distortion ratio.
+
+    1. adjust both estimate and reference to have 0-mean
+    2. scale the reference signal with power(s_est * s_ref) / powr(s_ref * s_ref)
+    3. compute SNR between adjusted estimate and reference.
+
+    Args:
+        estimate (torch.Tensor): Estimtaed signal.
+            Shape: [batch, speakers (can be 1), time frame]
+        reference (torch.Tensor): Reference signal.
+            Shape: [batch, speakers, time frame]
+        mask (torch.Tensor or None, optional): Binary mask to indicate padded value (0) or valid value (1).
+            Shape: [batch, 1, time frame]
+        epsilon (float, optional): constant value used to stabilize division.
+
+    Returns:
+        torch.Tensor: scale-invariant source-to-distortion ratio.
+        Shape: [batch, speaker]
+
+    References:
+        - Single-channel multi-speaker separation using deep clustering
+          Y. Isik, J. Le Roux, Z. Chen, S. Watanabe, and J. R. Hershey,
+        - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation
+          Luo, Yi and Mesgarani, Nima
+          https://arxiv.org/abs/1809.07454
+
+    Notes:
+        This function is tested to produce the exact same result as the reference implementation,
+        *when the inputs have 0-mean*
+        https://github.com/naplab/Conv-TasNet/blob/e66d82a8f956a69749ec8a4ae382217faa097c5c/utility/sdr.py#L107-L153
+    """
+    return _sdr_pit(estimate, reference, mask, epsilon)
+
+
+def sdri(
+    estimate: torch.Tensor,
+    reference: torch.Tensor,
+    mix: torch.Tensor,
+    mask: Optional[torch.Tensor] = None,
+    epsilon: float = 1e-8,
+) -> torch.Tensor:
+    """Compute the improvement of SDR (SDRi).
+
+    This function compute how much SDR is improved if the estimation is changed from
+    the original mixture signal to the actual estimated source signals. That is,
+    ``SDR(estimate, reference) - SDR(mix, reference)``.
+
+    For computing ``SDR(estimate, reference)``, PIT (permutation invariant training) is applied,
+    so that best combination of sources between the reference signals and the esimate signals
+    are picked.
+
+    Args:
+        estimate (torch.Tensor): Estimated source signals.
+            Shape: [batch, speakers, time frame]
+        reference (torch.Tensor): Reference (original) source signals.
+            Shape: [batch, speakers, time frame]
+        mix (torch.Tensor): Mixed souce signals, from which the setimated signals were generated.
+            Shape: [batch, speakers == 1, time frame]
+        mask (torch.Tensor or None, optional): Binary mask to indicate padded value (0) or valid value (1).
+            Shape: [batch, 1, time frame]
+        epsilon (float, optional): constant value used to stabilize division.
+
+    Returns:
+        torch.Tensor: Improved SDR. Shape: [batch, ]
+
+    References:
+        - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation
+          Luo, Yi and Mesgarani, Nima
+          https://arxiv.org/abs/1809.07454
+    """
+    sdr_ = sdr_pit(estimate, reference, mask=mask, epsilon=epsilon)  # [batch, ]
+    base_sdr = sdr(mix, reference, mask=mask, epsilon=epsilon)  # [batch, speaker]
+    return sdr_ - base_sdr.mean(dim=1)
+
+
+def si_sdr_loss(estimate: torch.Tensor, reference: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+    """Compute the Si-SDR loss.
+
+    Args:
+        estimate (torch.Tensor): Estimated source signals.
+            Tensor of dimension (batch, speakers, time)
+        reference (torch.Tensor): Reference (original) source signals.
+            Tensor of dimension (batch, speakers, time)
+        mask (torch.Tensor): Mask to indicate padded value (0) or valid value (1).
+            Tensor of dimension (batch, 1, time)
+
+    Returns:
+        torch.Tensor: Si-SDR loss. Tensor of dimension (batch, )
+    """
+    estimate = estimate - estimate.mean(axis=2, keepdim=True)
+    reference = reference - reference.mean(axis=2, keepdim=True)
+
+    si_sdri = sdr_pit(estimate, reference, mask=mask)
+    return -si_sdri.mean()
\ No newline at end of file
diff --git a/micro_benchmarking_apex.py b/micro_benchmarking_apex.py
new file mode 100644
index 0000000..e137ecc
--- /dev/null
+++ b/micro_benchmarking_apex.py
@@ -0,0 +1,563 @@
+import torch
+import torchvision
+import random
+import time
+import argparse
+import os
+import sys
+import ast
+import copy
+import math
+import torch.nn as nn
+import torch.multiprocessing as mp
+try:
+    import apex
+except:
+    print ("ERROR: You must install apex to run apex microbenchmarking")
+    sys.exit(1)
+from apex.fp16_utils import FP16Model
+from shufflenet import shufflenet
+from shufflenet_v2 import shufflenet as shufflenet_v2
+from xception import xception
+import csv
+import json
+
+try:
+    import torch._dynamo
+    torch._dynamo.config.verbose=True
+    HAVE_DYNAMO = True
+except:
+    HAVE_DYNAMO = False
+
+IS_PT2 = hasattr(torch, "compile")
+
+is_torchrun = False
+if "LOCAL_RANK" in os.environ:
+    # this indicates we're using torchrun
+    is_torchrun = True
+
+def xform(m: nn.Module) -> nn.Module:
+    m = m.cuda()
+    m.to(memory_format=torch.channels_last)
+    return m
+
+def weight_init(m):
+    if isinstance(m, nn.Conv2d):
+        n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        m.weight.data.normal_(0, math.sqrt(2. / n))
+        if m.bias is not None:
+            m.bias.data.zero_()
+    elif isinstance(m, nn.BatchNorm2d):
+        m.weight.data.fill_(1)
+        m.bias.data.zero_()
+
+# num_classes=1000
+models = {
+        "alexnet" :            torchvision.models.alexnet,
+        "densenet121" :        torchvision.models.densenet121,
+        "densenet161" :        torchvision.models.densenet161,
+        "densenet169" :        torchvision.models.densenet169,
+        "densenet201" :        torchvision.models.densenet201,
+        "googlenet" :          torchvision.models.googlenet,
+        "inception_v3" :       torchvision.models.inception_v3,
+        "mnasnet0_5" :         torchvision.models.mnasnet0_5,
+        "mnasnet0_75" :        torchvision.models.mnasnet0_75,
+        "mnasnet1_0" :         torchvision.models.mnasnet1_0,
+        "mnasnet1_3" :         torchvision.models.mnasnet1_3,
+        "mobilenet_v2" :       torchvision.models.mobilenet_v2,
+        "resnet18" :           torchvision.models.resnet18,
+        "resnet34" :           torchvision.models.resnet34,
+        "resnet50" :           torchvision.models.resnet50,
+        "resnet101" :          torchvision.models.resnet101,
+        "resnet152" :          torchvision.models.resnet152,
+        "resnext50" :          torchvision.models.resnext50_32x4d,
+        "resnext50_32x4d" :    torchvision.models.resnext50_32x4d,
+        "resnext101" :         torchvision.models.resnext101_32x8d,
+        "resnext101_32x8d" :   torchvision.models.resnext101_32x8d,
+        "shufflenet" :         shufflenet,
+        "shufflenet_v2" :      shufflenet_v2,
+        "shufflenet_v2_x05" :  torchvision.models.shufflenet_v2_x0_5,
+        "shufflenet_v2_x10" :  torchvision.models.shufflenet_v2_x1_0,
+        "shufflenet_v2_x15" :  torchvision.models.shufflenet_v2_x1_5,
+        "shufflenet_v2_x20" :  torchvision.models.shufflenet_v2_x2_0,
+        "shufflenet_v2_x0_5" : torchvision.models.shufflenet_v2_x0_5,
+        "shufflenet_v2_x1_0" : torchvision.models.shufflenet_v2_x1_0,
+        "shufflenet_v2_x1_5" : torchvision.models.shufflenet_v2_x1_5,
+        "shufflenet_v2_x2_0" : torchvision.models.shufflenet_v2_x2_0,
+        "SqueezeNet" :         torchvision.models.squeezenet1_0,
+        "squeezenet1_0" :      torchvision.models.squeezenet1_0,
+        "SqueezeNet1.1" :      torchvision.models.squeezenet1_1,
+        "squeezenet1_1" :      torchvision.models.squeezenet1_1,
+        "vgg11" :              torchvision.models.vgg11,
+        "vgg13" :              torchvision.models.vgg13,
+        "vgg16" :              torchvision.models.vgg16,
+        "vgg19" :              torchvision.models.vgg19,
+        "vgg11_bn" :           torchvision.models.vgg11_bn,
+        "vgg13_bn" :           torchvision.models.vgg13_bn,
+        "vgg16_bn" :           torchvision.models.vgg16_bn,
+        "vgg19_bn" :           torchvision.models.vgg19_bn,
+        "wide_resnet50_2" :    torchvision.models.wide_resnet50_2,
+        "wide_resnet101_2" :   torchvision.models.wide_resnet101_2,
+        "xception" :           xception,
+}
+
+# newer torchvision models, for backwards compat
+try:
+    models["swin_t"] = torchvision.models.swin_t
+    models["swin_s"] = torchvision.models.swin_s
+    models["swin_b"] = torchvision.models.swin_b
+    models["swin_v2_t"] = torchvision.models.swin_v2_t
+    models["swin_v2_s"] = torchvision.models.swin_v2_s
+    models["swin_v2_b"] = torchvision.models.swin_v2_b
+    models["vit_b_16"] = torchvision.models.vit_b_16
+    models["vit_b_32"] = torchvision.models.vit_b_32
+    models["vit_l_16"] = torchvision.models.vit_l_16
+    models["vit_l_32"] = torchvision.models.vit_l_32
+    models["vit_h_14"] = torchvision.models.vit_h_14
+    models["efficientnet_b0"] = torchvision.models.efficientnet_b0
+    models["efficientnet_b1"] = torchvision.models.efficientnet_b1
+    models["efficientnet_b2"] = torchvision.models.efficientnet_b2
+    models["efficientnet_b3"] = torchvision.models.efficientnet_b3
+    models["efficientnet_b4"] = torchvision.models.efficientnet_b4
+    models["efficientnet_b5"] = torchvision.models.efficientnet_b5
+    models["efficientnet_b6"] = torchvision.models.efficientnet_b6
+    models["efficientnet_b7"] = torchvision.models.efficientnet_b7
+    models["maxvit_t"] = torchvision.models.maxvit_t
+except AttributeError:
+    pass
+
+try:
+    models["mobilenet_v3_large"] = torchvision.models.mobilenet_v3_large
+    models["mobilenet_v3_small"] = torchvision.models.mobilenet_v3_small
+except AttributeError:
+    pass
+# segmentation models, num_classes=21
+segmentation_models = {
+        "fcn_resnet50" :        torchvision.models.segmentation.fcn_resnet50,
+        "fcn_resnet101" :       torchvision.models.segmentation.fcn_resnet101,
+        "deeplabv3_resnet50" :  torchvision.models.segmentation.deeplabv3_resnet50,
+        "deeplabv3_resnet101" : torchvision.models.segmentation.deeplabv3_resnet101,
+}
+
+# newer torchvision segmentation models, for backwards compat
+try:
+    segmentation_models["deeplabv3_mobilenet_v3_large"] = torchvision.models.segmentation.deeplabv3_mobilenet_v3_large
+    segmentation_models["lraspp_mobilenet_v3_large"] = torchvision.models.segmentation.lraspp_mobilenet_v3_large,
+except AttributeError:
+    pass
+
+def get_network_names():
+    return sorted(list(models.keys()) + list(segmentation_models.keys()))
+
+def get_network(net, params):
+    # aux_logits=False only used by inception_v3
+    if "inception_v3" == net:
+        if params.nhwc:
+            return xform(models[net](aux_logits=False))
+        return models[net](aux_logits=False).to(device="cuda")
+    elif net in models:
+        if params.nhwc:
+            return xform(models[net]())
+        return models[net]().to(device="cuda")
+    elif net in segmentation_models:
+        if params.nhwc:
+            return xform(segmentation_models[net]())
+        return segmentation_models[net]().to(device="cuda")
+    else:
+        print ("ERROR: not a supported model '%s'" % net)
+        sys.exit(1)
+
+def forwardbackward(inp, optimizer, network, params, target, step=0, opt_step=1, flops_prof_step=0):
+    if step % opt_step == 0:
+        optimizer.zero_grad()
+    if flops_prof_step:
+        prof = FlopsProfiler(network)
+        prof.start_profile()
+    out = network(inp)
+    # If using HuggingFace model outputs logits, we need to extract them
+    if hasattr(out, 'logits'):
+        logits = out.logits
+    else:
+        logits = out
+    loss_fn = torch.nn.CrossEntropyLoss().to(device="cuda")
+    if params.nhwc:
+        loss_fn = loss_fn.to(memory_format=torch.channels_last)
+    loss = loss_fn(logits, target)
+
+    if params.amp_opt_level:
+        with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+    else:
+        loss.backward()
+
+    if (step + 1) % opt_step == 0:
+        optimizer.step()
+        optimizer.zero_grad()
+
+    if flops_prof_step:
+        # End profiler here to profile both fwd and bwd passes
+        # flops = prof.get_total_flops(as_string=True)
+        # params = prof.get_total_params(as_string=True)
+        prof.print_model_profile(profile_step=flops_prof_step)
+        prof.end_profile()
+
+def forward(inp, optimizer, network, params, target, step=0, opt_step=1, flops_prof_step=0):
+
+    if flops_prof_step:
+        prof = FlopsProfiler(network)
+        prof.start_profile()
+    out = network(inp)
+    # If using HuggingFace model outputs logits, we need to extract them
+    if hasattr(out, 'logits'):
+        logits = out.logits
+    else:
+        logits = out
+
+    if flops_prof_step:
+        # End profiler here to profile both fwd and bwd passes
+        # flops = prof.get_total_flops(as_string=True)
+        # params = prof.get_total_params(as_string=True)
+        prof.print_model_profile(profile_step=flops_prof_step)
+        prof.end_profile()
+
+    return logits
+
+def rendezvous(distributed_parameters):
+    print("Initializing process group...")
+    torch.distributed.init_process_group(backend=distributed_parameters['dist_backend'], init_method=distributed_parameters['dist_url'], rank=distributed_parameters['rank'], world_size=distributed_parameters['world_size'])
+    print("Rendezvous complete. Created process group...")
+
+def run_benchmarking_wrapper(params):
+    params.flops_prof_step = max(0, min(params.flops_prof_step, params.iterations - 1))
+    if (params.device_ids):
+        params.device_ids = [int(x) for x in params.device_ids.split(",")]
+    else:
+        params.device_ids = None
+    params.distributed_parameters = {}
+    if is_torchrun:
+        params.distributed_parameters['rank'] = int(os.environ["LOCAL_RANK"])
+        params.distributed_parameters['world_size'] = int(os.environ["WORLD_SIZE"])
+        params.distributed_parameters['dist_backend'] = "nccl"
+        params.distributed_parameters['dist_url'] = 'tcp://' + os.environ["MASTER_ADDR"] + ":" + os.environ["MASTER_PORT"]
+    else:
+        params.distributed_parameters['rank'] = params.rank
+        params.distributed_parameters['world_size'] = params.world_size
+        params.distributed_parameters['dist_backend'] = params.dist_backend
+        params.distributed_parameters['dist_url'] = params.dist_url
+
+    # Some arguments are required for distributed_dataparallel
+    if params.distributed_dataparallel:
+        assert params.distributed_parameters['rank'] is not None and \
+               params.distributed_parameters['world_size'] is not None and \
+               params.distributed_parameters['dist_backend'] is not None and \
+               params.distributed_parameters['dist_url'] is not None, "rank, world-size, dist-backend and dist-url are required arguments for distributed_dataparallel"
+
+    if is_torchrun:
+        params.ngpus = params.distributed_parameters['world_size']
+    elif params.distributed_dataparallel:
+        params.ngpus = len(params.device_ids) if params.device_ids else torch.cuda.device_count()
+    else:
+        params.ngpus = 1
+
+    if is_torchrun:
+        run_benchmarking(params.distributed_parameters['rank'], params)
+    elif params.distributed_dataparallel:
+        # Assumption below that each process launched with --distributed_dataparallel has the same number of devices visible/specified
+        params.distributed_parameters['world_size'] = params.ngpus * params.distributed_parameters['world_size']
+        params.distributed_parameters['rank'] = params.ngpus * params.distributed_parameters['rank']
+        mp.spawn(run_benchmarking, nprocs=params.ngpus, args=(params,))
+    else:
+        run_benchmarking(0, params)
+
+def run_benchmarking(local_rank, params):
+    device_ids = params.device_ids
+    ngpus = params.ngpus
+    net = params.network
+    run_fp16 = params.fp16
+    amp_opt_level = params.amp_opt_level
+    distributed_dataparallel = params.distributed_dataparallel
+    distributed_parameters = params.distributed_parameters
+    batch_size = params.batch_size
+    kineto = params.kineto
+    iterations = params.iterations
+    autograd_profiler = params.autograd_profiler
+    flops_prof_step = params.flops_prof_step
+
+    if is_torchrun:
+        torch.cuda.set_device("cuda:%d" % local_rank)
+    elif device_ids:
+        assert ngpus == len(device_ids)
+        torch.cuda.set_device("cuda:%d" % device_ids[local_rank])
+    else:
+        torch.cuda.set_device("cuda:0")
+
+    network = get_network(net, params)
+    if "shufflenet" == net:
+        network.apply(weight_init)
+
+    if params.compile:
+        compile_ctx = {"mode": None,
+                       "dynamic": False,
+                       "fullgraph": False,
+                       "backend": "inductor",
+                       "options": None,
+                       "disable": False}
+        options = None  # needed for internal pytorch checks
+        if params.compileContext:
+            compile_ctx.update(ast.literal_eval(params.compileContext))
+            if compile_ctx["mode"] is not None and compile_ctx["options"] is not None:
+                raise RuntimeError("Cannot specify mode and options simultaneously")
+            if compile_ctx["options"] is not None:
+                options = {}  # needed to save multiple options
+                for compiler_pass in compile_ctx["options"].keys():
+                    options.update({compiler_pass: bool(compile_ctx["options"][compiler_pass])})
+        if IS_PT2:
+            network = torch.compile(network,
+                                    mode=compile_ctx["mode"],
+                                    dynamic=bool(compile_ctx["dynamic"]),
+                                    fullgraph=bool(compile_ctx["fullgraph"]),
+                                    backend=compile_ctx["backend"],
+                                    options=options,
+                                    disable=compile_ctx["disable"])
+        else:
+            print ("ERROR: requested torch.compile but this isn't pytorch 2.x")
+            sys.exit(1)
+
+    ## MLPerf Setting
+    sgd_opt_base_learning_rate = 0.01
+    sgd_opt_end_learning_rate = 1e-4
+    sgd_opt_learning_rate_decay_poly_power = 2
+    sgd_opt_weight_decay = 0.0001
+    sgd_opt_momentum = 0.9
+    opt_learning_rate_warmup_epochs = 5
+
+    total_epochs = params.iterations
+    optimizer = torch.optim.SGD(param_copy, lr = sgd_opt_base_learning_rate, momentum = sgd_opt_momentum, weight_decay=sgd_opt_weight_decay)
+
+    if (run_fp16):
+        network = FP16Model(network) 
+
+    #use apex syncbn
+    if args.sync_bn:
+        network = apex.parallel.convert_syncbn_model(network)
+    
+    optimizer = apex.optimizers.FusedSGD(network.parameters(), lr = 0.01, momentum = 0.9)
+
+    if (amp_opt_level):
+        network, optimizer = apex.amp.initialize(network, optimizer, opt_level="O%d"%amp_opt_level,
+                                                keep_batchnorm_fp32=args.keep_batchnorm_fp32,
+                                                loss_scale=args.loss_scale)
+
+    if is_torchrun:
+        rendezvous(distributed_parameters)
+        devices_to_run_on = [local_rank]
+        print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on)))
+        network = apex.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on)
+        batch_size = int(batch_size / ngpus)
+    elif (distributed_dataparallel):
+        distributed_parameters['rank'] += local_rank
+        rendezvous(distributed_parameters)
+        devices_to_run_on = [(device_ids[local_rank] if device_ids else local_rank)]
+        print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on)))
+        network = apex.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on)
+        batch_size = int(batch_size / ngpus)
+
+    if (net == "inception_v3"):
+        inp = torch.randn(batch_size, 3, 299, 299, device="cuda")
+    else:
+        inp = torch.randn(batch_size, 3, 224, 224, device="cuda")
+    if (run_fp16):
+        inp = inp.half()
+    if params.nhwc:
+        inp = inp.to(memory_format=torch.channels_last)
+    if net in models:
+        # number of classes is 1000 for imagenet
+        target = torch.randint(0, 1000, (batch_size,), device="cuda")
+    elif net in segmentation_models:
+        # number of classes is 21 for segmentation
+        target = torch.randint(0, 21, (batch_size,), device="cuda")
+
+    if params.mode == "training":
+        forward_fn = forwardbackward
+        network.train()
+    else:
+        forward_fn = forward
+        network.eval()
+
+    ## warmup.
+    print ("INFO: running forward and backward for warmup.")
+    for i in range(2):
+        forward_fn(inp, optimizer, network, params, target, step=0, opt_step=params.opt_step)
+
+    time.sleep(1)
+    torch.cuda.synchronize()
+
+    ## benchmark.
+    print ("INFO: running the benchmark..")
+    if kineto:
+        from torch.profiler import schedule, profile, ProfilerActivity, record_function
+        profiler_schedule = schedule(
+            skip_first = 0,
+            wait = 1,
+            warmup = 2,
+            active = 5,
+            repeat = 1,
+        )
+
+        def trace_ready_callback(prof):
+            rank = 0
+            if torch.distributed.is_available() and torch.distributed.is_initialized():
+                rank = torch.distributed.get_rank()
+            if rank == 0:
+                print("----------- Trace Ready -----------")
+                prof.export_chrome_trace(f"{params.profiler_output}.json")            
+            # print(f"----------- Rank {rank} Trace Ready -----------")
+            # prof.export_chrome_trace(f"{params.profiler_output}_rank{rank}.json")
+
+        tm = time.time()
+        with profile(
+            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+            schedule=profiler_schedule,
+            on_trace_ready=trace_ready_callback) as prof:
+            for i in range(iterations):
+                with record_function(f"iteration {i}"):
+                    forward_fn(inp, optimizer, network, params, target, step=i, opt_step=params.opt_step)
+                prof.step()
+            torch.cuda.synchronize()
+            print(prof.key_averages().table(sort_by="cuda_time_total"))
+    else:
+        tm = time.time()
+        with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler):
+            for i in range(iterations):
+                if i == flops_prof_step:
+                    forward_fn(inp, optimizer, network, params, target, step=i, opt_step=params.opt_step, 
+                        flops_prof_step = i)
+                else:
+                    forward_fn(inp, optimizer, network, params, target, step=i, opt_step=params.opt_step)
+        torch.cuda.synchronize()
+
+    tm2 = time.time()
+    time_per_batch = (tm2 - tm) / iterations
+
+    if run_fp16:
+        dtype = 'FP16'
+    elif amp_opt_level == 1:
+        dtype = 'AMP-O1: Insert automatic FP16 casts around safe Pytorch functions and Tensor methods.'
+    elif amp_opt_level == 2:
+        dtype = 'AMP-O2: FP16 training with FP32 batchnorm and FP32 master weights.'
+    elif amp_opt_level == 3:
+        dtype = 'AMP-O3: Pure FP16 training.'
+    elif amp_opt_level == 4:
+        dtype = 'AMP-O4: Insert automatic BFLOAT16 casts around safe Pytorch functions and Tensor methods.'
+    elif amp_opt_level == 5:
+        dtype = 'AMP-O5: BFLOAT16 training with FP32 batchnorm and FP32 master weights.'
+    else:
+        dtype = 'FP32'
+
+    result = None
+    if not params.output_dir:
+        params.output_dir = "."
+
+    print ("OK: finished running benchmark..")
+    print ("--------------------SUMMARY--------------------------")
+    print ("Microbenchmark for network : {}".format(net))
+    if distributed_dataparallel or is_torchrun:
+        print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------");
+        print ("Num devices: 1")
+    else:
+        print ("Num devices: {}".format(ngpus))
+        result = {
+            "Name": params.output_file,
+            "GPUs": 1,
+            "Mini batch size [img]": batch_size,
+            "Mini batch size [img/gpu]": batch_size,
+            "Throughput [img/sec]": batch_size / time_per_batch,
+            "Time per mini-batch": time_per_batch
+        }
+        with open(f"{params.output_dir}/{params.output_file}.json", "w") as f:
+            json.dump(result, f, indent=2)
+
+    print ("Dtype: {}".format(dtype))
+    print ("Mini batch size [img] : {}".format(batch_size))
+    print ("Time per mini-batch : {}".format(time_per_batch))
+    print ("Throughput [img/sec] : {}".format(batch_size/time_per_batch))
+    if (distributed_dataparallel or is_torchrun) and distributed_parameters['rank'] == 0:
+        print ("")
+        print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------")
+        world_size = distributed_parameters['world_size']
+        print ("Num devices: {}".format(world_size))
+        print ("Dtype: {}".format(dtype))
+        print ("Mini batch size [img] : {}".format(batch_size*world_size))
+        print ("Time per mini-batch : {}".format(time_per_batch))
+        print ("Throughput [img/sec] : {}".format(batch_size*world_size/time_per_batch))
+        result = {
+            "Name": params.output_file,
+            "GPUs": distributed_parameters['world_size'],
+            "Mini batch size [img]": batch_size * distributed_parameters['world_size'],
+            "Mini batch size [img/gpu]": batch_size,
+            "Throughput [img/sec]": batch_size * distributed_parameters['world_size'] / time_per_batch,
+            "Time per mini-batch": time_per_batch
+        }
+        with open(f"{params.output_dir}/{params.output_file}.json", "w") as f:
+            json.dump(result, f, indent=2)
+
+    csv_filename = f"{params.output_dir}/benchmark_summary.csv"
+    if params.csv_file:
+        csv_filename = params.csv_file
+    file_exists = os.path.isfile(csv_filename)
+    if result:
+        with open(csv_filename, "a", newline='') as csvfile:
+            writer = csv.writer(csvfile)
+            if not file_exists:
+                writer.writerow(result.keys())
+            writer.writerow(result.values())
+        print(f"Benchmark result saved to {csv_filename}")
+
+def main():
+    run_benchmarking_wrapper(copy.deepcopy(args))
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--network", type=str, choices=get_network_names(), required=True, help="Network to run.")
+    parser.add_argument("--batch-size" , type=int, required=False, default=64, help="Batch size (will be split among devices used by this invocation)")
+    parser.add_argument("--iterations", type=int, required=False, default=20, help="Iterations")
+    parser.add_argument("--flops-prof-step", type=int, required=False, default=0, help="The flops profiling step")
+    parser.add_argument("--kineto", action='store_true', required=False, help="Turn kineto profiling on")
+    parser.add_argument("--autograd_profiler", action='store_true', required=False, help="Use PyTorch autograd (old) profiler")
+    parser.add_argument("--fp16", type=int, required=False, default=0,help="FP16 mixed precision benchmarking")
+    parser.add_argument("--amp-opt-level", type=int, required=False, default=0,help="apex.amp mixed precision benchmarking opt level")
+    parser.add_argument("--distributed_dataparallel", action='store_true', required=False, help="Use apex.parallel.DistributedDataParallel api to run on multiple processes/nodes. The multiple processes need to be launched manually, this script will only launch ONE process per invocation. Either use --distributed_dataparallel and manually launch multiple processes or launch this script with `torchrun`")
+    parser.add_argument("--device_ids", type=str, required=False, default=None, help="Comma-separated list (no spaces) to specify which HIP devices (0-indexed) to run distributedDataParallel api on. Might need to use HIP_VISIBLE_DEVICES to limit visiblity of devices to different processes.")
+    parser.add_argument("--rank", type=int, required=False, default=None, help="Rank of this process. Required for --distributed_dataparallel")
+    parser.add_argument("--world-size", type=int, required=False, default=None, help="Total number of ranks/processes. Required for --distributed_dataparallel")
+    parser.add_argument("--dist-backend", type=str, required=False, default=None, help="Backend used for distributed training. Can be one of 'nccl' or 'gloo'. Required for --distributed_dataparallel")
+    parser.add_argument("--dist-url", type=str, required=False, default=None, help="url used for rendezvous of processes in distributed training. Needs to contain IP and open port of master rank0 eg. 'tcp://172.23.2.1:54321'. Required for --distributed_dataparallel")
+    parser.add_argument("--compile", action='store_true', required=False, help="use pytorch 2.0")
+    parser.add_argument("--compileContext", default={}, required=False, help="additional compile options")
+    parser.add_argument('--sync_bn', action='store_true', help='enabling apex sync BN.')
+    parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
+    parser.add_argument('--loss-scale', type=str, default=None)
+    parser.add_argument("--csv-file", type=str, default=None, required=False, help="assign output csv file name.")
+    parser.add_argument("--mode", type=str, choices=['training', 'inference'], default="training", help="Select mode: training or inference")
+    parser.add_argument("--nhwc", action='store_true', default=False, help="Use nhwc format")
+    parser.add_argument("--opt-step", type=int, required=False, default=1, help="Optimizer update step")
+    parser.add_argument("--output-dir", type=str, default="", help="assign output directory name.")
+    parser.add_argument("--output-file", type=str, default="", help="assign output file name.")
+    parser.add_argument("--profiler-output", type=str, default="", help="assign profiler output name.") 
+
+
+    args = parser.parse_args()
+
+    if args.flops_prof_step:
+        try:
+            from deepspeed.profiling.flops_profiler import FlopsProfiler
+        except:
+            print("ERROR: You must install (or copy) deepspeed.profiling to use --flops-prof-step")
+            sys.exit(1)
+
+    if args.fp16 and args.amp_opt_level:
+        print ("ERROR: Cannot use both --fp16 and --amp-opt-level")
+        sys.exit(1)
+    
+
+    main()
diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py
new file mode 100644
index 0000000..efb5915
--- /dev/null
+++ b/micro_benchmarking_audio.py
@@ -0,0 +1,414 @@
+import torch
+import time
+import argparse
+import os
+import sys
+import ast
+import copy
+import math
+import torch.nn as nn
+import torch.multiprocessing as mp
+from fp16util import network_to_half, get_param_copy
+import torch.nn.functional as F
+from audio.audio_model import get_network_names, get_network
+from audio.audio_loss import get_criterion, calculate_loss
+from audio.audio_input import get_input_type, get_input
+from audio.audio_output import get_output_selection, create_target
+import csv
+import json
+from torch.amp import autocast, GradScaler
+
+try:
+    import torch._dynamo
+    torch._dynamo.config.verbose=True
+    HAVE_DYNAMO = True
+except:
+    HAVE_DYNAMO = False
+
+IS_PT2 = hasattr(torch, "compile")
+
+is_torchrun = False
+if "LOCAL_RANK" in os.environ:
+    # this indicates we're using torchrun
+    is_torchrun = True
+
+
+def weight_init(m):
+    if isinstance(m, nn.Conv2d):
+        n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        m.weight.data.normal_(0, math.sqrt(2. / n))
+        if m.bias is not None:
+            m.bias.data.zero_()
+    elif isinstance(m, nn.BatchNorm2d):
+        m.weight.data.fill_(1)
+        m.bias.data.zero_()
+
+
+def forwardbackward(inp, optimizer, network, params, network_name, batch_size, criterion, target, step=0, opt_step=1, flops_prof_step=0):
+    if step % opt_step == 0:
+        optimizer.zero_grad()
+    if flops_prof_step:
+        prof = FlopsProfiler(network)
+        prof.start_profile()
+
+    if params.amp:
+        with autocast('cuda'):
+            out = network(**inp)
+        output_index = get_output_selection(network_name) 
+        if output_index is not None:
+            out = out[output_index]
+
+        loss = calculate_loss(network_name, criterion, out, target, batch_size, inp)
+        scaler.scale(loss).backward()
+        if (step + 1) % opt_step == 0:
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+    else:
+        out = network(**inp)
+        output_index = get_output_selection(network_name) 
+        if output_index is not None:
+            out = out[output_index]
+        loss = calculate_loss(network_name, criterion, out, target, batch_size, inp)
+        loss.backward()
+        if (step + 1) % opt_step == 0:
+            optimizer.step()
+            optimizer.zero_grad()
+    
+    # End profiler here if only to profile forward pass
+
+    if flops_prof_step:
+        prof.print_model_profile(profile_step=flops_prof_step)
+        prof.end_profile()
+
+def forward(inp, optimizer, network, params, network_name, batch_size, criterion, target, step=0, opt_step=1, flops_prof_step=0):
+    if step % opt_step == 0:
+        optimizer.zero_grad()
+    if flops_prof_step:
+        prof = FlopsProfiler(network)
+        prof.start_profile()
+
+    if params.amp:
+        with autocast('cuda'):
+            out = network(**inp)
+        output_index = get_output_selection(network_name) 
+        if output_index is not None:
+            out = out[output_index]
+    else:
+        out = network(**inp)
+        output_index = get_output_selection(network_name) 
+        if output_index is not None:
+            out = out[output_index]
+    
+    # End profiler here if only to profile forward pass
+
+    if flops_prof_step:
+        prof.print_model_profile(profile_step=flops_prof_step)
+        prof.end_profile()
+
+
+def rendezvous(distributed_parameters):
+    print("Initializing process group...")
+    torch.distributed.init_process_group(backend=distributed_parameters['dist_backend'], init_method=distributed_parameters['dist_url'], rank=distributed_parameters['rank'], world_size=distributed_parameters['world_size'])
+    print("Rendezvous complete. Created process group...")
+
+def run_benchmarking_wrapper(params):
+    params.flops_prof_step = max(0, min(params.flops_prof_step, params.iterations - 1))
+    if (params.device_ids):
+        params.device_ids = [int(x) for x in params.device_ids.split(",")]
+    else:
+        params.device_ids = None
+    params.distributed_parameters = {}
+    if is_torchrun:
+        params.distributed_parameters['rank'] = int(os.environ["LOCAL_RANK"])
+        params.distributed_parameters['world_size'] = int(os.environ["WORLD_SIZE"])
+        params.distributed_parameters['dist_backend'] = "nccl"
+        params.distributed_parameters['dist_url'] = 'tcp://' + os.environ["MASTER_ADDR"] + ":" + os.environ["MASTER_PORT"]
+    else:
+        params.distributed_parameters['rank'] = params.rank
+        params.distributed_parameters['world_size'] = params.world_size
+        params.distributed_parameters['dist_backend'] = params.dist_backend
+        params.distributed_parameters['dist_url'] = params.dist_url
+
+    # Some arguments are required for distributed_dataparallel
+    if params.distributed_dataparallel:
+        assert params.distributed_parameters['rank'] is not None and \
+               params.distributed_parameters['world_size'] is not None and \
+               params.distributed_parameters['dist_backend'] is not None and \
+               params.distributed_parameters['dist_url'] is not None, "rank, world-size, dist-backend and dist-url are required arguments for distributed_dataparallel"
+
+    if is_torchrun:
+        params.ngpus = params.distributed_parameters['world_size']
+    elif params.distributed_dataparallel:
+        params.ngpus = len(params.device_ids) if params.device_ids else torch.cuda.device_count()
+    else:
+        params.ngpus = 1
+
+    if is_torchrun:
+        run_benchmarking(params.distributed_parameters['rank'], params)
+    elif params.distributed_dataparallel:
+        # Assumption below that each process launched with --distributed_dataparallel has the same number of devices visible/specified
+        params.distributed_parameters['world_size'] = params.ngpus * params.distributed_parameters['world_size']
+        params.distributed_parameters['rank'] = params.ngpus * params.distributed_parameters['rank']
+        mp.spawn(run_benchmarking, nprocs=params.ngpus, args=(params,))
+    else:
+        run_benchmarking(0, params)
+
+def run_benchmarking(local_rank, params):
+    device_ids = params.device_ids
+    ngpus = params.ngpus
+    net = params.network
+    run_fp16 = params.fp16
+    run_amp = params.amp
+    distributed_dataparallel = params.distributed_dataparallel
+    distributed_parameters = params.distributed_parameters
+    batch_size = params.batch_size
+    kineto = params.kineto
+    iterations = params.iterations
+    autograd_profiler = params.autograd_profiler
+    flops_prof_step = params.flops_prof_step
+
+    if is_torchrun:
+        torch.cuda.set_device("cuda:%d" % local_rank)
+    elif device_ids:
+        assert ngpus == len(device_ids)
+        torch.cuda.set_device("cuda:%d" % device_ids[local_rank])
+    else:
+        torch.cuda.set_device("cuda:0")
+
+    network = get_network(net)
+    criterion = get_criterion(net)
+    if "shufflenet" == net:
+        network.apply(weight_init)
+
+    if (run_fp16):
+        network = network_to_half(network)
+
+    if params.compile:
+        compile_ctx = {"mode": None,
+                       "dynamic": False,
+                       "fullgraph": False,
+                       "backend": "inductor",
+                       "options": None,
+                       "disable": False}
+        options = None  # needed for internal pytorch checks
+        if params.compileContext:
+            compile_ctx.update(ast.literal_eval(params.compileContext))
+            if compile_ctx["mode"] is not None and compile_ctx["options"] is not None:
+                raise RuntimeError("Cannot specify mode and options simultaneously")
+            if compile_ctx["options"] is not None:
+                options = {}  # needed to save multiple options
+                for compiler_pass in compile_ctx["options"].keys():
+                    options.update({compiler_pass: bool(compile_ctx["options"][compiler_pass])})
+        if IS_PT2:
+            network = torch.compile(network,
+                                    mode=compile_ctx["mode"],
+                                    dynamic=bool(compile_ctx["dynamic"]),
+                                    fullgraph=bool(compile_ctx["fullgraph"]),
+                                    backend=compile_ctx["backend"],
+                                    options=options,
+                                    disable=compile_ctx["disable"])
+        else:
+            print ("ERROR: requested torch.compile but this isn't pytorch 2.x")
+            sys.exit(1)
+
+    param_copy = network.parameters()
+    if (run_fp16):
+        param_copy = get_param_copy(network)
+    
+    ## MLPerf Setting
+    sgd_opt_base_learning_rate = 0.01
+    sgd_opt_end_learning_rate = 1e-4
+    sgd_opt_learning_rate_decay_poly_power = 2
+    sgd_opt_weight_decay = 0.0001
+    sgd_opt_momentum = 0.9
+    opt_learning_rate_warmup_epochs = 5
+
+    total_epochs = params.iterations
+    optimizer = torch.optim.SGD(param_copy, lr = sgd_opt_base_learning_rate, momentum = sgd_opt_momentum, weight_decay=sgd_opt_weight_decay)
+
+    if is_torchrun:
+        rendezvous(distributed_parameters)
+        devices_to_run_on = [local_rank]
+        print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on)))
+        network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on)
+        batch_size = int(batch_size / ngpus)
+    elif (distributed_dataparallel):
+        distributed_parameters['rank'] += local_rank
+        rendezvous(distributed_parameters)
+        devices_to_run_on = [(device_ids[local_rank] if device_ids else local_rank)]
+        print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on)))
+        network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on)
+        batch_size = int(batch_size / ngpus)
+
+    inp = get_input(net, network, batch_size)
+        
+    if (run_fp16):
+        inp = inp.half()
+
+    target = create_target(net, network, inp, batch_size)
+
+    if params.mode == "training":
+        forward_fn = forwardbackward
+        network.train()
+    else:
+        forward_fn = forward
+        network.eval()
+    
+    ## warmup.
+    print ("INFO: running forward and backward for warmup.")
+    for i in range(2):
+        forward_fn(inp, optimizer, network, params, net, batch_size, criterion, target, step=0, opt_step=params.opt_step)
+
+    time.sleep(1)
+    torch.cuda.synchronize()
+
+    ## benchmark.
+    print ("INFO: running the benchmark..")
+    if kineto:
+        from torch.profiler import schedule, profile, ProfilerActivity, record_function
+        profiler_schedule = schedule(
+            skip_first = 0,
+            wait = 1,
+            warmup = 2,
+            active = 5,
+            repeat = 1,
+        )
+
+        def trace_ready_callback(prof):
+            rank = 0
+            if torch.distributed.is_available() and torch.distributed.is_initialized():
+                rank = torch.distributed.get_rank()
+            if rank == 0:
+                print("----------- Trace Ready -----------")
+                prof.export_chrome_trace(f"{params.profiler_output}.json")            
+            # print(f"----------- Rank {rank} Trace Ready -----------")
+            # prof.export_chrome_trace(f"{params.profiler_output}_rank{rank}.json")
+
+        tm = time.time()
+        with profile(
+            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+            schedule=profiler_schedule,
+            on_trace_ready=trace_ready_callback) as prof:
+            for i in range(iterations):
+                with record_function(f"iteration {i}"):
+                    forward_fn(inp, optimizer, network, params, net, batch_size, criterion, target, step=i, opt_step=params.opt_step)
+                prof.step()
+            torch.cuda.synchronize()
+            print(prof.key_averages().table(sort_by="cuda_time_total"))
+    else:
+        tm = time.time()
+        with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler):
+            for i in range(iterations):
+                if i == flops_prof_step:
+                    forward_fn(inp, optimizer, network, params, net, batch_size, criterion, target, step=i, opt_step=params.opt_step, flops_prof_step=i)
+                else:
+                    forward_fn(inp, optimizer, network, params, net, batch_size, criterion, target, step=i, opt_step=params.opt_step)
+        torch.cuda.synchronize()
+
+    tm2 = time.time()
+    time_per_batch = (tm2 - tm) / iterations
+
+    if run_fp16:
+        dtype = 'FP16'
+    elif run_amp:
+        dtype = 'AMP: PyTorch Native Automatic Mixed Precision'
+    else:
+        dtype = 'FP32'
+
+    result = None
+    if not params.output_dir:
+        params.output_dir = "."
+
+    print ("OK: finished running benchmark..")
+    print ("--------------------SUMMARY--------------------------")
+    print ("Microbenchmark for network : {}".format(net))
+    if distributed_dataparallel or is_torchrun:
+        print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------");
+        print ("Num devices: 1")
+    else:
+        print ("Num devices: {}".format(ngpus))
+        result = {
+            "Name": params.output_file,
+            "GPUs": 1,
+            "Mini batch size [img]": batch_size,
+            "Mini batch size [img/gpu]": batch_size,
+            "Throughput [img/sec]": batch_size / time_per_batch,
+            "Time per mini-batch": time_per_batch
+        }
+        with open(f"{params.output_dir}/{params.output_file}.json", "w") as f:
+            json.dump(result, f, indent=2)
+    print ("Dtype: {}".format(dtype))
+    print ("Mini batch size [", get_input_type(net), "] : {}".format(batch_size))
+    print ("Time per mini-batch : {}".format(time_per_batch))
+    print ("Throughput [", get_input_type(net), "/sec] : {}".format(batch_size/time_per_batch))
+    if (distributed_dataparallel or is_torchrun) and distributed_parameters['rank'] == 0:
+        print ("")
+        print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------")
+        world_size = distributed_parameters['world_size']
+        print ("Num devices: {}".format(world_size))
+        print ("Dtype: {}".format(dtype))
+        print ("Mini batch size [", get_input_type(net), "] : {}".format(batch_size*world_size))
+        print ("Time per mini-batch : {}".format(time_per_batch))
+        print ("Throughput [", get_input_type(net), "/sec] : {}".format(batch_size*world_size/time_per_batch))
+        result = {
+            "Name": params.output_file,
+            "GPUs": distributed_parameters['world_size'],
+            "Mini batch size [img]": batch_size * distributed_parameters['world_size'],
+            "Mini batch size [img/gpu]": batch_size,
+            "Throughput [img/sec]": batch_size * distributed_parameters['world_size'] / time_per_batch,
+            "Time per mini-batch": time_per_batch
+        }
+        with open(f"{params.output_dir}/{params.output_file}.json", "w") as f:
+            json.dump(result, f, indent=2)
+
+    csv_filename = f"{params.output_dir}/benchmark_summary.csv"
+    if params.csv_file:
+        csv_filename = params.csv_file
+    file_exists = os.path.isfile(csv_filename)
+    if result:
+        with open(csv_filename, "a", newline='') as csvfile:
+            writer = csv.writer(csvfile)
+            if not file_exists:
+                writer.writerow(result.keys())
+            writer.writerow(result.values())
+        print(f"Benchmark result saved to {csv_filename}")
+
+def main():
+    run_benchmarking_wrapper(copy.deepcopy(args))
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--network", type=str, choices=get_network_names(), required=True, help="Network to run.")
+    parser.add_argument("--batch-size" , type=int, required=False, default=64, help="Batch size (will be split among devices used by this invocation)")
+    parser.add_argument("--iterations", type=int, required=False, default=20, help="Iterations")
+    parser.add_argument("--flops-prof-step", type=int, required=False, default=0, help="The flops profiling step")
+    parser.add_argument("--kineto", action='store_true', required=False, help="Turn kineto profiling on")
+    parser.add_argument("--autograd_profiler", action='store_true', required=False, help="Use PyTorch autograd (old) profiler")
+    parser.add_argument("--fp16", type=int, required=False, default=0,help="FP16 mixed precision benchmarking")
+    parser.add_argument("--distributed_dataparallel", action='store_true', required=False, help="Use torch.nn.parallel.DistributedDataParallel api to run on multiple processes/nodes. The multiple processes need to be launched manually, this script will only launch ONE process per invocation. Either use --distributed_dataparallel and manually launch multiple processes or launch this script with `torchrun`")
+    parser.add_argument("--device_ids", type=str, required=False, default=None, help="Comma-separated list (no spaces) to specify which HIP devices (0-indexed) to run distributedDataParallel api on. Might need to use HIP_VISIBLE_DEVICES to limit visiblity of devices to different processes.")
+    parser.add_argument("--rank", type=int, required=False, default=None, help="Rank of this process. Required for --distributed_dataparallel")
+    parser.add_argument("--world-size", type=int, required=False, default=None, help="Total number of ranks/processes. Required for --distributed_dataparallel")
+    parser.add_argument("--dist-backend", type=str, required=False, default=None, help="Backend used for distributed training. Can be one of 'nccl' or 'gloo'. Required for --distributed_dataparallel")
+    parser.add_argument("--dist-url", type=str, required=False, default=None, help="url used for rendezvous of processes in distributed training. Needs to contain IP and open port of master rank0 eg. 'tcp://172.23.2.1:54321'. Required for --distributed_dataparallel")
+    parser.add_argument("--compile", action='store_true', required=False, help="use pytorch 2.0")
+    parser.add_argument("--compileContext", default={}, required=False, help="additional compile options")
+    parser.add_argument("--amp", action='store_true', default=False, required=False, help="Automatic mixed precision benchmarking")
+    parser.add_argument("--csv-file", type=str, default=None, required=False, help="assign output csv file name.")
+    parser.add_argument("--mode", type=str, choices=['training', 'inference'], default="training", help="Select mode: training or inference")
+    parser.add_argument("--opt-step", type=int, required=False, default=1, help="Optimizer update step")
+    parser.add_argument("--output-dir", type=str, default="", help="assign output directory name.")
+    parser.add_argument("--output-file", type=str, default="", help="assign output file name.")
+    parser.add_argument("--profiler-output", type=str, default="", help="assign profiler output name.") 
+
+    args = parser.parse_args()
+
+    if args.flops_prof_step:
+        try:
+            from deepspeed.profiling.flops_profiler import FlopsProfiler
+        except:
+            print("ERROR: You must install (or copy) deepspeed.profiling to use --flops-prof-step")
+            sys.exit(1)
+
+    main()
\ No newline at end of file