From fc751d9dc63378837aa18430419e8e97a618d48b Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Sat, 2 Aug 2025 07:42:22 +0000
Subject: [PATCH 01/22] replace pytorch calls to apex calls in
 microbenchmarking code

---
 micro_benchmarking_apex.py | 451 +++++++++++++++++++++++++++++++++++++
 1 file changed, 451 insertions(+)
 create mode 100644 micro_benchmarking_apex.py

diff --git a/micro_benchmarking_apex.py b/micro_benchmarking_apex.py
new file mode 100644
index 0000000..4c7e513
--- /dev/null
+++ b/micro_benchmarking_apex.py
@@ -0,0 +1,451 @@
+import torch
+import torchvision
+import random
+import time
+import argparse
+import os
+import sys
+import ast
+import copy
+import math
+import torch.nn as nn
+import torch.multiprocessing as mp
+try:
+    import apex
+except:
+    print ("ERROR: You must install apex to run apex microbenchmarking")
+    sys.exit(1)
+from apex.fp16_utils import FP16Model
+from shufflenet import shufflenet
+from shufflenet_v2 import shufflenet as shufflenet_v2
+from xception import xception
+from apex.parallel import DistributedDataParallel as DDP
+
+try:
+    import torch._dynamo
+    torch._dynamo.config.verbose=True
+    HAVE_DYNAMO = True
+except:
+    HAVE_DYNAMO = False
+
+IS_PT2 = hasattr(torch, "compile")
+
+is_torchrun = False
+if "LOCAL_RANK" in os.environ:
+    # this indicates we're using torchrun
+    is_torchrun = True
+
+
+
+def weight_init(m):
+    if isinstance(m, nn.Conv2d):
+        n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        m.weight.data.normal_(0, math.sqrt(2. / n))
+        if m.bias is not None:
+            m.bias.data.zero_()
+    elif isinstance(m, nn.BatchNorm2d):
+        m.weight.data.fill_(1)
+        m.bias.data.zero_()
+
+# num_classes=1000
+models = {
+        "alexnet" :            torchvision.models.alexnet,
+        "densenet121" :        torchvision.models.densenet121,
+        "densenet161" :        torchvision.models.densenet161,
+        "densenet169" :        torchvision.models.densenet169,
+        "densenet201" :        torchvision.models.densenet201,
+        "googlenet" :          torchvision.models.googlenet,
+        "inception_v3" :       torchvision.models.inception_v3,
+        "mnasnet0_5" :         torchvision.models.mnasnet0_5,
+        "mnasnet0_75" :        torchvision.models.mnasnet0_75,
+        "mnasnet1_0" :         torchvision.models.mnasnet1_0,
+        "mnasnet1_3" :         torchvision.models.mnasnet1_3,
+        "mobilenet_v2" :       torchvision.models.mobilenet_v2,
+        "resnet18" :           torchvision.models.resnet18,
+        "resnet34" :           torchvision.models.resnet34,
+        "resnet50" :           torchvision.models.resnet50,
+        "resnet101" :          torchvision.models.resnet101,
+        "resnet152" :          torchvision.models.resnet152,
+        "resnext50" :          torchvision.models.resnext50_32x4d,
+        "resnext50_32x4d" :    torchvision.models.resnext50_32x4d,
+        "resnext101" :         torchvision.models.resnext101_32x8d,
+        "resnext101_32x8d" :   torchvision.models.resnext101_32x8d,
+        "shufflenet" :         shufflenet,
+        "shufflenet_v2" :      shufflenet_v2,
+        "shufflenet_v2_x05" :  torchvision.models.shufflenet_v2_x0_5,
+        "shufflenet_v2_x10" :  torchvision.models.shufflenet_v2_x1_0,
+        "shufflenet_v2_x15" :  torchvision.models.shufflenet_v2_x1_5,
+        "shufflenet_v2_x20" :  torchvision.models.shufflenet_v2_x2_0,
+        "shufflenet_v2_x0_5" : torchvision.models.shufflenet_v2_x0_5,
+        "shufflenet_v2_x1_0" : torchvision.models.shufflenet_v2_x1_0,
+        "shufflenet_v2_x1_5" : torchvision.models.shufflenet_v2_x1_5,
+        "shufflenet_v2_x2_0" : torchvision.models.shufflenet_v2_x2_0,
+        "SqueezeNet" :         torchvision.models.squeezenet1_0,
+        "squeezenet1_0" :      torchvision.models.squeezenet1_0,
+        "SqueezeNet1.1" :      torchvision.models.squeezenet1_1,
+        "squeezenet1_1" :      torchvision.models.squeezenet1_1,
+        "vgg11" :              torchvision.models.vgg11,
+        "vgg13" :              torchvision.models.vgg13,
+        "vgg16" :              torchvision.models.vgg16,
+        "vgg19" :              torchvision.models.vgg19,
+        "vgg11_bn" :           torchvision.models.vgg11_bn,
+        "vgg13_bn" :           torchvision.models.vgg13_bn,
+        "vgg16_bn" :           torchvision.models.vgg16_bn,
+        "vgg19_bn" :           torchvision.models.vgg19_bn,
+        "wide_resnet50_2" :    torchvision.models.wide_resnet50_2,
+        "wide_resnet101_2" :   torchvision.models.wide_resnet101_2,
+        "xception" :           xception,
+}
+
+# newer torchvision models, for backwards compat
+try:
+    models["swin_t"] = torchvision.models.swin_t
+    models["swin_s"] = torchvision.models.swin_s
+    models["swin_b"] = torchvision.models.swin_b
+    models["swin_v2_t"] = torchvision.models.swin_v2_t
+    models["swin_v2_s"] = torchvision.models.swin_v2_s
+    models["swin_v2_b"] = torchvision.models.swin_v2_b
+    models["vit_b_16"] = torchvision.models.vit_b_16
+    models["vit_b_32"] = torchvision.models.vit_b_32
+    models["vit_l_16"] = torchvision.models.vit_l_16
+    models["vit_l_32"] = torchvision.models.vit_l_32
+    models["vit_h_14"] = torchvision.models.vit_h_14
+    models["efficientnet_b0"] = torchvision.models.efficientnet_b0
+    models["efficientnet_b1"] = torchvision.models.efficientnet_b1
+    models["efficientnet_b2"] = torchvision.models.efficientnet_b2
+    models["efficientnet_b3"] = torchvision.models.efficientnet_b3
+    models["efficientnet_b4"] = torchvision.models.efficientnet_b4
+    models["efficientnet_b5"] = torchvision.models.efficientnet_b5
+    models["efficientnet_b6"] = torchvision.models.efficientnet_b6
+    models["efficientnet_b7"] = torchvision.models.efficientnet_b7
+    models["maxvit_t"] = torchvision.models.maxvit_t
+except AttributeError:
+    pass
+
+try:
+    models["mobilenet_v3_large"] = torchvision.models.mobilenet_v3_large
+    models["mobilenet_v3_small"] = torchvision.models.mobilenet_v3_small
+except AttributeError:
+    pass
+# segmentation models, num_classes=21
+segmentation_models = {
+        "fcn_resnet50" :        torchvision.models.segmentation.fcn_resnet50,
+        "fcn_resnet101" :       torchvision.models.segmentation.fcn_resnet101,
+        "deeplabv3_resnet50" :  torchvision.models.segmentation.deeplabv3_resnet50,
+        "deeplabv3_resnet101" : torchvision.models.segmentation.deeplabv3_resnet101,
+}
+
+# newer torchvision segmentation models, for backwards compat
+try:
+    segmentation_models["deeplabv3_mobilenet_v3_large"] = torchvision.models.segmentation.deeplabv3_mobilenet_v3_large
+    segmentation_models["lraspp_mobilenet_v3_large"] = torchvision.models.segmentation.lraspp_mobilenet_v3_large,
+except AttributeError:
+    pass
+
+def get_network_names():
+    return sorted(list(models.keys()) + list(segmentation_models.keys()))
+
+def get_network(net):
+    # aux_logits=False only used by inception_v3
+    if "inception_v3" == net:
+        return models[net](aux_logits=False).to(device="cuda")
+    elif net in models:
+        return models[net]().to(device="cuda")
+    elif net in segmentation_models:
+        return segmentation_models[net]().to(device="cuda")
+    else:
+        print ("ERROR: not a supported model '%s'" % net)
+        sys.exit(1)
+
+def forwardbackward(inp, optimizer, network, target, amp_opt_level, flops_prof_step=0):
+    optimizer.zero_grad()
+    if flops_prof_step:
+        prof = FlopsProfiler(network)
+        prof.start_profile()
+    out = network(inp)
+    # WIP: googlenet, deeplabv3_*, fcn_* missing log_softmax for this to work
+    loss = torch.nn.functional.cross_entropy(out, target)
+    # End profiler here if only to profile forward pass
+
+    if amp_opt_level:
+        with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+    else:
+        loss.backward()
+
+    if flops_prof_step:
+        # End profiler here to profile both fwd and bwd passes
+        # flops = prof.get_total_flops(as_string=True)
+        # params = prof.get_total_params(as_string=True)
+        prof.print_model_profile(profile_step=flops_prof_step)
+        prof.end_profile()
+
+    optimizer.step()
+
+def rendezvous(distributed_parameters):
+    print("Initializing process group...")
+    torch.distributed.init_process_group(backend=distributed_parameters['dist_backend'], init_method=distributed_parameters['dist_url'], rank=distributed_parameters['rank'], world_size=distributed_parameters['world_size'])
+    print("Rendezvous complete. Created process group...")
+
+def run_benchmarking_wrapper(params):
+    params.flops_prof_step = max(0, min(params.flops_prof_step, params.iterations - 1))
+    if (params.device_ids):
+        params.device_ids = [int(x) for x in params.device_ids.split(",")]
+    else:
+        params.device_ids = None
+    params.distributed_parameters = {}
+    if is_torchrun:
+        params.distributed_parameters['rank'] = int(os.environ["LOCAL_RANK"])
+        params.distributed_parameters['world_size'] = int(os.environ["WORLD_SIZE"])
+        params.distributed_parameters['dist_backend'] = "nccl"
+        params.distributed_parameters['dist_url'] = 'tcp://' + os.environ["MASTER_ADDR"] + ":" + os.environ["MASTER_PORT"]
+    else:
+        params.distributed_parameters['rank'] = params.rank
+        params.distributed_parameters['world_size'] = params.world_size
+        params.distributed_parameters['dist_backend'] = params.dist_backend
+        params.distributed_parameters['dist_url'] = params.dist_url
+
+    # Some arguments are required for distributed_dataparallel
+    if params.distributed_dataparallel:
+        assert params.distributed_parameters['rank'] is not None and \
+               params.distributed_parameters['world_size'] is not None and \
+               params.distributed_parameters['dist_backend'] is not None and \
+               params.distributed_parameters['dist_url'] is not None, "rank, world-size, dist-backend and dist-url are required arguments for distributed_dataparallel"
+
+    if is_torchrun:
+        params.ngpus = params.distributed_parameters['world_size']
+    elif params.distributed_dataparallel:
+        params.ngpus = len(params.device_ids) if params.device_ids else torch.cuda.device_count()
+    else:
+        params.ngpus = 1
+
+    if is_torchrun:
+        run_benchmarking(params.distributed_parameters['rank'], params)
+    elif params.distributed_dataparallel:
+        # Assumption below that each process launched with --distributed_dataparallel has the same number of devices visible/specified
+        params.distributed_parameters['world_size'] = params.ngpus * params.distributed_parameters['world_size']
+        params.distributed_parameters['rank'] = params.ngpus * params.distributed_parameters['rank']
+        mp.spawn(run_benchmarking, nprocs=params.ngpus, args=(params,))
+    else:
+        run_benchmarking(0, params)
+
+def run_benchmarking(local_rank, params):
+    device_ids = params.device_ids
+    ngpus = params.ngpus
+    net = params.network
+    run_fp16 = params.fp16
+    amp_opt_level = params.amp_opt_level
+    distributed_dataparallel = params.distributed_dataparallel
+    distributed_parameters = params.distributed_parameters
+    batch_size = params.batch_size
+    kineto = params.kineto
+    iterations = params.iterations
+    autograd_profiler = params.autograd_profiler
+    flops_prof_step = params.flops_prof_step
+
+    if is_torchrun:
+        torch.cuda.set_device("cuda:%d" % local_rank)
+    elif device_ids:
+        assert ngpus == len(device_ids)
+        torch.cuda.set_device("cuda:%d" % device_ids[local_rank])
+    else:
+        torch.cuda.set_device("cuda:0")
+
+    network = get_network(net)
+    if "shufflenet" == net:
+        network.apply(weight_init)
+
+    if params.compile:
+        compile_ctx = {"mode": None,
+                       "dynamic": False,
+                       "fullgraph": False,
+                       "backend": "inductor",
+                       "options": None,
+                       "disable": False}
+        options = None  # needed for internal pytorch checks
+        if params.compileContext:
+            compile_ctx.update(ast.literal_eval(params.compileContext))
+            if compile_ctx["mode"] is not None and compile_ctx["options"] is not None:
+                raise RuntimeError("Cannot specify mode and options simultaneously")
+            if compile_ctx["options"] is not None:
+                options = {}  # needed to save multiple options
+                for compiler_pass in compile_ctx["options"].keys():
+                    options.update({compiler_pass: bool(compile_ctx["options"][compiler_pass])})
+        if IS_PT2:
+            network = torch.compile(network,
+                                    mode=compile_ctx["mode"],
+                                    dynamic=bool(compile_ctx["dynamic"]),
+                                    fullgraph=bool(compile_ctx["fullgraph"]),
+                                    backend=compile_ctx["backend"],
+                                    options=options,
+                                    disable=compile_ctx["disable"])
+        else:
+            print ("ERROR: requested torch.compile but this isn't pytorch 2.x")
+            sys.exit(1)
+
+    if (run_fp16):
+        network = FP16Model(network) 
+
+    #use apex syncbn
+    if args.sync_bn:
+        network = apex.parallel.convert_syncbn_model(network)
+    
+    optimizer = torch.optim.SGD(network.parameters(), lr = 0.01, momentum = 0.9)
+
+    if (amp_opt_level):
+        network, optimizer = apex.amp.initialize(network, optimizer, opt_level="O%d"%amp_opt_level,
+                                                keep_batchnorm_fp32=args.keep_batchnorm_fp32,
+                                                loss_scale=args.loss_scale)
+
+    if is_torchrun:
+        rendezvous(distributed_parameters)
+        devices_to_run_on = [local_rank]
+        print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on)))
+        network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on)
+        batch_size = int(batch_size / ngpus)
+    elif (distributed_dataparallel):
+        distributed_parameters['rank'] += local_rank
+        rendezvous(distributed_parameters)
+        devices_to_run_on = [(device_ids[local_rank] if device_ids else local_rank)]
+        print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on)))
+        network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on)
+        batch_size = int(batch_size / ngpus)
+
+    if (net == "inception_v3"):
+        inp = torch.randn(batch_size, 3, 299, 299, device="cuda")
+    else:
+        inp = torch.randn(batch_size, 3, 224, 224, device="cuda")
+    if (run_fp16):
+        inp = inp.half()
+    if net in models:
+        # number of classes is 1000 for imagenet
+        target = torch.randint(0, 1000, (batch_size,), device="cuda")
+    elif net in segmentation_models:
+        # number of classes is 21 for segmentation
+        target = torch.randint(0, 21, (batch_size,), device="cuda")
+
+    ## warmup.
+    print ("INFO: running forward and backward for warmup.")
+    forwardbackward(inp, optimizer, network, target, amp_opt_level)
+    forwardbackward(inp, optimizer, network, target, amp_opt_level)
+
+    time.sleep(1)
+    torch.cuda.synchronize()
+
+    ## benchmark.
+    print ("INFO: running the benchmark..")
+    if kineto:
+        from torch.profiler import schedule, profile, ProfilerActivity, record_function
+        profiler_schedule = schedule(
+            skip_first = 0,
+            wait = 1,
+            warmup = 2,
+            active = 2,
+            repeat = 1,
+        )
+
+        def trace_ready_callback(prof):
+            print("----------- Trace Ready -----------")
+            prof.export_chrome_trace(f"trace{prof.step_num}.json")
+
+        tm = time.time()
+        with profile(
+            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+            schedule=profiler_schedule,
+            on_trace_ready=trace_ready_callback) as prof:
+            for i in range(iterations):
+                with record_function(f"iteration {i}"):
+                    forwardbackward(inp, optimizer, network, target, amp_opt_level)
+                prof.step()
+            torch.cuda.synchronize()
+            print(prof.key_averages().table(sort_by="cuda_time_total"))
+    else:
+        tm = time.time()
+        with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler):
+            for i in range(iterations):
+                if i == flops_prof_step:
+                    forwardbackward(inp, optimizer, network, target, amp_opt_level, i)
+                else:
+                    forwardbackward(inp, optimizer, network, target, amp_opt_level)
+        torch.cuda.synchronize()
+
+    tm2 = time.time()
+    time_per_batch = (tm2 - tm) / iterations
+
+    if run_fp16:
+        dtype = 'FP16'
+    elif amp_opt_level == 1:
+        dtype = 'AMP-O1: Insert automatic FP16 casts around safe Pytorch functions and Tensor methods.'
+    elif amp_opt_level == 2:
+        dtype = 'AMP-O2: FP16 training with FP32 batchnorm and FP32 master weights.'
+    elif amp_opt_level == 3:
+        dtype = 'AMP-O3: Pure FP16 training.'
+    elif amp_opt_level == 4:
+        dtype = 'AMP-O4: Insert automatic BFLOAT16 casts around safe Pytorch functions and Tensor methods.'
+    elif amp_opt_level == 5:
+        dtype = 'AMP-O5: BFLOAT16 training with FP32 batchnorm and FP32 master weights.'
+    else:
+        dtype = 'FP32'
+
+    print ("OK: finished running benchmark..")
+    print ("--------------------SUMMARY--------------------------")
+    print ("Microbenchmark for network : {}".format(net))
+    if distributed_dataparallel or is_torchrun:
+      print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------");
+      print ("Num devices: 1")
+    else:
+      print ("Num devices: {}".format(ngpus))
+    print ("Dtype: {}".format(dtype))
+    print ("Mini batch size [img] : {}".format(batch_size))
+    print ("Time per mini-batch : {}".format(time_per_batch))
+    print ("Throughput [img/sec] : {}".format(batch_size/time_per_batch))
+    if (distributed_dataparallel or is_torchrun) and distributed_parameters['rank'] == 0:
+      print ("")
+      print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------")
+      world_size = distributed_parameters['world_size']
+      print ("Num devices: {}".format(world_size))
+      print ("Dtype: {}".format(dtype))
+      print ("Mini batch size [img] : {}".format(batch_size*world_size))
+      print ("Time per mini-batch : {}".format(time_per_batch))
+      print ("Throughput [img/sec] : {}".format(batch_size*world_size/time_per_batch))
+
+def main():
+    run_benchmarking_wrapper(copy.deepcopy(args))
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--network", type=str, choices=get_network_names(), required=True, help="Network to run.")
+    parser.add_argument("--batch-size" , type=int, required=False, default=64, help="Batch size (will be split among devices used by this invocation)")
+    parser.add_argument("--iterations", type=int, required=False, default=20, help="Iterations")
+    parser.add_argument("--flops-prof-step", type=int, required=False, default=0, help="The flops profiling step")
+    parser.add_argument("--kineto", action='store_true', required=False, help="Turn kineto profiling on")
+    parser.add_argument("--autograd_profiler", action='store_true', required=False, help="Use PyTorch autograd (old) profiler")
+    parser.add_argument("--fp16", type=int, required=False, default=0,help="FP16 mixed precision benchmarking")
+    parser.add_argument("--amp-opt-level", type=int, required=False, default=0,help="apex.amp mixed precision benchmarking opt level")
+    parser.add_argument("--distributed_dataparallel", action='store_true', required=False, help="Use torch.nn.parallel.DistributedDataParallel api to run on multiple processes/nodes. The multiple processes need to be launched manually, this script will only launch ONE process per invocation. Either use --distributed_dataparallel and manually launch multiple processes or launch this script with `torchrun`")
+    parser.add_argument("--device_ids", type=str, required=False, default=None, help="Comma-separated list (no spaces) to specify which HIP devices (0-indexed) to run distributedDataParallel api on. Might need to use HIP_VISIBLE_DEVICES to limit visiblity of devices to different processes.")
+    parser.add_argument("--rank", type=int, required=False, default=None, help="Rank of this process. Required for --distributed_dataparallel")
+    parser.add_argument("--world-size", type=int, required=False, default=None, help="Total number of ranks/processes. Required for --distributed_dataparallel")
+    parser.add_argument("--dist-backend", type=str, required=False, default=None, help="Backend used for distributed training. Can be one of 'nccl' or 'gloo'. Required for --distributed_dataparallel")
+    parser.add_argument("--dist-url", type=str, required=False, default=None, help="url used for rendezvous of processes in distributed training. Needs to contain IP and open port of master rank0 eg. 'tcp://172.23.2.1:54321'. Required for --distributed_dataparallel")
+    parser.add_argument("--compile", action='store_true', required=False, help="use pytorch 2.0")
+    parser.add_argument("--compileContext", default={}, required=False, help="additional compile options")
+    parser.add_argument('--sync_bn', action='store_true', help='enabling apex sync BN.')
+    parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
+    parser.add_argument('--loss-scale', type=str, default=None)
+
+    args = parser.parse_args()
+
+    if args.flops_prof_step:
+        try:
+            from deepspeed.profiling.flops_profiler import FlopsProfiler
+        except:
+            print("ERROR: You must install (or copy) deepspeed.profiling to use --flops-prof-step")
+            sys.exit(1)
+
+    if args.fp16 and args.amp_opt_level:
+        print ("ERROR: Cannot use both --fp16 and --amp-opt-level")
+        sys.exit(1)
+    
+
+    main()

From 80b38cd5d1c4c5c349b8d94e75237829cbf1ea25 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 9 Oct 2025 11:18:18 +0000
Subject: [PATCH 02/22] update the optimizer from torch to apex fused sgd
 optimizer

---
 micro_benchmarking_apex.py  |   2 +-
 micro_benchmarking_audio.py | 355 ++++++++++++++++++++++++++++++++++++
 2 files changed, 356 insertions(+), 1 deletion(-)
 create mode 100644 micro_benchmarking_audio.py

diff --git a/micro_benchmarking_apex.py b/micro_benchmarking_apex.py
index 4c7e513..d94052a 100644
--- a/micro_benchmarking_apex.py
+++ b/micro_benchmarking_apex.py
@@ -290,7 +290,7 @@ def run_benchmarking(local_rank, params):
     if args.sync_bn:
         network = apex.parallel.convert_syncbn_model(network)
     
-    optimizer = torch.optim.SGD(network.parameters(), lr = 0.01, momentum = 0.9)
+    optimizer = apex.optimizers.FusedSGD(network.parameters(), lr = 0.01, momentum = 0.9)
 
     if (amp_opt_level):
         network, optimizer = apex.amp.initialize(network, optimizer, opt_level="O%d"%amp_opt_level,
diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py
new file mode 100644
index 0000000..52c6035
--- /dev/null
+++ b/micro_benchmarking_audio.py
@@ -0,0 +1,355 @@
+import torch
+import torchaudio
+import random
+import time
+import argparse
+import os
+import sys
+import ast
+import copy
+import math
+import torch.nn as nn
+import torch.multiprocessing as mp
+from fp16util import network_to_half, get_param_copy
+import torch.nn.functional as F
+
+try:
+    import torch._dynamo
+    torch._dynamo.config.verbose=True
+    HAVE_DYNAMO = True
+except:
+    HAVE_DYNAMO = False
+
+IS_PT2 = hasattr(torch, "compile")
+
+is_torchrun = False
+if "LOCAL_RANK" in os.environ:
+    # this indicates we're using torchrun
+    is_torchrun = True
+
+try:
+    import apex
+    HAVE_APEX = True
+except:
+    HAVE_APEX = False
+
+def weight_init(m):
+    if isinstance(m, nn.Conv2d):
+        n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        m.weight.data.normal_(0, math.sqrt(2. / n))
+        if m.bias is not None:
+            m.bias.data.zero_()
+    elif isinstance(m, nn.BatchNorm2d):
+        m.weight.data.fill_(1)
+        m.bias.data.zero_()
+
+#models that take waveforms as input
+waveform_models = {
+        "wav2vec2_base" : torchaudio.models.hubert_base,
+}
+
+
+
+def get_network_names():
+    return sorted(list(waveform_models.keys()))
+
+def get_network(net):
+    if net in waveform_models:
+        return waveform_models[net](aux_num_out=29).to(device="cuda")
+    else:
+        print ("ERROR: not a supported model '%s'" % net)
+        sys.exit(1)
+
+def forwardbackward(inp, optimizer, network, target, amp_opt_level, flops_prof_step=0):
+    optimizer.zero_grad()
+    if flops_prof_step:
+        prof = FlopsProfiler(network)
+        prof.start_profile()
+    logits, _ = network(inp)
+    out = F.log_softmax(logits, dim=-1)
+    target = torch.randn_like(out)
+    print ("inp", inp.shape)
+    print ("out", out.shape)
+    print ("target", target.shape)
+
+    
+    # WIP: googlenet, deeplabv3_*, fcn_* missing log_softmax for this to work
+    loss = torch.nn.functional.mse_loss(out, target)
+    # End profiler here if only to profile forward pass
+
+    if amp_opt_level:
+        with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
+            scaled_loss.backward()
+    else:
+        loss.backward()
+
+    if flops_prof_step:
+        # End profiler here to profile both fwd and bwd passes
+        # flops = prof.get_total_flops(as_string=True)
+        # params = prof.get_total_params(as_string=True)
+        prof.print_model_profile(profile_step=flops_prof_step)
+        prof.end_profile()
+
+    optimizer.step()
+
+def rendezvous(distributed_parameters):
+    print("Initializing process group...")
+    torch.distributed.init_process_group(backend=distributed_parameters['dist_backend'], init_method=distributed_parameters['dist_url'], rank=distributed_parameters['rank'], world_size=distributed_parameters['world_size'])
+    print("Rendezvous complete. Created process group...")
+
+def run_benchmarking_wrapper(params):
+    params.flops_prof_step = max(0, min(params.flops_prof_step, params.iterations - 1))
+    if (params.device_ids):
+        params.device_ids = [int(x) for x in params.device_ids.split(",")]
+    else:
+        params.device_ids = None
+    params.distributed_parameters = {}
+    if is_torchrun:
+        params.distributed_parameters['rank'] = int(os.environ["LOCAL_RANK"])
+        params.distributed_parameters['world_size'] = int(os.environ["WORLD_SIZE"])
+        params.distributed_parameters['dist_backend'] = "nccl"
+        params.distributed_parameters['dist_url'] = 'tcp://' + os.environ["MASTER_ADDR"] + ":" + os.environ["MASTER_PORT"]
+    else:
+        params.distributed_parameters['rank'] = params.rank
+        params.distributed_parameters['world_size'] = params.world_size
+        params.distributed_parameters['dist_backend'] = params.dist_backend
+        params.distributed_parameters['dist_url'] = params.dist_url
+
+    # Some arguments are required for distributed_dataparallel
+    if params.distributed_dataparallel:
+        assert params.distributed_parameters['rank'] is not None and \
+               params.distributed_parameters['world_size'] is not None and \
+               params.distributed_parameters['dist_backend'] is not None and \
+               params.distributed_parameters['dist_url'] is not None, "rank, world-size, dist-backend and dist-url are required arguments for distributed_dataparallel"
+
+    if is_torchrun:
+        params.ngpus = params.distributed_parameters['world_size']
+    elif params.distributed_dataparallel:
+        params.ngpus = len(params.device_ids) if params.device_ids else torch.cuda.device_count()
+    else:
+        params.ngpus = 1
+
+    if is_torchrun:
+        run_benchmarking(params.distributed_parameters['rank'], params)
+    elif params.distributed_dataparallel:
+        # Assumption below that each process launched with --distributed_dataparallel has the same number of devices visible/specified
+        params.distributed_parameters['world_size'] = params.ngpus * params.distributed_parameters['world_size']
+        params.distributed_parameters['rank'] = params.ngpus * params.distributed_parameters['rank']
+        mp.spawn(run_benchmarking, nprocs=params.ngpus, args=(params,))
+    else:
+        run_benchmarking(0, params)
+
+def run_benchmarking(local_rank, params):
+    device_ids = params.device_ids
+    ngpus = params.ngpus
+    net = params.network
+    run_fp16 = params.fp16
+    amp_opt_level = params.amp_opt_level
+    distributed_dataparallel = params.distributed_dataparallel
+    distributed_parameters = params.distributed_parameters
+    batch_size = params.batch_size
+    kineto = params.kineto
+    iterations = params.iterations
+    autograd_profiler = params.autograd_profiler
+    flops_prof_step = params.flops_prof_step
+
+    if is_torchrun:
+        torch.cuda.set_device("cuda:%d" % local_rank)
+    elif device_ids:
+        assert ngpus == len(device_ids)
+        torch.cuda.set_device("cuda:%d" % device_ids[local_rank])
+    else:
+        torch.cuda.set_device("cuda:0")
+
+    network = get_network(net)
+    if "shufflenet" == net:
+        network.apply(weight_init)
+
+    if (run_fp16):
+        network = network_to_half(network)
+
+    if params.compile:
+        compile_ctx = {"mode": None,
+                       "dynamic": False,
+                       "fullgraph": False,
+                       "backend": "inductor",
+                       "options": None,
+                       "disable": False}
+        options = None  # needed for internal pytorch checks
+        if params.compileContext:
+            compile_ctx.update(ast.literal_eval(params.compileContext))
+            if compile_ctx["mode"] is not None and compile_ctx["options"] is not None:
+                raise RuntimeError("Cannot specify mode and options simultaneously")
+            if compile_ctx["options"] is not None:
+                options = {}  # needed to save multiple options
+                for compiler_pass in compile_ctx["options"].keys():
+                    options.update({compiler_pass: bool(compile_ctx["options"][compiler_pass])})
+        if IS_PT2:
+            network = torch.compile(network,
+                                    mode=compile_ctx["mode"],
+                                    dynamic=bool(compile_ctx["dynamic"]),
+                                    fullgraph=bool(compile_ctx["fullgraph"]),
+                                    backend=compile_ctx["backend"],
+                                    options=options,
+                                    disable=compile_ctx["disable"])
+        else:
+            print ("ERROR: requested torch.compile but this isn't pytorch 2.x")
+            sys.exit(1)
+
+    param_copy = network.parameters()
+    if (run_fp16):
+        param_copy = get_param_copy(network)
+    optimizer = torch.optim.SGD(param_copy, lr = 0.01, momentum = 0.9)
+
+    if (amp_opt_level):
+        network, optimizer = apex.amp.initialize(network, optimizer, opt_level="O%d"%amp_opt_level)
+
+    if is_torchrun:
+        rendezvous(distributed_parameters)
+        devices_to_run_on = [local_rank]
+        print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on)))
+        network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on)
+        batch_size = int(batch_size / ngpus)
+    elif (distributed_dataparallel):
+        distributed_parameters['rank'] += local_rank
+        rendezvous(distributed_parameters)
+        devices_to_run_on = [(device_ids[local_rank] if device_ids else local_rank)]
+        print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on)))
+        network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on)
+        batch_size = int(batch_size / ngpus)
+
+    if net in waveform_models:
+        inp = torch.randn(batch_size, 16000, device="cuda")
+        # number of classes is 500 for hubert
+        target = torch.randint(0, 500, (batch_size,29), device="cuda")
+    elif net in segmentation_models:
+        # number of classes is 21 for segmentation
+        target = torch.randint(0, 21, (batch_size,), device="cuda")
+        
+    if (run_fp16):
+        inp = inp.half()
+    
+    ## warmup.
+    print ("INFO: running forward and backward for warmup.")
+    forwardbackward(inp, optimizer, network, target, amp_opt_level)
+    forwardbackward(inp, optimizer, network, target, amp_opt_level)
+
+    time.sleep(1)
+    torch.cuda.synchronize()
+
+    ## benchmark.
+    print ("INFO: running the benchmark..")
+    if kineto:
+        from torch.profiler import schedule, profile, ProfilerActivity, record_function
+        profiler_schedule = schedule(
+            skip_first = 0,
+            wait = 1,
+            warmup = 2,
+            active = 2,
+            repeat = 1,
+        )
+
+        def trace_ready_callback(prof):
+            print("----------- Trace Ready -----------")
+            prof.export_chrome_trace(f"trace{prof.step_num}.json")
+
+        tm = time.time()
+        with profile(
+            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+            schedule=profiler_schedule,
+            on_trace_ready=trace_ready_callback) as prof:
+            for i in range(iterations):
+                with record_function(f"iteration {i}"):
+                    forwardbackward(inp, optimizer, network, target, amp_opt_level)
+                prof.step()
+            torch.cuda.synchronize()
+            print(prof.key_averages().table(sort_by="cuda_time_total"))
+    else:
+        tm = time.time()
+        with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler):
+            for i in range(iterations):
+                if i == flops_prof_step:
+                    forwardbackward(inp, optimizer, network, target, amp_opt_level, i)
+                else:
+                    forwardbackward(inp, optimizer, network, target, amp_opt_level)
+        torch.cuda.synchronize()
+
+    tm2 = time.time()
+    time_per_batch = (tm2 - tm) / iterations
+
+    if run_fp16:
+        dtype = 'FP16'
+    elif amp_opt_level == 1:
+        dtype = 'AMP-O1: Insert automatic FP16 casts around safe Pytorch functions and Tensor methods.'
+    elif amp_opt_level == 2:
+        dtype = 'AMP-O2: FP16 training with FP32 batchnorm and FP32 master weights.'
+    elif amp_opt_level == 3:
+        dtype = 'AMP-O3: Pure FP16 training.'
+    elif amp_opt_level == 4:
+        dtype = 'AMP-O4: Insert automatic BFLOAT16 casts around safe Pytorch functions and Tensor methods.'
+    elif amp_opt_level == 5:
+        dtype = 'AMP-O5: BFLOAT16 training with FP32 batchnorm and FP32 master weights.'
+    else:
+        dtype = 'FP32'
+
+    print ("OK: finished running benchmark..")
+    print ("--------------------SUMMARY--------------------------")
+    print ("Microbenchmark for network : {}".format(net))
+    if distributed_dataparallel or is_torchrun:
+      print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------");
+      print ("Num devices: 1")
+    else:
+      print ("Num devices: {}".format(ngpus))
+    print ("Dtype: {}".format(dtype))
+    print ("Mini batch size [img] : {}".format(batch_size))
+    print ("Time per mini-batch : {}".format(time_per_batch))
+    print ("Throughput [img/sec] : {}".format(batch_size/time_per_batch))
+    if (distributed_dataparallel or is_torchrun) and distributed_parameters['rank'] == 0:
+      print ("")
+      print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------")
+      world_size = distributed_parameters['world_size']
+      print ("Num devices: {}".format(world_size))
+      print ("Dtype: {}".format(dtype))
+      print ("Mini batch size [img] : {}".format(batch_size*world_size))
+      print ("Time per mini-batch : {}".format(time_per_batch))
+      print ("Throughput [img/sec] : {}".format(batch_size*world_size/time_per_batch))
+
+def main():
+    run_benchmarking_wrapper(copy.deepcopy(args))
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--network", type=str, choices=get_network_names(), required=True, help="Network to run.")
+    parser.add_argument("--batch-size" , type=int, required=False, default=64, help="Batch size (will be split among devices used by this invocation)")
+    parser.add_argument("--iterations", type=int, required=False, default=20, help="Iterations")
+    parser.add_argument("--flops-prof-step", type=int, required=False, default=0, help="The flops profiling step")
+    parser.add_argument("--kineto", action='store_true', required=False, help="Turn kineto profiling on")
+    parser.add_argument("--autograd_profiler", action='store_true', required=False, help="Use PyTorch autograd (old) profiler")
+    parser.add_argument("--fp16", type=int, required=False, default=0,help="FP16 mixed precision benchmarking")
+    parser.add_argument("--amp-opt-level", type=int, required=False, default=0,help="apex.amp mixed precision benchmarking opt level")
+    parser.add_argument("--distributed_dataparallel", action='store_true', required=False, help="Use torch.nn.parallel.DistributedDataParallel api to run on multiple processes/nodes. The multiple processes need to be launched manually, this script will only launch ONE process per invocation. Either use --distributed_dataparallel and manually launch multiple processes or launch this script with `torchrun`")
+    parser.add_argument("--device_ids", type=str, required=False, default=None, help="Comma-separated list (no spaces) to specify which HIP devices (0-indexed) to run distributedDataParallel api on. Might need to use HIP_VISIBLE_DEVICES to limit visiblity of devices to different processes.")
+    parser.add_argument("--rank", type=int, required=False, default=None, help="Rank of this process. Required for --distributed_dataparallel")
+    parser.add_argument("--world-size", type=int, required=False, default=None, help="Total number of ranks/processes. Required for --distributed_dataparallel")
+    parser.add_argument("--dist-backend", type=str, required=False, default=None, help="Backend used for distributed training. Can be one of 'nccl' or 'gloo'. Required for --distributed_dataparallel")
+    parser.add_argument("--dist-url", type=str, required=False, default=None, help="url used for rendezvous of processes in distributed training. Needs to contain IP and open port of master rank0 eg. 'tcp://172.23.2.1:54321'. Required for --distributed_dataparallel")
+    parser.add_argument("--compile", action='store_true', required=False, help="use pytorch 2.0")
+    parser.add_argument("--compileContext", default={}, required=False, help="additional compile options")
+
+    args = parser.parse_args()
+
+    if args.flops_prof_step:
+        try:
+            from deepspeed.profiling.flops_profiler import FlopsProfiler
+        except:
+            print("ERROR: You must install (or copy) deepspeed.profiling to use --flops-prof-step")
+            sys.exit(1)
+
+    if args.fp16 and args.amp_opt_level:
+        print ("ERROR: Cannot use both --fp16 and --amp-opt-level")
+        sys.exit(1)
+    if args.amp_opt_level and not HAVE_APEX:
+        print ("ERROR: You must install apex to use --amp-opt-level")
+        sys.exit(1)
+
+    main()
\ No newline at end of file

From 674bd6b29a7d62027e22641e01e99e1d44cd8fc0 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 9 Oct 2025 11:23:07 +0000
Subject: [PATCH 03/22] update the DistributedDataParallel from torch to apex

---
 micro_benchmarking_apex.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/micro_benchmarking_apex.py b/micro_benchmarking_apex.py
index d94052a..ae1a422 100644
--- a/micro_benchmarking_apex.py
+++ b/micro_benchmarking_apex.py
@@ -19,7 +19,6 @@
 from shufflenet import shufflenet
 from shufflenet_v2 import shufflenet as shufflenet_v2
 from xception import xception
-from apex.parallel import DistributedDataParallel as DDP
 
 try:
     import torch._dynamo
@@ -301,14 +300,14 @@ def run_benchmarking(local_rank, params):
         rendezvous(distributed_parameters)
         devices_to_run_on = [local_rank]
         print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on)))
-        network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on)
+        network = apex.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on)
         batch_size = int(batch_size / ngpus)
     elif (distributed_dataparallel):
         distributed_parameters['rank'] += local_rank
         rendezvous(distributed_parameters)
         devices_to_run_on = [(device_ids[local_rank] if device_ids else local_rank)]
         print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on)))
-        network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on)
+        network = apex.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on)
         batch_size = int(batch_size / ngpus)
 
     if (net == "inception_v3"):
@@ -422,7 +421,7 @@ def main():
     parser.add_argument("--autograd_profiler", action='store_true', required=False, help="Use PyTorch autograd (old) profiler")
     parser.add_argument("--fp16", type=int, required=False, default=0,help="FP16 mixed precision benchmarking")
     parser.add_argument("--amp-opt-level", type=int, required=False, default=0,help="apex.amp mixed precision benchmarking opt level")
-    parser.add_argument("--distributed_dataparallel", action='store_true', required=False, help="Use torch.nn.parallel.DistributedDataParallel api to run on multiple processes/nodes. The multiple processes need to be launched manually, this script will only launch ONE process per invocation. Either use --distributed_dataparallel and manually launch multiple processes or launch this script with `torchrun`")
+    parser.add_argument("--distributed_dataparallel", action='store_true', required=False, help="Use apex.parallel.DistributedDataParallel api to run on multiple processes/nodes. The multiple processes need to be launched manually, this script will only launch ONE process per invocation. Either use --distributed_dataparallel and manually launch multiple processes or launch this script with `torchrun`")
     parser.add_argument("--device_ids", type=str, required=False, default=None, help="Comma-separated list (no spaces) to specify which HIP devices (0-indexed) to run distributedDataParallel api on. Might need to use HIP_VISIBLE_DEVICES to limit visiblity of devices to different processes.")
     parser.add_argument("--rank", type=int, required=False, default=None, help="Rank of this process. Required for --distributed_dataparallel")
     parser.add_argument("--world-size", type=int, required=False, default=None, help="Total number of ranks/processes. Required for --distributed_dataparallel")

From 782bf64f5087bbfa4bcefb09686f3d466ebeba24 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 14 Oct 2025 09:59:28 +0000
Subject: [PATCH 04/22] Fixed the errors in conformer and wavernn models. also
 added units for the wavernn models

---
 micro_benchmarking_audio.py | 242 ++++++++++++++++++++++++++++++------
 1 file changed, 207 insertions(+), 35 deletions(-)

diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py
index 52c6035..238acbc 100644
--- a/micro_benchmarking_audio.py
+++ b/micro_benchmarking_audio.py
@@ -12,6 +12,7 @@
 import torch.multiprocessing as mp
 from fp16util import network_to_half, get_param_copy
 import torch.nn.functional as F
+from torchaudio.models.decoder._ctc_decoder import ctc_decoder
 
 try:
     import torch._dynamo
@@ -33,6 +34,12 @@
 except:
     HAVE_APEX = False
 
+ACOUSTIC_FEATURES_SIZE = 32
+FRAME_COUNT = 128
+HOP_LENGTH = 36
+N_FREQ = 128
+
+
 def weight_init(m):
     if isinstance(m, nn.Conv2d):
         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
@@ -43,38 +50,212 @@ def weight_init(m):
         m.weight.data.fill_(1)
         m.bias.data.zero_()
 
-#models that take waveforms as input
-waveform_models = {
-        "wav2vec2_base" : torchaudio.models.hubert_base,
+#different audio tasks related models
+wav2vec_models = {
+    "wav2vec2_base" : torchaudio.models.wav2vec2_base,
+    "wav2vec2_large" : torchaudio.models.wav2vec2_large,
+    "wav2vec2_large_lv60k" : torchaudio.models.wav2vec2_large_lv60k,
+    "wav2vec2_xlsr_300m" : torchaudio.models.wav2vec2_xlsr_300m,
+    "wav2vec2_xlsr_1b" : torchaudio.models.wav2vec2_xlsr_1b,
+    "wav2vec2_xlsr_2b" : torchaudio.models.wav2vec2_xlsr_2b,
+    "hubert_base" :  torchaudio.models.hubert_base,
+    "hubert_large" : torchaudio.models.hubert_large,
+    "hubert_xlarge" : torchaudio.models.hubert_xlarge,
+    "wavlm_model" : torchaudio.models.wavlm_model,
+    "wavlm_base" : torchaudio.models.wavlm_base,
+    "wavlm_large" : torchaudio.models.wavlm_large,
+}
+
+speech_recognition_models = {
+    "conformer" : torchaudio.models.Conformer,
+    "deepspeech" : torchaudio.models.DeepSpeech,
+    "emformer" : torchaudio.models.Emformer,
+    "wav2letter" : torchaudio.models.Wav2Letter
 }
 
+source_separation_models = {
+    "conv_tasnet_base" : torchaudio.models.conv_tasnet_base,
+    "hdemucs_low" : torchaudio.models.hdemucs_low,
+    "hdemucs_medium" : torchaudio.models.hdemucs_medium,
+    "hdemucs_high" : torchaudio.models.hdemucs_high,
+}
 
+speech_quality_models = {
+    "squim_objective_base" : torchaudio.models.squim_objective_base,
+    "squim_subjective_base" : torchaudio.models.squim_subjective_base
+}
 
-def get_network_names():
-    return sorted(list(waveform_models.keys()))
+speech_synthesis_models = {
+    "tacotron2" : torchaudio.models.Tacotron2,
+    "wavernn" : torchaudio.models.WaveRNN
+}
 
-def get_network(net):
-    if net in waveform_models:
-        return waveform_models[net](aux_num_out=29).to(device="cuda")
+decoder_models = {
+    "emformer_rnnt_base" : torchaudio.models.emformer_rnnt_base
+}
+
+
+def get_network_names():
+    return sorted(list(wav2vec_models.keys()) +
+                  list(speech_recognition_models.keys()) +
+                  list(source_separation_models.keys()) +
+                  list(speech_quality_models.keys()) + 
+                  list(speech_synthesis_models.keys()) + 
+                  list(decoder_models.keys()))
+
+def get_input_type(network_name):
+    if network_name in wav2vec_models or network_name in source_separation_models or network_name in speech_quality_models:
+        return "waveform"
+    elif network_name in speech_recognition_models:
+        return "acoustic features"
+    elif network_name in speech_synthesis_models:
+        if "wavernn" in network_name:
+            return "waveform"
+        else:
+            return "tokens"
+    elif network_name in decoder_models:
+        return ""
+
+def get_input(network_name, network, batch_size):
+    if network_name in wav2vec_models:
+        inp = torch.randn(batch_size, FRAME_COUNT, device="cuda")
+    elif network_name in source_separation_models:
+        if "hdemucs" in network_name:
+            inp = torch.randn(batch_size, 2, FRAME_COUNT, device="cuda")
+        else:
+            inp = torch.randn(batch_size, 1, FRAME_COUNT, device="cuda")
+    elif network_name in speech_recognition_models:
+        if "deepspeech" in network_name:
+            #number of channels must be specified for deepspeech
+            inp = torch.randn(batch_size, 1, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda")
+        elif "wav2letter" in network_name:
+            inp = torch.randn(batch_size, ACOUSTIC_FEATURES_SIZE, FRAME_COUNT, device="cuda")
+        elif "emformer" in network_name:
+            inp = (torch.randn(batch_size, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda"),
+                   torch.randint(1, FRAME_COUNT, (batch_size,)).to(device="cuda"))
+        elif "conformer" in network_name:
+            lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")
+            inp = (torch.rand(batch_size, int(lengths.max()), 80, device="cuda"),
+                lengths)
+        else:
+            lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")
+            inp = (lengths,
+                    torch.rand(batch_size, int(lengths.max()), network.input_dim, device="cuda"))
+    elif network_name in speech_quality_models:
+        if "subjective" in network_name:
+            inp = (torch.randn(batch_size, FRAME_COUNT, device="cuda"),
+                   torch.randn(batch_size, FRAME_COUNT, device="cuda"))
+        else:
+            inp = torch.randn(batch_size, FRAME_COUNT, device="cuda")
+    elif network_name in speech_synthesis_models:
+        if "wavernn" in network_name:
+            spec_frames = 64
+            waveform_length = HOP_LENGTH * (spec_frames - 4)
+            
+            inp = (torch.rand(batch_size, 1, waveform_length, device="cuda"),
+                   torch.rand(batch_size, 1, N_FREQ, spec_frames, device="cuda"))
+        else:
+            n_mels = 80
+            max_mel_specgram_length = 300
+            max_text_length = 100
+            inp = (torch.randint(0, 148, (batch_size, max_text_length), dtype=torch.int32, device="cuda"),
+                    max_text_length * torch.ones((batch_size,), device="cuda"),
+                    torch.rand(
+                        batch_size,
+                        n_mels,
+                        max_mel_specgram_length,
+                        device="cuda",
+                    ),
+                    max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda"))
+    elif network_name in decoder_models:
+        right_context_length = 4
+        max_input_length = 61
+        max_target_length = 23
+
+        inp = (torch.rand(batch_size, max_input_length + right_context_length, 80, device="cuda"),
+                torch.randint(1, max_input_length + 1, (batch_size,), device="cuda"),
+                torch.randint(0, 256, (batch_size, max_target_length), device="cuda"),
+                torch.randint(1, max_target_length + 1, (batch_size,), device="cuda"),
+                None)
+    return inp
+
+def get_network(network_name):
+    if network_name in wav2vec_models:
+        return wav2vec_models[network_name](aux_num_out=29).to(device="cuda")
+    elif network_name in source_separation_models:
+        if "hdemucs" in network_name:
+            return source_separation_models[network_name](sources = ["vocals"]).to(device="cuda")
+        else:
+            return source_separation_models[network_name]().to(device="cuda")
+    elif network_name in speech_recognition_models:
+        if "deepspeech" in network_name:
+            return speech_recognition_models[network_name](n_feature = ACOUSTIC_FEATURES_SIZE).to(device="cuda")
+        elif "wav2letter" in network_name:
+            return speech_recognition_models[network_name](num_features = ACOUSTIC_FEATURES_SIZE).to(device="cuda")
+        elif "emformer" in network_name:
+            return speech_recognition_models[network_name](input_dim = ACOUSTIC_FEATURES_SIZE,
+                                                           num_heads=8, 
+                                                           ffn_dim=1024, 
+                                                           num_layers=20,
+                                                           segment_length=4).to(device="cuda")
+        elif "conformer" in network_name:
+            return speech_recognition_models[network_name](input_dim = 80,
+                                                           num_heads=4, 
+                                                           ffn_dim=128, 
+                                                           num_layers=4,
+                                                           depthwise_conv_kernel_size=31).to(device="cuda")
+    elif network_name in speech_quality_models:
+        return speech_quality_models[network_name]().to(device="cuda")
+    elif network_name in speech_synthesis_models:
+        if "wavernn" in network_name:
+            return speech_synthesis_models[network_name](upsample_scales = [3, 3, 4], n_classes = 10, 
+                                                         hop_length = HOP_LENGTH, n_freq = 128).to(device="cuda")
+        else:
+            return speech_synthesis_models[network_name]().to(device="cuda")
+    elif network_name in decoder_models:
+        return decoder_models[network_name](num_symbols = 256).to(device="cuda")                                                      
     else:
-        print ("ERROR: not a supported model '%s'" % net)
+        print ("ERROR: not a supported model '%s'" % network_name)
         sys.exit(1)
 
-def forwardbackward(inp, optimizer, network, target, amp_opt_level, flops_prof_step=0):
+
+def get_output_selection(network_name):
+    if network_name in wav2vec_models:
+        return 0
+    elif "conformer" in network_name or "emformer" in network_name:
+        return 0
+    elif "objective" in network_name:
+        return 0
+    elif "tacotron2" in network_name:
+        return 1
+    return None
+
+
+def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_size, flops_prof_step=0):
     optimizer.zero_grad()
     if flops_prof_step:
         prof = FlopsProfiler(network)
         prof.start_profile()
-    logits, _ = network(inp)
-    out = F.log_softmax(logits, dim=-1)
+
+    out = network(*inp)
+    output_index = get_output_selection(network_name) 
+    if output_index is not None:
+        out = out[0]
+
+    if network_name in wav2vec_models:
+        out = F.log_softmax(out, dim=-1)
+    elif network_name in speech_recognition_models:
+        out = F.log_softmax(out, dim=-1)
+    
+    
+
     target = torch.randn_like(out)
-    print ("inp", inp.shape)
+    #print ("inp", inp.shape)
     print ("out", out.shape)
-    print ("target", target.shape)
 
-    
-    # WIP: googlenet, deeplabv3_*, fcn_* missing log_softmax for this to work
+    #if network_name in wav2vec_models:
     loss = torch.nn.functional.mse_loss(out, target)
+    
     # End profiler here if only to profile forward pass
 
     if amp_opt_level:
@@ -84,9 +265,6 @@ def forwardbackward(inp, optimizer, network, target, amp_opt_level, flops_prof_s
         loss.backward()
 
     if flops_prof_step:
-        # End profiler here to profile both fwd and bwd passes
-        # flops = prof.get_total_flops(as_string=True)
-        # params = prof.get_total_params(as_string=True)
         prof.print_model_profile(profile_step=flops_prof_step)
         prof.end_profile()
 
@@ -218,21 +396,15 @@ def run_benchmarking(local_rank, params):
         network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on)
         batch_size = int(batch_size / ngpus)
 
-    if net in waveform_models:
-        inp = torch.randn(batch_size, 16000, device="cuda")
-        # number of classes is 500 for hubert
-        target = torch.randint(0, 500, (batch_size,29), device="cuda")
-    elif net in segmentation_models:
-        # number of classes is 21 for segmentation
-        target = torch.randint(0, 21, (batch_size,), device="cuda")
+    inp = get_input(net, network, batch_size)
         
     if (run_fp16):
         inp = inp.half()
     
     ## warmup.
     print ("INFO: running forward and backward for warmup.")
-    forwardbackward(inp, optimizer, network, target, amp_opt_level)
-    forwardbackward(inp, optimizer, network, target, amp_opt_level)
+    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size)
+    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size)
 
     time.sleep(1)
     torch.cuda.synchronize()
@@ -260,7 +432,7 @@ def trace_ready_callback(prof):
             on_trace_ready=trace_ready_callback) as prof:
             for i in range(iterations):
                 with record_function(f"iteration {i}"):
-                    forwardbackward(inp, optimizer, network, target, amp_opt_level)
+                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size)
                 prof.step()
             torch.cuda.synchronize()
             print(prof.key_averages().table(sort_by="cuda_time_total"))
@@ -269,9 +441,9 @@ def trace_ready_callback(prof):
         with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler):
             for i in range(iterations):
                 if i == flops_prof_step:
-                    forwardbackward(inp, optimizer, network, target, amp_opt_level, i)
+                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, i)
                 else:
-                    forwardbackward(inp, optimizer, network, target, amp_opt_level)
+                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size)
         torch.cuda.synchronize()
 
     tm2 = time.time()
@@ -301,18 +473,18 @@ def trace_ready_callback(prof):
     else:
       print ("Num devices: {}".format(ngpus))
     print ("Dtype: {}".format(dtype))
-    print ("Mini batch size [img] : {}".format(batch_size))
+    print ("Mini batch size [", get_input_type(net), "] : {}".format(batch_size))
     print ("Time per mini-batch : {}".format(time_per_batch))
-    print ("Throughput [img/sec] : {}".format(batch_size/time_per_batch))
+    print ("Throughput [", get_input_type(net), "/sec] : {}".format(batch_size/time_per_batch))
     if (distributed_dataparallel or is_torchrun) and distributed_parameters['rank'] == 0:
       print ("")
       print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------")
       world_size = distributed_parameters['world_size']
       print ("Num devices: {}".format(world_size))
       print ("Dtype: {}".format(dtype))
-      print ("Mini batch size [img] : {}".format(batch_size*world_size))
+      print ("Mini batch size [", get_input_type(net), "] : {}".format(batch_size*world_size))
       print ("Time per mini-batch : {}".format(time_per_batch))
-      print ("Throughput [img/sec] : {}".format(batch_size*world_size/time_per_batch))
+      print ("Throughput [", get_input_type(net), "/sec] : {}".format(batch_size*world_size/time_per_batch))
 
 def main():
     run_benchmarking_wrapper(copy.deepcopy(args))

From e28c8e56adb25c9b3a5fd54eb9b788487dfaf1ca Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Thu, 16 Oct 2025 15:28:15 +0000
Subject: [PATCH 05/22] refactor the code, move the models, input, output
 selection to audio_models.py file

---
 audio_model.py              | 181 +++++++++++++++++++++++++++++
 micro_benchmarking_audio.py | 219 ++----------------------------------
 2 files changed, 192 insertions(+), 208 deletions(-)
 create mode 100644 audio_model.py

diff --git a/audio_model.py b/audio_model.py
new file mode 100644
index 0000000..6f1f9d2
--- /dev/null
+++ b/audio_model.py
@@ -0,0 +1,181 @@
+import torch
+import torchaudio
+import sys
+
+ACOUSTIC_FEATURES_SIZE = 32
+FRAME_COUNT = 1024
+HOP_LENGTH = 36
+N_FREQ = 128
+
+#different audio tasks related models
+wav2vec_models = {
+    "wav2vec2_base" : torchaudio.models.wav2vec2_base,
+    "wav2vec2_large" : torchaudio.models.wav2vec2_large,
+    "wav2vec2_large_lv60k" : torchaudio.models.wav2vec2_large_lv60k,
+    "wav2vec2_xlsr_300m" : torchaudio.models.wav2vec2_xlsr_300m,
+    "wav2vec2_xlsr_1b" : torchaudio.models.wav2vec2_xlsr_1b,
+    "wav2vec2_xlsr_2b" : torchaudio.models.wav2vec2_xlsr_2b,
+    "hubert_base" :  torchaudio.models.hubert_base,
+    "hubert_large" : torchaudio.models.hubert_large,
+    "hubert_xlarge" : torchaudio.models.hubert_xlarge,
+    "wavlm_base" : torchaudio.models.wavlm_base,
+    "wavlm_large" : torchaudio.models.wavlm_large,
+}
+
+speech_recognition_models = {
+    "conformer" : torchaudio.models.Conformer,
+    "deepspeech" : torchaudio.models.DeepSpeech,
+    "emformer" : torchaudio.models.Emformer,
+    "wav2letter" : torchaudio.models.Wav2Letter
+}
+
+source_separation_models = {
+    "conv_tasnet_base" : torchaudio.models.conv_tasnet_base,
+    "hdemucs_low" : torchaudio.models.hdemucs_low,
+    "hdemucs_medium" : torchaudio.models.hdemucs_medium,
+    "hdemucs_high" : torchaudio.models.hdemucs_high,
+}
+
+speech_quality_models = {
+    "squim_objective_base" : torchaudio.models.squim_objective_base,
+    "squim_subjective_base" : torchaudio.models.squim_subjective_base
+}
+
+speech_synthesis_models = {
+    "tacotron2" : torchaudio.models.Tacotron2,
+    "wavernn" : torchaudio.models.WaveRNN
+}
+
+decoder_models = {
+    "emformer_rnnt_base" : torchaudio.models.emformer_rnnt_base
+}
+
+
+def get_network_names():
+    return sorted(list(wav2vec_models.keys()) +
+                  list(speech_recognition_models.keys()) +
+                  list(source_separation_models.keys()) +
+                  list(speech_quality_models.keys()) + 
+                  list(speech_synthesis_models.keys()) + 
+                  list(decoder_models.keys()))
+
+
+def get_network(network_name):
+    if network_name in wav2vec_models:
+        return wav2vec_models[network_name](aux_num_out=29).to(device="cuda")
+    elif network_name in source_separation_models:
+        if "hdemucs" in network_name:
+            return source_separation_models[network_name](sources = ["vocals"]).to(device="cuda")
+        else:
+            return source_separation_models[network_name]().to(device="cuda")
+    elif network_name in speech_recognition_models:
+        if "deepspeech" in network_name:
+            return speech_recognition_models[network_name](n_feature = ACOUSTIC_FEATURES_SIZE).to(device="cuda")
+        elif "wav2letter" in network_name:
+            return speech_recognition_models[network_name](num_features = ACOUSTIC_FEATURES_SIZE).to(device="cuda")
+        elif "emformer" in network_name:
+            return speech_recognition_models[network_name](input_dim = ACOUSTIC_FEATURES_SIZE,
+                                                           num_heads=8, 
+                                                           ffn_dim=1024, 
+                                                           num_layers=20,
+                                                           segment_length=4).to(device="cuda")
+        elif "conformer" in network_name:
+            return speech_recognition_models[network_name](input_dim = 80,
+                                                           num_heads=4, 
+                                                           ffn_dim=128, 
+                                                           num_layers=4,
+                                                           depthwise_conv_kernel_size=31).to(device="cuda")
+    elif network_name in speech_quality_models:
+        return speech_quality_models[network_name]().to(device="cuda")
+    elif network_name in speech_synthesis_models:
+        if "wavernn" in network_name:
+            return speech_synthesis_models[network_name](upsample_scales = [3, 3, 4], n_classes = 10, 
+                                                         hop_length = HOP_LENGTH, n_freq = 128).to(device="cuda")
+        else:
+            return speech_synthesis_models[network_name]().to(device="cuda")
+    elif network_name in decoder_models:
+        return decoder_models[network_name](num_symbols = 256).to(device="cuda")                                                      
+    else:
+        print ("ERROR: not a supported model '%s'" % network_name)
+        sys.exit(1)
+
+
+def get_input_type(network_name):
+    if network_name in wav2vec_models or network_name in source_separation_models or network_name in speech_quality_models:
+        return "waveform"
+    elif network_name in speech_recognition_models:
+        return "acoustic features"
+    elif network_name in speech_synthesis_models:
+        if "wavernn" in network_name:
+            return "waveform"
+        else:
+            return "tokens"
+    elif network_name in decoder_models:
+        return ""
+
+
+def get_input(network_name, network, batch_size):
+    if network_name in wav2vec_models:
+        inp = {"waveforms": torch.randn(batch_size, FRAME_COUNT, device="cuda")}
+    elif network_name in source_separation_models:
+        if "hdemucs" in network_name:
+            inp = {"input" : torch.randn(batch_size, 2, FRAME_COUNT, device="cuda")}
+        else:
+            inp = {"input" : torch.randn(batch_size, 1, FRAME_COUNT, device="cuda")}
+    elif network_name in speech_recognition_models:
+        if "deepspeech" in network_name:
+            #number of channels must be specified for deepspeech
+            inp = {"x" : torch.randn(batch_size, 1, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda")}
+        elif "wav2letter" in network_name:
+            inp = {"x" : torch.randn(batch_size, ACOUSTIC_FEATURES_SIZE, FRAME_COUNT, device="cuda")}
+        elif "emformer" in network_name:
+            inp = {"input" : torch.randn(batch_size, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda"),
+                   "lengths" : torch.randint(1, FRAME_COUNT, (batch_size,)).to(device="cuda")}
+        elif "conformer" in network_name:
+            lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")
+            inp = {"input" : torch.rand(batch_size, int(lengths.max()), 80, device="cuda"),
+                   "lengths" : lengths}
+    elif network_name in speech_quality_models:
+        if "subjective" in network_name:
+            inp = {"waveform" : torch.randn(batch_size, FRAME_COUNT, device="cuda"),
+                   "reference" : torch.randn(batch_size, FRAME_COUNT, device="cuda")}
+        else:
+            inp = {"x" : torch.randn(batch_size, FRAME_COUNT, device="cuda")}
+    elif network_name in speech_synthesis_models:
+        if "wavernn" in network_name:
+            spec_frames = 64
+            waveform_length = HOP_LENGTH * (spec_frames - 4)
+            
+            inp = {"waveform" : torch.rand(batch_size, 1, waveform_length, device="cuda"),
+                   "specgram": torch.rand(batch_size, 1, N_FREQ, spec_frames, device="cuda")}
+        elif "tacotron2" in network_name:
+            n_mels = 80
+            max_mel_specgram_length = 300
+            max_text_length = 100
+            inp = {"tokens" : torch.randint(0, 148, (batch_size, max_text_length), dtype=torch.int32, device="cuda"),
+                   "token_lengths" : max_text_length * torch.ones((batch_size,), device="cuda"),
+                   "mel_specgram": torch.rand(batch_size, n_mels, max_mel_specgram_length, device="cuda"),
+                   "mel_specgram_lengths" : max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda")}
+    elif network_name in decoder_models:
+        right_context_length = 4
+        max_input_length = 61
+        max_target_length = 23
+
+        inp = {"sources" : torch.rand(batch_size, max_input_length + right_context_length, 80, device="cuda"),
+               "source_lengths" : torch.randint(1, max_input_length + 1, (batch_size,), device="cuda"),
+               "targets" : torch.randint(0, 256, (batch_size, max_target_length), device="cuda"),
+               "target_lengths" : torch.randint(1, max_target_length + 1, (batch_size,), device="cuda"),
+               "predictor_state" : None}
+    return inp
+
+
+def get_output_selection(network_name):
+    if network_name in wav2vec_models:
+        return 0
+    elif "conformer" in network_name or "emformer" in network_name:
+        return 0
+    elif "objective" in network_name:
+        return 0
+    elif "tacotron2" in network_name:
+        return 1
+    return None
\ No newline at end of file
diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py
index 238acbc..5dbbd54 100644
--- a/micro_benchmarking_audio.py
+++ b/micro_benchmarking_audio.py
@@ -1,6 +1,4 @@
 import torch
-import torchaudio
-import random
 import time
 import argparse
 import os
@@ -12,7 +10,8 @@
 import torch.multiprocessing as mp
 from fp16util import network_to_half, get_param_copy
 import torch.nn.functional as F
-from torchaudio.models.decoder._ctc_decoder import ctc_decoder
+from audio_model import get_network_names, get_network, get_input_type, get_input, get_output_selection
+from audio_loss import get_criterion, calculate_loss
 
 try:
     import torch._dynamo
@@ -34,11 +33,6 @@
 except:
     HAVE_APEX = False
 
-ACOUSTIC_FEATURES_SIZE = 32
-FRAME_COUNT = 128
-HOP_LENGTH = 36
-N_FREQ = 128
-
 
 def weight_init(m):
     if isinstance(m, nn.Conv2d):
@@ -50,211 +44,19 @@ def weight_init(m):
         m.weight.data.fill_(1)
         m.bias.data.zero_()
 
-#different audio tasks related models
-wav2vec_models = {
-    "wav2vec2_base" : torchaudio.models.wav2vec2_base,
-    "wav2vec2_large" : torchaudio.models.wav2vec2_large,
-    "wav2vec2_large_lv60k" : torchaudio.models.wav2vec2_large_lv60k,
-    "wav2vec2_xlsr_300m" : torchaudio.models.wav2vec2_xlsr_300m,
-    "wav2vec2_xlsr_1b" : torchaudio.models.wav2vec2_xlsr_1b,
-    "wav2vec2_xlsr_2b" : torchaudio.models.wav2vec2_xlsr_2b,
-    "hubert_base" :  torchaudio.models.hubert_base,
-    "hubert_large" : torchaudio.models.hubert_large,
-    "hubert_xlarge" : torchaudio.models.hubert_xlarge,
-    "wavlm_model" : torchaudio.models.wavlm_model,
-    "wavlm_base" : torchaudio.models.wavlm_base,
-    "wavlm_large" : torchaudio.models.wavlm_large,
-}
-
-speech_recognition_models = {
-    "conformer" : torchaudio.models.Conformer,
-    "deepspeech" : torchaudio.models.DeepSpeech,
-    "emformer" : torchaudio.models.Emformer,
-    "wav2letter" : torchaudio.models.Wav2Letter
-}
-
-source_separation_models = {
-    "conv_tasnet_base" : torchaudio.models.conv_tasnet_base,
-    "hdemucs_low" : torchaudio.models.hdemucs_low,
-    "hdemucs_medium" : torchaudio.models.hdemucs_medium,
-    "hdemucs_high" : torchaudio.models.hdemucs_high,
-}
-
-speech_quality_models = {
-    "squim_objective_base" : torchaudio.models.squim_objective_base,
-    "squim_subjective_base" : torchaudio.models.squim_subjective_base
-}
-
-speech_synthesis_models = {
-    "tacotron2" : torchaudio.models.Tacotron2,
-    "wavernn" : torchaudio.models.WaveRNN
-}
-
-decoder_models = {
-    "emformer_rnnt_base" : torchaudio.models.emformer_rnnt_base
-}
-
-
-def get_network_names():
-    return sorted(list(wav2vec_models.keys()) +
-                  list(speech_recognition_models.keys()) +
-                  list(source_separation_models.keys()) +
-                  list(speech_quality_models.keys()) + 
-                  list(speech_synthesis_models.keys()) + 
-                  list(decoder_models.keys()))
-
-def get_input_type(network_name):
-    if network_name in wav2vec_models or network_name in source_separation_models or network_name in speech_quality_models:
-        return "waveform"
-    elif network_name in speech_recognition_models:
-        return "acoustic features"
-    elif network_name in speech_synthesis_models:
-        if "wavernn" in network_name:
-            return "waveform"
-        else:
-            return "tokens"
-    elif network_name in decoder_models:
-        return ""
-
-def get_input(network_name, network, batch_size):
-    if network_name in wav2vec_models:
-        inp = torch.randn(batch_size, FRAME_COUNT, device="cuda")
-    elif network_name in source_separation_models:
-        if "hdemucs" in network_name:
-            inp = torch.randn(batch_size, 2, FRAME_COUNT, device="cuda")
-        else:
-            inp = torch.randn(batch_size, 1, FRAME_COUNT, device="cuda")
-    elif network_name in speech_recognition_models:
-        if "deepspeech" in network_name:
-            #number of channels must be specified for deepspeech
-            inp = torch.randn(batch_size, 1, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda")
-        elif "wav2letter" in network_name:
-            inp = torch.randn(batch_size, ACOUSTIC_FEATURES_SIZE, FRAME_COUNT, device="cuda")
-        elif "emformer" in network_name:
-            inp = (torch.randn(batch_size, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda"),
-                   torch.randint(1, FRAME_COUNT, (batch_size,)).to(device="cuda"))
-        elif "conformer" in network_name:
-            lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")
-            inp = (torch.rand(batch_size, int(lengths.max()), 80, device="cuda"),
-                lengths)
-        else:
-            lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")
-            inp = (lengths,
-                    torch.rand(batch_size, int(lengths.max()), network.input_dim, device="cuda"))
-    elif network_name in speech_quality_models:
-        if "subjective" in network_name:
-            inp = (torch.randn(batch_size, FRAME_COUNT, device="cuda"),
-                   torch.randn(batch_size, FRAME_COUNT, device="cuda"))
-        else:
-            inp = torch.randn(batch_size, FRAME_COUNT, device="cuda")
-    elif network_name in speech_synthesis_models:
-        if "wavernn" in network_name:
-            spec_frames = 64
-            waveform_length = HOP_LENGTH * (spec_frames - 4)
-            
-            inp = (torch.rand(batch_size, 1, waveform_length, device="cuda"),
-                   torch.rand(batch_size, 1, N_FREQ, spec_frames, device="cuda"))
-        else:
-            n_mels = 80
-            max_mel_specgram_length = 300
-            max_text_length = 100
-            inp = (torch.randint(0, 148, (batch_size, max_text_length), dtype=torch.int32, device="cuda"),
-                    max_text_length * torch.ones((batch_size,), device="cuda"),
-                    torch.rand(
-                        batch_size,
-                        n_mels,
-                        max_mel_specgram_length,
-                        device="cuda",
-                    ),
-                    max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda"))
-    elif network_name in decoder_models:
-        right_context_length = 4
-        max_input_length = 61
-        max_target_length = 23
-
-        inp = (torch.rand(batch_size, max_input_length + right_context_length, 80, device="cuda"),
-                torch.randint(1, max_input_length + 1, (batch_size,), device="cuda"),
-                torch.randint(0, 256, (batch_size, max_target_length), device="cuda"),
-                torch.randint(1, max_target_length + 1, (batch_size,), device="cuda"),
-                None)
-    return inp
-
-def get_network(network_name):
-    if network_name in wav2vec_models:
-        return wav2vec_models[network_name](aux_num_out=29).to(device="cuda")
-    elif network_name in source_separation_models:
-        if "hdemucs" in network_name:
-            return source_separation_models[network_name](sources = ["vocals"]).to(device="cuda")
-        else:
-            return source_separation_models[network_name]().to(device="cuda")
-    elif network_name in speech_recognition_models:
-        if "deepspeech" in network_name:
-            return speech_recognition_models[network_name](n_feature = ACOUSTIC_FEATURES_SIZE).to(device="cuda")
-        elif "wav2letter" in network_name:
-            return speech_recognition_models[network_name](num_features = ACOUSTIC_FEATURES_SIZE).to(device="cuda")
-        elif "emformer" in network_name:
-            return speech_recognition_models[network_name](input_dim = ACOUSTIC_FEATURES_SIZE,
-                                                           num_heads=8, 
-                                                           ffn_dim=1024, 
-                                                           num_layers=20,
-                                                           segment_length=4).to(device="cuda")
-        elif "conformer" in network_name:
-            return speech_recognition_models[network_name](input_dim = 80,
-                                                           num_heads=4, 
-                                                           ffn_dim=128, 
-                                                           num_layers=4,
-                                                           depthwise_conv_kernel_size=31).to(device="cuda")
-    elif network_name in speech_quality_models:
-        return speech_quality_models[network_name]().to(device="cuda")
-    elif network_name in speech_synthesis_models:
-        if "wavernn" in network_name:
-            return speech_synthesis_models[network_name](upsample_scales = [3, 3, 4], n_classes = 10, 
-                                                         hop_length = HOP_LENGTH, n_freq = 128).to(device="cuda")
-        else:
-            return speech_synthesis_models[network_name]().to(device="cuda")
-    elif network_name in decoder_models:
-        return decoder_models[network_name](num_symbols = 256).to(device="cuda")                                                      
-    else:
-        print ("ERROR: not a supported model '%s'" % network_name)
-        sys.exit(1)
-
-
-def get_output_selection(network_name):
-    if network_name in wav2vec_models:
-        return 0
-    elif "conformer" in network_name or "emformer" in network_name:
-        return 0
-    elif "objective" in network_name:
-        return 0
-    elif "tacotron2" in network_name:
-        return 1
-    return None
-
 
-def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_size, flops_prof_step=0):
+def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_size, criterion, flops_prof_step=0):
     optimizer.zero_grad()
     if flops_prof_step:
         prof = FlopsProfiler(network)
         prof.start_profile()
 
-    out = network(*inp)
+    out = network(**inp)
     output_index = get_output_selection(network_name) 
     if output_index is not None:
         out = out[0]
-
-    if network_name in wav2vec_models:
-        out = F.log_softmax(out, dim=-1)
-    elif network_name in speech_recognition_models:
-        out = F.log_softmax(out, dim=-1)
-    
     
-
-    target = torch.randn_like(out)
-    #print ("inp", inp.shape)
-    print ("out", out.shape)
-
-    #if network_name in wav2vec_models:
-    loss = torch.nn.functional.mse_loss(out, target)
+    loss = calculate_loss(network_name, criterion, out)
     
     # End profiler here if only to profile forward pass
 
@@ -340,6 +142,7 @@ def run_benchmarking(local_rank, params):
         torch.cuda.set_device("cuda:0")
 
     network = get_network(net)
+    criterion = get_criterion(net)
     if "shufflenet" == net:
         network.apply(weight_init)
 
@@ -403,8 +206,8 @@ def run_benchmarking(local_rank, params):
     
     ## warmup.
     print ("INFO: running forward and backward for warmup.")
-    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size)
-    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size)
+    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion)
+    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion)
 
     time.sleep(1)
     torch.cuda.synchronize()
@@ -432,7 +235,7 @@ def trace_ready_callback(prof):
             on_trace_ready=trace_ready_callback) as prof:
             for i in range(iterations):
                 with record_function(f"iteration {i}"):
-                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size)
+                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion)
                 prof.step()
             torch.cuda.synchronize()
             print(prof.key_averages().table(sort_by="cuda_time_total"))
@@ -441,9 +244,9 @@ def trace_ready_callback(prof):
         with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler):
             for i in range(iterations):
                 if i == flops_prof_step:
-                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, i)
+                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, i)
                 else:
-                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size)
+                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion)
         torch.cuda.synchronize()
 
     tm2 = time.time()

From fbdd71a7cbb6a2f012ecbdb0050a9d48e98b168a Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 28 Oct 2025 08:35:00 +0000
Subject: [PATCH 06/22] refactored the audio benchmarking code so that most of
 the lower level code is inside the audio folder, added backward pass code for
 some of the models

---
 audio/audio_loss.py         |  63 +++++++++++
 audio/audio_model.py        | 175 +++++++++++++++++++++++++++++
 audio/languagemodels.py     |  38 +++++++
 audio/sdr.py                | 218 ++++++++++++++++++++++++++++++++++++
 micro_benchmarking_audio.py |   6 +-
 5 files changed, 497 insertions(+), 3 deletions(-)
 create mode 100644 audio/audio_loss.py
 create mode 100644 audio/audio_model.py
 create mode 100644 audio/languagemodels.py
 create mode 100644 audio/sdr.py

diff --git a/audio/audio_loss.py b/audio/audio_loss.py
new file mode 100644
index 0000000..735dbb4
--- /dev/null
+++ b/audio/audio_loss.py
@@ -0,0 +1,63 @@
+import torch
+from audio.languagemodels import LanguageModel
+import string
+from audio.hubert_loss import hubert_loss
+from audio.sdr import si_sdr_loss
+from torch import nn
+
+
+def get_criterion(network_name):
+    criterion = None
+    if network_name in ["wav2letter", "conformer", "deepspeech"] or "wav2vec2" in network_name:
+        char_blank = "*"
+        char_space = " "
+        char_apostrophe = "'"
+        labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase
+        language_model = LanguageModel(labels, char_blank, char_space)
+        criterion = torch.nn.CTCLoss(blank=language_model.mapping[char_blank], zero_infinity=False)
+    elif "hubert_pretrain" in network_name:
+        criterion = hubert_loss
+    elif "wavernn" in network_name:
+        criterion = nn.CrossEntropyLoss()
+    elif "conv_tasnet" in network_name:
+        criterion = si_sdr_loss
+    elif "tacotron2" in network_name:
+        criterion = nn.MSELoss()
+    return criterion
+
+    
+def calculate_loss(network_name, criterion, output):
+    if criterion is None:
+        target = torch.randn_like(output)
+        return torch.nn.functional.mse_loss(output, target)
+    if network_name in ["wav2letter", "conformer", "deepspeech"] or "wav2vec2" in network_name:
+        output = output.transpose(-1, -2).transpose(0, 1)
+        T, N, C = output.shape
+        target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long)
+        target = torch.randint(
+            low=1,
+            high=C,
+            size=(sum(target_lengths),),
+            dtype=torch.long,
+        )
+        tensors_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
+        loss = criterion(output, target, tensors_lengths, target_lengths)
+    elif "hubert_pretrain" in network_name:
+        print ("hubert", len(output), output[0].shape, output[1])
+        logit_m, logit_u, feature_penalty = output
+        loss = criterion(logit_m, logit_u, feature_penalty)
+    elif "wavernn" in network_name:
+        target = torch.randn_like(output)
+        output, target = output.squeeze(1), target.squeeze(1)
+        output = output.transpose(1, 2)
+        target = target.transpose(1, 2)
+        loss = criterion(output, target)
+    elif "conv_tasnet" in network_name:
+        batch, _, time = output.shape
+        mask = torch.randint(low=0, high=1, size=(batch,1,time), dtype=torch.long).cuda()
+        target = torch.randn_like(output)
+        loss = criterion(output, target, mask)
+    elif "tacotron2" in network_name:
+        target = torch.randn_like(output)
+        loss = criterion(output, target)
+    return loss
\ No newline at end of file
diff --git a/audio/audio_model.py b/audio/audio_model.py
new file mode 100644
index 0000000..f0de1ed
--- /dev/null
+++ b/audio/audio_model.py
@@ -0,0 +1,175 @@
+import torch
+import torchaudio
+import sys
+
+ACOUSTIC_FEATURES_SIZE = 32
+FRAME_COUNT = 1024
+HOP_LENGTH = 36
+N_FREQ = 128
+CLASSES_COUNT = 29
+
+#different audio tasks related models
+wav2vec_models = {
+    "wav2vec2_base" : torchaudio.models.wav2vec2_base,
+    "wav2vec2_large" : torchaudio.models.wav2vec2_large,
+    "wav2vec2_large_lv60k" : torchaudio.models.wav2vec2_large_lv60k,
+    "wav2vec2_xlsr_300m" : torchaudio.models.wav2vec2_xlsr_300m,
+    "wav2vec2_xlsr_1b" : torchaudio.models.wav2vec2_xlsr_1b,
+    "wav2vec2_xlsr_2b" : torchaudio.models.wav2vec2_xlsr_2b,
+    "hubert_base" :  torchaudio.models.hubert_base,
+    "hubert_large" : torchaudio.models.hubert_large,
+    "hubert_xlarge" : torchaudio.models.hubert_xlarge,
+    "wavlm_base" : torchaudio.models.wavlm_base,
+    "wavlm_large" : torchaudio.models.wavlm_large,
+}
+
+speech_recognition_models = {
+    "conformer" : torchaudio.models.Conformer,
+    "deepspeech" : torchaudio.models.DeepSpeech,
+    "emformer" : torchaudio.models.Emformer,
+    "wav2letter" : torchaudio.models.Wav2Letter
+}
+
+source_separation_models = {
+    "conv_tasnet_base" : torchaudio.models.conv_tasnet_base,
+    "hdemucs_low" : torchaudio.models.hdemucs_low,
+    "hdemucs_medium" : torchaudio.models.hdemucs_medium,
+    "hdemucs_high" : torchaudio.models.hdemucs_high,
+}
+
+speech_quality_models = {
+    "squim_objective_base" : torchaudio.models.squim_objective_base,
+    "squim_subjective_base" : torchaudio.models.squim_subjective_base
+}
+
+speech_synthesis_models = {
+    "tacotron2" : torchaudio.models.Tacotron2,
+    "wavernn" : torchaudio.models.WaveRNN
+}
+
+
+hubert_pretrain_models = {
+    "hubert_pretrain_base" : torchaudio.models.hubert_pretrain_base,
+}
+
+def get_network_names():
+    return sorted(list(wav2vec_models.keys()) +
+                  list(speech_recognition_models.keys()) +
+                  list(source_separation_models.keys()) +
+                  list(speech_quality_models.keys()) + 
+                  list(speech_synthesis_models.keys()) +
+                  list(hubert_pretrain_models.keys()))
+
+
+def get_network(network_name):
+    if network_name in wav2vec_models:
+        return wav2vec_models[network_name](aux_num_out=CLASSES_COUNT).to(device="cuda")
+    elif network_name in source_separation_models:
+        if "hdemucs" in network_name:
+            return source_separation_models[network_name](sources = ["vocals"]).to(device="cuda")
+        else:
+            return source_separation_models[network_name]().to(device="cuda")
+    elif network_name in speech_recognition_models:
+        if "deepspeech" in network_name:
+            return speech_recognition_models[network_name](n_feature = ACOUSTIC_FEATURES_SIZE).to(device="cuda")
+        elif "wav2letter" in network_name:
+            return speech_recognition_models[network_name](num_features = ACOUSTIC_FEATURES_SIZE).to(device="cuda")
+        elif "emformer" in network_name:
+            return speech_recognition_models[network_name](input_dim = ACOUSTIC_FEATURES_SIZE,
+                                                           num_heads=8, 
+                                                           ffn_dim=1024, 
+                                                           num_layers=20,
+                                                           segment_length=4).to(device="cuda")
+        elif "conformer" in network_name:
+            return speech_recognition_models[network_name](input_dim = 80,
+                                                           num_heads=4, 
+                                                           ffn_dim=128, 
+                                                           num_layers=4,
+                                                           depthwise_conv_kernel_size=31).to(device="cuda")
+    elif network_name in speech_quality_models:
+        return speech_quality_models[network_name]().to(device="cuda")
+    elif network_name in speech_synthesis_models:
+        if "wavernn" in network_name:
+            return speech_synthesis_models[network_name](upsample_scales = [3, 3, 4], n_classes = 10, 
+                                                         hop_length = HOP_LENGTH, n_freq = 128).to(device="cuda")
+        else:
+            return speech_synthesis_models[network_name]().to(device="cuda")    
+    elif network_name in hubert_pretrain_models:
+        return hubert_pretrain_models[network_name]().to(device="cuda")                                       
+    else:
+        print ("ERROR: not a supported model '%s'" % network_name)
+        sys.exit(1)
+
+
+def get_input_type(network_name):
+    if network_name in wav2vec_models or network_name in source_separation_models or network_name in speech_quality_models:
+        return "waveform"
+    elif network_name in speech_recognition_models:
+        return "acoustic features"
+    elif network_name in speech_synthesis_models:
+        if "wavernn" in network_name:
+            return "waveform"
+        else:
+            return "tokens"
+
+
+def get_input(network_name, network, batch_size):
+    if network_name in wav2vec_models:
+        inp = {"waveforms": torch.randn(batch_size, FRAME_COUNT, device="cuda")}
+    elif network_name in source_separation_models:
+        if "hdemucs" in network_name:
+            inp = {"input" : torch.randn(batch_size, 2, FRAME_COUNT, device="cuda")}
+        else:
+            inp = {"input" : torch.randn(batch_size, 1, FRAME_COUNT, device="cuda")}
+    elif network_name in speech_recognition_models:
+        if "deepspeech" in network_name:
+            #number of channels must be specified for deepspeech
+            inp = {"x" : torch.randn(batch_size, 1, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda")}
+        elif "wav2letter" in network_name:
+            inp = {"x" : torch.randn(batch_size, ACOUSTIC_FEATURES_SIZE, FRAME_COUNT, device="cuda")}
+        elif "emformer" in network_name:
+            inp = {"input" : torch.randn(batch_size, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda"),
+                   "lengths" : torch.randint(1, FRAME_COUNT, (batch_size,)).to(device="cuda")}
+        elif "conformer" in network_name:
+            lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")
+            inp = {"input" : torch.rand(batch_size, int(lengths.max()), 80, device="cuda"),
+                   "lengths" : lengths}
+    elif network_name in speech_quality_models:
+        if "subjective" in network_name:
+            inp = {"waveform" : torch.randn(batch_size, FRAME_COUNT, device="cuda"),
+                   "reference" : torch.randn(batch_size, FRAME_COUNT, device="cuda")}
+        else:
+            inp = {"x" : torch.randn(batch_size, FRAME_COUNT, device="cuda")}
+    elif network_name in speech_synthesis_models:
+        if "wavernn" in network_name:
+            spec_frames = 64
+            waveform_length = HOP_LENGTH * (spec_frames - 4)
+            
+            inp = {"waveform" : torch.rand(batch_size, 1, waveform_length, device="cuda"),
+                   "specgram": torch.rand(batch_size, 1, N_FREQ, spec_frames, device="cuda")}
+        elif "tacotron2" in network_name:
+            n_mels = 80
+            max_mel_specgram_length = 300
+            max_text_length = 100
+            inp = {"tokens" : torch.randint(0, 148, (batch_size, max_text_length), dtype=torch.int32, device="cuda"),
+                   "token_lengths" : max_text_length * torch.ones((batch_size,), device="cuda"),
+                   "mel_specgram": torch.rand(batch_size, n_mels, max_mel_specgram_length, device="cuda"),
+                   "mel_specgram_lengths" : max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda")}
+    elif network_name in hubert_pretrain_models:
+        
+        inp = {"waveforms" : torch.rand(batch_size, FRAME_COUNT, device="cuda"),
+               "labels" : torch.randint(0, 100, (batch_size, FRAME_COUNT), dtype=torch.int32, device="cuda"),
+               "audio_lengths" : torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")}
+    return inp
+
+
+def get_output_selection(network_name):
+    if "wav2vec2" in network_name:
+        return 0
+    elif "conformer" in network_name or "emformer" in network_name:
+        return 0
+    elif "objective" in network_name:
+        return 0
+    elif "tacotron2" in network_name:
+        return 1
+    return None
\ No newline at end of file
diff --git a/audio/languagemodels.py b/audio/languagemodels.py
new file mode 100644
index 0000000..8091568
--- /dev/null
+++ b/audio/languagemodels.py
@@ -0,0 +1,38 @@
+import collections
+import itertools
+
+
+class LanguageModel:
+    def __init__(self, labels, char_blank, char_space):
+
+        self.char_space = char_space
+        self.char_blank = char_blank
+
+        labels = list(labels)
+        self.length = len(labels)
+        enumerated = list(enumerate(labels))
+        flipped = [(sub[1], sub[0]) for sub in enumerated]
+
+        d1 = collections.OrderedDict(enumerated)
+        d2 = collections.OrderedDict(flipped)
+        self.mapping = {**d1, **d2}
+
+    def encode(self, iterable):
+        if isinstance(iterable, list):
+            return [self.encode(i) for i in iterable]
+        else:
+            return [self.mapping[i] + self.mapping[self.char_blank] for i in iterable]
+
+    def decode(self, tensor):
+        if len(tensor) > 0 and isinstance(tensor[0], list):
+            return [self.decode(t) for t in tensor]
+        else:
+            # not idempotent, since clean string
+            x = (self.mapping[i] for i in tensor)
+            x = "".join(i for i, _ in itertools.groupby(x))
+            x = x.replace(self.char_blank, "")
+            # x = x.strip()
+            return x
+
+    def __len__(self):
+        return self.length
\ No newline at end of file
diff --git a/audio/sdr.py b/audio/sdr.py
new file mode 100644
index 0000000..011bb56
--- /dev/null
+++ b/audio/sdr.py
@@ -0,0 +1,218 @@
+import math
+from itertools import permutations
+from typing import Optional
+
+import torch
+
+
+def sdr(
+    estimate: torch.Tensor, reference: torch.Tensor, mask: Optional[torch.Tensor] = None, epsilon: float = 1e-8
+) -> torch.Tensor:
+    """Computes source-to-distortion ratio.
+
+    1. scale the reference signal with power(s_est * s_ref) / powr(s_ref * s_ref)
+    2. compute SNR between adjusted estimate and reference.
+
+    Args:
+        estimate (torch.Tensor): Estimtaed signal.
+            Shape: [batch, speakers (can be 1), time frame]
+        reference (torch.Tensor): Reference signal.
+            Shape: [batch, speakers, time frame]
+        mask (torch.Tensor or None, optional): Binary mask to indicate padded value (0) or valid value (1).
+            Shape: [batch, 1, time frame]
+        epsilon (float, optional): constant value used to stabilize division.
+
+    Returns:
+        torch.Tensor: scale-invariant source-to-distortion ratio.
+        Shape: [batch, speaker]
+
+    References:
+        - Single-channel multi-speaker separation using deep clustering
+          Y. Isik, J. Le Roux, Z. Chen, S. Watanabe, and J. R. Hershey,
+        - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation
+          Luo, Yi and Mesgarani, Nima
+          https://arxiv.org/abs/1809.07454
+
+    Notes:
+        This function is tested to produce the exact same result as
+        https://github.com/naplab/Conv-TasNet/blob/e66d82a8f956a69749ec8a4ae382217faa097c5c/utility/sdr.py#L34-L56
+    """
+    reference_pow = reference.pow(2).mean(axis=2, keepdim=True)
+    mix_pow = (estimate * reference).mean(axis=2, keepdim=True)
+    scale = mix_pow / (reference_pow + epsilon)
+
+    reference = scale * reference
+    error = estimate - reference
+
+    reference_pow = reference.pow(2)
+    error_pow = error.pow(2)
+
+    if mask is None:
+        reference_pow = reference_pow.mean(axis=2)
+        error_pow = error_pow.mean(axis=2)
+    else:
+        denom = mask.sum(axis=2)
+        reference_pow = (mask * reference_pow).sum(axis=2) / denom
+        error_pow = (mask * error_pow).sum(axis=2) / denom
+
+    return 10 * torch.log10(reference_pow) - 10 * torch.log10(error_pow)
+
+
+class PIT(torch.nn.Module):
+    """Applies utterance-level speaker permutation
+
+    Computes the maxium possible value of the given utility function
+    over the permutations of the speakers.
+
+    Args:
+        utility_func (function):
+            Function that computes the utility (opposite of loss) with signature of
+            (extimate: torch.Tensor, reference: torch.Tensor) -> torch.Tensor
+            where input Tensors are shape of [batch, speakers, frame] and
+            the output Tensor is shape of [batch, speakers].
+
+    References:
+        - Multi-talker Speech Separation with Utterance-level Permutation Invariant Training of
+          Deep Recurrent Neural Networks
+          Morten Kolbæk, Dong Yu, Zheng-Hua Tan and Jesper Jensen
+          https://arxiv.org/abs/1703.06284
+    """
+
+    def __init__(self, utility_func):
+        super().__init__()
+        self.utility_func = utility_func
+
+    def forward(
+        self,
+        estimate: torch.Tensor,
+        reference: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        epsilon: float = 1e-8,
+    ) -> torch.Tensor:
+        """Compute utterance-level PIT Loss
+
+        Args:
+            estimate (torch.Tensor): Estimated source signals.
+                Shape: [bacth, speakers, time frame]
+            reference (torch.Tensor): Reference (original) source signals.
+                Shape: [batch, speakers, time frame]
+            mask (torch.Tensor or None, optional): Binary mask to indicate padded value (0) or valid value (1).
+                Shape: [batch, 1, time frame]
+            epsilon (float, optional): constant value used to stabilize division.
+
+        Returns:
+            torch.Tensor: Maximum criterion over the speaker permutation.
+                Shape: [batch, ]
+        """
+        assert estimate.shape == reference.shape
+
+        batch_size, num_speakers = reference.shape[:2]
+        num_permute = math.factorial(num_speakers)
+
+        util_mat = torch.zeros(batch_size, num_permute, dtype=estimate.dtype, device=estimate.device)
+        for i, idx in enumerate(permutations(range(num_speakers))):
+            util = self.utility_func(estimate, reference[:, idx, :], mask=mask, epsilon=epsilon)
+            util_mat[:, i] = util.mean(dim=1)  # take the average over speaker dimension
+        return util_mat.max(dim=1).values
+
+
+_sdr_pit = PIT(utility_func=sdr)
+
+
+def sdr_pit(
+    estimate: torch.Tensor, reference: torch.Tensor, mask: Optional[torch.Tensor] = None, epsilon: float = 1e-8
+):
+    """Computes scale-invariant source-to-distortion ratio.
+
+    1. adjust both estimate and reference to have 0-mean
+    2. scale the reference signal with power(s_est * s_ref) / powr(s_ref * s_ref)
+    3. compute SNR between adjusted estimate and reference.
+
+    Args:
+        estimate (torch.Tensor): Estimtaed signal.
+            Shape: [batch, speakers (can be 1), time frame]
+        reference (torch.Tensor): Reference signal.
+            Shape: [batch, speakers, time frame]
+        mask (torch.Tensor or None, optional): Binary mask to indicate padded value (0) or valid value (1).
+            Shape: [batch, 1, time frame]
+        epsilon (float, optional): constant value used to stabilize division.
+
+    Returns:
+        torch.Tensor: scale-invariant source-to-distortion ratio.
+        Shape: [batch, speaker]
+
+    References:
+        - Single-channel multi-speaker separation using deep clustering
+          Y. Isik, J. Le Roux, Z. Chen, S. Watanabe, and J. R. Hershey,
+        - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation
+          Luo, Yi and Mesgarani, Nima
+          https://arxiv.org/abs/1809.07454
+
+    Notes:
+        This function is tested to produce the exact same result as the reference implementation,
+        *when the inputs have 0-mean*
+        https://github.com/naplab/Conv-TasNet/blob/e66d82a8f956a69749ec8a4ae382217faa097c5c/utility/sdr.py#L107-L153
+    """
+    return _sdr_pit(estimate, reference, mask, epsilon)
+
+
+def sdri(
+    estimate: torch.Tensor,
+    reference: torch.Tensor,
+    mix: torch.Tensor,
+    mask: Optional[torch.Tensor] = None,
+    epsilon: float = 1e-8,
+) -> torch.Tensor:
+    """Compute the improvement of SDR (SDRi).
+
+    This function compute how much SDR is improved if the estimation is changed from
+    the original mixture signal to the actual estimated source signals. That is,
+    ``SDR(estimate, reference) - SDR(mix, reference)``.
+
+    For computing ``SDR(estimate, reference)``, PIT (permutation invariant training) is applied,
+    so that best combination of sources between the reference signals and the esimate signals
+    are picked.
+
+    Args:
+        estimate (torch.Tensor): Estimated source signals.
+            Shape: [batch, speakers, time frame]
+        reference (torch.Tensor): Reference (original) source signals.
+            Shape: [batch, speakers, time frame]
+        mix (torch.Tensor): Mixed souce signals, from which the setimated signals were generated.
+            Shape: [batch, speakers == 1, time frame]
+        mask (torch.Tensor or None, optional): Binary mask to indicate padded value (0) or valid value (1).
+            Shape: [batch, 1, time frame]
+        epsilon (float, optional): constant value used to stabilize division.
+
+    Returns:
+        torch.Tensor: Improved SDR. Shape: [batch, ]
+
+    References:
+        - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation
+          Luo, Yi and Mesgarani, Nima
+          https://arxiv.org/abs/1809.07454
+    """
+    sdr_ = sdr_pit(estimate, reference, mask=mask, epsilon=epsilon)  # [batch, ]
+    base_sdr = sdr(mix, reference, mask=mask, epsilon=epsilon)  # [batch, speaker]
+    return sdr_ - base_sdr.mean(dim=1)
+
+
+def si_sdr_loss(estimate: torch.Tensor, reference: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+    """Compute the Si-SDR loss.
+
+    Args:
+        estimate (torch.Tensor): Estimated source signals.
+            Tensor of dimension (batch, speakers, time)
+        reference (torch.Tensor): Reference (original) source signals.
+            Tensor of dimension (batch, speakers, time)
+        mask (torch.Tensor): Mask to indicate padded value (0) or valid value (1).
+            Tensor of dimension (batch, 1, time)
+
+    Returns:
+        torch.Tensor: Si-SDR loss. Tensor of dimension (batch, )
+    """
+    estimate = estimate - estimate.mean(axis=2, keepdim=True)
+    reference = reference - reference.mean(axis=2, keepdim=True)
+
+    si_sdri = sdr.sdr_pit(estimate, reference, mask=mask)
+    return -si_sdri.mean()
\ No newline at end of file
diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py
index 5dbbd54..7422144 100644
--- a/micro_benchmarking_audio.py
+++ b/micro_benchmarking_audio.py
@@ -10,8 +10,8 @@
 import torch.multiprocessing as mp
 from fp16util import network_to_half, get_param_copy
 import torch.nn.functional as F
-from audio_model import get_network_names, get_network, get_input_type, get_input, get_output_selection
-from audio_loss import get_criterion, calculate_loss
+from audio.audio_model import get_network_names, get_network, get_input_type, get_input, get_output_selection
+from audio.audio_loss import get_criterion, calculate_loss
 
 try:
     import torch._dynamo
@@ -54,7 +54,7 @@ def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_
     out = network(**inp)
     output_index = get_output_selection(network_name) 
     if output_index is not None:
-        out = out[0]
+        out = out[output_index]
     
     loss = calculate_loss(network_name, criterion, out)
     

From ead8362bf7456b6c94b66a582ca6586e6d2b85a1 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 28 Oct 2025 08:38:41 +0000
Subject: [PATCH 07/22] moved audio_model to audio folder

---
 audio_model.py | 181 -------------------------------------------------
 1 file changed, 181 deletions(-)
 delete mode 100644 audio_model.py

diff --git a/audio_model.py b/audio_model.py
deleted file mode 100644
index 6f1f9d2..0000000
--- a/audio_model.py
+++ /dev/null
@@ -1,181 +0,0 @@
-import torch
-import torchaudio
-import sys
-
-ACOUSTIC_FEATURES_SIZE = 32
-FRAME_COUNT = 1024
-HOP_LENGTH = 36
-N_FREQ = 128
-
-#different audio tasks related models
-wav2vec_models = {
-    "wav2vec2_base" : torchaudio.models.wav2vec2_base,
-    "wav2vec2_large" : torchaudio.models.wav2vec2_large,
-    "wav2vec2_large_lv60k" : torchaudio.models.wav2vec2_large_lv60k,
-    "wav2vec2_xlsr_300m" : torchaudio.models.wav2vec2_xlsr_300m,
-    "wav2vec2_xlsr_1b" : torchaudio.models.wav2vec2_xlsr_1b,
-    "wav2vec2_xlsr_2b" : torchaudio.models.wav2vec2_xlsr_2b,
-    "hubert_base" :  torchaudio.models.hubert_base,
-    "hubert_large" : torchaudio.models.hubert_large,
-    "hubert_xlarge" : torchaudio.models.hubert_xlarge,
-    "wavlm_base" : torchaudio.models.wavlm_base,
-    "wavlm_large" : torchaudio.models.wavlm_large,
-}
-
-speech_recognition_models = {
-    "conformer" : torchaudio.models.Conformer,
-    "deepspeech" : torchaudio.models.DeepSpeech,
-    "emformer" : torchaudio.models.Emformer,
-    "wav2letter" : torchaudio.models.Wav2Letter
-}
-
-source_separation_models = {
-    "conv_tasnet_base" : torchaudio.models.conv_tasnet_base,
-    "hdemucs_low" : torchaudio.models.hdemucs_low,
-    "hdemucs_medium" : torchaudio.models.hdemucs_medium,
-    "hdemucs_high" : torchaudio.models.hdemucs_high,
-}
-
-speech_quality_models = {
-    "squim_objective_base" : torchaudio.models.squim_objective_base,
-    "squim_subjective_base" : torchaudio.models.squim_subjective_base
-}
-
-speech_synthesis_models = {
-    "tacotron2" : torchaudio.models.Tacotron2,
-    "wavernn" : torchaudio.models.WaveRNN
-}
-
-decoder_models = {
-    "emformer_rnnt_base" : torchaudio.models.emformer_rnnt_base
-}
-
-
-def get_network_names():
-    return sorted(list(wav2vec_models.keys()) +
-                  list(speech_recognition_models.keys()) +
-                  list(source_separation_models.keys()) +
-                  list(speech_quality_models.keys()) + 
-                  list(speech_synthesis_models.keys()) + 
-                  list(decoder_models.keys()))
-
-
-def get_network(network_name):
-    if network_name in wav2vec_models:
-        return wav2vec_models[network_name](aux_num_out=29).to(device="cuda")
-    elif network_name in source_separation_models:
-        if "hdemucs" in network_name:
-            return source_separation_models[network_name](sources = ["vocals"]).to(device="cuda")
-        else:
-            return source_separation_models[network_name]().to(device="cuda")
-    elif network_name in speech_recognition_models:
-        if "deepspeech" in network_name:
-            return speech_recognition_models[network_name](n_feature = ACOUSTIC_FEATURES_SIZE).to(device="cuda")
-        elif "wav2letter" in network_name:
-            return speech_recognition_models[network_name](num_features = ACOUSTIC_FEATURES_SIZE).to(device="cuda")
-        elif "emformer" in network_name:
-            return speech_recognition_models[network_name](input_dim = ACOUSTIC_FEATURES_SIZE,
-                                                           num_heads=8, 
-                                                           ffn_dim=1024, 
-                                                           num_layers=20,
-                                                           segment_length=4).to(device="cuda")
-        elif "conformer" in network_name:
-            return speech_recognition_models[network_name](input_dim = 80,
-                                                           num_heads=4, 
-                                                           ffn_dim=128, 
-                                                           num_layers=4,
-                                                           depthwise_conv_kernel_size=31).to(device="cuda")
-    elif network_name in speech_quality_models:
-        return speech_quality_models[network_name]().to(device="cuda")
-    elif network_name in speech_synthesis_models:
-        if "wavernn" in network_name:
-            return speech_synthesis_models[network_name](upsample_scales = [3, 3, 4], n_classes = 10, 
-                                                         hop_length = HOP_LENGTH, n_freq = 128).to(device="cuda")
-        else:
-            return speech_synthesis_models[network_name]().to(device="cuda")
-    elif network_name in decoder_models:
-        return decoder_models[network_name](num_symbols = 256).to(device="cuda")                                                      
-    else:
-        print ("ERROR: not a supported model '%s'" % network_name)
-        sys.exit(1)
-
-
-def get_input_type(network_name):
-    if network_name in wav2vec_models or network_name in source_separation_models or network_name in speech_quality_models:
-        return "waveform"
-    elif network_name in speech_recognition_models:
-        return "acoustic features"
-    elif network_name in speech_synthesis_models:
-        if "wavernn" in network_name:
-            return "waveform"
-        else:
-            return "tokens"
-    elif network_name in decoder_models:
-        return ""
-
-
-def get_input(network_name, network, batch_size):
-    if network_name in wav2vec_models:
-        inp = {"waveforms": torch.randn(batch_size, FRAME_COUNT, device="cuda")}
-    elif network_name in source_separation_models:
-        if "hdemucs" in network_name:
-            inp = {"input" : torch.randn(batch_size, 2, FRAME_COUNT, device="cuda")}
-        else:
-            inp = {"input" : torch.randn(batch_size, 1, FRAME_COUNT, device="cuda")}
-    elif network_name in speech_recognition_models:
-        if "deepspeech" in network_name:
-            #number of channels must be specified for deepspeech
-            inp = {"x" : torch.randn(batch_size, 1, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda")}
-        elif "wav2letter" in network_name:
-            inp = {"x" : torch.randn(batch_size, ACOUSTIC_FEATURES_SIZE, FRAME_COUNT, device="cuda")}
-        elif "emformer" in network_name:
-            inp = {"input" : torch.randn(batch_size, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda"),
-                   "lengths" : torch.randint(1, FRAME_COUNT, (batch_size,)).to(device="cuda")}
-        elif "conformer" in network_name:
-            lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")
-            inp = {"input" : torch.rand(batch_size, int(lengths.max()), 80, device="cuda"),
-                   "lengths" : lengths}
-    elif network_name in speech_quality_models:
-        if "subjective" in network_name:
-            inp = {"waveform" : torch.randn(batch_size, FRAME_COUNT, device="cuda"),
-                   "reference" : torch.randn(batch_size, FRAME_COUNT, device="cuda")}
-        else:
-            inp = {"x" : torch.randn(batch_size, FRAME_COUNT, device="cuda")}
-    elif network_name in speech_synthesis_models:
-        if "wavernn" in network_name:
-            spec_frames = 64
-            waveform_length = HOP_LENGTH * (spec_frames - 4)
-            
-            inp = {"waveform" : torch.rand(batch_size, 1, waveform_length, device="cuda"),
-                   "specgram": torch.rand(batch_size, 1, N_FREQ, spec_frames, device="cuda")}
-        elif "tacotron2" in network_name:
-            n_mels = 80
-            max_mel_specgram_length = 300
-            max_text_length = 100
-            inp = {"tokens" : torch.randint(0, 148, (batch_size, max_text_length), dtype=torch.int32, device="cuda"),
-                   "token_lengths" : max_text_length * torch.ones((batch_size,), device="cuda"),
-                   "mel_specgram": torch.rand(batch_size, n_mels, max_mel_specgram_length, device="cuda"),
-                   "mel_specgram_lengths" : max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda")}
-    elif network_name in decoder_models:
-        right_context_length = 4
-        max_input_length = 61
-        max_target_length = 23
-
-        inp = {"sources" : torch.rand(batch_size, max_input_length + right_context_length, 80, device="cuda"),
-               "source_lengths" : torch.randint(1, max_input_length + 1, (batch_size,), device="cuda"),
-               "targets" : torch.randint(0, 256, (batch_size, max_target_length), device="cuda"),
-               "target_lengths" : torch.randint(1, max_target_length + 1, (batch_size,), device="cuda"),
-               "predictor_state" : None}
-    return inp
-
-
-def get_output_selection(network_name):
-    if network_name in wav2vec_models:
-        return 0
-    elif "conformer" in network_name or "emformer" in network_name:
-        return 0
-    elif "objective" in network_name:
-        return 0
-    elif "tacotron2" in network_name:
-        return 1
-    return None
\ No newline at end of file

From 0cd37b035ce8d08832c35d5c75f39222770b5855 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 28 Oct 2025 11:15:46 +0000
Subject: [PATCH 08/22] add loss functions for more audio models, refactor the
 input and output code for the audio benchmarking code

---
 audio/audio_input.py        | 64 ++++++++++++++++++++++++++++
 audio/audio_loss.py         | 22 ++++++----
 audio/audio_model.py        | 84 +++----------------------------------
 audio/audio_output.py       | 13 ++++++
 micro_benchmarking_audio.py |  5 ++-
 5 files changed, 100 insertions(+), 88 deletions(-)
 create mode 100644 audio/audio_input.py
 create mode 100644 audio/audio_output.py

diff --git a/audio/audio_input.py b/audio/audio_input.py
new file mode 100644
index 0000000..cd4b99a
--- /dev/null
+++ b/audio/audio_input.py
@@ -0,0 +1,64 @@
+import torch
+from audio.audio_model import *
+
+
+def get_input_type(network_name):
+    if network_name in acoustic_models or network_name in source_separation_models or network_name in speech_quality_models:
+        return "waveform"
+    elif network_name in speech_recognition_models:
+        return "acoustic features"
+    elif network_name in speech_synthesis_models:
+        if "wavernn" in network_name:
+            return "waveform"
+        else:
+            return "tokens"
+
+
+def get_input(network_name, network, batch_size):
+    if network_name in acoustic_models:
+        inp = {"waveforms": torch.randn(batch_size, FRAME_COUNT, device="cuda")}
+    elif network_name in source_separation_models:
+        if "hdemucs" in network_name:
+            inp = {"input" : torch.randn(batch_size, 2, FRAME_COUNT, device="cuda")}
+        else:
+            inp = {"input" : torch.randn(batch_size, 1, FRAME_COUNT, device="cuda")}
+    elif network_name in speech_recognition_models:
+        if "deepspeech" in network_name:
+            #number of channels must be specified for deepspeech
+            inp = {"x" : torch.randn(batch_size, 1, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda")}
+        elif "wav2letter" in network_name:
+            inp = {"x" : torch.randn(batch_size, ACOUSTIC_FEATURES_SIZE, FRAME_COUNT, device="cuda")}
+        elif "emformer" in network_name:
+            inp = {"input" : torch.randn(batch_size, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda"),
+                   "lengths" : torch.randint(1, FRAME_COUNT, (batch_size,)).to(device="cuda")}
+        elif "conformer" in network_name:
+            lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")
+            inp = {"input" : torch.rand(batch_size, int(lengths.max()), 80, device="cuda"),
+                   "lengths" : lengths}
+    elif network_name in speech_quality_models:
+        if "subjective" in network_name:
+            inp = {"waveform" : torch.randn(batch_size, FRAME_COUNT, device="cuda"),
+                   "reference" : torch.randn(batch_size, FRAME_COUNT, device="cuda")}
+        else:
+            inp = {"x" : torch.randn(batch_size, FRAME_COUNT, device="cuda")}
+    elif network_name in speech_synthesis_models:
+        if "wavernn" in network_name:
+            spec_frames = 64
+            waveform_length = HOP_LENGTH * (spec_frames - 4)
+            
+            inp = {"waveform" : torch.rand(batch_size, 1, waveform_length, device="cuda"),
+                   "specgram": torch.rand(batch_size, 1, N_FREQ, spec_frames, device="cuda")}
+        elif "tacotron2" in network_name:
+            n_mels = 80
+            max_mel_specgram_length = 300
+            max_text_length = 100
+            inp = {"tokens" : torch.randint(0, 148, (batch_size, max_text_length), dtype=torch.int32, device="cuda"),
+                   "token_lengths" : max_text_length * torch.ones((batch_size,), device="cuda"),
+                   "mel_specgram": torch.rand(batch_size, n_mels, max_mel_specgram_length, device="cuda"),
+                   "mel_specgram_lengths" : max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda")}
+    elif network_name in hubert_pretrain_models:
+        
+        inp = {"waveforms" : torch.rand(batch_size, FRAME_COUNT, device="cuda"),
+               "labels" : torch.randint(0, 100, (batch_size, FRAME_COUNT), dtype=torch.int32, device="cuda"),
+               "audio_lengths" : torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")}
+    return inp
diff --git a/audio/audio_loss.py b/audio/audio_loss.py
index 735dbb4..468e41e 100644
--- a/audio/audio_loss.py
+++ b/audio/audio_loss.py
@@ -4,25 +4,28 @@
 from audio.hubert_loss import hubert_loss
 from audio.sdr import si_sdr_loss
 from torch import nn
+from audio.audio_model import *
 
 
 def get_criterion(network_name):
     criterion = None
-    if network_name in ["wav2letter", "conformer", "deepspeech"] or "wav2vec2" in network_name:
+    if "hubert_pretrain" in network_name:
+        criterion = hubert_loss
+    elif network_name in speech_recognition_models or network_name in acoustic_models:
         char_blank = "*"
         char_space = " "
         char_apostrophe = "'"
         labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase
         language_model = LanguageModel(labels, char_blank, char_space)
         criterion = torch.nn.CTCLoss(blank=language_model.mapping[char_blank], zero_infinity=False)
-    elif "hubert_pretrain" in network_name:
-        criterion = hubert_loss
     elif "wavernn" in network_name:
         criterion = nn.CrossEntropyLoss()
     elif "conv_tasnet" in network_name:
         criterion = si_sdr_loss
     elif "tacotron2" in network_name:
         criterion = nn.MSELoss()
+    elif "hdemucs" in network_name or "squim" in network_name:
+        criterion = nn.L1Loss()
     return criterion
 
     
@@ -30,7 +33,11 @@ def calculate_loss(network_name, criterion, output):
     if criterion is None:
         target = torch.randn_like(output)
         return torch.nn.functional.mse_loss(output, target)
-    if network_name in ["wav2letter", "conformer", "deepspeech"] or "wav2vec2" in network_name:
+    if "hubert_pretrain" in network_name:
+        print ("hubert", len(output), output[0].shape, output[1])
+        logit_m, logit_u, feature_penalty = output
+        loss = criterion(logit_m, logit_u, feature_penalty)
+    elif network_name in speech_recognition_models or network_name in acoustic_models:
         output = output.transpose(-1, -2).transpose(0, 1)
         T, N, C = output.shape
         target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long)
@@ -42,10 +49,6 @@ def calculate_loss(network_name, criterion, output):
         )
         tensors_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
         loss = criterion(output, target, tensors_lengths, target_lengths)
-    elif "hubert_pretrain" in network_name:
-        print ("hubert", len(output), output[0].shape, output[1])
-        logit_m, logit_u, feature_penalty = output
-        loss = criterion(logit_m, logit_u, feature_penalty)
     elif "wavernn" in network_name:
         target = torch.randn_like(output)
         output, target = output.squeeze(1), target.squeeze(1)
@@ -60,4 +63,7 @@ def calculate_loss(network_name, criterion, output):
     elif "tacotron2" in network_name:
         target = torch.randn_like(output)
         loss = criterion(output, target)
+    elif "hdemucs" in network_name or "squim" in network_name:
+        target = torch.randn_like(output)
+        loss = criterion(output, target)
     return loss
\ No newline at end of file
diff --git a/audio/audio_model.py b/audio/audio_model.py
index f0de1ed..39b05f9 100644
--- a/audio/audio_model.py
+++ b/audio/audio_model.py
@@ -9,7 +9,7 @@
 CLASSES_COUNT = 29
 
 #different audio tasks related models
-wav2vec_models = {
+acoustic_models = {
     "wav2vec2_base" : torchaudio.models.wav2vec2_base,
     "wav2vec2_large" : torchaudio.models.wav2vec2_large,
     "wav2vec2_large_lv60k" : torchaudio.models.wav2vec2_large_lv60k,
@@ -53,7 +53,7 @@
 }
 
 def get_network_names():
-    return sorted(list(wav2vec_models.keys()) +
+    return sorted(list(acoustic_models.keys()) +
                   list(speech_recognition_models.keys()) +
                   list(source_separation_models.keys()) +
                   list(speech_quality_models.keys()) + 
@@ -62,8 +62,8 @@ def get_network_names():
 
 
 def get_network(network_name):
-    if network_name in wav2vec_models:
-        return wav2vec_models[network_name](aux_num_out=CLASSES_COUNT).to(device="cuda")
+    if network_name in acoustic_models:
+        return acoustic_models[network_name](aux_num_out=CLASSES_COUNT).to(device="cuda")
     elif network_name in source_separation_models:
         if "hdemucs" in network_name:
             return source_separation_models[network_name](sources = ["vocals"]).to(device="cuda")
@@ -98,78 +98,4 @@ def get_network(network_name):
         return hubert_pretrain_models[network_name]().to(device="cuda")                                       
     else:
         print ("ERROR: not a supported model '%s'" % network_name)
-        sys.exit(1)
-
-
-def get_input_type(network_name):
-    if network_name in wav2vec_models or network_name in source_separation_models or network_name in speech_quality_models:
-        return "waveform"
-    elif network_name in speech_recognition_models:
-        return "acoustic features"
-    elif network_name in speech_synthesis_models:
-        if "wavernn" in network_name:
-            return "waveform"
-        else:
-            return "tokens"
-
-
-def get_input(network_name, network, batch_size):
-    if network_name in wav2vec_models:
-        inp = {"waveforms": torch.randn(batch_size, FRAME_COUNT, device="cuda")}
-    elif network_name in source_separation_models:
-        if "hdemucs" in network_name:
-            inp = {"input" : torch.randn(batch_size, 2, FRAME_COUNT, device="cuda")}
-        else:
-            inp = {"input" : torch.randn(batch_size, 1, FRAME_COUNT, device="cuda")}
-    elif network_name in speech_recognition_models:
-        if "deepspeech" in network_name:
-            #number of channels must be specified for deepspeech
-            inp = {"x" : torch.randn(batch_size, 1, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda")}
-        elif "wav2letter" in network_name:
-            inp = {"x" : torch.randn(batch_size, ACOUSTIC_FEATURES_SIZE, FRAME_COUNT, device="cuda")}
-        elif "emformer" in network_name:
-            inp = {"input" : torch.randn(batch_size, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda"),
-                   "lengths" : torch.randint(1, FRAME_COUNT, (batch_size,)).to(device="cuda")}
-        elif "conformer" in network_name:
-            lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")
-            inp = {"input" : torch.rand(batch_size, int(lengths.max()), 80, device="cuda"),
-                   "lengths" : lengths}
-    elif network_name in speech_quality_models:
-        if "subjective" in network_name:
-            inp = {"waveform" : torch.randn(batch_size, FRAME_COUNT, device="cuda"),
-                   "reference" : torch.randn(batch_size, FRAME_COUNT, device="cuda")}
-        else:
-            inp = {"x" : torch.randn(batch_size, FRAME_COUNT, device="cuda")}
-    elif network_name in speech_synthesis_models:
-        if "wavernn" in network_name:
-            spec_frames = 64
-            waveform_length = HOP_LENGTH * (spec_frames - 4)
-            
-            inp = {"waveform" : torch.rand(batch_size, 1, waveform_length, device="cuda"),
-                   "specgram": torch.rand(batch_size, 1, N_FREQ, spec_frames, device="cuda")}
-        elif "tacotron2" in network_name:
-            n_mels = 80
-            max_mel_specgram_length = 300
-            max_text_length = 100
-            inp = {"tokens" : torch.randint(0, 148, (batch_size, max_text_length), dtype=torch.int32, device="cuda"),
-                   "token_lengths" : max_text_length * torch.ones((batch_size,), device="cuda"),
-                   "mel_specgram": torch.rand(batch_size, n_mels, max_mel_specgram_length, device="cuda"),
-                   "mel_specgram_lengths" : max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda")}
-    elif network_name in hubert_pretrain_models:
-        
-        inp = {"waveforms" : torch.rand(batch_size, FRAME_COUNT, device="cuda"),
-               "labels" : torch.randint(0, 100, (batch_size, FRAME_COUNT), dtype=torch.int32, device="cuda"),
-               "audio_lengths" : torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")}
-    return inp
-
-
-def get_output_selection(network_name):
-    if "wav2vec2" in network_name:
-        return 0
-    elif "conformer" in network_name or "emformer" in network_name:
-        return 0
-    elif "objective" in network_name:
-        return 0
-    elif "tacotron2" in network_name:
-        return 1
-    return None
\ No newline at end of file
+        sys.exit(1)
\ No newline at end of file
diff --git a/audio/audio_output.py b/audio/audio_output.py
new file mode 100644
index 0000000..90c08d3
--- /dev/null
+++ b/audio/audio_output.py
@@ -0,0 +1,13 @@
+from audio.audio_model import *
+
+
+def get_output_selection(network_name):
+    if network_name in acoustic_models:
+        return 0
+    elif "conformer" in network_name or "emformer" in network_name:
+        return 0
+    elif "objective" in network_name:
+        return 0
+    elif "tacotron2" in network_name:
+        return 1
+    return None
\ No newline at end of file
diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py
index 7422144..ac07b95 100644
--- a/micro_benchmarking_audio.py
+++ b/micro_benchmarking_audio.py
@@ -10,8 +10,11 @@
 import torch.multiprocessing as mp
 from fp16util import network_to_half, get_param_copy
 import torch.nn.functional as F
-from audio.audio_model import get_network_names, get_network, get_input_type, get_input, get_output_selection
+from audio.audio_model import get_network_names, get_network
 from audio.audio_loss import get_criterion, calculate_loss
+from audio.audio_input import get_input_type, get_input
+from audio.audio_output import get_output_selection
+
 
 try:
     import torch._dynamo

From 14050367a6ce570ec3d168e0ad3540d4ece20802 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Wed, 29 Oct 2025 14:44:01 +0000
Subject: [PATCH 09/22] add hubert loss for hubert pretrained model

---
 audio/audio_input.py  |  8 +++-----
 audio/audio_loss.py   | 14 ++++++++++----
 audio/audio_model.py  | 10 ++++++----
 audio/audio_output.py |  2 --
 audio/hubert_loss.py  | 36 ++++++++++++++++++++++++++++++++++++
 5 files changed, 55 insertions(+), 15 deletions(-)
 create mode 100644 audio/hubert_loss.py

diff --git a/audio/audio_input.py b/audio/audio_input.py
index cd4b99a..d7a58af 100644
--- a/audio/audio_input.py
+++ b/audio/audio_input.py
@@ -56,9 +56,7 @@ def get_input(network_name, network, batch_size):
                    "token_lengths" : max_text_length * torch.ones((batch_size,), device="cuda"),
                    "mel_specgram": torch.rand(batch_size, n_mels, max_mel_specgram_length, device="cuda"),
                    "mel_specgram_lengths" : max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda")}
-    elif network_name in hubert_pretrain_models:
-        
+    elif network_name in speech_representation_models:
         inp = {"waveforms" : torch.rand(batch_size, FRAME_COUNT, device="cuda"),
-               "labels" : torch.randint(0, 100, (batch_size, FRAME_COUNT), dtype=torch.int32, device="cuda"),
-               "audio_lengths" : torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")}
-    return inp
+               "labels" : torch.randint(0, 100, (batch_size, 2), dtype=torch.int32, device="cuda")}
+    return inp
\ No newline at end of file
diff --git a/audio/audio_loss.py b/audio/audio_loss.py
index 468e41e..57e0b5c 100644
--- a/audio/audio_loss.py
+++ b/audio/audio_loss.py
@@ -9,7 +9,7 @@
 
 def get_criterion(network_name):
     criterion = None
-    if "hubert_pretrain" in network_name:
+    if network_name in speech_representation_models:
         criterion = hubert_loss
     elif network_name in speech_recognition_models or network_name in acoustic_models:
         char_blank = "*"
@@ -33,8 +33,7 @@ def calculate_loss(network_name, criterion, output):
     if criterion is None:
         target = torch.randn_like(output)
         return torch.nn.functional.mse_loss(output, target)
-    if "hubert_pretrain" in network_name:
-        print ("hubert", len(output), output[0].shape, output[1])
+    if network_name in speech_representation_models:
         logit_m, logit_u, feature_penalty = output
         loss = criterion(logit_m, logit_u, feature_penalty)
     elif network_name in speech_recognition_models or network_name in acoustic_models:
@@ -63,7 +62,14 @@ def calculate_loss(network_name, criterion, output):
     elif "tacotron2" in network_name:
         target = torch.randn_like(output)
         loss = criterion(output, target)
-    elif "hdemucs" in network_name or "squim" in network_name:
+    elif "hdemucs" in network_name or "subjective" in network_name:
         target = torch.randn_like(output)
         loss = criterion(output, target)
+    elif "objective" in network_name:
+        for index in range(len(output)):
+            target = torch.randn_like(output[index])
+            if index == 0:
+                loss = criterion(output[index], target)
+            else:
+                loss += criterion(output[index], target)
     return loss
\ No newline at end of file
diff --git a/audio/audio_model.py b/audio/audio_model.py
index 39b05f9..e13787b 100644
--- a/audio/audio_model.py
+++ b/audio/audio_model.py
@@ -48,8 +48,10 @@
 }
 
 
-hubert_pretrain_models = {
+speech_representation_models = {
     "hubert_pretrain_base" : torchaudio.models.hubert_pretrain_base,
+    "hubert_pretrain_large" : torchaudio.models.hubert_pretrain_large,
+    "hubert_pretrain_xlarge" : torchaudio.models.hubert_pretrain_xlarge
 }
 
 def get_network_names():
@@ -58,7 +60,7 @@ def get_network_names():
                   list(source_separation_models.keys()) +
                   list(speech_quality_models.keys()) + 
                   list(speech_synthesis_models.keys()) +
-                  list(hubert_pretrain_models.keys()))
+                  list(speech_representation_models.keys()))
 
 
 def get_network(network_name):
@@ -94,8 +96,8 @@ def get_network(network_name):
                                                          hop_length = HOP_LENGTH, n_freq = 128).to(device="cuda")
         else:
             return speech_synthesis_models[network_name]().to(device="cuda")    
-    elif network_name in hubert_pretrain_models:
-        return hubert_pretrain_models[network_name]().to(device="cuda")                                       
+    elif network_name in speech_representation_models:
+        return speech_representation_models[network_name]().to(device="cuda")                                       
     else:
         print ("ERROR: not a supported model '%s'" % network_name)
         sys.exit(1)
\ No newline at end of file
diff --git a/audio/audio_output.py b/audio/audio_output.py
index 90c08d3..86c22bf 100644
--- a/audio/audio_output.py
+++ b/audio/audio_output.py
@@ -6,8 +6,6 @@ def get_output_selection(network_name):
         return 0
     elif "conformer" in network_name or "emformer" in network_name:
         return 0
-    elif "objective" in network_name:
-        return 0
     elif "tacotron2" in network_name:
         return 1
     return None
\ No newline at end of file
diff --git a/audio/hubert_loss.py b/audio/hubert_loss.py
new file mode 100644
index 0000000..1edaebd
--- /dev/null
+++ b/audio/hubert_loss.py
@@ -0,0 +1,36 @@
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+
+def hubert_loss(
+    logit_m: Optional[Tensor],
+    logit_u: Optional[Tensor],
+    feature_penalty: Tensor,
+    masked_weight: float = 1.0,
+    unmasked_weight: float = 0.0,
+    feature_weight: float = 10.0,
+    reduction: str = "sum",
+) -> Tensor:
+    """Compute the cross-entropy loss on HuBERT masked and non-masked logits.
+    Args:
+        logit_m (Tensor or None): The masked logit Tensor of dimension `(masked_frames, final_dim)`.
+        logit_u (Tensor or None): The non-masked logit Tensor of dimension `(unmasked_frames, final_dim)`.
+        feature_penalty (Tensor): The feature mean value for additional penalty loss.
+        masked_weight (float, optional): The weight for masked cross-entropy loss (Default: ``1.0``).
+        unmasked_weight (float, optional): The weight for non-masked cross-entropy loss (Default: ``0.0``).
+        feature_weight (float, optional): The weight for feature penalty loss (Default: ``10.0``).
+        reduction (str, optional): The reduction method for cross-entropy loss (Default: ``"sum"``).
+    """
+    loss = feature_penalty * feature_weight * logit_m.shape[0]
+    if logit_m is not None:
+        target_m = torch.zeros(logit_m.shape[0], dtype=torch.long, device=logit_m.device)
+        loss_m = F.cross_entropy(logit_m, target_m, reduction=reduction)
+        loss += loss_m * masked_weight
+    if logit_u is not None:
+        target_u = torch.zeros(logit_u.shape[0], dtype=torch.long, device=logit_m.device)
+        loss_u = F.cross_entropy(logit_u, target_u, reduction=reduction)
+        loss += loss_u * unmasked_weight
+    return loss
\ No newline at end of file

From b60ceaa9977fc86713d56e0d0179e6641ac7c70a Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 3 Feb 2026 18:17:26 +0000
Subject: [PATCH 10/22] add recent changes in pytorch microbenchmarking to apex
 microbenchmarking

---
 micro_benchmarking_apex.py | 170 ++++++++++++++++++++++++++++++-------
 1 file changed, 141 insertions(+), 29 deletions(-)

diff --git a/micro_benchmarking_apex.py b/micro_benchmarking_apex.py
index ae1a422..8f5f729 100644
--- a/micro_benchmarking_apex.py
+++ b/micro_benchmarking_apex.py
@@ -19,6 +19,8 @@
 from shufflenet import shufflenet
 from shufflenet_v2 import shufflenet as shufflenet_v2
 from xception import xception
+import csv
+import json
 
 try:
     import torch._dynamo
@@ -34,7 +36,10 @@
     # this indicates we're using torchrun
     is_torchrun = True
 
-
+def xform(m: nn.Module) -> nn.Module:
+    m = m.cuda()
+    m.to(memory_format=torch.channels_last)
+    return m
 
 def weight_init(m):
     if isinstance(m, nn.Conv2d):
@@ -144,29 +149,42 @@ def weight_init(m):
 def get_network_names():
     return sorted(list(models.keys()) + list(segmentation_models.keys()))
 
-def get_network(net):
+def get_network(net, params):
     # aux_logits=False only used by inception_v3
     if "inception_v3" == net:
+        if params.nhwc:
+            return xform(models[net](aux_logits=False))
         return models[net](aux_logits=False).to(device="cuda")
     elif net in models:
+        if params.nhwc:
+            return xform(models[net]())
         return models[net]().to(device="cuda")
     elif net in segmentation_models:
+        if params.nhwc:
+            return xform(segmentation_models[net]())
         return segmentation_models[net]().to(device="cuda")
     else:
         print ("ERROR: not a supported model '%s'" % net)
         sys.exit(1)
 
-def forwardbackward(inp, optimizer, network, target, amp_opt_level, flops_prof_step=0):
-    optimizer.zero_grad()
+def forwardbackward(inp, optimizer, network, params, target, step=0, opt_step=1, flops_prof_step=0):
+    if step % opt_step == 0:
+        optimizer.zero_grad()
     if flops_prof_step:
         prof = FlopsProfiler(network)
         prof.start_profile()
     out = network(inp)
-    # WIP: googlenet, deeplabv3_*, fcn_* missing log_softmax for this to work
-    loss = torch.nn.functional.cross_entropy(out, target)
-    # End profiler here if only to profile forward pass
+    # If using HuggingFace model outputs logits, we need to extract them
+    if hasattr(out, 'logits'):
+        logits = out.logits
+    else:
+        logits = out
+    loss_fn = torch.nn.CrossEntropyLoss().to(device="cuda")
+    if params.nhwc:
+        loss_fn = loss_fn.to(memory_format=torch.channels_last)
+    loss = loss_fn(logits, target)
 
-    if amp_opt_level:
+    if params.amp_opt_level:
         with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
             scaled_loss.backward()
     else:
@@ -179,7 +197,29 @@ def forwardbackward(inp, optimizer, network, target, amp_opt_level, flops_prof_s
         prof.print_model_profile(profile_step=flops_prof_step)
         prof.end_profile()
 
-    optimizer.step()
+    if (step + 1) % opt_step == 0:
+        optimizer.step()
+
+def forward(inp, optimizer, network, params, target, step=0, opt_step=1, flops_prof_step=0):
+
+    if flops_prof_step:
+        prof = FlopsProfiler(network)
+        prof.start_profile()
+    out = network(inp)
+    # If using HuggingFace model outputs logits, we need to extract them
+    if hasattr(out, 'logits'):
+        logits = out.logits
+    else:
+        logits = out
+
+    if flops_prof_step:
+        # End profiler here to profile both fwd and bwd passes
+        # flops = prof.get_total_flops(as_string=True)
+        # params = prof.get_total_params(as_string=True)
+        prof.print_model_profile(profile_step=flops_prof_step)
+        prof.end_profile()
+
+    return logits
 
 def rendezvous(distributed_parameters):
     print("Initializing process group...")
@@ -250,7 +290,7 @@ def run_benchmarking(local_rank, params):
     else:
         torch.cuda.set_device("cuda:0")
 
-    network = get_network(net)
+    network = get_network(net, params)
     if "shufflenet" == net:
         network.apply(weight_init)
 
@@ -282,6 +322,17 @@ def run_benchmarking(local_rank, params):
             print ("ERROR: requested torch.compile but this isn't pytorch 2.x")
             sys.exit(1)
 
+    ## MLPerf Setting
+    sgd_opt_base_learning_rate = 0.01
+    sgd_opt_end_learning_rate = 1e-4
+    sgd_opt_learning_rate_decay_poly_power = 2
+    sgd_opt_weight_decay = 0.0001
+    sgd_opt_momentum = 0.9
+    opt_learning_rate_warmup_epochs = 5
+
+    total_epochs = params.iterations
+    optimizer = torch.optim.SGD(param_copy, lr = sgd_opt_base_learning_rate, momentum = sgd_opt_momentum, weight_decay=sgd_opt_weight_decay)
+
     if (run_fp16):
         network = FP16Model(network) 
 
@@ -316,6 +367,8 @@ def run_benchmarking(local_rank, params):
         inp = torch.randn(batch_size, 3, 224, 224, device="cuda")
     if (run_fp16):
         inp = inp.half()
+    if params.nhwc:
+        inp = inp.to(memory_format=torch.channels_last)
     if net in models:
         # number of classes is 1000 for imagenet
         target = torch.randint(0, 1000, (batch_size,), device="cuda")
@@ -323,10 +376,17 @@ def run_benchmarking(local_rank, params):
         # number of classes is 21 for segmentation
         target = torch.randint(0, 21, (batch_size,), device="cuda")
 
+    if params.mode == "training":
+        forward_fn = forwardbackward
+        network.train()
+    else:
+        forward_fn = forward
+        network.eval()
+
     ## warmup.
     print ("INFO: running forward and backward for warmup.")
-    forwardbackward(inp, optimizer, network, target, amp_opt_level)
-    forwardbackward(inp, optimizer, network, target, amp_opt_level)
+    for i in range(2):
+        forward_fn(inp, optimizer, network, params, target, step=0, opt_step=params.opt_step)
 
     time.sleep(1)
     torch.cuda.synchronize()
@@ -339,13 +399,19 @@ def run_benchmarking(local_rank, params):
             skip_first = 0,
             wait = 1,
             warmup = 2,
-            active = 2,
+            active = 5,
             repeat = 1,
         )
 
         def trace_ready_callback(prof):
-            print("----------- Trace Ready -----------")
-            prof.export_chrome_trace(f"trace{prof.step_num}.json")
+            rank = 0
+            if torch.distributed.is_available() and torch.distributed.is_initialized():
+                rank = torch.distributed.get_rank()
+            if rank == 0:
+                print("----------- Trace Ready -----------")
+                prof.export_chrome_trace(f"{params.profiler_output}.json")            
+            # print(f"----------- Rank {rank} Trace Ready -----------")
+            # prof.export_chrome_trace(f"{params.profiler_output}_rank{rank}.json")
 
         tm = time.time()
         with profile(
@@ -354,7 +420,7 @@ def trace_ready_callback(prof):
             on_trace_ready=trace_ready_callback) as prof:
             for i in range(iterations):
                 with record_function(f"iteration {i}"):
-                    forwardbackward(inp, optimizer, network, target, amp_opt_level)
+                    forward_fn(inp, optimizer, network, params, target, step=i, opt_step=params.opt_step)
                 prof.step()
             torch.cuda.synchronize()
             print(prof.key_averages().table(sort_by="cuda_time_total"))
@@ -363,9 +429,10 @@ def trace_ready_callback(prof):
         with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler):
             for i in range(iterations):
                 if i == flops_prof_step:
-                    forwardbackward(inp, optimizer, network, target, amp_opt_level, i)
+                    forward_fn(inp, optimizer, network, params, target, step=i, opt_step=params.opt_step, 
+                        flops_prof_step = i)
                 else:
-                    forwardbackward(inp, optimizer, network, target, amp_opt_level)
+                    forward_fn(inp, optimizer, network, params, target, step=i, opt_step=params.opt_step)
         torch.cuda.synchronize()
 
     tm2 = time.time()
@@ -386,27 +453,64 @@ def trace_ready_callback(prof):
     else:
         dtype = 'FP32'
 
+    result = None
+    if not params.output_dir:
+        params.output_dir = "."
+
     print ("OK: finished running benchmark..")
     print ("--------------------SUMMARY--------------------------")
     print ("Microbenchmark for network : {}".format(net))
     if distributed_dataparallel or is_torchrun:
-      print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------");
-      print ("Num devices: 1")
+        print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------");
+        print ("Num devices: 1")
     else:
-      print ("Num devices: {}".format(ngpus))
+        print ("Num devices: {}".format(ngpus))
+        result = {
+            "Name": params.output_file,
+            "GPUs": 1,
+            "Mini batch size [img]": batch_size,
+            "Mini batch size [img/gpu]": batch_size,
+            "Throughput [img/sec]": batch_size / time_per_batch,
+            "Time per mini-batch": time_per_batch
+        }
+        with open(f"{params.output_dir}/{params.output_file}.json", "w") as f:
+            json.dump(result, f, indent=2)
+
     print ("Dtype: {}".format(dtype))
     print ("Mini batch size [img] : {}".format(batch_size))
     print ("Time per mini-batch : {}".format(time_per_batch))
     print ("Throughput [img/sec] : {}".format(batch_size/time_per_batch))
     if (distributed_dataparallel or is_torchrun) and distributed_parameters['rank'] == 0:
-      print ("")
-      print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------")
-      world_size = distributed_parameters['world_size']
-      print ("Num devices: {}".format(world_size))
-      print ("Dtype: {}".format(dtype))
-      print ("Mini batch size [img] : {}".format(batch_size*world_size))
-      print ("Time per mini-batch : {}".format(time_per_batch))
-      print ("Throughput [img/sec] : {}".format(batch_size*world_size/time_per_batch))
+        print ("")
+        print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------")
+        world_size = distributed_parameters['world_size']
+        print ("Num devices: {}".format(world_size))
+        print ("Dtype: {}".format(dtype))
+        print ("Mini batch size [img] : {}".format(batch_size*world_size))
+        print ("Time per mini-batch : {}".format(time_per_batch))
+        print ("Throughput [img/sec] : {}".format(batch_size*world_size/time_per_batch))
+        result = {
+            "Name": params.output_file,
+            "GPUs": distributed_parameters['world_size'],
+            "Mini batch size [img]": batch_size * distributed_parameters['world_size'],
+            "Mini batch size [img/gpu]": batch_size,
+            "Throughput [img/sec]": batch_size * distributed_parameters['world_size'] / time_per_batch,
+            "Time per mini-batch": time_per_batch
+        }
+        with open(f"{params.output_dir}/{params.output_file}.json", "w") as f:
+            json.dump(result, f, indent=2)
+
+    csv_filename = f"{params.output_dir}/benchmark_summary.csv"
+    if params.csv_file:
+        csv_filename = params.csv_file
+    file_exists = os.path.isfile(csv_filename)
+    if result:
+        with open(csv_filename, "a", newline='') as csvfile:
+            writer = csv.writer(csvfile)
+            if not file_exists:
+                writer.writerow(result.keys())
+            writer.writerow(result.values())
+        print(f"Benchmark result saved to {csv_filename}")
 
 def main():
     run_benchmarking_wrapper(copy.deepcopy(args))
@@ -432,6 +536,14 @@ def main():
     parser.add_argument('--sync_bn', action='store_true', help='enabling apex sync BN.')
     parser.add_argument('--keep-batchnorm-fp32', type=str, default=None)
     parser.add_argument('--loss-scale', type=str, default=None)
+    parser.add_argument("--csv-file", type=str, default=None, required=False, help="assign output csv file name.")
+    parser.add_argument("--mode", type=str, choices=['training', 'inference'], default="training", help="Select mode: training or inference")
+    parser.add_argument("--nhwc", action='store_true', default=False, help="Use nhwc format")
+    parser.add_argument("--opt-step", type=int, required=False, default=1, help="Optimizer update step")
+    parser.add_argument("--output-dir", type=str, default="", help="assign output directory name.")
+    parser.add_argument("--output-file", type=str, default="", help="assign output file name.")
+    parser.add_argument("--profiler-output", type=str, default="", help="assign profiler output name.") 
+
 
     args = parser.parse_args()
 

From cc26b2f0a4c57485053459ba2e90f0e322e8b726 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Sun, 8 Feb 2026 21:34:50 +0000
Subject: [PATCH 11/22] add methods to calculate the target

---
 audio/audio_loss.py         | 26 +++++++----------------
 audio/audio_output.py       | 41 ++++++++++++++++++++++++++++++++++++-
 micro_benchmarking_audio.py | 18 ++++++++--------
 3 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/audio/audio_loss.py b/audio/audio_loss.py
index 57e0b5c..3a0d857 100644
--- a/audio/audio_loss.py
+++ b/audio/audio_loss.py
@@ -29,7 +29,7 @@ def get_criterion(network_name):
     return criterion
 
     
-def calculate_loss(network_name, criterion, output):
+def calculate_loss(network_name, criterion, output, target, batch_size):
     if criterion is None:
         target = torch.randn_like(output)
         return torch.nn.functional.mse_loss(output, target)
@@ -39,37 +39,25 @@ def calculate_loss(network_name, criterion, output):
     elif network_name in speech_recognition_models or network_name in acoustic_models:
         output = output.transpose(-1, -2).transpose(0, 1)
         T, N, C = output.shape
-        target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long)
-        target = torch.randint(
-            low=1,
-            high=C,
-            size=(sum(target_lengths),),
-            dtype=torch.long,
-        )
+        target, target_lengths = target
         tensors_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
         loss = criterion(output, target, tensors_lengths, target_lengths)
     elif "wavernn" in network_name:
-        target = torch.randn_like(output)
-        output, target = output.squeeze(1), target.squeeze(1)
+        output = output.squeeze(1)
         output = output.transpose(1, 2)
-        target = target.transpose(1, 2)
         loss = criterion(output, target)
     elif "conv_tasnet" in network_name:
-        batch, _, time = output.shape
-        mask = torch.randint(low=0, high=1, size=(batch,1,time), dtype=torch.long).cuda()
-        target = torch.randn_like(output)
+        target, mask = target
         loss = criterion(output, target, mask)
     elif "tacotron2" in network_name:
-        target = torch.randn_like(output)
         loss = criterion(output, target)
     elif "hdemucs" in network_name or "subjective" in network_name:
-        target = torch.randn_like(output)
         loss = criterion(output, target)
     elif "objective" in network_name:
+        loss = 0
         for index in range(len(output)):
-            target = torch.randn_like(output[index])
             if index == 0:
-                loss = criterion(output[index], target)
+                loss = criterion(output[index], target[index])
             else:
-                loss += criterion(output[index], target)
+                loss += criterion(output[index], target[index])
     return loss
\ No newline at end of file
diff --git a/audio/audio_output.py b/audio/audio_output.py
index 86c22bf..6110bcf 100644
--- a/audio/audio_output.py
+++ b/audio/audio_output.py
@@ -8,4 +8,43 @@ def get_output_selection(network_name):
         return 0
     elif "tacotron2" in network_name:
         return 1
-    return None
\ No newline at end of file
+    return None
+
+def create_target(network_name, network, input, batch_size):
+    
+    #get output
+    output = network(**input)
+    output_index = get_output_selection(network_name) 
+    if output_index is not None:
+        output = output[output_index]
+
+    target = None
+    if network_name in speech_recognition_models or network_name in acoustic_models:
+        output = output.transpose(-1, -2).transpose(0, 1)
+        T, N, C = output.shape
+        target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long)
+        target = torch.randint(
+            low=1,
+            high=C,
+            size=(sum(target_lengths),),
+            dtype=torch.long,
+        )
+        target = [target, target_lengths]
+    elif "wavernn" in network_name:
+        target = torch.randn_like(output)
+        target = target.squeeze(1)
+        target = target.transpose(1, 2)
+    elif "conv_tasnet" in network_name:
+        batch, _, time = output.shape
+        mask = torch.randint(low=0, high=1, size=(batch,1,time), dtype=torch.long).cuda()
+        target = torch.randn_like(output)
+        target = [target, mask]
+    elif "tacotron2" in network_name:
+        target = torch.randn_like(output)
+    elif "hdemucs" in network_name or "subjective" in network_name:
+        target = torch.randn_like(output)
+    elif "objective" in network_name:
+        target = []
+        for index in range(len(output)):
+            target.append(torch.randn_like(output[index]))
+    return target
\ No newline at end of file
diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py
index ac07b95..ba58bc1 100644
--- a/micro_benchmarking_audio.py
+++ b/micro_benchmarking_audio.py
@@ -13,7 +13,7 @@
 from audio.audio_model import get_network_names, get_network
 from audio.audio_loss import get_criterion, calculate_loss
 from audio.audio_input import get_input_type, get_input
-from audio.audio_output import get_output_selection
+from audio.audio_output import get_output_selection, create_target
 
 
 try:
@@ -48,7 +48,7 @@ def weight_init(m):
         m.bias.data.zero_()
 
 
-def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_size, criterion, flops_prof_step=0):
+def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_size, criterion, target, flops_prof_step=0):
     optimizer.zero_grad()
     if flops_prof_step:
         prof = FlopsProfiler(network)
@@ -59,7 +59,7 @@ def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_
     if output_index is not None:
         out = out[output_index]
     
-    loss = calculate_loss(network_name, criterion, out)
+    loss = calculate_loss(network_name, criterion, out, target, batch_size)
     
     # End profiler here if only to profile forward pass
 
@@ -206,11 +206,13 @@ def run_benchmarking(local_rank, params):
         
     if (run_fp16):
         inp = inp.half()
+
+    target = create_target(net, network, inp, batch_size)
     
     ## warmup.
     print ("INFO: running forward and backward for warmup.")
-    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion)
-    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion)
+    for i in range(2):
+        forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target)
 
     time.sleep(1)
     torch.cuda.synchronize()
@@ -238,7 +240,7 @@ def trace_ready_callback(prof):
             on_trace_ready=trace_ready_callback) as prof:
             for i in range(iterations):
                 with record_function(f"iteration {i}"):
-                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion)
+                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target)
                 prof.step()
             torch.cuda.synchronize()
             print(prof.key_averages().table(sort_by="cuda_time_total"))
@@ -247,9 +249,9 @@ def trace_ready_callback(prof):
         with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler):
             for i in range(iterations):
                 if i == flops_prof_step:
-                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, i)
+                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, i)
                 else:
-                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion)
+                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target)
         torch.cuda.synchronize()
 
     tm2 = time.time()

From 3724039242fcb65e5d908beb33c953b3add3b724 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Sun, 8 Feb 2026 22:31:31 +0000
Subject: [PATCH 12/22] add loss function for squim objective

---
 audio/audio_loss.py         | 4 +++-
 audio/audio_output.py       | 1 +
 micro_benchmarking_audio.py | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/audio/audio_loss.py b/audio/audio_loss.py
index 3a0d857..6464544 100644
--- a/audio/audio_loss.py
+++ b/audio/audio_loss.py
@@ -29,7 +29,7 @@ def get_criterion(network_name):
     return criterion
 
     
-def calculate_loss(network_name, criterion, output, target, batch_size):
+def calculate_loss(network_name, criterion, output, target, batch_size, input):
     if criterion is None:
         target = torch.randn_like(output)
         return torch.nn.functional.mse_loss(output, target)
@@ -55,9 +55,11 @@ def calculate_loss(network_name, criterion, output, target, batch_size):
         loss = criterion(output, target)
     elif "objective" in network_name:
         loss = 0
+        weights = [1, 2, 0.5, 2]
         for index in range(len(output)):
             if index == 0:
                 loss = criterion(output[index], target[index])
             else:
                 loss += criterion(output[index], target[index])
+        loss += criterion(input["x"], target[3])
     return loss
\ No newline at end of file
diff --git a/audio/audio_output.py b/audio/audio_output.py
index 6110bcf..cf9c03b 100644
--- a/audio/audio_output.py
+++ b/audio/audio_output.py
@@ -47,4 +47,5 @@ def create_target(network_name, network, input, batch_size):
         target = []
         for index in range(len(output)):
             target.append(torch.randn_like(output[index]))
+        target.append(torch.randn_like(input["x"]))
     return target
\ No newline at end of file
diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py
index ba58bc1..ed7509f 100644
--- a/micro_benchmarking_audio.py
+++ b/micro_benchmarking_audio.py
@@ -59,7 +59,7 @@ def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_
     if output_index is not None:
         out = out[output_index]
     
-    loss = calculate_loss(network_name, criterion, out, target, batch_size)
+    loss = calculate_loss(network_name, criterion, out, target, batch_size, inp)
     
     # End profiler here if only to profile forward pass
 

From 714a1421a4c5cb3f62bd456b654e98cfb8bd2a9e Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Mon, 9 Feb 2026 08:36:28 +0000
Subject: [PATCH 13/22] correct usage of squim subjective model

---
 audio/audio_loss.py   | 2 ++
 audio/audio_output.py | 1 +
 2 files changed, 3 insertions(+)

diff --git a/audio/audio_loss.py b/audio/audio_loss.py
index 6464544..fdc3ce4 100644
--- a/audio/audio_loss.py
+++ b/audio/audio_loss.py
@@ -26,6 +26,8 @@ def get_criterion(network_name):
         criterion = nn.MSELoss()
     elif "hdemucs" in network_name or "squim" in network_name:
         criterion = nn.L1Loss()
+    elif "subjective" in network_name:
+        criterion = nn.L1Loss()
     return criterion
 
     
diff --git a/audio/audio_output.py b/audio/audio_output.py
index cf9c03b..a4fcf8d 100644
--- a/audio/audio_output.py
+++ b/audio/audio_output.py
@@ -15,6 +15,7 @@ def create_target(network_name, network, input, batch_size):
     #get output
     output = network(**input)
     output_index = get_output_selection(network_name) 
+    print("output", output.shape)
     if output_index is not None:
         output = output[output_index]
 

From ad68179d4983092ba44f1bf597419e2a876ee6e4 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Mon, 9 Feb 2026 10:08:59 +0000
Subject: [PATCH 14/22] refactor audio loss code to combine conditions

---
 audio/audio_loss.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/audio/audio_loss.py b/audio/audio_loss.py
index fdc3ce4..9b3fdf6 100644
--- a/audio/audio_loss.py
+++ b/audio/audio_loss.py
@@ -24,17 +24,12 @@ def get_criterion(network_name):
         criterion = si_sdr_loss
     elif "tacotron2" in network_name:
         criterion = nn.MSELoss()
-    elif "hdemucs" in network_name or "squim" in network_name:
-        criterion = nn.L1Loss()
-    elif "subjective" in network_name:
+    elif "hdemucs" in network_name or "squim" in network_name or "subjective" in network_name:
         criterion = nn.L1Loss()
     return criterion
 
     
 def calculate_loss(network_name, criterion, output, target, batch_size, input):
-    if criterion is None:
-        target = torch.randn_like(output)
-        return torch.nn.functional.mse_loss(output, target)
     if network_name in speech_representation_models:
         logit_m, logit_u, feature_penalty = output
         loss = criterion(logit_m, logit_u, feature_penalty)
@@ -51,9 +46,7 @@ def calculate_loss(network_name, criterion, output, target, batch_size, input):
     elif "conv_tasnet" in network_name:
         target, mask = target
         loss = criterion(output, target, mask)
-    elif "tacotron2" in network_name:
-        loss = criterion(output, target)
-    elif "hdemucs" in network_name or "subjective" in network_name:
+    elif "tacotron2" in network_name or "hdemucs" in network_name or "subjective" in network_name:
         loss = criterion(output, target)
     elif "objective" in network_name:
         loss = 0

From d5b520cb17d99e1caa88a2a03a38935a4169656c Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Mon, 9 Feb 2026 10:21:35 +0000
Subject: [PATCH 15/22] refactor audio loss code to combine conditions

---
 audio/audio_loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/audio/audio_loss.py b/audio/audio_loss.py
index 9b3fdf6..4d0cd89 100644
--- a/audio/audio_loss.py
+++ b/audio/audio_loss.py
@@ -24,7 +24,7 @@ def get_criterion(network_name):
         criterion = si_sdr_loss
     elif "tacotron2" in network_name:
         criterion = nn.MSELoss()
-    elif "hdemucs" in network_name or "squim" in network_name or "subjective" in network_name:
+    elif "hdemucs" in network_name or "squim" in network_name:
         criterion = nn.L1Loss()
     return criterion
 

From 870711b8b8d1293d9bd51430c25b81c691d4c192 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Mon, 9 Feb 2026 10:26:23 +0000
Subject: [PATCH 16/22] add error messages for undefined input, target, model,
 loss function for undefined network names

---
 audio/audio_input.py  | 3 +++
 audio/audio_loss.py   | 7 +++++++
 audio/audio_output.py | 3 +++
 3 files changed, 13 insertions(+)

diff --git a/audio/audio_input.py b/audio/audio_input.py
index d7a58af..22e1288 100644
--- a/audio/audio_input.py
+++ b/audio/audio_input.py
@@ -59,4 +59,7 @@ def get_input(network_name, network, batch_size):
     elif network_name in speech_representation_models:
         inp = {"waveforms" : torch.rand(batch_size, FRAME_COUNT, device="cuda"),
                "labels" : torch.randint(0, 100, (batch_size, 2), dtype=torch.int32, device="cuda")}
+    else:
+        print (f"Input for {network_name} not defined")
+        sys.exit(1)
     return inp
\ No newline at end of file
diff --git a/audio/audio_loss.py b/audio/audio_loss.py
index 4d0cd89..a70a151 100644
--- a/audio/audio_loss.py
+++ b/audio/audio_loss.py
@@ -26,10 +26,14 @@ def get_criterion(network_name):
         criterion = nn.MSELoss()
     elif "hdemucs" in network_name or "squim" in network_name:
         criterion = nn.L1Loss()
+    else:
+        print (f"Criterion for network name {network_name} not defined")
+        sys.exit(1)
     return criterion
 
     
 def calculate_loss(network_name, criterion, output, target, batch_size, input):
+    loss = 0
     if network_name in speech_representation_models:
         logit_m, logit_u, feature_penalty = output
         loss = criterion(logit_m, logit_u, feature_penalty)
@@ -57,4 +61,7 @@ def calculate_loss(network_name, criterion, output, target, batch_size, input):
             else:
                 loss += criterion(output[index], target[index])
         loss += criterion(input["x"], target[3])
+    else:
+        print (f"Loss function for {network_name} not defined")
+        sys.exit(1)
     return loss
\ No newline at end of file
diff --git a/audio/audio_output.py b/audio/audio_output.py
index a4fcf8d..fffd546 100644
--- a/audio/audio_output.py
+++ b/audio/audio_output.py
@@ -49,4 +49,7 @@ def create_target(network_name, network, input, batch_size):
         for index in range(len(output)):
             target.append(torch.randn_like(output[index]))
         target.append(torch.randn_like(input["x"]))
+    else:
+        print (f"Target for {network_name} not defined")
+        sys.exit(1)
     return target
\ No newline at end of file

From 007e0cfd30770d99d017d11d1ba962873bdb47bd Mon Sep 17 00:00:00 2001
From: Sriram Kumar <sriramkumar.kishorekumar@amd.com>
Date: Mon, 9 Feb 2026 04:41:26 -0600
Subject: [PATCH 17/22] fix error related to sdr loss

---
 audio/audio_output.py | 2 +-
 audio/sdr.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/audio/audio_output.py b/audio/audio_output.py
index fffd546..db29faa 100644
--- a/audio/audio_output.py
+++ b/audio/audio_output.py
@@ -15,7 +15,6 @@ def create_target(network_name, network, input, batch_size):
     #get output
     output = network(**input)
     output_index = get_output_selection(network_name) 
-    print("output", output.shape)
     if output_index is not None:
         output = output[output_index]
 
@@ -43,6 +42,7 @@ def create_target(network_name, network, input, batch_size):
     elif "tacotron2" in network_name:
         target = torch.randn_like(output)
     elif "hdemucs" in network_name or "subjective" in network_name:
+        print("output", output.shape)
         target = torch.randn_like(output)
     elif "objective" in network_name:
         target = []
diff --git a/audio/sdr.py b/audio/sdr.py
index 011bb56..4053b6c 100644
--- a/audio/sdr.py
+++ b/audio/sdr.py
@@ -214,5 +214,5 @@ def si_sdr_loss(estimate: torch.Tensor, reference: torch.Tensor, mask: torch.Ten
     estimate = estimate - estimate.mean(axis=2, keepdim=True)
     reference = reference - reference.mean(axis=2, keepdim=True)
 
-    si_sdri = sdr.sdr_pit(estimate, reference, mask=mask)
+    si_sdri = sdr_pit(estimate, reference, mask=mask)
     return -si_sdri.mean()
\ No newline at end of file

From 768bf9c704170d311c4622317582397564064122 Mon Sep 17 00:00:00 2001
From: Sriram Kumar <sriramkumar.kishorekumar@amd.com>
Date: Mon, 9 Feb 2026 07:07:30 -0600
Subject: [PATCH 18/22] fix error related to hdmucs loss

---
 audio/audio_loss.py   | 10 ++++++++--
 audio/audio_output.py |  1 -
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/audio/audio_loss.py b/audio/audio_loss.py
index a70a151..817437a 100644
--- a/audio/audio_loss.py
+++ b/audio/audio_loss.py
@@ -24,7 +24,9 @@ def get_criterion(network_name):
         criterion = si_sdr_loss
     elif "tacotron2" in network_name:
         criterion = nn.MSELoss()
-    elif "hdemucs" in network_name or "squim" in network_name:
+    elif "hdemucs" in network_name:
+        criterion = nn.L1Loss(reduction='none')
+    elif "squim" in network_name:
         criterion = nn.L1Loss()
     else:
         print (f"Criterion for network name {network_name} not defined")
@@ -50,7 +52,7 @@ def calculate_loss(network_name, criterion, output, target, batch_size, input):
     elif "conv_tasnet" in network_name:
         target, mask = target
         loss = criterion(output, target, mask)
-    elif "tacotron2" in network_name or "hdemucs" in network_name or "subjective" in network_name:
+    elif "tacotron2" in network_name or "subjective" in network_name:
         loss = criterion(output, target)
     elif "objective" in network_name:
         loss = 0
@@ -61,6 +63,10 @@ def calculate_loss(network_name, criterion, output, target, batch_size, input):
             else:
                 loss += criterion(output[index], target[index])
         loss += criterion(input["x"], target[3])
+    elif "hdemucs" in network_name:
+        dims = tuple(range(2, target.dim()))
+        loss = criterion(output, target)
+        loss = loss.mean(dims).mean(0)
     else:
         print (f"Loss function for {network_name} not defined")
         sys.exit(1)
diff --git a/audio/audio_output.py b/audio/audio_output.py
index db29faa..c5c5cb6 100644
--- a/audio/audio_output.py
+++ b/audio/audio_output.py
@@ -42,7 +42,6 @@ def create_target(network_name, network, input, batch_size):
     elif "tacotron2" in network_name:
         target = torch.randn_like(output)
     elif "hdemucs" in network_name or "subjective" in network_name:
-        print("output", output.shape)
         target = torch.randn_like(output)
     elif "objective" in network_name:
         target = []

From 8696de9c75aa0b3c3d30e635d446a6346f56218a Mon Sep 17 00:00:00 2001
From: Sriram Kumar <sriramkumar.kishorekumar@amd.com>
Date: Mon, 9 Feb 2026 08:21:09 -0600
Subject: [PATCH 19/22] apply some recent pytorch changes to torchaudio
 benchmark

---
 micro_benchmarking_audio.py | 114 ++++++++++++++++++++++++++++--------
 1 file changed, 91 insertions(+), 23 deletions(-)

diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py
index ed7509f..fc74d3c 100644
--- a/micro_benchmarking_audio.py
+++ b/micro_benchmarking_audio.py
@@ -14,7 +14,8 @@
 from audio.audio_loss import get_criterion, calculate_loss
 from audio.audio_input import get_input_type, get_input
 from audio.audio_output import get_output_selection, create_target
-
+import csv
+import json
 
 try:
     import torch._dynamo
@@ -48,8 +49,9 @@ def weight_init(m):
         m.bias.data.zero_()
 
 
-def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_size, criterion, target, flops_prof_step=0):
-    optimizer.zero_grad()
+def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_size, criterion, target, step=0, opt_step=1, flops_prof_step=0):
+    if step % opt_step == 0:
+        optimizer.zero_grad()
     if flops_prof_step:
         prof = FlopsProfiler(network)
         prof.start_profile()
@@ -73,7 +75,8 @@ def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_
         prof.print_model_profile(profile_step=flops_prof_step)
         prof.end_profile()
 
-    optimizer.step()
+    if (step + 1) % opt_step == 0:
+        optimizer.step()
 
 def rendezvous(distributed_parameters):
     print("Initializing process group...")
@@ -183,7 +186,17 @@ def run_benchmarking(local_rank, params):
     param_copy = network.parameters()
     if (run_fp16):
         param_copy = get_param_copy(network)
-    optimizer = torch.optim.SGD(param_copy, lr = 0.01, momentum = 0.9)
+    
+    ## MLPerf Setting
+    sgd_opt_base_learning_rate = 0.01
+    sgd_opt_end_learning_rate = 1e-4
+    sgd_opt_learning_rate_decay_poly_power = 2
+    sgd_opt_weight_decay = 0.0001
+    sgd_opt_momentum = 0.9
+    opt_learning_rate_warmup_epochs = 5
+
+    total_epochs = params.iterations
+    optimizer = torch.optim.SGD(param_copy, lr = sgd_opt_base_learning_rate, momentum = sgd_opt_momentum, weight_decay=sgd_opt_weight_decay)
 
     if (amp_opt_level):
         network, optimizer = apex.amp.initialize(network, optimizer, opt_level="O%d"%amp_opt_level)
@@ -208,11 +221,18 @@ def run_benchmarking(local_rank, params):
         inp = inp.half()
 
     target = create_target(net, network, inp, batch_size)
+
+    if params.mode == "training":
+        forward_fn = forwardbackward
+        network.train()
+    else:
+        forward_fn = forward
+        network.eval()
     
     ## warmup.
     print ("INFO: running forward and backward for warmup.")
     for i in range(2):
-        forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target)
+        forward_fn(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, step=0, opt_step=params.opt_step)
 
     time.sleep(1)
     torch.cuda.synchronize()
@@ -225,13 +245,19 @@ def run_benchmarking(local_rank, params):
             skip_first = 0,
             wait = 1,
             warmup = 2,
-            active = 2,
+            active = 5,
             repeat = 1,
         )
 
         def trace_ready_callback(prof):
-            print("----------- Trace Ready -----------")
-            prof.export_chrome_trace(f"trace{prof.step_num}.json")
+            rank = 0
+            if torch.distributed.is_available() and torch.distributed.is_initialized():
+                rank = torch.distributed.get_rank()
+            if rank == 0:
+                print("----------- Trace Ready -----------")
+                prof.export_chrome_trace(f"{params.profiler_output}.json")            
+            # print(f"----------- Rank {rank} Trace Ready -----------")
+            # prof.export_chrome_trace(f"{params.profiler_output}_rank{rank}.json")
 
         tm = time.time()
         with profile(
@@ -240,7 +266,7 @@ def trace_ready_callback(prof):
             on_trace_ready=trace_ready_callback) as prof:
             for i in range(iterations):
                 with record_function(f"iteration {i}"):
-                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target)
+                    forward_fn(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, step=i, opt_step=params.opt_step)
                 prof.step()
             torch.cuda.synchronize()
             print(prof.key_averages().table(sort_by="cuda_time_total"))
@@ -249,9 +275,9 @@ def trace_ready_callback(prof):
         with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler):
             for i in range(iterations):
                 if i == flops_prof_step:
-                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, i)
+                    forward_fn(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, step=i, opt_step=params.opt_step, flops_prof_step=i)
                 else:
-                    forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target)
+                    forward_fn(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, step=i, opt_step=params.opt_step)
         torch.cuda.synchronize()
 
     tm2 = time.time()
@@ -272,27 +298,63 @@ def trace_ready_callback(prof):
     else:
         dtype = 'FP32'
 
+    result = None
+    if not params.output_dir:
+        params.output_dir = "."
+
     print ("OK: finished running benchmark..")
     print ("--------------------SUMMARY--------------------------")
     print ("Microbenchmark for network : {}".format(net))
     if distributed_dataparallel or is_torchrun:
-      print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------");
-      print ("Num devices: 1")
+        print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------");
+        print ("Num devices: 1")
     else:
-      print ("Num devices: {}".format(ngpus))
+        print ("Num devices: {}".format(ngpus))
+        result = {
+            "Name": params.output_file,
+            "GPUs": 1,
+            "Mini batch size [img]": batch_size,
+            "Mini batch size [img/gpu]": batch_size,
+            "Throughput [img/sec]": batch_size / time_per_batch,
+            "Time per mini-batch": time_per_batch
+        }
+        with open(f"{params.output_dir}/{params.output_file}.json", "w") as f:
+            json.dump(result, f, indent=2)
     print ("Dtype: {}".format(dtype))
     print ("Mini batch size [", get_input_type(net), "] : {}".format(batch_size))
     print ("Time per mini-batch : {}".format(time_per_batch))
     print ("Throughput [", get_input_type(net), "/sec] : {}".format(batch_size/time_per_batch))
     if (distributed_dataparallel or is_torchrun) and distributed_parameters['rank'] == 0:
-      print ("")
-      print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------")
-      world_size = distributed_parameters['world_size']
-      print ("Num devices: {}".format(world_size))
-      print ("Dtype: {}".format(dtype))
-      print ("Mini batch size [", get_input_type(net), "] : {}".format(batch_size*world_size))
-      print ("Time per mini-batch : {}".format(time_per_batch))
-      print ("Throughput [", get_input_type(net), "/sec] : {}".format(batch_size*world_size/time_per_batch))
+        print ("")
+        print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------")
+        world_size = distributed_parameters['world_size']
+        print ("Num devices: {}".format(world_size))
+        print ("Dtype: {}".format(dtype))
+        print ("Mini batch size [", get_input_type(net), "] : {}".format(batch_size*world_size))
+        print ("Time per mini-batch : {}".format(time_per_batch))
+        print ("Throughput [", get_input_type(net), "/sec] : {}".format(batch_size*world_size/time_per_batch))
+        result = {
+            "Name": params.output_file,
+            "GPUs": distributed_parameters['world_size'],
+            "Mini batch size [img]": batch_size * distributed_parameters['world_size'],
+            "Mini batch size [img/gpu]": batch_size,
+            "Throughput [img/sec]": batch_size * distributed_parameters['world_size'] / time_per_batch,
+            "Time per mini-batch": time_per_batch
+        }
+        with open(f"{params.output_dir}/{params.output_file}.json", "w") as f:
+            json.dump(result, f, indent=2)
+
+    csv_filename = f"{params.output_dir}/benchmark_summary.csv"
+    if params.csv_file:
+        csv_filename = params.csv_file
+    file_exists = os.path.isfile(csv_filename)
+    if result:
+        with open(csv_filename, "a", newline='') as csvfile:
+            writer = csv.writer(csvfile)
+            if not file_exists:
+                writer.writerow(result.keys())
+            writer.writerow(result.values())
+        print(f"Benchmark result saved to {csv_filename}")
 
 def main():
     run_benchmarking_wrapper(copy.deepcopy(args))
@@ -315,6 +377,12 @@ def main():
     parser.add_argument("--dist-url", type=str, required=False, default=None, help="url used for rendezvous of processes in distributed training. Needs to contain IP and open port of master rank0 eg. 'tcp://172.23.2.1:54321'. Required for --distributed_dataparallel")
     parser.add_argument("--compile", action='store_true', required=False, help="use pytorch 2.0")
     parser.add_argument("--compileContext", default={}, required=False, help="additional compile options")
+    parser.add_argument("--csv-file", type=str, default=None, required=False, help="assign output csv file name.")
+    parser.add_argument("--mode", type=str, choices=['training', 'inference'], default="training", help="Select mode: training or inference")
+    parser.add_argument("--opt-step", type=int, required=False, default=1, help="Optimizer update step")
+    parser.add_argument("--output-dir", type=str, default="", help="assign output directory name.")
+    parser.add_argument("--output-file", type=str, default="", help="assign output file name.")
+    parser.add_argument("--profiler-output", type=str, default="", help="assign profiler output name.") 
 
     args = parser.parse_args()
 

From 2f02eb4818d45d5d8697910cb207cea523e6d1c7 Mon Sep 17 00:00:00 2001
From: Sriram Kumar <sriramkumar.kishorekumar@amd.com>
Date: Mon, 9 Feb 2026 08:32:53 -0600
Subject: [PATCH 20/22] replace apex amp with torch amp

---
 micro_benchmarking_audio.py | 101 ++++++++++++++++++++----------------
 1 file changed, 56 insertions(+), 45 deletions(-)

diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py
index fc74d3c..efb5915 100644
--- a/micro_benchmarking_audio.py
+++ b/micro_benchmarking_audio.py
@@ -16,6 +16,7 @@
 from audio.audio_output import get_output_selection, create_target
 import csv
 import json
+from torch.amp import autocast, GradScaler
 
 try:
     import torch._dynamo
@@ -31,12 +32,6 @@
     # this indicates we're using torchrun
     is_torchrun = True
 
-try:
-    import apex
-    HAVE_APEX = True
-except:
-    HAVE_APEX = False
-
 
 def weight_init(m):
     if isinstance(m, nn.Conv2d):
@@ -49,34 +44,68 @@ def weight_init(m):
         m.bias.data.zero_()
 
 
-def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_size, criterion, target, step=0, opt_step=1, flops_prof_step=0):
+def forwardbackward(inp, optimizer, network, params, network_name, batch_size, criterion, target, step=0, opt_step=1, flops_prof_step=0):
     if step % opt_step == 0:
         optimizer.zero_grad()
     if flops_prof_step:
         prof = FlopsProfiler(network)
         prof.start_profile()
 
-    out = network(**inp)
-    output_index = get_output_selection(network_name) 
-    if output_index is not None:
-        out = out[output_index]
-    
-    loss = calculate_loss(network_name, criterion, out, target, batch_size, inp)
+    if params.amp:
+        with autocast('cuda'):
+            out = network(**inp)
+        output_index = get_output_selection(network_name) 
+        if output_index is not None:
+            out = out[output_index]
+
+        loss = calculate_loss(network_name, criterion, out, target, batch_size, inp)
+        scaler.scale(loss).backward()
+        if (step + 1) % opt_step == 0:
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+    else:
+        out = network(**inp)
+        output_index = get_output_selection(network_name) 
+        if output_index is not None:
+            out = out[output_index]
+        loss = calculate_loss(network_name, criterion, out, target, batch_size, inp)
+        loss.backward()
+        if (step + 1) % opt_step == 0:
+            optimizer.step()
+            optimizer.zero_grad()
     
     # End profiler here if only to profile forward pass
 
-    if amp_opt_level:
-        with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
-            scaled_loss.backward()
+    if flops_prof_step:
+        prof.print_model_profile(profile_step=flops_prof_step)
+        prof.end_profile()
+
+def forward(inp, optimizer, network, params, network_name, batch_size, criterion, target, step=0, opt_step=1, flops_prof_step=0):
+    if step % opt_step == 0:
+        optimizer.zero_grad()
+    if flops_prof_step:
+        prof = FlopsProfiler(network)
+        prof.start_profile()
+
+    if params.amp:
+        with autocast('cuda'):
+            out = network(**inp)
+        output_index = get_output_selection(network_name) 
+        if output_index is not None:
+            out = out[output_index]
     else:
-        loss.backward()
+        out = network(**inp)
+        output_index = get_output_selection(network_name) 
+        if output_index is not None:
+            out = out[output_index]
+    
+    # End profiler here if only to profile forward pass
 
     if flops_prof_step:
         prof.print_model_profile(profile_step=flops_prof_step)
         prof.end_profile()
 
-    if (step + 1) % opt_step == 0:
-        optimizer.step()
 
 def rendezvous(distributed_parameters):
     print("Initializing process group...")
@@ -130,7 +159,7 @@ def run_benchmarking(local_rank, params):
     ngpus = params.ngpus
     net = params.network
     run_fp16 = params.fp16
-    amp_opt_level = params.amp_opt_level
+    run_amp = params.amp
     distributed_dataparallel = params.distributed_dataparallel
     distributed_parameters = params.distributed_parameters
     batch_size = params.batch_size
@@ -198,9 +227,6 @@ def run_benchmarking(local_rank, params):
     total_epochs = params.iterations
     optimizer = torch.optim.SGD(param_copy, lr = sgd_opt_base_learning_rate, momentum = sgd_opt_momentum, weight_decay=sgd_opt_weight_decay)
 
-    if (amp_opt_level):
-        network, optimizer = apex.amp.initialize(network, optimizer, opt_level="O%d"%amp_opt_level)
-
     if is_torchrun:
         rendezvous(distributed_parameters)
         devices_to_run_on = [local_rank]
@@ -232,7 +258,7 @@ def run_benchmarking(local_rank, params):
     ## warmup.
     print ("INFO: running forward and backward for warmup.")
     for i in range(2):
-        forward_fn(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, step=0, opt_step=params.opt_step)
+        forward_fn(inp, optimizer, network, params, net, batch_size, criterion, target, step=0, opt_step=params.opt_step)
 
     time.sleep(1)
     torch.cuda.synchronize()
@@ -266,7 +292,7 @@ def trace_ready_callback(prof):
             on_trace_ready=trace_ready_callback) as prof:
             for i in range(iterations):
                 with record_function(f"iteration {i}"):
-                    forward_fn(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, step=i, opt_step=params.opt_step)
+                    forward_fn(inp, optimizer, network, params, net, batch_size, criterion, target, step=i, opt_step=params.opt_step)
                 prof.step()
             torch.cuda.synchronize()
             print(prof.key_averages().table(sort_by="cuda_time_total"))
@@ -275,9 +301,9 @@ def trace_ready_callback(prof):
         with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler):
             for i in range(iterations):
                 if i == flops_prof_step:
-                    forward_fn(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, step=i, opt_step=params.opt_step, flops_prof_step=i)
+                    forward_fn(inp, optimizer, network, params, net, batch_size, criterion, target, step=i, opt_step=params.opt_step, flops_prof_step=i)
                 else:
-                    forward_fn(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, step=i, opt_step=params.opt_step)
+                    forward_fn(inp, optimizer, network, params, net, batch_size, criterion, target, step=i, opt_step=params.opt_step)
         torch.cuda.synchronize()
 
     tm2 = time.time()
@@ -285,16 +311,8 @@ def trace_ready_callback(prof):
 
     if run_fp16:
         dtype = 'FP16'
-    elif amp_opt_level == 1:
-        dtype = 'AMP-O1: Insert automatic FP16 casts around safe Pytorch functions and Tensor methods.'
-    elif amp_opt_level == 2:
-        dtype = 'AMP-O2: FP16 training with FP32 batchnorm and FP32 master weights.'
-    elif amp_opt_level == 3:
-        dtype = 'AMP-O3: Pure FP16 training.'
-    elif amp_opt_level == 4:
-        dtype = 'AMP-O4: Insert automatic BFLOAT16 casts around safe Pytorch functions and Tensor methods.'
-    elif amp_opt_level == 5:
-        dtype = 'AMP-O5: BFLOAT16 training with FP32 batchnorm and FP32 master weights.'
+    elif run_amp:
+        dtype = 'AMP: PyTorch Native Automatic Mixed Precision'
     else:
         dtype = 'FP32'
 
@@ -368,7 +386,6 @@ def main():
     parser.add_argument("--kineto", action='store_true', required=False, help="Turn kineto profiling on")
     parser.add_argument("--autograd_profiler", action='store_true', required=False, help="Use PyTorch autograd (old) profiler")
     parser.add_argument("--fp16", type=int, required=False, default=0,help="FP16 mixed precision benchmarking")
-    parser.add_argument("--amp-opt-level", type=int, required=False, default=0,help="apex.amp mixed precision benchmarking opt level")
     parser.add_argument("--distributed_dataparallel", action='store_true', required=False, help="Use torch.nn.parallel.DistributedDataParallel api to run on multiple processes/nodes. The multiple processes need to be launched manually, this script will only launch ONE process per invocation. Either use --distributed_dataparallel and manually launch multiple processes or launch this script with `torchrun`")
     parser.add_argument("--device_ids", type=str, required=False, default=None, help="Comma-separated list (no spaces) to specify which HIP devices (0-indexed) to run distributedDataParallel api on. Might need to use HIP_VISIBLE_DEVICES to limit visiblity of devices to different processes.")
     parser.add_argument("--rank", type=int, required=False, default=None, help="Rank of this process. Required for --distributed_dataparallel")
@@ -377,6 +394,7 @@ def main():
     parser.add_argument("--dist-url", type=str, required=False, default=None, help="url used for rendezvous of processes in distributed training. Needs to contain IP and open port of master rank0 eg. 'tcp://172.23.2.1:54321'. Required for --distributed_dataparallel")
     parser.add_argument("--compile", action='store_true', required=False, help="use pytorch 2.0")
     parser.add_argument("--compileContext", default={}, required=False, help="additional compile options")
+    parser.add_argument("--amp", action='store_true', default=False, required=False, help="Automatic mixed precision benchmarking")
     parser.add_argument("--csv-file", type=str, default=None, required=False, help="assign output csv file name.")
     parser.add_argument("--mode", type=str, choices=['training', 'inference'], default="training", help="Select mode: training or inference")
     parser.add_argument("--opt-step", type=int, required=False, default=1, help="Optimizer update step")
@@ -393,11 +411,4 @@ def main():
             print("ERROR: You must install (or copy) deepspeed.profiling to use --flops-prof-step")
             sys.exit(1)
 
-    if args.fp16 and args.amp_opt_level:
-        print ("ERROR: Cannot use both --fp16 and --amp-opt-level")
-        sys.exit(1)
-    if args.amp_opt_level and not HAVE_APEX:
-        print ("ERROR: You must install apex to use --amp-opt-level")
-        sys.exit(1)
-
     main()
\ No newline at end of file

From a8ee74deac418b5cfbdf26395c407ede7330c1a5 Mon Sep 17 00:00:00 2001
From: Sriram Kumar <sriramkumar.kishorekumar@amd.com>
Date: Wed, 11 Feb 2026 09:09:31 -0600
Subject: [PATCH 21/22] change the location of optimizer step before profiling

---
 micro_benchmarking_apex.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/micro_benchmarking_apex.py b/micro_benchmarking_apex.py
index 8f5f729..e137ecc 100644
--- a/micro_benchmarking_apex.py
+++ b/micro_benchmarking_apex.py
@@ -190,6 +190,10 @@ def forwardbackward(inp, optimizer, network, params, target, step=0, opt_step=1,
     else:
         loss.backward()
 
+    if (step + 1) % opt_step == 0:
+        optimizer.step()
+        optimizer.zero_grad()
+
     if flops_prof_step:
         # End profiler here to profile both fwd and bwd passes
         # flops = prof.get_total_flops(as_string=True)
@@ -197,9 +201,6 @@ def forwardbackward(inp, optimizer, network, params, target, step=0, opt_step=1,
         prof.print_model_profile(profile_step=flops_prof_step)
         prof.end_profile()
 
-    if (step + 1) % opt_step == 0:
-        optimizer.step()
-
 def forward(inp, optimizer, network, params, target, step=0, opt_step=1, flops_prof_step=0):
 
     if flops_prof_step:

From 55c55175b68cccd8122098e07466277e138029b5 Mon Sep 17 00:00:00 2001
From: skishore <sriramkumar.kishorekumar@amd.com>
Date: Tue, 17 Feb 2026 12:45:17 +0000
Subject: [PATCH 22/22] add readme sections for apex and audio

---
 README.md | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index fa52322..8528753 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,9 @@
 # pytorch-micro-benchmarking
+
+This repo provides microbenchmarking script for training and inferencing models in pytorch, apex and torchaudio libraries on ROCm.
+
+## Pytorch
+
 We supply a small microbenchmarking script for PyTorch training on ROCm.
 
 To execute:
@@ -37,10 +42,10 @@ python3 micro_benchmarking_pytorch.py --device_ids=1 --network resnet50 --distri
 To run FlopsProfiler (with deepspeed.profiling.flops_profiler imported):
 `python micro_benchmarking_pytorch.py --network resnet50 --amp-opt-level=2 --batch-size=256 --iterations=20 --flops-prof-step 10`
 
-## Performance tuning
+### Performance tuning
 If performance on a specific card and/or model is found to be lacking, typically some gains can be made by tuning MIOpen. For this, `export MIOPEN_FIND_ENFORCE=3` prior to running the model. This will take some time if untuned configurations are encountered and write to a local performance database. More information on this can be found in the [MIOpen documentation](https://rocm.github.io/MIOpen/doc/html/perfdatabase.html).
 
-## PyTorch 2.0
+### PyTorch 2.0
 Added the `--compile` option opens up PyTorch 2.0 capabilities, which comes with several options. Here are some notes from upstream: 
 ```
     Optimizes given model/function using TorchDynamo and specified backend.
@@ -75,3 +80,26 @@ python micro_benchmarking_pytorch.py --network resnet50 --compile --compileConte
 python micro_benchmarking_pytorch.py --network resnet50 --compile --compileContext "{'options': {'static-memory': 'True', 'matmul-padding': 'True'}}"
 ```
 Note: you cannot pass the `mode` and `options` options together.
+
+## TorchAudio
+
+The script and parameters for torchaudio are similar to pytorch.
+
+To execute:
+`python micro_benchmarking_audio.py --network <network name> [--batch-size <batch size> ] [--iterations <number of iterations>] [--fp16 <0 or 1> ] [--distributed_dataparallel] [--device_ids <comma separated list (no spaces) of GPU indices (0-indexed) to run distributed_dataparallel api on>] `
+
+Possible network names are: `wav2vec2_base`, `deepspeech`, `hdemucs_low`, `tacotron2`, `wavernn`, `wav2letter`, `hubert_base` etc.
+
+## Apex
+
+The script and parameters for torchaudio are similar to pytorch. 
+
+To execute:
+`python micro_benchmarking_apex.py --network <network name> [--batch-size <batch size> ] [--iterations <number of iterations>] [--fp16 <0 or 1> ] [--distributed_dataparallel] [--device_ids <comma separated list (no spaces) of GPU indices (0-indexed) to run distributed_dataparallel api on>] [--sync_bn] [--keep-batchnorm-fp32 <true|false>] [--loss-scale <value|dynamic>]`
+
+There are three additional parameters.
+1. `--sync_bn`: Use apex synchronized batch normalization across GPUs (useful for multi-GPU training).
+2. `--keep-batchnorm-fp32`: Keep batch norm layers in FP32 when using AMP (e.g. `--keep-batchnorm-fp32 true`). Omit with opt_level O1.
+3. `--loss-scale`: Loss scale for mixed precision. It is a number (e.g. `1024`) for static scaling, or `dynamic` for adaptive scaling.
+
+Instead of amp flag (true/false), there  is a level of amp optimization used in apex. 
\ No newline at end of file