From fc751d9dc63378837aa18430419e8e97a618d48b Mon Sep 17 00:00:00 2001 From: skishore Date: Sat, 2 Aug 2025 07:42:22 +0000 Subject: [PATCH 01/22] replace pytorch calls to apex calls in microbenchmarking code --- micro_benchmarking_apex.py | 451 +++++++++++++++++++++++++++++++++++++ 1 file changed, 451 insertions(+) create mode 100644 micro_benchmarking_apex.py diff --git a/micro_benchmarking_apex.py b/micro_benchmarking_apex.py new file mode 100644 index 0000000..4c7e513 --- /dev/null +++ b/micro_benchmarking_apex.py @@ -0,0 +1,451 @@ +import torch +import torchvision +import random +import time +import argparse +import os +import sys +import ast +import copy +import math +import torch.nn as nn +import torch.multiprocessing as mp +try: + import apex +except: + print ("ERROR: You must install apex to run apex microbenchmarking") + sys.exit(1) +from apex.fp16_utils import FP16Model +from shufflenet import shufflenet +from shufflenet_v2 import shufflenet as shufflenet_v2 +from xception import xception +from apex.parallel import DistributedDataParallel as DDP + +try: + import torch._dynamo + torch._dynamo.config.verbose=True + HAVE_DYNAMO = True +except: + HAVE_DYNAMO = False + +IS_PT2 = hasattr(torch, "compile") + +is_torchrun = False +if "LOCAL_RANK" in os.environ: + # this indicates we're using torchrun + is_torchrun = True + + + +def weight_init(m): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + +# num_classes=1000 +models = { + "alexnet" : torchvision.models.alexnet, + "densenet121" : torchvision.models.densenet121, + "densenet161" : torchvision.models.densenet161, + "densenet169" : torchvision.models.densenet169, + "densenet201" : torchvision.models.densenet201, + "googlenet" : torchvision.models.googlenet, + "inception_v3" : torchvision.models.inception_v3, + "mnasnet0_5" : torchvision.models.mnasnet0_5, + "mnasnet0_75" : torchvision.models.mnasnet0_75, + "mnasnet1_0" : torchvision.models.mnasnet1_0, + "mnasnet1_3" : torchvision.models.mnasnet1_3, + "mobilenet_v2" : torchvision.models.mobilenet_v2, + "resnet18" : torchvision.models.resnet18, + "resnet34" : torchvision.models.resnet34, + "resnet50" : torchvision.models.resnet50, + "resnet101" : torchvision.models.resnet101, + "resnet152" : torchvision.models.resnet152, + "resnext50" : torchvision.models.resnext50_32x4d, + "resnext50_32x4d" : torchvision.models.resnext50_32x4d, + "resnext101" : torchvision.models.resnext101_32x8d, + "resnext101_32x8d" : torchvision.models.resnext101_32x8d, + "shufflenet" : shufflenet, + "shufflenet_v2" : shufflenet_v2, + "shufflenet_v2_x05" : torchvision.models.shufflenet_v2_x0_5, + "shufflenet_v2_x10" : torchvision.models.shufflenet_v2_x1_0, + "shufflenet_v2_x15" : torchvision.models.shufflenet_v2_x1_5, + "shufflenet_v2_x20" : torchvision.models.shufflenet_v2_x2_0, + "shufflenet_v2_x0_5" : torchvision.models.shufflenet_v2_x0_5, + "shufflenet_v2_x1_0" : torchvision.models.shufflenet_v2_x1_0, + "shufflenet_v2_x1_5" : torchvision.models.shufflenet_v2_x1_5, + "shufflenet_v2_x2_0" : torchvision.models.shufflenet_v2_x2_0, + "SqueezeNet" : torchvision.models.squeezenet1_0, + "squeezenet1_0" : torchvision.models.squeezenet1_0, + "SqueezeNet1.1" : torchvision.models.squeezenet1_1, + "squeezenet1_1" : torchvision.models.squeezenet1_1, + "vgg11" : torchvision.models.vgg11, + "vgg13" : torchvision.models.vgg13, + "vgg16" : torchvision.models.vgg16, + "vgg19" : torchvision.models.vgg19, + "vgg11_bn" : torchvision.models.vgg11_bn, + "vgg13_bn" : torchvision.models.vgg13_bn, + "vgg16_bn" : torchvision.models.vgg16_bn, + "vgg19_bn" : torchvision.models.vgg19_bn, + "wide_resnet50_2" : torchvision.models.wide_resnet50_2, + "wide_resnet101_2" : torchvision.models.wide_resnet101_2, + "xception" : xception, +} + +# newer torchvision models, for backwards compat +try: + models["swin_t"] = torchvision.models.swin_t + models["swin_s"] = torchvision.models.swin_s + models["swin_b"] = torchvision.models.swin_b + models["swin_v2_t"] = torchvision.models.swin_v2_t + models["swin_v2_s"] = torchvision.models.swin_v2_s + models["swin_v2_b"] = torchvision.models.swin_v2_b + models["vit_b_16"] = torchvision.models.vit_b_16 + models["vit_b_32"] = torchvision.models.vit_b_32 + models["vit_l_16"] = torchvision.models.vit_l_16 + models["vit_l_32"] = torchvision.models.vit_l_32 + models["vit_h_14"] = torchvision.models.vit_h_14 + models["efficientnet_b0"] = torchvision.models.efficientnet_b0 + models["efficientnet_b1"] = torchvision.models.efficientnet_b1 + models["efficientnet_b2"] = torchvision.models.efficientnet_b2 + models["efficientnet_b3"] = torchvision.models.efficientnet_b3 + models["efficientnet_b4"] = torchvision.models.efficientnet_b4 + models["efficientnet_b5"] = torchvision.models.efficientnet_b5 + models["efficientnet_b6"] = torchvision.models.efficientnet_b6 + models["efficientnet_b7"] = torchvision.models.efficientnet_b7 + models["maxvit_t"] = torchvision.models.maxvit_t +except AttributeError: + pass + +try: + models["mobilenet_v3_large"] = torchvision.models.mobilenet_v3_large + models["mobilenet_v3_small"] = torchvision.models.mobilenet_v3_small +except AttributeError: + pass +# segmentation models, num_classes=21 +segmentation_models = { + "fcn_resnet50" : torchvision.models.segmentation.fcn_resnet50, + "fcn_resnet101" : torchvision.models.segmentation.fcn_resnet101, + "deeplabv3_resnet50" : torchvision.models.segmentation.deeplabv3_resnet50, + "deeplabv3_resnet101" : torchvision.models.segmentation.deeplabv3_resnet101, +} + +# newer torchvision segmentation models, for backwards compat +try: + segmentation_models["deeplabv3_mobilenet_v3_large"] = torchvision.models.segmentation.deeplabv3_mobilenet_v3_large + segmentation_models["lraspp_mobilenet_v3_large"] = torchvision.models.segmentation.lraspp_mobilenet_v3_large, +except AttributeError: + pass + +def get_network_names(): + return sorted(list(models.keys()) + list(segmentation_models.keys())) + +def get_network(net): + # aux_logits=False only used by inception_v3 + if "inception_v3" == net: + return models[net](aux_logits=False).to(device="cuda") + elif net in models: + return models[net]().to(device="cuda") + elif net in segmentation_models: + return segmentation_models[net]().to(device="cuda") + else: + print ("ERROR: not a supported model '%s'" % net) + sys.exit(1) + +def forwardbackward(inp, optimizer, network, target, amp_opt_level, flops_prof_step=0): + optimizer.zero_grad() + if flops_prof_step: + prof = FlopsProfiler(network) + prof.start_profile() + out = network(inp) + # WIP: googlenet, deeplabv3_*, fcn_* missing log_softmax for this to work + loss = torch.nn.functional.cross_entropy(out, target) + # End profiler here if only to profile forward pass + + if amp_opt_level: + with apex.amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + + if flops_prof_step: + # End profiler here to profile both fwd and bwd passes + # flops = prof.get_total_flops(as_string=True) + # params = prof.get_total_params(as_string=True) + prof.print_model_profile(profile_step=flops_prof_step) + prof.end_profile() + + optimizer.step() + +def rendezvous(distributed_parameters): + print("Initializing process group...") + torch.distributed.init_process_group(backend=distributed_parameters['dist_backend'], init_method=distributed_parameters['dist_url'], rank=distributed_parameters['rank'], world_size=distributed_parameters['world_size']) + print("Rendezvous complete. Created process group...") + +def run_benchmarking_wrapper(params): + params.flops_prof_step = max(0, min(params.flops_prof_step, params.iterations - 1)) + if (params.device_ids): + params.device_ids = [int(x) for x in params.device_ids.split(",")] + else: + params.device_ids = None + params.distributed_parameters = {} + if is_torchrun: + params.distributed_parameters['rank'] = int(os.environ["LOCAL_RANK"]) + params.distributed_parameters['world_size'] = int(os.environ["WORLD_SIZE"]) + params.distributed_parameters['dist_backend'] = "nccl" + params.distributed_parameters['dist_url'] = 'tcp://' + os.environ["MASTER_ADDR"] + ":" + os.environ["MASTER_PORT"] + else: + params.distributed_parameters['rank'] = params.rank + params.distributed_parameters['world_size'] = params.world_size + params.distributed_parameters['dist_backend'] = params.dist_backend + params.distributed_parameters['dist_url'] = params.dist_url + + # Some arguments are required for distributed_dataparallel + if params.distributed_dataparallel: + assert params.distributed_parameters['rank'] is not None and \ + params.distributed_parameters['world_size'] is not None and \ + params.distributed_parameters['dist_backend'] is not None and \ + params.distributed_parameters['dist_url'] is not None, "rank, world-size, dist-backend and dist-url are required arguments for distributed_dataparallel" + + if is_torchrun: + params.ngpus = params.distributed_parameters['world_size'] + elif params.distributed_dataparallel: + params.ngpus = len(params.device_ids) if params.device_ids else torch.cuda.device_count() + else: + params.ngpus = 1 + + if is_torchrun: + run_benchmarking(params.distributed_parameters['rank'], params) + elif params.distributed_dataparallel: + # Assumption below that each process launched with --distributed_dataparallel has the same number of devices visible/specified + params.distributed_parameters['world_size'] = params.ngpus * params.distributed_parameters['world_size'] + params.distributed_parameters['rank'] = params.ngpus * params.distributed_parameters['rank'] + mp.spawn(run_benchmarking, nprocs=params.ngpus, args=(params,)) + else: + run_benchmarking(0, params) + +def run_benchmarking(local_rank, params): + device_ids = params.device_ids + ngpus = params.ngpus + net = params.network + run_fp16 = params.fp16 + amp_opt_level = params.amp_opt_level + distributed_dataparallel = params.distributed_dataparallel + distributed_parameters = params.distributed_parameters + batch_size = params.batch_size + kineto = params.kineto + iterations = params.iterations + autograd_profiler = params.autograd_profiler + flops_prof_step = params.flops_prof_step + + if is_torchrun: + torch.cuda.set_device("cuda:%d" % local_rank) + elif device_ids: + assert ngpus == len(device_ids) + torch.cuda.set_device("cuda:%d" % device_ids[local_rank]) + else: + torch.cuda.set_device("cuda:0") + + network = get_network(net) + if "shufflenet" == net: + network.apply(weight_init) + + if params.compile: + compile_ctx = {"mode": None, + "dynamic": False, + "fullgraph": False, + "backend": "inductor", + "options": None, + "disable": False} + options = None # needed for internal pytorch checks + if params.compileContext: + compile_ctx.update(ast.literal_eval(params.compileContext)) + if compile_ctx["mode"] is not None and compile_ctx["options"] is not None: + raise RuntimeError("Cannot specify mode and options simultaneously") + if compile_ctx["options"] is not None: + options = {} # needed to save multiple options + for compiler_pass in compile_ctx["options"].keys(): + options.update({compiler_pass: bool(compile_ctx["options"][compiler_pass])}) + if IS_PT2: + network = torch.compile(network, + mode=compile_ctx["mode"], + dynamic=bool(compile_ctx["dynamic"]), + fullgraph=bool(compile_ctx["fullgraph"]), + backend=compile_ctx["backend"], + options=options, + disable=compile_ctx["disable"]) + else: + print ("ERROR: requested torch.compile but this isn't pytorch 2.x") + sys.exit(1) + + if (run_fp16): + network = FP16Model(network) + + #use apex syncbn + if args.sync_bn: + network = apex.parallel.convert_syncbn_model(network) + + optimizer = torch.optim.SGD(network.parameters(), lr = 0.01, momentum = 0.9) + + if (amp_opt_level): + network, optimizer = apex.amp.initialize(network, optimizer, opt_level="O%d"%amp_opt_level, + keep_batchnorm_fp32=args.keep_batchnorm_fp32, + loss_scale=args.loss_scale) + + if is_torchrun: + rendezvous(distributed_parameters) + devices_to_run_on = [local_rank] + print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on))) + network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on) + batch_size = int(batch_size / ngpus) + elif (distributed_dataparallel): + distributed_parameters['rank'] += local_rank + rendezvous(distributed_parameters) + devices_to_run_on = [(device_ids[local_rank] if device_ids else local_rank)] + print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on))) + network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on) + batch_size = int(batch_size / ngpus) + + if (net == "inception_v3"): + inp = torch.randn(batch_size, 3, 299, 299, device="cuda") + else: + inp = torch.randn(batch_size, 3, 224, 224, device="cuda") + if (run_fp16): + inp = inp.half() + if net in models: + # number of classes is 1000 for imagenet + target = torch.randint(0, 1000, (batch_size,), device="cuda") + elif net in segmentation_models: + # number of classes is 21 for segmentation + target = torch.randint(0, 21, (batch_size,), device="cuda") + + ## warmup. + print ("INFO: running forward and backward for warmup.") + forwardbackward(inp, optimizer, network, target, amp_opt_level) + forwardbackward(inp, optimizer, network, target, amp_opt_level) + + time.sleep(1) + torch.cuda.synchronize() + + ## benchmark. + print ("INFO: running the benchmark..") + if kineto: + from torch.profiler import schedule, profile, ProfilerActivity, record_function + profiler_schedule = schedule( + skip_first = 0, + wait = 1, + warmup = 2, + active = 2, + repeat = 1, + ) + + def trace_ready_callback(prof): + print("----------- Trace Ready -----------") + prof.export_chrome_trace(f"trace{prof.step_num}.json") + + tm = time.time() + with profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + schedule=profiler_schedule, + on_trace_ready=trace_ready_callback) as prof: + for i in range(iterations): + with record_function(f"iteration {i}"): + forwardbackward(inp, optimizer, network, target, amp_opt_level) + prof.step() + torch.cuda.synchronize() + print(prof.key_averages().table(sort_by="cuda_time_total")) + else: + tm = time.time() + with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler): + for i in range(iterations): + if i == flops_prof_step: + forwardbackward(inp, optimizer, network, target, amp_opt_level, i) + else: + forwardbackward(inp, optimizer, network, target, amp_opt_level) + torch.cuda.synchronize() + + tm2 = time.time() + time_per_batch = (tm2 - tm) / iterations + + if run_fp16: + dtype = 'FP16' + elif amp_opt_level == 1: + dtype = 'AMP-O1: Insert automatic FP16 casts around safe Pytorch functions and Tensor methods.' + elif amp_opt_level == 2: + dtype = 'AMP-O2: FP16 training with FP32 batchnorm and FP32 master weights.' + elif amp_opt_level == 3: + dtype = 'AMP-O3: Pure FP16 training.' + elif amp_opt_level == 4: + dtype = 'AMP-O4: Insert automatic BFLOAT16 casts around safe Pytorch functions and Tensor methods.' + elif amp_opt_level == 5: + dtype = 'AMP-O5: BFLOAT16 training with FP32 batchnorm and FP32 master weights.' + else: + dtype = 'FP32' + + print ("OK: finished running benchmark..") + print ("--------------------SUMMARY--------------------------") + print ("Microbenchmark for network : {}".format(net)) + if distributed_dataparallel or is_torchrun: + print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------"); + print ("Num devices: 1") + else: + print ("Num devices: {}".format(ngpus)) + print ("Dtype: {}".format(dtype)) + print ("Mini batch size [img] : {}".format(batch_size)) + print ("Time per mini-batch : {}".format(time_per_batch)) + print ("Throughput [img/sec] : {}".format(batch_size/time_per_batch)) + if (distributed_dataparallel or is_torchrun) and distributed_parameters['rank'] == 0: + print ("") + print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------") + world_size = distributed_parameters['world_size'] + print ("Num devices: {}".format(world_size)) + print ("Dtype: {}".format(dtype)) + print ("Mini batch size [img] : {}".format(batch_size*world_size)) + print ("Time per mini-batch : {}".format(time_per_batch)) + print ("Throughput [img/sec] : {}".format(batch_size*world_size/time_per_batch)) + +def main(): + run_benchmarking_wrapper(copy.deepcopy(args)) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--network", type=str, choices=get_network_names(), required=True, help="Network to run.") + parser.add_argument("--batch-size" , type=int, required=False, default=64, help="Batch size (will be split among devices used by this invocation)") + parser.add_argument("--iterations", type=int, required=False, default=20, help="Iterations") + parser.add_argument("--flops-prof-step", type=int, required=False, default=0, help="The flops profiling step") + parser.add_argument("--kineto", action='store_true', required=False, help="Turn kineto profiling on") + parser.add_argument("--autograd_profiler", action='store_true', required=False, help="Use PyTorch autograd (old) profiler") + parser.add_argument("--fp16", type=int, required=False, default=0,help="FP16 mixed precision benchmarking") + parser.add_argument("--amp-opt-level", type=int, required=False, default=0,help="apex.amp mixed precision benchmarking opt level") + parser.add_argument("--distributed_dataparallel", action='store_true', required=False, help="Use torch.nn.parallel.DistributedDataParallel api to run on multiple processes/nodes. The multiple processes need to be launched manually, this script will only launch ONE process per invocation. Either use --distributed_dataparallel and manually launch multiple processes or launch this script with `torchrun`") + parser.add_argument("--device_ids", type=str, required=False, default=None, help="Comma-separated list (no spaces) to specify which HIP devices (0-indexed) to run distributedDataParallel api on. Might need to use HIP_VISIBLE_DEVICES to limit visiblity of devices to different processes.") + parser.add_argument("--rank", type=int, required=False, default=None, help="Rank of this process. Required for --distributed_dataparallel") + parser.add_argument("--world-size", type=int, required=False, default=None, help="Total number of ranks/processes. Required for --distributed_dataparallel") + parser.add_argument("--dist-backend", type=str, required=False, default=None, help="Backend used for distributed training. Can be one of 'nccl' or 'gloo'. Required for --distributed_dataparallel") + parser.add_argument("--dist-url", type=str, required=False, default=None, help="url used for rendezvous of processes in distributed training. Needs to contain IP and open port of master rank0 eg. 'tcp://172.23.2.1:54321'. Required for --distributed_dataparallel") + parser.add_argument("--compile", action='store_true', required=False, help="use pytorch 2.0") + parser.add_argument("--compileContext", default={}, required=False, help="additional compile options") + parser.add_argument('--sync_bn', action='store_true', help='enabling apex sync BN.') + parser.add_argument('--keep-batchnorm-fp32', type=str, default=None) + parser.add_argument('--loss-scale', type=str, default=None) + + args = parser.parse_args() + + if args.flops_prof_step: + try: + from deepspeed.profiling.flops_profiler import FlopsProfiler + except: + print("ERROR: You must install (or copy) deepspeed.profiling to use --flops-prof-step") + sys.exit(1) + + if args.fp16 and args.amp_opt_level: + print ("ERROR: Cannot use both --fp16 and --amp-opt-level") + sys.exit(1) + + + main() From 80b38cd5d1c4c5c349b8d94e75237829cbf1ea25 Mon Sep 17 00:00:00 2001 From: skishore Date: Thu, 9 Oct 2025 11:18:18 +0000 Subject: [PATCH 02/22] update the optimizer from torch to apex fused sgd optimizer --- micro_benchmarking_apex.py | 2 +- micro_benchmarking_audio.py | 355 ++++++++++++++++++++++++++++++++++++ 2 files changed, 356 insertions(+), 1 deletion(-) create mode 100644 micro_benchmarking_audio.py diff --git a/micro_benchmarking_apex.py b/micro_benchmarking_apex.py index 4c7e513..d94052a 100644 --- a/micro_benchmarking_apex.py +++ b/micro_benchmarking_apex.py @@ -290,7 +290,7 @@ def run_benchmarking(local_rank, params): if args.sync_bn: network = apex.parallel.convert_syncbn_model(network) - optimizer = torch.optim.SGD(network.parameters(), lr = 0.01, momentum = 0.9) + optimizer = apex.optimizers.FusedSGD(network.parameters(), lr = 0.01, momentum = 0.9) if (amp_opt_level): network, optimizer = apex.amp.initialize(network, optimizer, opt_level="O%d"%amp_opt_level, diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py new file mode 100644 index 0000000..52c6035 --- /dev/null +++ b/micro_benchmarking_audio.py @@ -0,0 +1,355 @@ +import torch +import torchaudio +import random +import time +import argparse +import os +import sys +import ast +import copy +import math +import torch.nn as nn +import torch.multiprocessing as mp +from fp16util import network_to_half, get_param_copy +import torch.nn.functional as F + +try: + import torch._dynamo + torch._dynamo.config.verbose=True + HAVE_DYNAMO = True +except: + HAVE_DYNAMO = False + +IS_PT2 = hasattr(torch, "compile") + +is_torchrun = False +if "LOCAL_RANK" in os.environ: + # this indicates we're using torchrun + is_torchrun = True + +try: + import apex + HAVE_APEX = True +except: + HAVE_APEX = False + +def weight_init(m): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + +#models that take waveforms as input +waveform_models = { + "wav2vec2_base" : torchaudio.models.hubert_base, +} + + + +def get_network_names(): + return sorted(list(waveform_models.keys())) + +def get_network(net): + if net in waveform_models: + return waveform_models[net](aux_num_out=29).to(device="cuda") + else: + print ("ERROR: not a supported model '%s'" % net) + sys.exit(1) + +def forwardbackward(inp, optimizer, network, target, amp_opt_level, flops_prof_step=0): + optimizer.zero_grad() + if flops_prof_step: + prof = FlopsProfiler(network) + prof.start_profile() + logits, _ = network(inp) + out = F.log_softmax(logits, dim=-1) + target = torch.randn_like(out) + print ("inp", inp.shape) + print ("out", out.shape) + print ("target", target.shape) + + + # WIP: googlenet, deeplabv3_*, fcn_* missing log_softmax for this to work + loss = torch.nn.functional.mse_loss(out, target) + # End profiler here if only to profile forward pass + + if amp_opt_level: + with apex.amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + + if flops_prof_step: + # End profiler here to profile both fwd and bwd passes + # flops = prof.get_total_flops(as_string=True) + # params = prof.get_total_params(as_string=True) + prof.print_model_profile(profile_step=flops_prof_step) + prof.end_profile() + + optimizer.step() + +def rendezvous(distributed_parameters): + print("Initializing process group...") + torch.distributed.init_process_group(backend=distributed_parameters['dist_backend'], init_method=distributed_parameters['dist_url'], rank=distributed_parameters['rank'], world_size=distributed_parameters['world_size']) + print("Rendezvous complete. Created process group...") + +def run_benchmarking_wrapper(params): + params.flops_prof_step = max(0, min(params.flops_prof_step, params.iterations - 1)) + if (params.device_ids): + params.device_ids = [int(x) for x in params.device_ids.split(",")] + else: + params.device_ids = None + params.distributed_parameters = {} + if is_torchrun: + params.distributed_parameters['rank'] = int(os.environ["LOCAL_RANK"]) + params.distributed_parameters['world_size'] = int(os.environ["WORLD_SIZE"]) + params.distributed_parameters['dist_backend'] = "nccl" + params.distributed_parameters['dist_url'] = 'tcp://' + os.environ["MASTER_ADDR"] + ":" + os.environ["MASTER_PORT"] + else: + params.distributed_parameters['rank'] = params.rank + params.distributed_parameters['world_size'] = params.world_size + params.distributed_parameters['dist_backend'] = params.dist_backend + params.distributed_parameters['dist_url'] = params.dist_url + + # Some arguments are required for distributed_dataparallel + if params.distributed_dataparallel: + assert params.distributed_parameters['rank'] is not None and \ + params.distributed_parameters['world_size'] is not None and \ + params.distributed_parameters['dist_backend'] is not None and \ + params.distributed_parameters['dist_url'] is not None, "rank, world-size, dist-backend and dist-url are required arguments for distributed_dataparallel" + + if is_torchrun: + params.ngpus = params.distributed_parameters['world_size'] + elif params.distributed_dataparallel: + params.ngpus = len(params.device_ids) if params.device_ids else torch.cuda.device_count() + else: + params.ngpus = 1 + + if is_torchrun: + run_benchmarking(params.distributed_parameters['rank'], params) + elif params.distributed_dataparallel: + # Assumption below that each process launched with --distributed_dataparallel has the same number of devices visible/specified + params.distributed_parameters['world_size'] = params.ngpus * params.distributed_parameters['world_size'] + params.distributed_parameters['rank'] = params.ngpus * params.distributed_parameters['rank'] + mp.spawn(run_benchmarking, nprocs=params.ngpus, args=(params,)) + else: + run_benchmarking(0, params) + +def run_benchmarking(local_rank, params): + device_ids = params.device_ids + ngpus = params.ngpus + net = params.network + run_fp16 = params.fp16 + amp_opt_level = params.amp_opt_level + distributed_dataparallel = params.distributed_dataparallel + distributed_parameters = params.distributed_parameters + batch_size = params.batch_size + kineto = params.kineto + iterations = params.iterations + autograd_profiler = params.autograd_profiler + flops_prof_step = params.flops_prof_step + + if is_torchrun: + torch.cuda.set_device("cuda:%d" % local_rank) + elif device_ids: + assert ngpus == len(device_ids) + torch.cuda.set_device("cuda:%d" % device_ids[local_rank]) + else: + torch.cuda.set_device("cuda:0") + + network = get_network(net) + if "shufflenet" == net: + network.apply(weight_init) + + if (run_fp16): + network = network_to_half(network) + + if params.compile: + compile_ctx = {"mode": None, + "dynamic": False, + "fullgraph": False, + "backend": "inductor", + "options": None, + "disable": False} + options = None # needed for internal pytorch checks + if params.compileContext: + compile_ctx.update(ast.literal_eval(params.compileContext)) + if compile_ctx["mode"] is not None and compile_ctx["options"] is not None: + raise RuntimeError("Cannot specify mode and options simultaneously") + if compile_ctx["options"] is not None: + options = {} # needed to save multiple options + for compiler_pass in compile_ctx["options"].keys(): + options.update({compiler_pass: bool(compile_ctx["options"][compiler_pass])}) + if IS_PT2: + network = torch.compile(network, + mode=compile_ctx["mode"], + dynamic=bool(compile_ctx["dynamic"]), + fullgraph=bool(compile_ctx["fullgraph"]), + backend=compile_ctx["backend"], + options=options, + disable=compile_ctx["disable"]) + else: + print ("ERROR: requested torch.compile but this isn't pytorch 2.x") + sys.exit(1) + + param_copy = network.parameters() + if (run_fp16): + param_copy = get_param_copy(network) + optimizer = torch.optim.SGD(param_copy, lr = 0.01, momentum = 0.9) + + if (amp_opt_level): + network, optimizer = apex.amp.initialize(network, optimizer, opt_level="O%d"%amp_opt_level) + + if is_torchrun: + rendezvous(distributed_parameters) + devices_to_run_on = [local_rank] + print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on))) + network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on) + batch_size = int(batch_size / ngpus) + elif (distributed_dataparallel): + distributed_parameters['rank'] += local_rank + rendezvous(distributed_parameters) + devices_to_run_on = [(device_ids[local_rank] if device_ids else local_rank)] + print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on))) + network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on) + batch_size = int(batch_size / ngpus) + + if net in waveform_models: + inp = torch.randn(batch_size, 16000, device="cuda") + # number of classes is 500 for hubert + target = torch.randint(0, 500, (batch_size,29), device="cuda") + elif net in segmentation_models: + # number of classes is 21 for segmentation + target = torch.randint(0, 21, (batch_size,), device="cuda") + + if (run_fp16): + inp = inp.half() + + ## warmup. + print ("INFO: running forward and backward for warmup.") + forwardbackward(inp, optimizer, network, target, amp_opt_level) + forwardbackward(inp, optimizer, network, target, amp_opt_level) + + time.sleep(1) + torch.cuda.synchronize() + + ## benchmark. + print ("INFO: running the benchmark..") + if kineto: + from torch.profiler import schedule, profile, ProfilerActivity, record_function + profiler_schedule = schedule( + skip_first = 0, + wait = 1, + warmup = 2, + active = 2, + repeat = 1, + ) + + def trace_ready_callback(prof): + print("----------- Trace Ready -----------") + prof.export_chrome_trace(f"trace{prof.step_num}.json") + + tm = time.time() + with profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + schedule=profiler_schedule, + on_trace_ready=trace_ready_callback) as prof: + for i in range(iterations): + with record_function(f"iteration {i}"): + forwardbackward(inp, optimizer, network, target, amp_opt_level) + prof.step() + torch.cuda.synchronize() + print(prof.key_averages().table(sort_by="cuda_time_total")) + else: + tm = time.time() + with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler): + for i in range(iterations): + if i == flops_prof_step: + forwardbackward(inp, optimizer, network, target, amp_opt_level, i) + else: + forwardbackward(inp, optimizer, network, target, amp_opt_level) + torch.cuda.synchronize() + + tm2 = time.time() + time_per_batch = (tm2 - tm) / iterations + + if run_fp16: + dtype = 'FP16' + elif amp_opt_level == 1: + dtype = 'AMP-O1: Insert automatic FP16 casts around safe Pytorch functions and Tensor methods.' + elif amp_opt_level == 2: + dtype = 'AMP-O2: FP16 training with FP32 batchnorm and FP32 master weights.' + elif amp_opt_level == 3: + dtype = 'AMP-O3: Pure FP16 training.' + elif amp_opt_level == 4: + dtype = 'AMP-O4: Insert automatic BFLOAT16 casts around safe Pytorch functions and Tensor methods.' + elif amp_opt_level == 5: + dtype = 'AMP-O5: BFLOAT16 training with FP32 batchnorm and FP32 master weights.' + else: + dtype = 'FP32' + + print ("OK: finished running benchmark..") + print ("--------------------SUMMARY--------------------------") + print ("Microbenchmark for network : {}".format(net)) + if distributed_dataparallel or is_torchrun: + print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------"); + print ("Num devices: 1") + else: + print ("Num devices: {}".format(ngpus)) + print ("Dtype: {}".format(dtype)) + print ("Mini batch size [img] : {}".format(batch_size)) + print ("Time per mini-batch : {}".format(time_per_batch)) + print ("Throughput [img/sec] : {}".format(batch_size/time_per_batch)) + if (distributed_dataparallel or is_torchrun) and distributed_parameters['rank'] == 0: + print ("") + print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------") + world_size = distributed_parameters['world_size'] + print ("Num devices: {}".format(world_size)) + print ("Dtype: {}".format(dtype)) + print ("Mini batch size [img] : {}".format(batch_size*world_size)) + print ("Time per mini-batch : {}".format(time_per_batch)) + print ("Throughput [img/sec] : {}".format(batch_size*world_size/time_per_batch)) + +def main(): + run_benchmarking_wrapper(copy.deepcopy(args)) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--network", type=str, choices=get_network_names(), required=True, help="Network to run.") + parser.add_argument("--batch-size" , type=int, required=False, default=64, help="Batch size (will be split among devices used by this invocation)") + parser.add_argument("--iterations", type=int, required=False, default=20, help="Iterations") + parser.add_argument("--flops-prof-step", type=int, required=False, default=0, help="The flops profiling step") + parser.add_argument("--kineto", action='store_true', required=False, help="Turn kineto profiling on") + parser.add_argument("--autograd_profiler", action='store_true', required=False, help="Use PyTorch autograd (old) profiler") + parser.add_argument("--fp16", type=int, required=False, default=0,help="FP16 mixed precision benchmarking") + parser.add_argument("--amp-opt-level", type=int, required=False, default=0,help="apex.amp mixed precision benchmarking opt level") + parser.add_argument("--distributed_dataparallel", action='store_true', required=False, help="Use torch.nn.parallel.DistributedDataParallel api to run on multiple processes/nodes. The multiple processes need to be launched manually, this script will only launch ONE process per invocation. Either use --distributed_dataparallel and manually launch multiple processes or launch this script with `torchrun`") + parser.add_argument("--device_ids", type=str, required=False, default=None, help="Comma-separated list (no spaces) to specify which HIP devices (0-indexed) to run distributedDataParallel api on. Might need to use HIP_VISIBLE_DEVICES to limit visiblity of devices to different processes.") + parser.add_argument("--rank", type=int, required=False, default=None, help="Rank of this process. Required for --distributed_dataparallel") + parser.add_argument("--world-size", type=int, required=False, default=None, help="Total number of ranks/processes. Required for --distributed_dataparallel") + parser.add_argument("--dist-backend", type=str, required=False, default=None, help="Backend used for distributed training. Can be one of 'nccl' or 'gloo'. Required for --distributed_dataparallel") + parser.add_argument("--dist-url", type=str, required=False, default=None, help="url used for rendezvous of processes in distributed training. Needs to contain IP and open port of master rank0 eg. 'tcp://172.23.2.1:54321'. Required for --distributed_dataparallel") + parser.add_argument("--compile", action='store_true', required=False, help="use pytorch 2.0") + parser.add_argument("--compileContext", default={}, required=False, help="additional compile options") + + args = parser.parse_args() + + if args.flops_prof_step: + try: + from deepspeed.profiling.flops_profiler import FlopsProfiler + except: + print("ERROR: You must install (or copy) deepspeed.profiling to use --flops-prof-step") + sys.exit(1) + + if args.fp16 and args.amp_opt_level: + print ("ERROR: Cannot use both --fp16 and --amp-opt-level") + sys.exit(1) + if args.amp_opt_level and not HAVE_APEX: + print ("ERROR: You must install apex to use --amp-opt-level") + sys.exit(1) + + main() \ No newline at end of file From 674bd6b29a7d62027e22641e01e99e1d44cd8fc0 Mon Sep 17 00:00:00 2001 From: skishore Date: Thu, 9 Oct 2025 11:23:07 +0000 Subject: [PATCH 03/22] update the DistributedDataParallel from torch to apex --- micro_benchmarking_apex.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/micro_benchmarking_apex.py b/micro_benchmarking_apex.py index d94052a..ae1a422 100644 --- a/micro_benchmarking_apex.py +++ b/micro_benchmarking_apex.py @@ -19,7 +19,6 @@ from shufflenet import shufflenet from shufflenet_v2 import shufflenet as shufflenet_v2 from xception import xception -from apex.parallel import DistributedDataParallel as DDP try: import torch._dynamo @@ -301,14 +300,14 @@ def run_benchmarking(local_rank, params): rendezvous(distributed_parameters) devices_to_run_on = [local_rank] print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on))) - network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on) + network = apex.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on) batch_size = int(batch_size / ngpus) elif (distributed_dataparallel): distributed_parameters['rank'] += local_rank rendezvous(distributed_parameters) devices_to_run_on = [(device_ids[local_rank] if device_ids else local_rank)] print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on))) - network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on) + network = apex.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on) batch_size = int(batch_size / ngpus) if (net == "inception_v3"): @@ -422,7 +421,7 @@ def main(): parser.add_argument("--autograd_profiler", action='store_true', required=False, help="Use PyTorch autograd (old) profiler") parser.add_argument("--fp16", type=int, required=False, default=0,help="FP16 mixed precision benchmarking") parser.add_argument("--amp-opt-level", type=int, required=False, default=0,help="apex.amp mixed precision benchmarking opt level") - parser.add_argument("--distributed_dataparallel", action='store_true', required=False, help="Use torch.nn.parallel.DistributedDataParallel api to run on multiple processes/nodes. The multiple processes need to be launched manually, this script will only launch ONE process per invocation. Either use --distributed_dataparallel and manually launch multiple processes or launch this script with `torchrun`") + parser.add_argument("--distributed_dataparallel", action='store_true', required=False, help="Use apex.parallel.DistributedDataParallel api to run on multiple processes/nodes. The multiple processes need to be launched manually, this script will only launch ONE process per invocation. Either use --distributed_dataparallel and manually launch multiple processes or launch this script with `torchrun`") parser.add_argument("--device_ids", type=str, required=False, default=None, help="Comma-separated list (no spaces) to specify which HIP devices (0-indexed) to run distributedDataParallel api on. Might need to use HIP_VISIBLE_DEVICES to limit visiblity of devices to different processes.") parser.add_argument("--rank", type=int, required=False, default=None, help="Rank of this process. Required for --distributed_dataparallel") parser.add_argument("--world-size", type=int, required=False, default=None, help="Total number of ranks/processes. Required for --distributed_dataparallel") From 782bf64f5087bbfa4bcefb09686f3d466ebeba24 Mon Sep 17 00:00:00 2001 From: skishore Date: Tue, 14 Oct 2025 09:59:28 +0000 Subject: [PATCH 04/22] Fixed the errors in conformer and wavernn models. also added units for the wavernn models --- micro_benchmarking_audio.py | 242 ++++++++++++++++++++++++++++++------ 1 file changed, 207 insertions(+), 35 deletions(-) diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py index 52c6035..238acbc 100644 --- a/micro_benchmarking_audio.py +++ b/micro_benchmarking_audio.py @@ -12,6 +12,7 @@ import torch.multiprocessing as mp from fp16util import network_to_half, get_param_copy import torch.nn.functional as F +from torchaudio.models.decoder._ctc_decoder import ctc_decoder try: import torch._dynamo @@ -33,6 +34,12 @@ except: HAVE_APEX = False +ACOUSTIC_FEATURES_SIZE = 32 +FRAME_COUNT = 128 +HOP_LENGTH = 36 +N_FREQ = 128 + + def weight_init(m): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels @@ -43,38 +50,212 @@ def weight_init(m): m.weight.data.fill_(1) m.bias.data.zero_() -#models that take waveforms as input -waveform_models = { - "wav2vec2_base" : torchaudio.models.hubert_base, +#different audio tasks related models +wav2vec_models = { + "wav2vec2_base" : torchaudio.models.wav2vec2_base, + "wav2vec2_large" : torchaudio.models.wav2vec2_large, + "wav2vec2_large_lv60k" : torchaudio.models.wav2vec2_large_lv60k, + "wav2vec2_xlsr_300m" : torchaudio.models.wav2vec2_xlsr_300m, + "wav2vec2_xlsr_1b" : torchaudio.models.wav2vec2_xlsr_1b, + "wav2vec2_xlsr_2b" : torchaudio.models.wav2vec2_xlsr_2b, + "hubert_base" : torchaudio.models.hubert_base, + "hubert_large" : torchaudio.models.hubert_large, + "hubert_xlarge" : torchaudio.models.hubert_xlarge, + "wavlm_model" : torchaudio.models.wavlm_model, + "wavlm_base" : torchaudio.models.wavlm_base, + "wavlm_large" : torchaudio.models.wavlm_large, +} + +speech_recognition_models = { + "conformer" : torchaudio.models.Conformer, + "deepspeech" : torchaudio.models.DeepSpeech, + "emformer" : torchaudio.models.Emformer, + "wav2letter" : torchaudio.models.Wav2Letter } +source_separation_models = { + "conv_tasnet_base" : torchaudio.models.conv_tasnet_base, + "hdemucs_low" : torchaudio.models.hdemucs_low, + "hdemucs_medium" : torchaudio.models.hdemucs_medium, + "hdemucs_high" : torchaudio.models.hdemucs_high, +} +speech_quality_models = { + "squim_objective_base" : torchaudio.models.squim_objective_base, + "squim_subjective_base" : torchaudio.models.squim_subjective_base +} -def get_network_names(): - return sorted(list(waveform_models.keys())) +speech_synthesis_models = { + "tacotron2" : torchaudio.models.Tacotron2, + "wavernn" : torchaudio.models.WaveRNN +} -def get_network(net): - if net in waveform_models: - return waveform_models[net](aux_num_out=29).to(device="cuda") +decoder_models = { + "emformer_rnnt_base" : torchaudio.models.emformer_rnnt_base +} + + +def get_network_names(): + return sorted(list(wav2vec_models.keys()) + + list(speech_recognition_models.keys()) + + list(source_separation_models.keys()) + + list(speech_quality_models.keys()) + + list(speech_synthesis_models.keys()) + + list(decoder_models.keys())) + +def get_input_type(network_name): + if network_name in wav2vec_models or network_name in source_separation_models or network_name in speech_quality_models: + return "waveform" + elif network_name in speech_recognition_models: + return "acoustic features" + elif network_name in speech_synthesis_models: + if "wavernn" in network_name: + return "waveform" + else: + return "tokens" + elif network_name in decoder_models: + return "" + +def get_input(network_name, network, batch_size): + if network_name in wav2vec_models: + inp = torch.randn(batch_size, FRAME_COUNT, device="cuda") + elif network_name in source_separation_models: + if "hdemucs" in network_name: + inp = torch.randn(batch_size, 2, FRAME_COUNT, device="cuda") + else: + inp = torch.randn(batch_size, 1, FRAME_COUNT, device="cuda") + elif network_name in speech_recognition_models: + if "deepspeech" in network_name: + #number of channels must be specified for deepspeech + inp = torch.randn(batch_size, 1, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda") + elif "wav2letter" in network_name: + inp = torch.randn(batch_size, ACOUSTIC_FEATURES_SIZE, FRAME_COUNT, device="cuda") + elif "emformer" in network_name: + inp = (torch.randn(batch_size, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda"), + torch.randint(1, FRAME_COUNT, (batch_size,)).to(device="cuda")) + elif "conformer" in network_name: + lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda") + inp = (torch.rand(batch_size, int(lengths.max()), 80, device="cuda"), + lengths) + else: + lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda") + inp = (lengths, + torch.rand(batch_size, int(lengths.max()), network.input_dim, device="cuda")) + elif network_name in speech_quality_models: + if "subjective" in network_name: + inp = (torch.randn(batch_size, FRAME_COUNT, device="cuda"), + torch.randn(batch_size, FRAME_COUNT, device="cuda")) + else: + inp = torch.randn(batch_size, FRAME_COUNT, device="cuda") + elif network_name in speech_synthesis_models: + if "wavernn" in network_name: + spec_frames = 64 + waveform_length = HOP_LENGTH * (spec_frames - 4) + + inp = (torch.rand(batch_size, 1, waveform_length, device="cuda"), + torch.rand(batch_size, 1, N_FREQ, spec_frames, device="cuda")) + else: + n_mels = 80 + max_mel_specgram_length = 300 + max_text_length = 100 + inp = (torch.randint(0, 148, (batch_size, max_text_length), dtype=torch.int32, device="cuda"), + max_text_length * torch.ones((batch_size,), device="cuda"), + torch.rand( + batch_size, + n_mels, + max_mel_specgram_length, + device="cuda", + ), + max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda")) + elif network_name in decoder_models: + right_context_length = 4 + max_input_length = 61 + max_target_length = 23 + + inp = (torch.rand(batch_size, max_input_length + right_context_length, 80, device="cuda"), + torch.randint(1, max_input_length + 1, (batch_size,), device="cuda"), + torch.randint(0, 256, (batch_size, max_target_length), device="cuda"), + torch.randint(1, max_target_length + 1, (batch_size,), device="cuda"), + None) + return inp + +def get_network(network_name): + if network_name in wav2vec_models: + return wav2vec_models[network_name](aux_num_out=29).to(device="cuda") + elif network_name in source_separation_models: + if "hdemucs" in network_name: + return source_separation_models[network_name](sources = ["vocals"]).to(device="cuda") + else: + return source_separation_models[network_name]().to(device="cuda") + elif network_name in speech_recognition_models: + if "deepspeech" in network_name: + return speech_recognition_models[network_name](n_feature = ACOUSTIC_FEATURES_SIZE).to(device="cuda") + elif "wav2letter" in network_name: + return speech_recognition_models[network_name](num_features = ACOUSTIC_FEATURES_SIZE).to(device="cuda") + elif "emformer" in network_name: + return speech_recognition_models[network_name](input_dim = ACOUSTIC_FEATURES_SIZE, + num_heads=8, + ffn_dim=1024, + num_layers=20, + segment_length=4).to(device="cuda") + elif "conformer" in network_name: + return speech_recognition_models[network_name](input_dim = 80, + num_heads=4, + ffn_dim=128, + num_layers=4, + depthwise_conv_kernel_size=31).to(device="cuda") + elif network_name in speech_quality_models: + return speech_quality_models[network_name]().to(device="cuda") + elif network_name in speech_synthesis_models: + if "wavernn" in network_name: + return speech_synthesis_models[network_name](upsample_scales = [3, 3, 4], n_classes = 10, + hop_length = HOP_LENGTH, n_freq = 128).to(device="cuda") + else: + return speech_synthesis_models[network_name]().to(device="cuda") + elif network_name in decoder_models: + return decoder_models[network_name](num_symbols = 256).to(device="cuda") else: - print ("ERROR: not a supported model '%s'" % net) + print ("ERROR: not a supported model '%s'" % network_name) sys.exit(1) -def forwardbackward(inp, optimizer, network, target, amp_opt_level, flops_prof_step=0): + +def get_output_selection(network_name): + if network_name in wav2vec_models: + return 0 + elif "conformer" in network_name or "emformer" in network_name: + return 0 + elif "objective" in network_name: + return 0 + elif "tacotron2" in network_name: + return 1 + return None + + +def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_size, flops_prof_step=0): optimizer.zero_grad() if flops_prof_step: prof = FlopsProfiler(network) prof.start_profile() - logits, _ = network(inp) - out = F.log_softmax(logits, dim=-1) + + out = network(*inp) + output_index = get_output_selection(network_name) + if output_index is not None: + out = out[0] + + if network_name in wav2vec_models: + out = F.log_softmax(out, dim=-1) + elif network_name in speech_recognition_models: + out = F.log_softmax(out, dim=-1) + + + target = torch.randn_like(out) - print ("inp", inp.shape) + #print ("inp", inp.shape) print ("out", out.shape) - print ("target", target.shape) - - # WIP: googlenet, deeplabv3_*, fcn_* missing log_softmax for this to work + #if network_name in wav2vec_models: loss = torch.nn.functional.mse_loss(out, target) + # End profiler here if only to profile forward pass if amp_opt_level: @@ -84,9 +265,6 @@ def forwardbackward(inp, optimizer, network, target, amp_opt_level, flops_prof_s loss.backward() if flops_prof_step: - # End profiler here to profile both fwd and bwd passes - # flops = prof.get_total_flops(as_string=True) - # params = prof.get_total_params(as_string=True) prof.print_model_profile(profile_step=flops_prof_step) prof.end_profile() @@ -218,21 +396,15 @@ def run_benchmarking(local_rank, params): network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on) batch_size = int(batch_size / ngpus) - if net in waveform_models: - inp = torch.randn(batch_size, 16000, device="cuda") - # number of classes is 500 for hubert - target = torch.randint(0, 500, (batch_size,29), device="cuda") - elif net in segmentation_models: - # number of classes is 21 for segmentation - target = torch.randint(0, 21, (batch_size,), device="cuda") + inp = get_input(net, network, batch_size) if (run_fp16): inp = inp.half() ## warmup. print ("INFO: running forward and backward for warmup.") - forwardbackward(inp, optimizer, network, target, amp_opt_level) - forwardbackward(inp, optimizer, network, target, amp_opt_level) + forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size) + forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size) time.sleep(1) torch.cuda.synchronize() @@ -260,7 +432,7 @@ def trace_ready_callback(prof): on_trace_ready=trace_ready_callback) as prof: for i in range(iterations): with record_function(f"iteration {i}"): - forwardbackward(inp, optimizer, network, target, amp_opt_level) + forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size) prof.step() torch.cuda.synchronize() print(prof.key_averages().table(sort_by="cuda_time_total")) @@ -269,9 +441,9 @@ def trace_ready_callback(prof): with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler): for i in range(iterations): if i == flops_prof_step: - forwardbackward(inp, optimizer, network, target, amp_opt_level, i) + forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, i) else: - forwardbackward(inp, optimizer, network, target, amp_opt_level) + forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size) torch.cuda.synchronize() tm2 = time.time() @@ -301,18 +473,18 @@ def trace_ready_callback(prof): else: print ("Num devices: {}".format(ngpus)) print ("Dtype: {}".format(dtype)) - print ("Mini batch size [img] : {}".format(batch_size)) + print ("Mini batch size [", get_input_type(net), "] : {}".format(batch_size)) print ("Time per mini-batch : {}".format(time_per_batch)) - print ("Throughput [img/sec] : {}".format(batch_size/time_per_batch)) + print ("Throughput [", get_input_type(net), "/sec] : {}".format(batch_size/time_per_batch)) if (distributed_dataparallel or is_torchrun) and distributed_parameters['rank'] == 0: print ("") print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------") world_size = distributed_parameters['world_size'] print ("Num devices: {}".format(world_size)) print ("Dtype: {}".format(dtype)) - print ("Mini batch size [img] : {}".format(batch_size*world_size)) + print ("Mini batch size [", get_input_type(net), "] : {}".format(batch_size*world_size)) print ("Time per mini-batch : {}".format(time_per_batch)) - print ("Throughput [img/sec] : {}".format(batch_size*world_size/time_per_batch)) + print ("Throughput [", get_input_type(net), "/sec] : {}".format(batch_size*world_size/time_per_batch)) def main(): run_benchmarking_wrapper(copy.deepcopy(args)) From e28c8e56adb25c9b3a5fd54eb9b788487dfaf1ca Mon Sep 17 00:00:00 2001 From: skishore Date: Thu, 16 Oct 2025 15:28:15 +0000 Subject: [PATCH 05/22] refactor the code, move the models, input, output selection to audio_models.py file --- audio_model.py | 181 +++++++++++++++++++++++++++++ micro_benchmarking_audio.py | 219 ++---------------------------------- 2 files changed, 192 insertions(+), 208 deletions(-) create mode 100644 audio_model.py diff --git a/audio_model.py b/audio_model.py new file mode 100644 index 0000000..6f1f9d2 --- /dev/null +++ b/audio_model.py @@ -0,0 +1,181 @@ +import torch +import torchaudio +import sys + +ACOUSTIC_FEATURES_SIZE = 32 +FRAME_COUNT = 1024 +HOP_LENGTH = 36 +N_FREQ = 128 + +#different audio tasks related models +wav2vec_models = { + "wav2vec2_base" : torchaudio.models.wav2vec2_base, + "wav2vec2_large" : torchaudio.models.wav2vec2_large, + "wav2vec2_large_lv60k" : torchaudio.models.wav2vec2_large_lv60k, + "wav2vec2_xlsr_300m" : torchaudio.models.wav2vec2_xlsr_300m, + "wav2vec2_xlsr_1b" : torchaudio.models.wav2vec2_xlsr_1b, + "wav2vec2_xlsr_2b" : torchaudio.models.wav2vec2_xlsr_2b, + "hubert_base" : torchaudio.models.hubert_base, + "hubert_large" : torchaudio.models.hubert_large, + "hubert_xlarge" : torchaudio.models.hubert_xlarge, + "wavlm_base" : torchaudio.models.wavlm_base, + "wavlm_large" : torchaudio.models.wavlm_large, +} + +speech_recognition_models = { + "conformer" : torchaudio.models.Conformer, + "deepspeech" : torchaudio.models.DeepSpeech, + "emformer" : torchaudio.models.Emformer, + "wav2letter" : torchaudio.models.Wav2Letter +} + +source_separation_models = { + "conv_tasnet_base" : torchaudio.models.conv_tasnet_base, + "hdemucs_low" : torchaudio.models.hdemucs_low, + "hdemucs_medium" : torchaudio.models.hdemucs_medium, + "hdemucs_high" : torchaudio.models.hdemucs_high, +} + +speech_quality_models = { + "squim_objective_base" : torchaudio.models.squim_objective_base, + "squim_subjective_base" : torchaudio.models.squim_subjective_base +} + +speech_synthesis_models = { + "tacotron2" : torchaudio.models.Tacotron2, + "wavernn" : torchaudio.models.WaveRNN +} + +decoder_models = { + "emformer_rnnt_base" : torchaudio.models.emformer_rnnt_base +} + + +def get_network_names(): + return sorted(list(wav2vec_models.keys()) + + list(speech_recognition_models.keys()) + + list(source_separation_models.keys()) + + list(speech_quality_models.keys()) + + list(speech_synthesis_models.keys()) + + list(decoder_models.keys())) + + +def get_network(network_name): + if network_name in wav2vec_models: + return wav2vec_models[network_name](aux_num_out=29).to(device="cuda") + elif network_name in source_separation_models: + if "hdemucs" in network_name: + return source_separation_models[network_name](sources = ["vocals"]).to(device="cuda") + else: + return source_separation_models[network_name]().to(device="cuda") + elif network_name in speech_recognition_models: + if "deepspeech" in network_name: + return speech_recognition_models[network_name](n_feature = ACOUSTIC_FEATURES_SIZE).to(device="cuda") + elif "wav2letter" in network_name: + return speech_recognition_models[network_name](num_features = ACOUSTIC_FEATURES_SIZE).to(device="cuda") + elif "emformer" in network_name: + return speech_recognition_models[network_name](input_dim = ACOUSTIC_FEATURES_SIZE, + num_heads=8, + ffn_dim=1024, + num_layers=20, + segment_length=4).to(device="cuda") + elif "conformer" in network_name: + return speech_recognition_models[network_name](input_dim = 80, + num_heads=4, + ffn_dim=128, + num_layers=4, + depthwise_conv_kernel_size=31).to(device="cuda") + elif network_name in speech_quality_models: + return speech_quality_models[network_name]().to(device="cuda") + elif network_name in speech_synthesis_models: + if "wavernn" in network_name: + return speech_synthesis_models[network_name](upsample_scales = [3, 3, 4], n_classes = 10, + hop_length = HOP_LENGTH, n_freq = 128).to(device="cuda") + else: + return speech_synthesis_models[network_name]().to(device="cuda") + elif network_name in decoder_models: + return decoder_models[network_name](num_symbols = 256).to(device="cuda") + else: + print ("ERROR: not a supported model '%s'" % network_name) + sys.exit(1) + + +def get_input_type(network_name): + if network_name in wav2vec_models or network_name in source_separation_models or network_name in speech_quality_models: + return "waveform" + elif network_name in speech_recognition_models: + return "acoustic features" + elif network_name in speech_synthesis_models: + if "wavernn" in network_name: + return "waveform" + else: + return "tokens" + elif network_name in decoder_models: + return "" + + +def get_input(network_name, network, batch_size): + if network_name in wav2vec_models: + inp = {"waveforms": torch.randn(batch_size, FRAME_COUNT, device="cuda")} + elif network_name in source_separation_models: + if "hdemucs" in network_name: + inp = {"input" : torch.randn(batch_size, 2, FRAME_COUNT, device="cuda")} + else: + inp = {"input" : torch.randn(batch_size, 1, FRAME_COUNT, device="cuda")} + elif network_name in speech_recognition_models: + if "deepspeech" in network_name: + #number of channels must be specified for deepspeech + inp = {"x" : torch.randn(batch_size, 1, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda")} + elif "wav2letter" in network_name: + inp = {"x" : torch.randn(batch_size, ACOUSTIC_FEATURES_SIZE, FRAME_COUNT, device="cuda")} + elif "emformer" in network_name: + inp = {"input" : torch.randn(batch_size, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda"), + "lengths" : torch.randint(1, FRAME_COUNT, (batch_size,)).to(device="cuda")} + elif "conformer" in network_name: + lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda") + inp = {"input" : torch.rand(batch_size, int(lengths.max()), 80, device="cuda"), + "lengths" : lengths} + elif network_name in speech_quality_models: + if "subjective" in network_name: + inp = {"waveform" : torch.randn(batch_size, FRAME_COUNT, device="cuda"), + "reference" : torch.randn(batch_size, FRAME_COUNT, device="cuda")} + else: + inp = {"x" : torch.randn(batch_size, FRAME_COUNT, device="cuda")} + elif network_name in speech_synthesis_models: + if "wavernn" in network_name: + spec_frames = 64 + waveform_length = HOP_LENGTH * (spec_frames - 4) + + inp = {"waveform" : torch.rand(batch_size, 1, waveform_length, device="cuda"), + "specgram": torch.rand(batch_size, 1, N_FREQ, spec_frames, device="cuda")} + elif "tacotron2" in network_name: + n_mels = 80 + max_mel_specgram_length = 300 + max_text_length = 100 + inp = {"tokens" : torch.randint(0, 148, (batch_size, max_text_length), dtype=torch.int32, device="cuda"), + "token_lengths" : max_text_length * torch.ones((batch_size,), device="cuda"), + "mel_specgram": torch.rand(batch_size, n_mels, max_mel_specgram_length, device="cuda"), + "mel_specgram_lengths" : max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda")} + elif network_name in decoder_models: + right_context_length = 4 + max_input_length = 61 + max_target_length = 23 + + inp = {"sources" : torch.rand(batch_size, max_input_length + right_context_length, 80, device="cuda"), + "source_lengths" : torch.randint(1, max_input_length + 1, (batch_size,), device="cuda"), + "targets" : torch.randint(0, 256, (batch_size, max_target_length), device="cuda"), + "target_lengths" : torch.randint(1, max_target_length + 1, (batch_size,), device="cuda"), + "predictor_state" : None} + return inp + + +def get_output_selection(network_name): + if network_name in wav2vec_models: + return 0 + elif "conformer" in network_name or "emformer" in network_name: + return 0 + elif "objective" in network_name: + return 0 + elif "tacotron2" in network_name: + return 1 + return None \ No newline at end of file diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py index 238acbc..5dbbd54 100644 --- a/micro_benchmarking_audio.py +++ b/micro_benchmarking_audio.py @@ -1,6 +1,4 @@ import torch -import torchaudio -import random import time import argparse import os @@ -12,7 +10,8 @@ import torch.multiprocessing as mp from fp16util import network_to_half, get_param_copy import torch.nn.functional as F -from torchaudio.models.decoder._ctc_decoder import ctc_decoder +from audio_model import get_network_names, get_network, get_input_type, get_input, get_output_selection +from audio_loss import get_criterion, calculate_loss try: import torch._dynamo @@ -34,11 +33,6 @@ except: HAVE_APEX = False -ACOUSTIC_FEATURES_SIZE = 32 -FRAME_COUNT = 128 -HOP_LENGTH = 36 -N_FREQ = 128 - def weight_init(m): if isinstance(m, nn.Conv2d): @@ -50,211 +44,19 @@ def weight_init(m): m.weight.data.fill_(1) m.bias.data.zero_() -#different audio tasks related models -wav2vec_models = { - "wav2vec2_base" : torchaudio.models.wav2vec2_base, - "wav2vec2_large" : torchaudio.models.wav2vec2_large, - "wav2vec2_large_lv60k" : torchaudio.models.wav2vec2_large_lv60k, - "wav2vec2_xlsr_300m" : torchaudio.models.wav2vec2_xlsr_300m, - "wav2vec2_xlsr_1b" : torchaudio.models.wav2vec2_xlsr_1b, - "wav2vec2_xlsr_2b" : torchaudio.models.wav2vec2_xlsr_2b, - "hubert_base" : torchaudio.models.hubert_base, - "hubert_large" : torchaudio.models.hubert_large, - "hubert_xlarge" : torchaudio.models.hubert_xlarge, - "wavlm_model" : torchaudio.models.wavlm_model, - "wavlm_base" : torchaudio.models.wavlm_base, - "wavlm_large" : torchaudio.models.wavlm_large, -} - -speech_recognition_models = { - "conformer" : torchaudio.models.Conformer, - "deepspeech" : torchaudio.models.DeepSpeech, - "emformer" : torchaudio.models.Emformer, - "wav2letter" : torchaudio.models.Wav2Letter -} - -source_separation_models = { - "conv_tasnet_base" : torchaudio.models.conv_tasnet_base, - "hdemucs_low" : torchaudio.models.hdemucs_low, - "hdemucs_medium" : torchaudio.models.hdemucs_medium, - "hdemucs_high" : torchaudio.models.hdemucs_high, -} - -speech_quality_models = { - "squim_objective_base" : torchaudio.models.squim_objective_base, - "squim_subjective_base" : torchaudio.models.squim_subjective_base -} - -speech_synthesis_models = { - "tacotron2" : torchaudio.models.Tacotron2, - "wavernn" : torchaudio.models.WaveRNN -} - -decoder_models = { - "emformer_rnnt_base" : torchaudio.models.emformer_rnnt_base -} - - -def get_network_names(): - return sorted(list(wav2vec_models.keys()) + - list(speech_recognition_models.keys()) + - list(source_separation_models.keys()) + - list(speech_quality_models.keys()) + - list(speech_synthesis_models.keys()) + - list(decoder_models.keys())) - -def get_input_type(network_name): - if network_name in wav2vec_models or network_name in source_separation_models or network_name in speech_quality_models: - return "waveform" - elif network_name in speech_recognition_models: - return "acoustic features" - elif network_name in speech_synthesis_models: - if "wavernn" in network_name: - return "waveform" - else: - return "tokens" - elif network_name in decoder_models: - return "" - -def get_input(network_name, network, batch_size): - if network_name in wav2vec_models: - inp = torch.randn(batch_size, FRAME_COUNT, device="cuda") - elif network_name in source_separation_models: - if "hdemucs" in network_name: - inp = torch.randn(batch_size, 2, FRAME_COUNT, device="cuda") - else: - inp = torch.randn(batch_size, 1, FRAME_COUNT, device="cuda") - elif network_name in speech_recognition_models: - if "deepspeech" in network_name: - #number of channels must be specified for deepspeech - inp = torch.randn(batch_size, 1, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda") - elif "wav2letter" in network_name: - inp = torch.randn(batch_size, ACOUSTIC_FEATURES_SIZE, FRAME_COUNT, device="cuda") - elif "emformer" in network_name: - inp = (torch.randn(batch_size, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda"), - torch.randint(1, FRAME_COUNT, (batch_size,)).to(device="cuda")) - elif "conformer" in network_name: - lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda") - inp = (torch.rand(batch_size, int(lengths.max()), 80, device="cuda"), - lengths) - else: - lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda") - inp = (lengths, - torch.rand(batch_size, int(lengths.max()), network.input_dim, device="cuda")) - elif network_name in speech_quality_models: - if "subjective" in network_name: - inp = (torch.randn(batch_size, FRAME_COUNT, device="cuda"), - torch.randn(batch_size, FRAME_COUNT, device="cuda")) - else: - inp = torch.randn(batch_size, FRAME_COUNT, device="cuda") - elif network_name in speech_synthesis_models: - if "wavernn" in network_name: - spec_frames = 64 - waveform_length = HOP_LENGTH * (spec_frames - 4) - - inp = (torch.rand(batch_size, 1, waveform_length, device="cuda"), - torch.rand(batch_size, 1, N_FREQ, spec_frames, device="cuda")) - else: - n_mels = 80 - max_mel_specgram_length = 300 - max_text_length = 100 - inp = (torch.randint(0, 148, (batch_size, max_text_length), dtype=torch.int32, device="cuda"), - max_text_length * torch.ones((batch_size,), device="cuda"), - torch.rand( - batch_size, - n_mels, - max_mel_specgram_length, - device="cuda", - ), - max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda")) - elif network_name in decoder_models: - right_context_length = 4 - max_input_length = 61 - max_target_length = 23 - - inp = (torch.rand(batch_size, max_input_length + right_context_length, 80, device="cuda"), - torch.randint(1, max_input_length + 1, (batch_size,), device="cuda"), - torch.randint(0, 256, (batch_size, max_target_length), device="cuda"), - torch.randint(1, max_target_length + 1, (batch_size,), device="cuda"), - None) - return inp - -def get_network(network_name): - if network_name in wav2vec_models: - return wav2vec_models[network_name](aux_num_out=29).to(device="cuda") - elif network_name in source_separation_models: - if "hdemucs" in network_name: - return source_separation_models[network_name](sources = ["vocals"]).to(device="cuda") - else: - return source_separation_models[network_name]().to(device="cuda") - elif network_name in speech_recognition_models: - if "deepspeech" in network_name: - return speech_recognition_models[network_name](n_feature = ACOUSTIC_FEATURES_SIZE).to(device="cuda") - elif "wav2letter" in network_name: - return speech_recognition_models[network_name](num_features = ACOUSTIC_FEATURES_SIZE).to(device="cuda") - elif "emformer" in network_name: - return speech_recognition_models[network_name](input_dim = ACOUSTIC_FEATURES_SIZE, - num_heads=8, - ffn_dim=1024, - num_layers=20, - segment_length=4).to(device="cuda") - elif "conformer" in network_name: - return speech_recognition_models[network_name](input_dim = 80, - num_heads=4, - ffn_dim=128, - num_layers=4, - depthwise_conv_kernel_size=31).to(device="cuda") - elif network_name in speech_quality_models: - return speech_quality_models[network_name]().to(device="cuda") - elif network_name in speech_synthesis_models: - if "wavernn" in network_name: - return speech_synthesis_models[network_name](upsample_scales = [3, 3, 4], n_classes = 10, - hop_length = HOP_LENGTH, n_freq = 128).to(device="cuda") - else: - return speech_synthesis_models[network_name]().to(device="cuda") - elif network_name in decoder_models: - return decoder_models[network_name](num_symbols = 256).to(device="cuda") - else: - print ("ERROR: not a supported model '%s'" % network_name) - sys.exit(1) - - -def get_output_selection(network_name): - if network_name in wav2vec_models: - return 0 - elif "conformer" in network_name or "emformer" in network_name: - return 0 - elif "objective" in network_name: - return 0 - elif "tacotron2" in network_name: - return 1 - return None - -def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_size, flops_prof_step=0): +def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_size, criterion, flops_prof_step=0): optimizer.zero_grad() if flops_prof_step: prof = FlopsProfiler(network) prof.start_profile() - out = network(*inp) + out = network(**inp) output_index = get_output_selection(network_name) if output_index is not None: out = out[0] - - if network_name in wav2vec_models: - out = F.log_softmax(out, dim=-1) - elif network_name in speech_recognition_models: - out = F.log_softmax(out, dim=-1) - - - target = torch.randn_like(out) - #print ("inp", inp.shape) - print ("out", out.shape) - - #if network_name in wav2vec_models: - loss = torch.nn.functional.mse_loss(out, target) + loss = calculate_loss(network_name, criterion, out) # End profiler here if only to profile forward pass @@ -340,6 +142,7 @@ def run_benchmarking(local_rank, params): torch.cuda.set_device("cuda:0") network = get_network(net) + criterion = get_criterion(net) if "shufflenet" == net: network.apply(weight_init) @@ -403,8 +206,8 @@ def run_benchmarking(local_rank, params): ## warmup. print ("INFO: running forward and backward for warmup.") - forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size) - forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size) + forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion) + forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion) time.sleep(1) torch.cuda.synchronize() @@ -432,7 +235,7 @@ def trace_ready_callback(prof): on_trace_ready=trace_ready_callback) as prof: for i in range(iterations): with record_function(f"iteration {i}"): - forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size) + forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion) prof.step() torch.cuda.synchronize() print(prof.key_averages().table(sort_by="cuda_time_total")) @@ -441,9 +244,9 @@ def trace_ready_callback(prof): with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler): for i in range(iterations): if i == flops_prof_step: - forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, i) + forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, i) else: - forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size) + forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion) torch.cuda.synchronize() tm2 = time.time() From fbdd71a7cbb6a2f012ecbdb0050a9d48e98b168a Mon Sep 17 00:00:00 2001 From: skishore Date: Tue, 28 Oct 2025 08:35:00 +0000 Subject: [PATCH 06/22] refactored the audio benchmarking code so that most of the lower level code is inside the audio folder, added backward pass code for some of the models --- audio/audio_loss.py | 63 +++++++++++ audio/audio_model.py | 175 +++++++++++++++++++++++++++++ audio/languagemodels.py | 38 +++++++ audio/sdr.py | 218 ++++++++++++++++++++++++++++++++++++ micro_benchmarking_audio.py | 6 +- 5 files changed, 497 insertions(+), 3 deletions(-) create mode 100644 audio/audio_loss.py create mode 100644 audio/audio_model.py create mode 100644 audio/languagemodels.py create mode 100644 audio/sdr.py diff --git a/audio/audio_loss.py b/audio/audio_loss.py new file mode 100644 index 0000000..735dbb4 --- /dev/null +++ b/audio/audio_loss.py @@ -0,0 +1,63 @@ +import torch +from audio.languagemodels import LanguageModel +import string +from audio.hubert_loss import hubert_loss +from audio.sdr import si_sdr_loss +from torch import nn + + +def get_criterion(network_name): + criterion = None + if network_name in ["wav2letter", "conformer", "deepspeech"] or "wav2vec2" in network_name: + char_blank = "*" + char_space = " " + char_apostrophe = "'" + labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase + language_model = LanguageModel(labels, char_blank, char_space) + criterion = torch.nn.CTCLoss(blank=language_model.mapping[char_blank], zero_infinity=False) + elif "hubert_pretrain" in network_name: + criterion = hubert_loss + elif "wavernn" in network_name: + criterion = nn.CrossEntropyLoss() + elif "conv_tasnet" in network_name: + criterion = si_sdr_loss + elif "tacotron2" in network_name: + criterion = nn.MSELoss() + return criterion + + +def calculate_loss(network_name, criterion, output): + if criterion is None: + target = torch.randn_like(output) + return torch.nn.functional.mse_loss(output, target) + if network_name in ["wav2letter", "conformer", "deepspeech"] or "wav2vec2" in network_name: + output = output.transpose(-1, -2).transpose(0, 1) + T, N, C = output.shape + target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long) + target = torch.randint( + low=1, + high=C, + size=(sum(target_lengths),), + dtype=torch.long, + ) + tensors_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long) + loss = criterion(output, target, tensors_lengths, target_lengths) + elif "hubert_pretrain" in network_name: + print ("hubert", len(output), output[0].shape, output[1]) + logit_m, logit_u, feature_penalty = output + loss = criterion(logit_m, logit_u, feature_penalty) + elif "wavernn" in network_name: + target = torch.randn_like(output) + output, target = output.squeeze(1), target.squeeze(1) + output = output.transpose(1, 2) + target = target.transpose(1, 2) + loss = criterion(output, target) + elif "conv_tasnet" in network_name: + batch, _, time = output.shape + mask = torch.randint(low=0, high=1, size=(batch,1,time), dtype=torch.long).cuda() + target = torch.randn_like(output) + loss = criterion(output, target, mask) + elif "tacotron2" in network_name: + target = torch.randn_like(output) + loss = criterion(output, target) + return loss \ No newline at end of file diff --git a/audio/audio_model.py b/audio/audio_model.py new file mode 100644 index 0000000..f0de1ed --- /dev/null +++ b/audio/audio_model.py @@ -0,0 +1,175 @@ +import torch +import torchaudio +import sys + +ACOUSTIC_FEATURES_SIZE = 32 +FRAME_COUNT = 1024 +HOP_LENGTH = 36 +N_FREQ = 128 +CLASSES_COUNT = 29 + +#different audio tasks related models +wav2vec_models = { + "wav2vec2_base" : torchaudio.models.wav2vec2_base, + "wav2vec2_large" : torchaudio.models.wav2vec2_large, + "wav2vec2_large_lv60k" : torchaudio.models.wav2vec2_large_lv60k, + "wav2vec2_xlsr_300m" : torchaudio.models.wav2vec2_xlsr_300m, + "wav2vec2_xlsr_1b" : torchaudio.models.wav2vec2_xlsr_1b, + "wav2vec2_xlsr_2b" : torchaudio.models.wav2vec2_xlsr_2b, + "hubert_base" : torchaudio.models.hubert_base, + "hubert_large" : torchaudio.models.hubert_large, + "hubert_xlarge" : torchaudio.models.hubert_xlarge, + "wavlm_base" : torchaudio.models.wavlm_base, + "wavlm_large" : torchaudio.models.wavlm_large, +} + +speech_recognition_models = { + "conformer" : torchaudio.models.Conformer, + "deepspeech" : torchaudio.models.DeepSpeech, + "emformer" : torchaudio.models.Emformer, + "wav2letter" : torchaudio.models.Wav2Letter +} + +source_separation_models = { + "conv_tasnet_base" : torchaudio.models.conv_tasnet_base, + "hdemucs_low" : torchaudio.models.hdemucs_low, + "hdemucs_medium" : torchaudio.models.hdemucs_medium, + "hdemucs_high" : torchaudio.models.hdemucs_high, +} + +speech_quality_models = { + "squim_objective_base" : torchaudio.models.squim_objective_base, + "squim_subjective_base" : torchaudio.models.squim_subjective_base +} + +speech_synthesis_models = { + "tacotron2" : torchaudio.models.Tacotron2, + "wavernn" : torchaudio.models.WaveRNN +} + + +hubert_pretrain_models = { + "hubert_pretrain_base" : torchaudio.models.hubert_pretrain_base, +} + +def get_network_names(): + return sorted(list(wav2vec_models.keys()) + + list(speech_recognition_models.keys()) + + list(source_separation_models.keys()) + + list(speech_quality_models.keys()) + + list(speech_synthesis_models.keys()) + + list(hubert_pretrain_models.keys())) + + +def get_network(network_name): + if network_name in wav2vec_models: + return wav2vec_models[network_name](aux_num_out=CLASSES_COUNT).to(device="cuda") + elif network_name in source_separation_models: + if "hdemucs" in network_name: + return source_separation_models[network_name](sources = ["vocals"]).to(device="cuda") + else: + return source_separation_models[network_name]().to(device="cuda") + elif network_name in speech_recognition_models: + if "deepspeech" in network_name: + return speech_recognition_models[network_name](n_feature = ACOUSTIC_FEATURES_SIZE).to(device="cuda") + elif "wav2letter" in network_name: + return speech_recognition_models[network_name](num_features = ACOUSTIC_FEATURES_SIZE).to(device="cuda") + elif "emformer" in network_name: + return speech_recognition_models[network_name](input_dim = ACOUSTIC_FEATURES_SIZE, + num_heads=8, + ffn_dim=1024, + num_layers=20, + segment_length=4).to(device="cuda") + elif "conformer" in network_name: + return speech_recognition_models[network_name](input_dim = 80, + num_heads=4, + ffn_dim=128, + num_layers=4, + depthwise_conv_kernel_size=31).to(device="cuda") + elif network_name in speech_quality_models: + return speech_quality_models[network_name]().to(device="cuda") + elif network_name in speech_synthesis_models: + if "wavernn" in network_name: + return speech_synthesis_models[network_name](upsample_scales = [3, 3, 4], n_classes = 10, + hop_length = HOP_LENGTH, n_freq = 128).to(device="cuda") + else: + return speech_synthesis_models[network_name]().to(device="cuda") + elif network_name in hubert_pretrain_models: + return hubert_pretrain_models[network_name]().to(device="cuda") + else: + print ("ERROR: not a supported model '%s'" % network_name) + sys.exit(1) + + +def get_input_type(network_name): + if network_name in wav2vec_models or network_name in source_separation_models or network_name in speech_quality_models: + return "waveform" + elif network_name in speech_recognition_models: + return "acoustic features" + elif network_name in speech_synthesis_models: + if "wavernn" in network_name: + return "waveform" + else: + return "tokens" + + +def get_input(network_name, network, batch_size): + if network_name in wav2vec_models: + inp = {"waveforms": torch.randn(batch_size, FRAME_COUNT, device="cuda")} + elif network_name in source_separation_models: + if "hdemucs" in network_name: + inp = {"input" : torch.randn(batch_size, 2, FRAME_COUNT, device="cuda")} + else: + inp = {"input" : torch.randn(batch_size, 1, FRAME_COUNT, device="cuda")} + elif network_name in speech_recognition_models: + if "deepspeech" in network_name: + #number of channels must be specified for deepspeech + inp = {"x" : torch.randn(batch_size, 1, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda")} + elif "wav2letter" in network_name: + inp = {"x" : torch.randn(batch_size, ACOUSTIC_FEATURES_SIZE, FRAME_COUNT, device="cuda")} + elif "emformer" in network_name: + inp = {"input" : torch.randn(batch_size, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda"), + "lengths" : torch.randint(1, FRAME_COUNT, (batch_size,)).to(device="cuda")} + elif "conformer" in network_name: + lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda") + inp = {"input" : torch.rand(batch_size, int(lengths.max()), 80, device="cuda"), + "lengths" : lengths} + elif network_name in speech_quality_models: + if "subjective" in network_name: + inp = {"waveform" : torch.randn(batch_size, FRAME_COUNT, device="cuda"), + "reference" : torch.randn(batch_size, FRAME_COUNT, device="cuda")} + else: + inp = {"x" : torch.randn(batch_size, FRAME_COUNT, device="cuda")} + elif network_name in speech_synthesis_models: + if "wavernn" in network_name: + spec_frames = 64 + waveform_length = HOP_LENGTH * (spec_frames - 4) + + inp = {"waveform" : torch.rand(batch_size, 1, waveform_length, device="cuda"), + "specgram": torch.rand(batch_size, 1, N_FREQ, spec_frames, device="cuda")} + elif "tacotron2" in network_name: + n_mels = 80 + max_mel_specgram_length = 300 + max_text_length = 100 + inp = {"tokens" : torch.randint(0, 148, (batch_size, max_text_length), dtype=torch.int32, device="cuda"), + "token_lengths" : max_text_length * torch.ones((batch_size,), device="cuda"), + "mel_specgram": torch.rand(batch_size, n_mels, max_mel_specgram_length, device="cuda"), + "mel_specgram_lengths" : max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda")} + elif network_name in hubert_pretrain_models: + + inp = {"waveforms" : torch.rand(batch_size, FRAME_COUNT, device="cuda"), + "labels" : torch.randint(0, 100, (batch_size, FRAME_COUNT), dtype=torch.int32, device="cuda"), + "audio_lengths" : torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")} + return inp + + +def get_output_selection(network_name): + if "wav2vec2" in network_name: + return 0 + elif "conformer" in network_name or "emformer" in network_name: + return 0 + elif "objective" in network_name: + return 0 + elif "tacotron2" in network_name: + return 1 + return None \ No newline at end of file diff --git a/audio/languagemodels.py b/audio/languagemodels.py new file mode 100644 index 0000000..8091568 --- /dev/null +++ b/audio/languagemodels.py @@ -0,0 +1,38 @@ +import collections +import itertools + + +class LanguageModel: + def __init__(self, labels, char_blank, char_space): + + self.char_space = char_space + self.char_blank = char_blank + + labels = list(labels) + self.length = len(labels) + enumerated = list(enumerate(labels)) + flipped = [(sub[1], sub[0]) for sub in enumerated] + + d1 = collections.OrderedDict(enumerated) + d2 = collections.OrderedDict(flipped) + self.mapping = {**d1, **d2} + + def encode(self, iterable): + if isinstance(iterable, list): + return [self.encode(i) for i in iterable] + else: + return [self.mapping[i] + self.mapping[self.char_blank] for i in iterable] + + def decode(self, tensor): + if len(tensor) > 0 and isinstance(tensor[0], list): + return [self.decode(t) for t in tensor] + else: + # not idempotent, since clean string + x = (self.mapping[i] for i in tensor) + x = "".join(i for i, _ in itertools.groupby(x)) + x = x.replace(self.char_blank, "") + # x = x.strip() + return x + + def __len__(self): + return self.length \ No newline at end of file diff --git a/audio/sdr.py b/audio/sdr.py new file mode 100644 index 0000000..011bb56 --- /dev/null +++ b/audio/sdr.py @@ -0,0 +1,218 @@ +import math +from itertools import permutations +from typing import Optional + +import torch + + +def sdr( + estimate: torch.Tensor, reference: torch.Tensor, mask: Optional[torch.Tensor] = None, epsilon: float = 1e-8 +) -> torch.Tensor: + """Computes source-to-distortion ratio. + + 1. scale the reference signal with power(s_est * s_ref) / powr(s_ref * s_ref) + 2. compute SNR between adjusted estimate and reference. + + Args: + estimate (torch.Tensor): Estimtaed signal. + Shape: [batch, speakers (can be 1), time frame] + reference (torch.Tensor): Reference signal. + Shape: [batch, speakers, time frame] + mask (torch.Tensor or None, optional): Binary mask to indicate padded value (0) or valid value (1). + Shape: [batch, 1, time frame] + epsilon (float, optional): constant value used to stabilize division. + + Returns: + torch.Tensor: scale-invariant source-to-distortion ratio. + Shape: [batch, speaker] + + References: + - Single-channel multi-speaker separation using deep clustering + Y. Isik, J. Le Roux, Z. Chen, S. Watanabe, and J. R. Hershey, + - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation + Luo, Yi and Mesgarani, Nima + https://arxiv.org/abs/1809.07454 + + Notes: + This function is tested to produce the exact same result as + https://github.com/naplab/Conv-TasNet/blob/e66d82a8f956a69749ec8a4ae382217faa097c5c/utility/sdr.py#L34-L56 + """ + reference_pow = reference.pow(2).mean(axis=2, keepdim=True) + mix_pow = (estimate * reference).mean(axis=2, keepdim=True) + scale = mix_pow / (reference_pow + epsilon) + + reference = scale * reference + error = estimate - reference + + reference_pow = reference.pow(2) + error_pow = error.pow(2) + + if mask is None: + reference_pow = reference_pow.mean(axis=2) + error_pow = error_pow.mean(axis=2) + else: + denom = mask.sum(axis=2) + reference_pow = (mask * reference_pow).sum(axis=2) / denom + error_pow = (mask * error_pow).sum(axis=2) / denom + + return 10 * torch.log10(reference_pow) - 10 * torch.log10(error_pow) + + +class PIT(torch.nn.Module): + """Applies utterance-level speaker permutation + + Computes the maxium possible value of the given utility function + over the permutations of the speakers. + + Args: + utility_func (function): + Function that computes the utility (opposite of loss) with signature of + (extimate: torch.Tensor, reference: torch.Tensor) -> torch.Tensor + where input Tensors are shape of [batch, speakers, frame] and + the output Tensor is shape of [batch, speakers]. + + References: + - Multi-talker Speech Separation with Utterance-level Permutation Invariant Training of + Deep Recurrent Neural Networks + Morten Kolbæk, Dong Yu, Zheng-Hua Tan and Jesper Jensen + https://arxiv.org/abs/1703.06284 + """ + + def __init__(self, utility_func): + super().__init__() + self.utility_func = utility_func + + def forward( + self, + estimate: torch.Tensor, + reference: torch.Tensor, + mask: Optional[torch.Tensor] = None, + epsilon: float = 1e-8, + ) -> torch.Tensor: + """Compute utterance-level PIT Loss + + Args: + estimate (torch.Tensor): Estimated source signals. + Shape: [bacth, speakers, time frame] + reference (torch.Tensor): Reference (original) source signals. + Shape: [batch, speakers, time frame] + mask (torch.Tensor or None, optional): Binary mask to indicate padded value (0) or valid value (1). + Shape: [batch, 1, time frame] + epsilon (float, optional): constant value used to stabilize division. + + Returns: + torch.Tensor: Maximum criterion over the speaker permutation. + Shape: [batch, ] + """ + assert estimate.shape == reference.shape + + batch_size, num_speakers = reference.shape[:2] + num_permute = math.factorial(num_speakers) + + util_mat = torch.zeros(batch_size, num_permute, dtype=estimate.dtype, device=estimate.device) + for i, idx in enumerate(permutations(range(num_speakers))): + util = self.utility_func(estimate, reference[:, idx, :], mask=mask, epsilon=epsilon) + util_mat[:, i] = util.mean(dim=1) # take the average over speaker dimension + return util_mat.max(dim=1).values + + +_sdr_pit = PIT(utility_func=sdr) + + +def sdr_pit( + estimate: torch.Tensor, reference: torch.Tensor, mask: Optional[torch.Tensor] = None, epsilon: float = 1e-8 +): + """Computes scale-invariant source-to-distortion ratio. + + 1. adjust both estimate and reference to have 0-mean + 2. scale the reference signal with power(s_est * s_ref) / powr(s_ref * s_ref) + 3. compute SNR between adjusted estimate and reference. + + Args: + estimate (torch.Tensor): Estimtaed signal. + Shape: [batch, speakers (can be 1), time frame] + reference (torch.Tensor): Reference signal. + Shape: [batch, speakers, time frame] + mask (torch.Tensor or None, optional): Binary mask to indicate padded value (0) or valid value (1). + Shape: [batch, 1, time frame] + epsilon (float, optional): constant value used to stabilize division. + + Returns: + torch.Tensor: scale-invariant source-to-distortion ratio. + Shape: [batch, speaker] + + References: + - Single-channel multi-speaker separation using deep clustering + Y. Isik, J. Le Roux, Z. Chen, S. Watanabe, and J. R. Hershey, + - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation + Luo, Yi and Mesgarani, Nima + https://arxiv.org/abs/1809.07454 + + Notes: + This function is tested to produce the exact same result as the reference implementation, + *when the inputs have 0-mean* + https://github.com/naplab/Conv-TasNet/blob/e66d82a8f956a69749ec8a4ae382217faa097c5c/utility/sdr.py#L107-L153 + """ + return _sdr_pit(estimate, reference, mask, epsilon) + + +def sdri( + estimate: torch.Tensor, + reference: torch.Tensor, + mix: torch.Tensor, + mask: Optional[torch.Tensor] = None, + epsilon: float = 1e-8, +) -> torch.Tensor: + """Compute the improvement of SDR (SDRi). + + This function compute how much SDR is improved if the estimation is changed from + the original mixture signal to the actual estimated source signals. That is, + ``SDR(estimate, reference) - SDR(mix, reference)``. + + For computing ``SDR(estimate, reference)``, PIT (permutation invariant training) is applied, + so that best combination of sources between the reference signals and the esimate signals + are picked. + + Args: + estimate (torch.Tensor): Estimated source signals. + Shape: [batch, speakers, time frame] + reference (torch.Tensor): Reference (original) source signals. + Shape: [batch, speakers, time frame] + mix (torch.Tensor): Mixed souce signals, from which the setimated signals were generated. + Shape: [batch, speakers == 1, time frame] + mask (torch.Tensor or None, optional): Binary mask to indicate padded value (0) or valid value (1). + Shape: [batch, 1, time frame] + epsilon (float, optional): constant value used to stabilize division. + + Returns: + torch.Tensor: Improved SDR. Shape: [batch, ] + + References: + - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation + Luo, Yi and Mesgarani, Nima + https://arxiv.org/abs/1809.07454 + """ + sdr_ = sdr_pit(estimate, reference, mask=mask, epsilon=epsilon) # [batch, ] + base_sdr = sdr(mix, reference, mask=mask, epsilon=epsilon) # [batch, speaker] + return sdr_ - base_sdr.mean(dim=1) + + +def si_sdr_loss(estimate: torch.Tensor, reference: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: + """Compute the Si-SDR loss. + + Args: + estimate (torch.Tensor): Estimated source signals. + Tensor of dimension (batch, speakers, time) + reference (torch.Tensor): Reference (original) source signals. + Tensor of dimension (batch, speakers, time) + mask (torch.Tensor): Mask to indicate padded value (0) or valid value (1). + Tensor of dimension (batch, 1, time) + + Returns: + torch.Tensor: Si-SDR loss. Tensor of dimension (batch, ) + """ + estimate = estimate - estimate.mean(axis=2, keepdim=True) + reference = reference - reference.mean(axis=2, keepdim=True) + + si_sdri = sdr.sdr_pit(estimate, reference, mask=mask) + return -si_sdri.mean() \ No newline at end of file diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py index 5dbbd54..7422144 100644 --- a/micro_benchmarking_audio.py +++ b/micro_benchmarking_audio.py @@ -10,8 +10,8 @@ import torch.multiprocessing as mp from fp16util import network_to_half, get_param_copy import torch.nn.functional as F -from audio_model import get_network_names, get_network, get_input_type, get_input, get_output_selection -from audio_loss import get_criterion, calculate_loss +from audio.audio_model import get_network_names, get_network, get_input_type, get_input, get_output_selection +from audio.audio_loss import get_criterion, calculate_loss try: import torch._dynamo @@ -54,7 +54,7 @@ def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_ out = network(**inp) output_index = get_output_selection(network_name) if output_index is not None: - out = out[0] + out = out[output_index] loss = calculate_loss(network_name, criterion, out) From ead8362bf7456b6c94b66a582ca6586e6d2b85a1 Mon Sep 17 00:00:00 2001 From: skishore Date: Tue, 28 Oct 2025 08:38:41 +0000 Subject: [PATCH 07/22] moved audio_model to audio folder --- audio_model.py | 181 ------------------------------------------------- 1 file changed, 181 deletions(-) delete mode 100644 audio_model.py diff --git a/audio_model.py b/audio_model.py deleted file mode 100644 index 6f1f9d2..0000000 --- a/audio_model.py +++ /dev/null @@ -1,181 +0,0 @@ -import torch -import torchaudio -import sys - -ACOUSTIC_FEATURES_SIZE = 32 -FRAME_COUNT = 1024 -HOP_LENGTH = 36 -N_FREQ = 128 - -#different audio tasks related models -wav2vec_models = { - "wav2vec2_base" : torchaudio.models.wav2vec2_base, - "wav2vec2_large" : torchaudio.models.wav2vec2_large, - "wav2vec2_large_lv60k" : torchaudio.models.wav2vec2_large_lv60k, - "wav2vec2_xlsr_300m" : torchaudio.models.wav2vec2_xlsr_300m, - "wav2vec2_xlsr_1b" : torchaudio.models.wav2vec2_xlsr_1b, - "wav2vec2_xlsr_2b" : torchaudio.models.wav2vec2_xlsr_2b, - "hubert_base" : torchaudio.models.hubert_base, - "hubert_large" : torchaudio.models.hubert_large, - "hubert_xlarge" : torchaudio.models.hubert_xlarge, - "wavlm_base" : torchaudio.models.wavlm_base, - "wavlm_large" : torchaudio.models.wavlm_large, -} - -speech_recognition_models = { - "conformer" : torchaudio.models.Conformer, - "deepspeech" : torchaudio.models.DeepSpeech, - "emformer" : torchaudio.models.Emformer, - "wav2letter" : torchaudio.models.Wav2Letter -} - -source_separation_models = { - "conv_tasnet_base" : torchaudio.models.conv_tasnet_base, - "hdemucs_low" : torchaudio.models.hdemucs_low, - "hdemucs_medium" : torchaudio.models.hdemucs_medium, - "hdemucs_high" : torchaudio.models.hdemucs_high, -} - -speech_quality_models = { - "squim_objective_base" : torchaudio.models.squim_objective_base, - "squim_subjective_base" : torchaudio.models.squim_subjective_base -} - -speech_synthesis_models = { - "tacotron2" : torchaudio.models.Tacotron2, - "wavernn" : torchaudio.models.WaveRNN -} - -decoder_models = { - "emformer_rnnt_base" : torchaudio.models.emformer_rnnt_base -} - - -def get_network_names(): - return sorted(list(wav2vec_models.keys()) + - list(speech_recognition_models.keys()) + - list(source_separation_models.keys()) + - list(speech_quality_models.keys()) + - list(speech_synthesis_models.keys()) + - list(decoder_models.keys())) - - -def get_network(network_name): - if network_name in wav2vec_models: - return wav2vec_models[network_name](aux_num_out=29).to(device="cuda") - elif network_name in source_separation_models: - if "hdemucs" in network_name: - return source_separation_models[network_name](sources = ["vocals"]).to(device="cuda") - else: - return source_separation_models[network_name]().to(device="cuda") - elif network_name in speech_recognition_models: - if "deepspeech" in network_name: - return speech_recognition_models[network_name](n_feature = ACOUSTIC_FEATURES_SIZE).to(device="cuda") - elif "wav2letter" in network_name: - return speech_recognition_models[network_name](num_features = ACOUSTIC_FEATURES_SIZE).to(device="cuda") - elif "emformer" in network_name: - return speech_recognition_models[network_name](input_dim = ACOUSTIC_FEATURES_SIZE, - num_heads=8, - ffn_dim=1024, - num_layers=20, - segment_length=4).to(device="cuda") - elif "conformer" in network_name: - return speech_recognition_models[network_name](input_dim = 80, - num_heads=4, - ffn_dim=128, - num_layers=4, - depthwise_conv_kernel_size=31).to(device="cuda") - elif network_name in speech_quality_models: - return speech_quality_models[network_name]().to(device="cuda") - elif network_name in speech_synthesis_models: - if "wavernn" in network_name: - return speech_synthesis_models[network_name](upsample_scales = [3, 3, 4], n_classes = 10, - hop_length = HOP_LENGTH, n_freq = 128).to(device="cuda") - else: - return speech_synthesis_models[network_name]().to(device="cuda") - elif network_name in decoder_models: - return decoder_models[network_name](num_symbols = 256).to(device="cuda") - else: - print ("ERROR: not a supported model '%s'" % network_name) - sys.exit(1) - - -def get_input_type(network_name): - if network_name in wav2vec_models or network_name in source_separation_models or network_name in speech_quality_models: - return "waveform" - elif network_name in speech_recognition_models: - return "acoustic features" - elif network_name in speech_synthesis_models: - if "wavernn" in network_name: - return "waveform" - else: - return "tokens" - elif network_name in decoder_models: - return "" - - -def get_input(network_name, network, batch_size): - if network_name in wav2vec_models: - inp = {"waveforms": torch.randn(batch_size, FRAME_COUNT, device="cuda")} - elif network_name in source_separation_models: - if "hdemucs" in network_name: - inp = {"input" : torch.randn(batch_size, 2, FRAME_COUNT, device="cuda")} - else: - inp = {"input" : torch.randn(batch_size, 1, FRAME_COUNT, device="cuda")} - elif network_name in speech_recognition_models: - if "deepspeech" in network_name: - #number of channels must be specified for deepspeech - inp = {"x" : torch.randn(batch_size, 1, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda")} - elif "wav2letter" in network_name: - inp = {"x" : torch.randn(batch_size, ACOUSTIC_FEATURES_SIZE, FRAME_COUNT, device="cuda")} - elif "emformer" in network_name: - inp = {"input" : torch.randn(batch_size, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda"), - "lengths" : torch.randint(1, FRAME_COUNT, (batch_size,)).to(device="cuda")} - elif "conformer" in network_name: - lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda") - inp = {"input" : torch.rand(batch_size, int(lengths.max()), 80, device="cuda"), - "lengths" : lengths} - elif network_name in speech_quality_models: - if "subjective" in network_name: - inp = {"waveform" : torch.randn(batch_size, FRAME_COUNT, device="cuda"), - "reference" : torch.randn(batch_size, FRAME_COUNT, device="cuda")} - else: - inp = {"x" : torch.randn(batch_size, FRAME_COUNT, device="cuda")} - elif network_name in speech_synthesis_models: - if "wavernn" in network_name: - spec_frames = 64 - waveform_length = HOP_LENGTH * (spec_frames - 4) - - inp = {"waveform" : torch.rand(batch_size, 1, waveform_length, device="cuda"), - "specgram": torch.rand(batch_size, 1, N_FREQ, spec_frames, device="cuda")} - elif "tacotron2" in network_name: - n_mels = 80 - max_mel_specgram_length = 300 - max_text_length = 100 - inp = {"tokens" : torch.randint(0, 148, (batch_size, max_text_length), dtype=torch.int32, device="cuda"), - "token_lengths" : max_text_length * torch.ones((batch_size,), device="cuda"), - "mel_specgram": torch.rand(batch_size, n_mels, max_mel_specgram_length, device="cuda"), - "mel_specgram_lengths" : max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda")} - elif network_name in decoder_models: - right_context_length = 4 - max_input_length = 61 - max_target_length = 23 - - inp = {"sources" : torch.rand(batch_size, max_input_length + right_context_length, 80, device="cuda"), - "source_lengths" : torch.randint(1, max_input_length + 1, (batch_size,), device="cuda"), - "targets" : torch.randint(0, 256, (batch_size, max_target_length), device="cuda"), - "target_lengths" : torch.randint(1, max_target_length + 1, (batch_size,), device="cuda"), - "predictor_state" : None} - return inp - - -def get_output_selection(network_name): - if network_name in wav2vec_models: - return 0 - elif "conformer" in network_name or "emformer" in network_name: - return 0 - elif "objective" in network_name: - return 0 - elif "tacotron2" in network_name: - return 1 - return None \ No newline at end of file From 0cd37b035ce8d08832c35d5c75f39222770b5855 Mon Sep 17 00:00:00 2001 From: skishore Date: Tue, 28 Oct 2025 11:15:46 +0000 Subject: [PATCH 08/22] add loss functions for more audio models, refactor the input and output code for the audio benchmarking code --- audio/audio_input.py | 64 ++++++++++++++++++++++++++++ audio/audio_loss.py | 22 ++++++---- audio/audio_model.py | 84 +++---------------------------------- audio/audio_output.py | 13 ++++++ micro_benchmarking_audio.py | 5 ++- 5 files changed, 100 insertions(+), 88 deletions(-) create mode 100644 audio/audio_input.py create mode 100644 audio/audio_output.py diff --git a/audio/audio_input.py b/audio/audio_input.py new file mode 100644 index 0000000..cd4b99a --- /dev/null +++ b/audio/audio_input.py @@ -0,0 +1,64 @@ +import torch +from audio.audio_model import * + + +def get_input_type(network_name): + if network_name in acoustic_models or network_name in source_separation_models or network_name in speech_quality_models: + return "waveform" + elif network_name in speech_recognition_models: + return "acoustic features" + elif network_name in speech_synthesis_models: + if "wavernn" in network_name: + return "waveform" + else: + return "tokens" + + +def get_input(network_name, network, batch_size): + if network_name in acoustic_models: + inp = {"waveforms": torch.randn(batch_size, FRAME_COUNT, device="cuda")} + elif network_name in source_separation_models: + if "hdemucs" in network_name: + inp = {"input" : torch.randn(batch_size, 2, FRAME_COUNT, device="cuda")} + else: + inp = {"input" : torch.randn(batch_size, 1, FRAME_COUNT, device="cuda")} + elif network_name in speech_recognition_models: + if "deepspeech" in network_name: + #number of channels must be specified for deepspeech + inp = {"x" : torch.randn(batch_size, 1, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda")} + elif "wav2letter" in network_name: + inp = {"x" : torch.randn(batch_size, ACOUSTIC_FEATURES_SIZE, FRAME_COUNT, device="cuda")} + elif "emformer" in network_name: + inp = {"input" : torch.randn(batch_size, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda"), + "lengths" : torch.randint(1, FRAME_COUNT, (batch_size,)).to(device="cuda")} + elif "conformer" in network_name: + lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda") + inp = {"input" : torch.rand(batch_size, int(lengths.max()), 80, device="cuda"), + "lengths" : lengths} + elif network_name in speech_quality_models: + if "subjective" in network_name: + inp = {"waveform" : torch.randn(batch_size, FRAME_COUNT, device="cuda"), + "reference" : torch.randn(batch_size, FRAME_COUNT, device="cuda")} + else: + inp = {"x" : torch.randn(batch_size, FRAME_COUNT, device="cuda")} + elif network_name in speech_synthesis_models: + if "wavernn" in network_name: + spec_frames = 64 + waveform_length = HOP_LENGTH * (spec_frames - 4) + + inp = {"waveform" : torch.rand(batch_size, 1, waveform_length, device="cuda"), + "specgram": torch.rand(batch_size, 1, N_FREQ, spec_frames, device="cuda")} + elif "tacotron2" in network_name: + n_mels = 80 + max_mel_specgram_length = 300 + max_text_length = 100 + inp = {"tokens" : torch.randint(0, 148, (batch_size, max_text_length), dtype=torch.int32, device="cuda"), + "token_lengths" : max_text_length * torch.ones((batch_size,), device="cuda"), + "mel_specgram": torch.rand(batch_size, n_mels, max_mel_specgram_length, device="cuda"), + "mel_specgram_lengths" : max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda")} + elif network_name in hubert_pretrain_models: + + inp = {"waveforms" : torch.rand(batch_size, FRAME_COUNT, device="cuda"), + "labels" : torch.randint(0, 100, (batch_size, FRAME_COUNT), dtype=torch.int32, device="cuda"), + "audio_lengths" : torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")} + return inp diff --git a/audio/audio_loss.py b/audio/audio_loss.py index 735dbb4..468e41e 100644 --- a/audio/audio_loss.py +++ b/audio/audio_loss.py @@ -4,25 +4,28 @@ from audio.hubert_loss import hubert_loss from audio.sdr import si_sdr_loss from torch import nn +from audio.audio_model import * def get_criterion(network_name): criterion = None - if network_name in ["wav2letter", "conformer", "deepspeech"] or "wav2vec2" in network_name: + if "hubert_pretrain" in network_name: + criterion = hubert_loss + elif network_name in speech_recognition_models or network_name in acoustic_models: char_blank = "*" char_space = " " char_apostrophe = "'" labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase language_model = LanguageModel(labels, char_blank, char_space) criterion = torch.nn.CTCLoss(blank=language_model.mapping[char_blank], zero_infinity=False) - elif "hubert_pretrain" in network_name: - criterion = hubert_loss elif "wavernn" in network_name: criterion = nn.CrossEntropyLoss() elif "conv_tasnet" in network_name: criterion = si_sdr_loss elif "tacotron2" in network_name: criterion = nn.MSELoss() + elif "hdemucs" in network_name or "squim" in network_name: + criterion = nn.L1Loss() return criterion @@ -30,7 +33,11 @@ def calculate_loss(network_name, criterion, output): if criterion is None: target = torch.randn_like(output) return torch.nn.functional.mse_loss(output, target) - if network_name in ["wav2letter", "conformer", "deepspeech"] or "wav2vec2" in network_name: + if "hubert_pretrain" in network_name: + print ("hubert", len(output), output[0].shape, output[1]) + logit_m, logit_u, feature_penalty = output + loss = criterion(logit_m, logit_u, feature_penalty) + elif network_name in speech_recognition_models or network_name in acoustic_models: output = output.transpose(-1, -2).transpose(0, 1) T, N, C = output.shape target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long) @@ -42,10 +49,6 @@ def calculate_loss(network_name, criterion, output): ) tensors_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long) loss = criterion(output, target, tensors_lengths, target_lengths) - elif "hubert_pretrain" in network_name: - print ("hubert", len(output), output[0].shape, output[1]) - logit_m, logit_u, feature_penalty = output - loss = criterion(logit_m, logit_u, feature_penalty) elif "wavernn" in network_name: target = torch.randn_like(output) output, target = output.squeeze(1), target.squeeze(1) @@ -60,4 +63,7 @@ def calculate_loss(network_name, criterion, output): elif "tacotron2" in network_name: target = torch.randn_like(output) loss = criterion(output, target) + elif "hdemucs" in network_name or "squim" in network_name: + target = torch.randn_like(output) + loss = criterion(output, target) return loss \ No newline at end of file diff --git a/audio/audio_model.py b/audio/audio_model.py index f0de1ed..39b05f9 100644 --- a/audio/audio_model.py +++ b/audio/audio_model.py @@ -9,7 +9,7 @@ CLASSES_COUNT = 29 #different audio tasks related models -wav2vec_models = { +acoustic_models = { "wav2vec2_base" : torchaudio.models.wav2vec2_base, "wav2vec2_large" : torchaudio.models.wav2vec2_large, "wav2vec2_large_lv60k" : torchaudio.models.wav2vec2_large_lv60k, @@ -53,7 +53,7 @@ } def get_network_names(): - return sorted(list(wav2vec_models.keys()) + + return sorted(list(acoustic_models.keys()) + list(speech_recognition_models.keys()) + list(source_separation_models.keys()) + list(speech_quality_models.keys()) + @@ -62,8 +62,8 @@ def get_network_names(): def get_network(network_name): - if network_name in wav2vec_models: - return wav2vec_models[network_name](aux_num_out=CLASSES_COUNT).to(device="cuda") + if network_name in acoustic_models: + return acoustic_models[network_name](aux_num_out=CLASSES_COUNT).to(device="cuda") elif network_name in source_separation_models: if "hdemucs" in network_name: return source_separation_models[network_name](sources = ["vocals"]).to(device="cuda") @@ -98,78 +98,4 @@ def get_network(network_name): return hubert_pretrain_models[network_name]().to(device="cuda") else: print ("ERROR: not a supported model '%s'" % network_name) - sys.exit(1) - - -def get_input_type(network_name): - if network_name in wav2vec_models or network_name in source_separation_models or network_name in speech_quality_models: - return "waveform" - elif network_name in speech_recognition_models: - return "acoustic features" - elif network_name in speech_synthesis_models: - if "wavernn" in network_name: - return "waveform" - else: - return "tokens" - - -def get_input(network_name, network, batch_size): - if network_name in wav2vec_models: - inp = {"waveforms": torch.randn(batch_size, FRAME_COUNT, device="cuda")} - elif network_name in source_separation_models: - if "hdemucs" in network_name: - inp = {"input" : torch.randn(batch_size, 2, FRAME_COUNT, device="cuda")} - else: - inp = {"input" : torch.randn(batch_size, 1, FRAME_COUNT, device="cuda")} - elif network_name in speech_recognition_models: - if "deepspeech" in network_name: - #number of channels must be specified for deepspeech - inp = {"x" : torch.randn(batch_size, 1, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda")} - elif "wav2letter" in network_name: - inp = {"x" : torch.randn(batch_size, ACOUSTIC_FEATURES_SIZE, FRAME_COUNT, device="cuda")} - elif "emformer" in network_name: - inp = {"input" : torch.randn(batch_size, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda"), - "lengths" : torch.randint(1, FRAME_COUNT, (batch_size,)).to(device="cuda")} - elif "conformer" in network_name: - lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda") - inp = {"input" : torch.rand(batch_size, int(lengths.max()), 80, device="cuda"), - "lengths" : lengths} - elif network_name in speech_quality_models: - if "subjective" in network_name: - inp = {"waveform" : torch.randn(batch_size, FRAME_COUNT, device="cuda"), - "reference" : torch.randn(batch_size, FRAME_COUNT, device="cuda")} - else: - inp = {"x" : torch.randn(batch_size, FRAME_COUNT, device="cuda")} - elif network_name in speech_synthesis_models: - if "wavernn" in network_name: - spec_frames = 64 - waveform_length = HOP_LENGTH * (spec_frames - 4) - - inp = {"waveform" : torch.rand(batch_size, 1, waveform_length, device="cuda"), - "specgram": torch.rand(batch_size, 1, N_FREQ, spec_frames, device="cuda")} - elif "tacotron2" in network_name: - n_mels = 80 - max_mel_specgram_length = 300 - max_text_length = 100 - inp = {"tokens" : torch.randint(0, 148, (batch_size, max_text_length), dtype=torch.int32, device="cuda"), - "token_lengths" : max_text_length * torch.ones((batch_size,), device="cuda"), - "mel_specgram": torch.rand(batch_size, n_mels, max_mel_specgram_length, device="cuda"), - "mel_specgram_lengths" : max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda")} - elif network_name in hubert_pretrain_models: - - inp = {"waveforms" : torch.rand(batch_size, FRAME_COUNT, device="cuda"), - "labels" : torch.randint(0, 100, (batch_size, FRAME_COUNT), dtype=torch.int32, device="cuda"), - "audio_lengths" : torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")} - return inp - - -def get_output_selection(network_name): - if "wav2vec2" in network_name: - return 0 - elif "conformer" in network_name or "emformer" in network_name: - return 0 - elif "objective" in network_name: - return 0 - elif "tacotron2" in network_name: - return 1 - return None \ No newline at end of file + sys.exit(1) \ No newline at end of file diff --git a/audio/audio_output.py b/audio/audio_output.py new file mode 100644 index 0000000..90c08d3 --- /dev/null +++ b/audio/audio_output.py @@ -0,0 +1,13 @@ +from audio.audio_model import * + + +def get_output_selection(network_name): + if network_name in acoustic_models: + return 0 + elif "conformer" in network_name or "emformer" in network_name: + return 0 + elif "objective" in network_name: + return 0 + elif "tacotron2" in network_name: + return 1 + return None \ No newline at end of file diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py index 7422144..ac07b95 100644 --- a/micro_benchmarking_audio.py +++ b/micro_benchmarking_audio.py @@ -10,8 +10,11 @@ import torch.multiprocessing as mp from fp16util import network_to_half, get_param_copy import torch.nn.functional as F -from audio.audio_model import get_network_names, get_network, get_input_type, get_input, get_output_selection +from audio.audio_model import get_network_names, get_network from audio.audio_loss import get_criterion, calculate_loss +from audio.audio_input import get_input_type, get_input +from audio.audio_output import get_output_selection + try: import torch._dynamo From 14050367a6ce570ec3d168e0ad3540d4ece20802 Mon Sep 17 00:00:00 2001 From: skishore Date: Wed, 29 Oct 2025 14:44:01 +0000 Subject: [PATCH 09/22] add hubert loss for hubert pretrained model --- audio/audio_input.py | 8 +++----- audio/audio_loss.py | 14 ++++++++++---- audio/audio_model.py | 10 ++++++---- audio/audio_output.py | 2 -- audio/hubert_loss.py | 36 ++++++++++++++++++++++++++++++++++++ 5 files changed, 55 insertions(+), 15 deletions(-) create mode 100644 audio/hubert_loss.py diff --git a/audio/audio_input.py b/audio/audio_input.py index cd4b99a..d7a58af 100644 --- a/audio/audio_input.py +++ b/audio/audio_input.py @@ -56,9 +56,7 @@ def get_input(network_name, network, batch_size): "token_lengths" : max_text_length * torch.ones((batch_size,), device="cuda"), "mel_specgram": torch.rand(batch_size, n_mels, max_mel_specgram_length, device="cuda"), "mel_specgram_lengths" : max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda")} - elif network_name in hubert_pretrain_models: - + elif network_name in speech_representation_models: inp = {"waveforms" : torch.rand(batch_size, FRAME_COUNT, device="cuda"), - "labels" : torch.randint(0, 100, (batch_size, FRAME_COUNT), dtype=torch.int32, device="cuda"), - "audio_lengths" : torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda")} - return inp + "labels" : torch.randint(0, 100, (batch_size, 2), dtype=torch.int32, device="cuda")} + return inp \ No newline at end of file diff --git a/audio/audio_loss.py b/audio/audio_loss.py index 468e41e..57e0b5c 100644 --- a/audio/audio_loss.py +++ b/audio/audio_loss.py @@ -9,7 +9,7 @@ def get_criterion(network_name): criterion = None - if "hubert_pretrain" in network_name: + if network_name in speech_representation_models: criterion = hubert_loss elif network_name in speech_recognition_models or network_name in acoustic_models: char_blank = "*" @@ -33,8 +33,7 @@ def calculate_loss(network_name, criterion, output): if criterion is None: target = torch.randn_like(output) return torch.nn.functional.mse_loss(output, target) - if "hubert_pretrain" in network_name: - print ("hubert", len(output), output[0].shape, output[1]) + if network_name in speech_representation_models: logit_m, logit_u, feature_penalty = output loss = criterion(logit_m, logit_u, feature_penalty) elif network_name in speech_recognition_models or network_name in acoustic_models: @@ -63,7 +62,14 @@ def calculate_loss(network_name, criterion, output): elif "tacotron2" in network_name: target = torch.randn_like(output) loss = criterion(output, target) - elif "hdemucs" in network_name or "squim" in network_name: + elif "hdemucs" in network_name or "subjective" in network_name: target = torch.randn_like(output) loss = criterion(output, target) + elif "objective" in network_name: + for index in range(len(output)): + target = torch.randn_like(output[index]) + if index == 0: + loss = criterion(output[index], target) + else: + loss += criterion(output[index], target) return loss \ No newline at end of file diff --git a/audio/audio_model.py b/audio/audio_model.py index 39b05f9..e13787b 100644 --- a/audio/audio_model.py +++ b/audio/audio_model.py @@ -48,8 +48,10 @@ } -hubert_pretrain_models = { +speech_representation_models = { "hubert_pretrain_base" : torchaudio.models.hubert_pretrain_base, + "hubert_pretrain_large" : torchaudio.models.hubert_pretrain_large, + "hubert_pretrain_xlarge" : torchaudio.models.hubert_pretrain_xlarge } def get_network_names(): @@ -58,7 +60,7 @@ def get_network_names(): list(source_separation_models.keys()) + list(speech_quality_models.keys()) + list(speech_synthesis_models.keys()) + - list(hubert_pretrain_models.keys())) + list(speech_representation_models.keys())) def get_network(network_name): @@ -94,8 +96,8 @@ def get_network(network_name): hop_length = HOP_LENGTH, n_freq = 128).to(device="cuda") else: return speech_synthesis_models[network_name]().to(device="cuda") - elif network_name in hubert_pretrain_models: - return hubert_pretrain_models[network_name]().to(device="cuda") + elif network_name in speech_representation_models: + return speech_representation_models[network_name]().to(device="cuda") else: print ("ERROR: not a supported model '%s'" % network_name) sys.exit(1) \ No newline at end of file diff --git a/audio/audio_output.py b/audio/audio_output.py index 90c08d3..86c22bf 100644 --- a/audio/audio_output.py +++ b/audio/audio_output.py @@ -6,8 +6,6 @@ def get_output_selection(network_name): return 0 elif "conformer" in network_name or "emformer" in network_name: return 0 - elif "objective" in network_name: - return 0 elif "tacotron2" in network_name: return 1 return None \ No newline at end of file diff --git a/audio/hubert_loss.py b/audio/hubert_loss.py new file mode 100644 index 0000000..1edaebd --- /dev/null +++ b/audio/hubert_loss.py @@ -0,0 +1,36 @@ +from typing import Optional + +import torch +import torch.nn.functional as F +from torch import Tensor + + +def hubert_loss( + logit_m: Optional[Tensor], + logit_u: Optional[Tensor], + feature_penalty: Tensor, + masked_weight: float = 1.0, + unmasked_weight: float = 0.0, + feature_weight: float = 10.0, + reduction: str = "sum", +) -> Tensor: + """Compute the cross-entropy loss on HuBERT masked and non-masked logits. + Args: + logit_m (Tensor or None): The masked logit Tensor of dimension `(masked_frames, final_dim)`. + logit_u (Tensor or None): The non-masked logit Tensor of dimension `(unmasked_frames, final_dim)`. + feature_penalty (Tensor): The feature mean value for additional penalty loss. + masked_weight (float, optional): The weight for masked cross-entropy loss (Default: ``1.0``). + unmasked_weight (float, optional): The weight for non-masked cross-entropy loss (Default: ``0.0``). + feature_weight (float, optional): The weight for feature penalty loss (Default: ``10.0``). + reduction (str, optional): The reduction method for cross-entropy loss (Default: ``"sum"``). + """ + loss = feature_penalty * feature_weight * logit_m.shape[0] + if logit_m is not None: + target_m = torch.zeros(logit_m.shape[0], dtype=torch.long, device=logit_m.device) + loss_m = F.cross_entropy(logit_m, target_m, reduction=reduction) + loss += loss_m * masked_weight + if logit_u is not None: + target_u = torch.zeros(logit_u.shape[0], dtype=torch.long, device=logit_m.device) + loss_u = F.cross_entropy(logit_u, target_u, reduction=reduction) + loss += loss_u * unmasked_weight + return loss \ No newline at end of file From b60ceaa9977fc86713d56e0d0179e6641ac7c70a Mon Sep 17 00:00:00 2001 From: skishore Date: Tue, 3 Feb 2026 18:17:26 +0000 Subject: [PATCH 10/22] add recent changes in pytorch microbenchmarking to apex microbenchmarking --- micro_benchmarking_apex.py | 170 ++++++++++++++++++++++++++++++------- 1 file changed, 141 insertions(+), 29 deletions(-) diff --git a/micro_benchmarking_apex.py b/micro_benchmarking_apex.py index ae1a422..8f5f729 100644 --- a/micro_benchmarking_apex.py +++ b/micro_benchmarking_apex.py @@ -19,6 +19,8 @@ from shufflenet import shufflenet from shufflenet_v2 import shufflenet as shufflenet_v2 from xception import xception +import csv +import json try: import torch._dynamo @@ -34,7 +36,10 @@ # this indicates we're using torchrun is_torchrun = True - +def xform(m: nn.Module) -> nn.Module: + m = m.cuda() + m.to(memory_format=torch.channels_last) + return m def weight_init(m): if isinstance(m, nn.Conv2d): @@ -144,29 +149,42 @@ def weight_init(m): def get_network_names(): return sorted(list(models.keys()) + list(segmentation_models.keys())) -def get_network(net): +def get_network(net, params): # aux_logits=False only used by inception_v3 if "inception_v3" == net: + if params.nhwc: + return xform(models[net](aux_logits=False)) return models[net](aux_logits=False).to(device="cuda") elif net in models: + if params.nhwc: + return xform(models[net]()) return models[net]().to(device="cuda") elif net in segmentation_models: + if params.nhwc: + return xform(segmentation_models[net]()) return segmentation_models[net]().to(device="cuda") else: print ("ERROR: not a supported model '%s'" % net) sys.exit(1) -def forwardbackward(inp, optimizer, network, target, amp_opt_level, flops_prof_step=0): - optimizer.zero_grad() +def forwardbackward(inp, optimizer, network, params, target, step=0, opt_step=1, flops_prof_step=0): + if step % opt_step == 0: + optimizer.zero_grad() if flops_prof_step: prof = FlopsProfiler(network) prof.start_profile() out = network(inp) - # WIP: googlenet, deeplabv3_*, fcn_* missing log_softmax for this to work - loss = torch.nn.functional.cross_entropy(out, target) - # End profiler here if only to profile forward pass + # If using HuggingFace model outputs logits, we need to extract them + if hasattr(out, 'logits'): + logits = out.logits + else: + logits = out + loss_fn = torch.nn.CrossEntropyLoss().to(device="cuda") + if params.nhwc: + loss_fn = loss_fn.to(memory_format=torch.channels_last) + loss = loss_fn(logits, target) - if amp_opt_level: + if params.amp_opt_level: with apex.amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: @@ -179,7 +197,29 @@ def forwardbackward(inp, optimizer, network, target, amp_opt_level, flops_prof_s prof.print_model_profile(profile_step=flops_prof_step) prof.end_profile() - optimizer.step() + if (step + 1) % opt_step == 0: + optimizer.step() + +def forward(inp, optimizer, network, params, target, step=0, opt_step=1, flops_prof_step=0): + + if flops_prof_step: + prof = FlopsProfiler(network) + prof.start_profile() + out = network(inp) + # If using HuggingFace model outputs logits, we need to extract them + if hasattr(out, 'logits'): + logits = out.logits + else: + logits = out + + if flops_prof_step: + # End profiler here to profile both fwd and bwd passes + # flops = prof.get_total_flops(as_string=True) + # params = prof.get_total_params(as_string=True) + prof.print_model_profile(profile_step=flops_prof_step) + prof.end_profile() + + return logits def rendezvous(distributed_parameters): print("Initializing process group...") @@ -250,7 +290,7 @@ def run_benchmarking(local_rank, params): else: torch.cuda.set_device("cuda:0") - network = get_network(net) + network = get_network(net, params) if "shufflenet" == net: network.apply(weight_init) @@ -282,6 +322,17 @@ def run_benchmarking(local_rank, params): print ("ERROR: requested torch.compile but this isn't pytorch 2.x") sys.exit(1) + ## MLPerf Setting + sgd_opt_base_learning_rate = 0.01 + sgd_opt_end_learning_rate = 1e-4 + sgd_opt_learning_rate_decay_poly_power = 2 + sgd_opt_weight_decay = 0.0001 + sgd_opt_momentum = 0.9 + opt_learning_rate_warmup_epochs = 5 + + total_epochs = params.iterations + optimizer = torch.optim.SGD(param_copy, lr = sgd_opt_base_learning_rate, momentum = sgd_opt_momentum, weight_decay=sgd_opt_weight_decay) + if (run_fp16): network = FP16Model(network) @@ -316,6 +367,8 @@ def run_benchmarking(local_rank, params): inp = torch.randn(batch_size, 3, 224, 224, device="cuda") if (run_fp16): inp = inp.half() + if params.nhwc: + inp = inp.to(memory_format=torch.channels_last) if net in models: # number of classes is 1000 for imagenet target = torch.randint(0, 1000, (batch_size,), device="cuda") @@ -323,10 +376,17 @@ def run_benchmarking(local_rank, params): # number of classes is 21 for segmentation target = torch.randint(0, 21, (batch_size,), device="cuda") + if params.mode == "training": + forward_fn = forwardbackward + network.train() + else: + forward_fn = forward + network.eval() + ## warmup. print ("INFO: running forward and backward for warmup.") - forwardbackward(inp, optimizer, network, target, amp_opt_level) - forwardbackward(inp, optimizer, network, target, amp_opt_level) + for i in range(2): + forward_fn(inp, optimizer, network, params, target, step=0, opt_step=params.opt_step) time.sleep(1) torch.cuda.synchronize() @@ -339,13 +399,19 @@ def run_benchmarking(local_rank, params): skip_first = 0, wait = 1, warmup = 2, - active = 2, + active = 5, repeat = 1, ) def trace_ready_callback(prof): - print("----------- Trace Ready -----------") - prof.export_chrome_trace(f"trace{prof.step_num}.json") + rank = 0 + if torch.distributed.is_available() and torch.distributed.is_initialized(): + rank = torch.distributed.get_rank() + if rank == 0: + print("----------- Trace Ready -----------") + prof.export_chrome_trace(f"{params.profiler_output}.json") + # print(f"----------- Rank {rank} Trace Ready -----------") + # prof.export_chrome_trace(f"{params.profiler_output}_rank{rank}.json") tm = time.time() with profile( @@ -354,7 +420,7 @@ def trace_ready_callback(prof): on_trace_ready=trace_ready_callback) as prof: for i in range(iterations): with record_function(f"iteration {i}"): - forwardbackward(inp, optimizer, network, target, amp_opt_level) + forward_fn(inp, optimizer, network, params, target, step=i, opt_step=params.opt_step) prof.step() torch.cuda.synchronize() print(prof.key_averages().table(sort_by="cuda_time_total")) @@ -363,9 +429,10 @@ def trace_ready_callback(prof): with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler): for i in range(iterations): if i == flops_prof_step: - forwardbackward(inp, optimizer, network, target, amp_opt_level, i) + forward_fn(inp, optimizer, network, params, target, step=i, opt_step=params.opt_step, + flops_prof_step = i) else: - forwardbackward(inp, optimizer, network, target, amp_opt_level) + forward_fn(inp, optimizer, network, params, target, step=i, opt_step=params.opt_step) torch.cuda.synchronize() tm2 = time.time() @@ -386,27 +453,64 @@ def trace_ready_callback(prof): else: dtype = 'FP32' + result = None + if not params.output_dir: + params.output_dir = "." + print ("OK: finished running benchmark..") print ("--------------------SUMMARY--------------------------") print ("Microbenchmark for network : {}".format(net)) if distributed_dataparallel or is_torchrun: - print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------"); - print ("Num devices: 1") + print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------"); + print ("Num devices: 1") else: - print ("Num devices: {}".format(ngpus)) + print ("Num devices: {}".format(ngpus)) + result = { + "Name": params.output_file, + "GPUs": 1, + "Mini batch size [img]": batch_size, + "Mini batch size [img/gpu]": batch_size, + "Throughput [img/sec]": batch_size / time_per_batch, + "Time per mini-batch": time_per_batch + } + with open(f"{params.output_dir}/{params.output_file}.json", "w") as f: + json.dump(result, f, indent=2) + print ("Dtype: {}".format(dtype)) print ("Mini batch size [img] : {}".format(batch_size)) print ("Time per mini-batch : {}".format(time_per_batch)) print ("Throughput [img/sec] : {}".format(batch_size/time_per_batch)) if (distributed_dataparallel or is_torchrun) and distributed_parameters['rank'] == 0: - print ("") - print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------") - world_size = distributed_parameters['world_size'] - print ("Num devices: {}".format(world_size)) - print ("Dtype: {}".format(dtype)) - print ("Mini batch size [img] : {}".format(batch_size*world_size)) - print ("Time per mini-batch : {}".format(time_per_batch)) - print ("Throughput [img/sec] : {}".format(batch_size*world_size/time_per_batch)) + print ("") + print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------") + world_size = distributed_parameters['world_size'] + print ("Num devices: {}".format(world_size)) + print ("Dtype: {}".format(dtype)) + print ("Mini batch size [img] : {}".format(batch_size*world_size)) + print ("Time per mini-batch : {}".format(time_per_batch)) + print ("Throughput [img/sec] : {}".format(batch_size*world_size/time_per_batch)) + result = { + "Name": params.output_file, + "GPUs": distributed_parameters['world_size'], + "Mini batch size [img]": batch_size * distributed_parameters['world_size'], + "Mini batch size [img/gpu]": batch_size, + "Throughput [img/sec]": batch_size * distributed_parameters['world_size'] / time_per_batch, + "Time per mini-batch": time_per_batch + } + with open(f"{params.output_dir}/{params.output_file}.json", "w") as f: + json.dump(result, f, indent=2) + + csv_filename = f"{params.output_dir}/benchmark_summary.csv" + if params.csv_file: + csv_filename = params.csv_file + file_exists = os.path.isfile(csv_filename) + if result: + with open(csv_filename, "a", newline='') as csvfile: + writer = csv.writer(csvfile) + if not file_exists: + writer.writerow(result.keys()) + writer.writerow(result.values()) + print(f"Benchmark result saved to {csv_filename}") def main(): run_benchmarking_wrapper(copy.deepcopy(args)) @@ -432,6 +536,14 @@ def main(): parser.add_argument('--sync_bn', action='store_true', help='enabling apex sync BN.') parser.add_argument('--keep-batchnorm-fp32', type=str, default=None) parser.add_argument('--loss-scale', type=str, default=None) + parser.add_argument("--csv-file", type=str, default=None, required=False, help="assign output csv file name.") + parser.add_argument("--mode", type=str, choices=['training', 'inference'], default="training", help="Select mode: training or inference") + parser.add_argument("--nhwc", action='store_true', default=False, help="Use nhwc format") + parser.add_argument("--opt-step", type=int, required=False, default=1, help="Optimizer update step") + parser.add_argument("--output-dir", type=str, default="", help="assign output directory name.") + parser.add_argument("--output-file", type=str, default="", help="assign output file name.") + parser.add_argument("--profiler-output", type=str, default="", help="assign profiler output name.") + args = parser.parse_args() From cc26b2f0a4c57485053459ba2e90f0e322e8b726 Mon Sep 17 00:00:00 2001 From: skishore Date: Sun, 8 Feb 2026 21:34:50 +0000 Subject: [PATCH 11/22] add methods to calculate the target --- audio/audio_loss.py | 26 +++++++---------------- audio/audio_output.py | 41 ++++++++++++++++++++++++++++++++++++- micro_benchmarking_audio.py | 18 ++++++++-------- 3 files changed, 57 insertions(+), 28 deletions(-) diff --git a/audio/audio_loss.py b/audio/audio_loss.py index 57e0b5c..3a0d857 100644 --- a/audio/audio_loss.py +++ b/audio/audio_loss.py @@ -29,7 +29,7 @@ def get_criterion(network_name): return criterion -def calculate_loss(network_name, criterion, output): +def calculate_loss(network_name, criterion, output, target, batch_size): if criterion is None: target = torch.randn_like(output) return torch.nn.functional.mse_loss(output, target) @@ -39,37 +39,25 @@ def calculate_loss(network_name, criterion, output): elif network_name in speech_recognition_models or network_name in acoustic_models: output = output.transpose(-1, -2).transpose(0, 1) T, N, C = output.shape - target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long) - target = torch.randint( - low=1, - high=C, - size=(sum(target_lengths),), - dtype=torch.long, - ) + target, target_lengths = target tensors_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long) loss = criterion(output, target, tensors_lengths, target_lengths) elif "wavernn" in network_name: - target = torch.randn_like(output) - output, target = output.squeeze(1), target.squeeze(1) + output = output.squeeze(1) output = output.transpose(1, 2) - target = target.transpose(1, 2) loss = criterion(output, target) elif "conv_tasnet" in network_name: - batch, _, time = output.shape - mask = torch.randint(low=0, high=1, size=(batch,1,time), dtype=torch.long).cuda() - target = torch.randn_like(output) + target, mask = target loss = criterion(output, target, mask) elif "tacotron2" in network_name: - target = torch.randn_like(output) loss = criterion(output, target) elif "hdemucs" in network_name or "subjective" in network_name: - target = torch.randn_like(output) loss = criterion(output, target) elif "objective" in network_name: + loss = 0 for index in range(len(output)): - target = torch.randn_like(output[index]) if index == 0: - loss = criterion(output[index], target) + loss = criterion(output[index], target[index]) else: - loss += criterion(output[index], target) + loss += criterion(output[index], target[index]) return loss \ No newline at end of file diff --git a/audio/audio_output.py b/audio/audio_output.py index 86c22bf..6110bcf 100644 --- a/audio/audio_output.py +++ b/audio/audio_output.py @@ -8,4 +8,43 @@ def get_output_selection(network_name): return 0 elif "tacotron2" in network_name: return 1 - return None \ No newline at end of file + return None + +def create_target(network_name, network, input, batch_size): + + #get output + output = network(**input) + output_index = get_output_selection(network_name) + if output_index is not None: + output = output[output_index] + + target = None + if network_name in speech_recognition_models or network_name in acoustic_models: + output = output.transpose(-1, -2).transpose(0, 1) + T, N, C = output.shape + target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long) + target = torch.randint( + low=1, + high=C, + size=(sum(target_lengths),), + dtype=torch.long, + ) + target = [target, target_lengths] + elif "wavernn" in network_name: + target = torch.randn_like(output) + target = target.squeeze(1) + target = target.transpose(1, 2) + elif "conv_tasnet" in network_name: + batch, _, time = output.shape + mask = torch.randint(low=0, high=1, size=(batch,1,time), dtype=torch.long).cuda() + target = torch.randn_like(output) + target = [target, mask] + elif "tacotron2" in network_name: + target = torch.randn_like(output) + elif "hdemucs" in network_name or "subjective" in network_name: + target = torch.randn_like(output) + elif "objective" in network_name: + target = [] + for index in range(len(output)): + target.append(torch.randn_like(output[index])) + return target \ No newline at end of file diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py index ac07b95..ba58bc1 100644 --- a/micro_benchmarking_audio.py +++ b/micro_benchmarking_audio.py @@ -13,7 +13,7 @@ from audio.audio_model import get_network_names, get_network from audio.audio_loss import get_criterion, calculate_loss from audio.audio_input import get_input_type, get_input -from audio.audio_output import get_output_selection +from audio.audio_output import get_output_selection, create_target try: @@ -48,7 +48,7 @@ def weight_init(m): m.bias.data.zero_() -def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_size, criterion, flops_prof_step=0): +def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_size, criterion, target, flops_prof_step=0): optimizer.zero_grad() if flops_prof_step: prof = FlopsProfiler(network) @@ -59,7 +59,7 @@ def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_ if output_index is not None: out = out[output_index] - loss = calculate_loss(network_name, criterion, out) + loss = calculate_loss(network_name, criterion, out, target, batch_size) # End profiler here if only to profile forward pass @@ -206,11 +206,13 @@ def run_benchmarking(local_rank, params): if (run_fp16): inp = inp.half() + + target = create_target(net, network, inp, batch_size) ## warmup. print ("INFO: running forward and backward for warmup.") - forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion) - forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion) + for i in range(2): + forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target) time.sleep(1) torch.cuda.synchronize() @@ -238,7 +240,7 @@ def trace_ready_callback(prof): on_trace_ready=trace_ready_callback) as prof: for i in range(iterations): with record_function(f"iteration {i}"): - forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion) + forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target) prof.step() torch.cuda.synchronize() print(prof.key_averages().table(sort_by="cuda_time_total")) @@ -247,9 +249,9 @@ def trace_ready_callback(prof): with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler): for i in range(iterations): if i == flops_prof_step: - forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, i) + forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, i) else: - forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion) + forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target) torch.cuda.synchronize() tm2 = time.time() From 3724039242fcb65e5d908beb33c953b3add3b724 Mon Sep 17 00:00:00 2001 From: skishore Date: Sun, 8 Feb 2026 22:31:31 +0000 Subject: [PATCH 12/22] add loss function for squim objective --- audio/audio_loss.py | 4 +++- audio/audio_output.py | 1 + micro_benchmarking_audio.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/audio/audio_loss.py b/audio/audio_loss.py index 3a0d857..6464544 100644 --- a/audio/audio_loss.py +++ b/audio/audio_loss.py @@ -29,7 +29,7 @@ def get_criterion(network_name): return criterion -def calculate_loss(network_name, criterion, output, target, batch_size): +def calculate_loss(network_name, criterion, output, target, batch_size, input): if criterion is None: target = torch.randn_like(output) return torch.nn.functional.mse_loss(output, target) @@ -55,9 +55,11 @@ def calculate_loss(network_name, criterion, output, target, batch_size): loss = criterion(output, target) elif "objective" in network_name: loss = 0 + weights = [1, 2, 0.5, 2] for index in range(len(output)): if index == 0: loss = criterion(output[index], target[index]) else: loss += criterion(output[index], target[index]) + loss += criterion(input["x"], target[3]) return loss \ No newline at end of file diff --git a/audio/audio_output.py b/audio/audio_output.py index 6110bcf..cf9c03b 100644 --- a/audio/audio_output.py +++ b/audio/audio_output.py @@ -47,4 +47,5 @@ def create_target(network_name, network, input, batch_size): target = [] for index in range(len(output)): target.append(torch.randn_like(output[index])) + target.append(torch.randn_like(input["x"])) return target \ No newline at end of file diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py index ba58bc1..ed7509f 100644 --- a/micro_benchmarking_audio.py +++ b/micro_benchmarking_audio.py @@ -59,7 +59,7 @@ def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_ if output_index is not None: out = out[output_index] - loss = calculate_loss(network_name, criterion, out, target, batch_size) + loss = calculate_loss(network_name, criterion, out, target, batch_size, inp) # End profiler here if only to profile forward pass From 714a1421a4c5cb3f62bd456b654e98cfb8bd2a9e Mon Sep 17 00:00:00 2001 From: skishore Date: Mon, 9 Feb 2026 08:36:28 +0000 Subject: [PATCH 13/22] correct usage of squim subjective model --- audio/audio_loss.py | 2 ++ audio/audio_output.py | 1 + 2 files changed, 3 insertions(+) diff --git a/audio/audio_loss.py b/audio/audio_loss.py index 6464544..fdc3ce4 100644 --- a/audio/audio_loss.py +++ b/audio/audio_loss.py @@ -26,6 +26,8 @@ def get_criterion(network_name): criterion = nn.MSELoss() elif "hdemucs" in network_name or "squim" in network_name: criterion = nn.L1Loss() + elif "subjective" in network_name: + criterion = nn.L1Loss() return criterion diff --git a/audio/audio_output.py b/audio/audio_output.py index cf9c03b..a4fcf8d 100644 --- a/audio/audio_output.py +++ b/audio/audio_output.py @@ -15,6 +15,7 @@ def create_target(network_name, network, input, batch_size): #get output output = network(**input) output_index = get_output_selection(network_name) + print("output", output.shape) if output_index is not None: output = output[output_index] From ad68179d4983092ba44f1bf597419e2a876ee6e4 Mon Sep 17 00:00:00 2001 From: skishore Date: Mon, 9 Feb 2026 10:08:59 +0000 Subject: [PATCH 14/22] refactor audio loss code to combine conditions --- audio/audio_loss.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/audio/audio_loss.py b/audio/audio_loss.py index fdc3ce4..9b3fdf6 100644 --- a/audio/audio_loss.py +++ b/audio/audio_loss.py @@ -24,17 +24,12 @@ def get_criterion(network_name): criterion = si_sdr_loss elif "tacotron2" in network_name: criterion = nn.MSELoss() - elif "hdemucs" in network_name or "squim" in network_name: - criterion = nn.L1Loss() - elif "subjective" in network_name: + elif "hdemucs" in network_name or "squim" in network_name or "subjective" in network_name: criterion = nn.L1Loss() return criterion def calculate_loss(network_name, criterion, output, target, batch_size, input): - if criterion is None: - target = torch.randn_like(output) - return torch.nn.functional.mse_loss(output, target) if network_name in speech_representation_models: logit_m, logit_u, feature_penalty = output loss = criterion(logit_m, logit_u, feature_penalty) @@ -51,9 +46,7 @@ def calculate_loss(network_name, criterion, output, target, batch_size, input): elif "conv_tasnet" in network_name: target, mask = target loss = criterion(output, target, mask) - elif "tacotron2" in network_name: - loss = criterion(output, target) - elif "hdemucs" in network_name or "subjective" in network_name: + elif "tacotron2" in network_name or "hdemucs" in network_name or "subjective" in network_name: loss = criterion(output, target) elif "objective" in network_name: loss = 0 From d5b520cb17d99e1caa88a2a03a38935a4169656c Mon Sep 17 00:00:00 2001 From: skishore Date: Mon, 9 Feb 2026 10:21:35 +0000 Subject: [PATCH 15/22] refactor audio loss code to combine conditions --- audio/audio_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/audio/audio_loss.py b/audio/audio_loss.py index 9b3fdf6..4d0cd89 100644 --- a/audio/audio_loss.py +++ b/audio/audio_loss.py @@ -24,7 +24,7 @@ def get_criterion(network_name): criterion = si_sdr_loss elif "tacotron2" in network_name: criterion = nn.MSELoss() - elif "hdemucs" in network_name or "squim" in network_name or "subjective" in network_name: + elif "hdemucs" in network_name or "squim" in network_name: criterion = nn.L1Loss() return criterion From 870711b8b8d1293d9bd51430c25b81c691d4c192 Mon Sep 17 00:00:00 2001 From: skishore Date: Mon, 9 Feb 2026 10:26:23 +0000 Subject: [PATCH 16/22] add error messages for undefined input, target, model, loss function for undefined network names --- audio/audio_input.py | 3 +++ audio/audio_loss.py | 7 +++++++ audio/audio_output.py | 3 +++ 3 files changed, 13 insertions(+) diff --git a/audio/audio_input.py b/audio/audio_input.py index d7a58af..22e1288 100644 --- a/audio/audio_input.py +++ b/audio/audio_input.py @@ -59,4 +59,7 @@ def get_input(network_name, network, batch_size): elif network_name in speech_representation_models: inp = {"waveforms" : torch.rand(batch_size, FRAME_COUNT, device="cuda"), "labels" : torch.randint(0, 100, (batch_size, 2), dtype=torch.int32, device="cuda")} + else: + print (f"Input for {network_name} not defined") + sys.exit(1) return inp \ No newline at end of file diff --git a/audio/audio_loss.py b/audio/audio_loss.py index 4d0cd89..a70a151 100644 --- a/audio/audio_loss.py +++ b/audio/audio_loss.py @@ -26,10 +26,14 @@ def get_criterion(network_name): criterion = nn.MSELoss() elif "hdemucs" in network_name or "squim" in network_name: criterion = nn.L1Loss() + else: + print (f"Criterion for network name {network_name} not defined") + sys.exit(1) return criterion def calculate_loss(network_name, criterion, output, target, batch_size, input): + loss = 0 if network_name in speech_representation_models: logit_m, logit_u, feature_penalty = output loss = criterion(logit_m, logit_u, feature_penalty) @@ -57,4 +61,7 @@ def calculate_loss(network_name, criterion, output, target, batch_size, input): else: loss += criterion(output[index], target[index]) loss += criterion(input["x"], target[3]) + else: + print (f"Loss function for {network_name} not defined") + sys.exit(1) return loss \ No newline at end of file diff --git a/audio/audio_output.py b/audio/audio_output.py index a4fcf8d..fffd546 100644 --- a/audio/audio_output.py +++ b/audio/audio_output.py @@ -49,4 +49,7 @@ def create_target(network_name, network, input, batch_size): for index in range(len(output)): target.append(torch.randn_like(output[index])) target.append(torch.randn_like(input["x"])) + else: + print (f"Target for {network_name} not defined") + sys.exit(1) return target \ No newline at end of file From 007e0cfd30770d99d017d11d1ba962873bdb47bd Mon Sep 17 00:00:00 2001 From: Sriram Kumar Date: Mon, 9 Feb 2026 04:41:26 -0600 Subject: [PATCH 17/22] fix error related to sdr loss --- audio/audio_output.py | 2 +- audio/sdr.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/audio/audio_output.py b/audio/audio_output.py index fffd546..db29faa 100644 --- a/audio/audio_output.py +++ b/audio/audio_output.py @@ -15,7 +15,6 @@ def create_target(network_name, network, input, batch_size): #get output output = network(**input) output_index = get_output_selection(network_name) - print("output", output.shape) if output_index is not None: output = output[output_index] @@ -43,6 +42,7 @@ def create_target(network_name, network, input, batch_size): elif "tacotron2" in network_name: target = torch.randn_like(output) elif "hdemucs" in network_name or "subjective" in network_name: + print("output", output.shape) target = torch.randn_like(output) elif "objective" in network_name: target = [] diff --git a/audio/sdr.py b/audio/sdr.py index 011bb56..4053b6c 100644 --- a/audio/sdr.py +++ b/audio/sdr.py @@ -214,5 +214,5 @@ def si_sdr_loss(estimate: torch.Tensor, reference: torch.Tensor, mask: torch.Ten estimate = estimate - estimate.mean(axis=2, keepdim=True) reference = reference - reference.mean(axis=2, keepdim=True) - si_sdri = sdr.sdr_pit(estimate, reference, mask=mask) + si_sdri = sdr_pit(estimate, reference, mask=mask) return -si_sdri.mean() \ No newline at end of file From 768bf9c704170d311c4622317582397564064122 Mon Sep 17 00:00:00 2001 From: Sriram Kumar Date: Mon, 9 Feb 2026 07:07:30 -0600 Subject: [PATCH 18/22] fix error related to hdmucs loss --- audio/audio_loss.py | 10 ++++++++-- audio/audio_output.py | 1 - 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/audio/audio_loss.py b/audio/audio_loss.py index a70a151..817437a 100644 --- a/audio/audio_loss.py +++ b/audio/audio_loss.py @@ -24,7 +24,9 @@ def get_criterion(network_name): criterion = si_sdr_loss elif "tacotron2" in network_name: criterion = nn.MSELoss() - elif "hdemucs" in network_name or "squim" in network_name: + elif "hdemucs" in network_name: + criterion = nn.L1Loss(reduction='none') + elif "squim" in network_name: criterion = nn.L1Loss() else: print (f"Criterion for network name {network_name} not defined") @@ -50,7 +52,7 @@ def calculate_loss(network_name, criterion, output, target, batch_size, input): elif "conv_tasnet" in network_name: target, mask = target loss = criterion(output, target, mask) - elif "tacotron2" in network_name or "hdemucs" in network_name or "subjective" in network_name: + elif "tacotron2" in network_name or "subjective" in network_name: loss = criterion(output, target) elif "objective" in network_name: loss = 0 @@ -61,6 +63,10 @@ def calculate_loss(network_name, criterion, output, target, batch_size, input): else: loss += criterion(output[index], target[index]) loss += criterion(input["x"], target[3]) + elif "hdemucs" in network_name: + dims = tuple(range(2, target.dim())) + loss = criterion(output, target) + loss = loss.mean(dims).mean(0) else: print (f"Loss function for {network_name} not defined") sys.exit(1) diff --git a/audio/audio_output.py b/audio/audio_output.py index db29faa..c5c5cb6 100644 --- a/audio/audio_output.py +++ b/audio/audio_output.py @@ -42,7 +42,6 @@ def create_target(network_name, network, input, batch_size): elif "tacotron2" in network_name: target = torch.randn_like(output) elif "hdemucs" in network_name or "subjective" in network_name: - print("output", output.shape) target = torch.randn_like(output) elif "objective" in network_name: target = [] From 8696de9c75aa0b3c3d30e635d446a6346f56218a Mon Sep 17 00:00:00 2001 From: Sriram Kumar Date: Mon, 9 Feb 2026 08:21:09 -0600 Subject: [PATCH 19/22] apply some recent pytorch changes to torchaudio benchmark --- micro_benchmarking_audio.py | 114 ++++++++++++++++++++++++++++-------- 1 file changed, 91 insertions(+), 23 deletions(-) diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py index ed7509f..fc74d3c 100644 --- a/micro_benchmarking_audio.py +++ b/micro_benchmarking_audio.py @@ -14,7 +14,8 @@ from audio.audio_loss import get_criterion, calculate_loss from audio.audio_input import get_input_type, get_input from audio.audio_output import get_output_selection, create_target - +import csv +import json try: import torch._dynamo @@ -48,8 +49,9 @@ def weight_init(m): m.bias.data.zero_() -def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_size, criterion, target, flops_prof_step=0): - optimizer.zero_grad() +def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_size, criterion, target, step=0, opt_step=1, flops_prof_step=0): + if step % opt_step == 0: + optimizer.zero_grad() if flops_prof_step: prof = FlopsProfiler(network) prof.start_profile() @@ -73,7 +75,8 @@ def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_ prof.print_model_profile(profile_step=flops_prof_step) prof.end_profile() - optimizer.step() + if (step + 1) % opt_step == 0: + optimizer.step() def rendezvous(distributed_parameters): print("Initializing process group...") @@ -183,7 +186,17 @@ def run_benchmarking(local_rank, params): param_copy = network.parameters() if (run_fp16): param_copy = get_param_copy(network) - optimizer = torch.optim.SGD(param_copy, lr = 0.01, momentum = 0.9) + + ## MLPerf Setting + sgd_opt_base_learning_rate = 0.01 + sgd_opt_end_learning_rate = 1e-4 + sgd_opt_learning_rate_decay_poly_power = 2 + sgd_opt_weight_decay = 0.0001 + sgd_opt_momentum = 0.9 + opt_learning_rate_warmup_epochs = 5 + + total_epochs = params.iterations + optimizer = torch.optim.SGD(param_copy, lr = sgd_opt_base_learning_rate, momentum = sgd_opt_momentum, weight_decay=sgd_opt_weight_decay) if (amp_opt_level): network, optimizer = apex.amp.initialize(network, optimizer, opt_level="O%d"%amp_opt_level) @@ -208,11 +221,18 @@ def run_benchmarking(local_rank, params): inp = inp.half() target = create_target(net, network, inp, batch_size) + + if params.mode == "training": + forward_fn = forwardbackward + network.train() + else: + forward_fn = forward + network.eval() ## warmup. print ("INFO: running forward and backward for warmup.") for i in range(2): - forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target) + forward_fn(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, step=0, opt_step=params.opt_step) time.sleep(1) torch.cuda.synchronize() @@ -225,13 +245,19 @@ def run_benchmarking(local_rank, params): skip_first = 0, wait = 1, warmup = 2, - active = 2, + active = 5, repeat = 1, ) def trace_ready_callback(prof): - print("----------- Trace Ready -----------") - prof.export_chrome_trace(f"trace{prof.step_num}.json") + rank = 0 + if torch.distributed.is_available() and torch.distributed.is_initialized(): + rank = torch.distributed.get_rank() + if rank == 0: + print("----------- Trace Ready -----------") + prof.export_chrome_trace(f"{params.profiler_output}.json") + # print(f"----------- Rank {rank} Trace Ready -----------") + # prof.export_chrome_trace(f"{params.profiler_output}_rank{rank}.json") tm = time.time() with profile( @@ -240,7 +266,7 @@ def trace_ready_callback(prof): on_trace_ready=trace_ready_callback) as prof: for i in range(iterations): with record_function(f"iteration {i}"): - forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target) + forward_fn(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, step=i, opt_step=params.opt_step) prof.step() torch.cuda.synchronize() print(prof.key_averages().table(sort_by="cuda_time_total")) @@ -249,9 +275,9 @@ def trace_ready_callback(prof): with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler): for i in range(iterations): if i == flops_prof_step: - forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, i) + forward_fn(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, step=i, opt_step=params.opt_step, flops_prof_step=i) else: - forwardbackward(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target) + forward_fn(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, step=i, opt_step=params.opt_step) torch.cuda.synchronize() tm2 = time.time() @@ -272,27 +298,63 @@ def trace_ready_callback(prof): else: dtype = 'FP32' + result = None + if not params.output_dir: + params.output_dir = "." + print ("OK: finished running benchmark..") print ("--------------------SUMMARY--------------------------") print ("Microbenchmark for network : {}".format(net)) if distributed_dataparallel or is_torchrun: - print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------"); - print ("Num devices: 1") + print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------"); + print ("Num devices: 1") else: - print ("Num devices: {}".format(ngpus)) + print ("Num devices: {}".format(ngpus)) + result = { + "Name": params.output_file, + "GPUs": 1, + "Mini batch size [img]": batch_size, + "Mini batch size [img/gpu]": batch_size, + "Throughput [img/sec]": batch_size / time_per_batch, + "Time per mini-batch": time_per_batch + } + with open(f"{params.output_dir}/{params.output_file}.json", "w") as f: + json.dump(result, f, indent=2) print ("Dtype: {}".format(dtype)) print ("Mini batch size [", get_input_type(net), "] : {}".format(batch_size)) print ("Time per mini-batch : {}".format(time_per_batch)) print ("Throughput [", get_input_type(net), "/sec] : {}".format(batch_size/time_per_batch)) if (distributed_dataparallel or is_torchrun) and distributed_parameters['rank'] == 0: - print ("") - print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------") - world_size = distributed_parameters['world_size'] - print ("Num devices: {}".format(world_size)) - print ("Dtype: {}".format(dtype)) - print ("Mini batch size [", get_input_type(net), "] : {}".format(batch_size*world_size)) - print ("Time per mini-batch : {}".format(time_per_batch)) - print ("Throughput [", get_input_type(net), "/sec] : {}".format(batch_size*world_size/time_per_batch)) + print ("") + print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------") + world_size = distributed_parameters['world_size'] + print ("Num devices: {}".format(world_size)) + print ("Dtype: {}".format(dtype)) + print ("Mini batch size [", get_input_type(net), "] : {}".format(batch_size*world_size)) + print ("Time per mini-batch : {}".format(time_per_batch)) + print ("Throughput [", get_input_type(net), "/sec] : {}".format(batch_size*world_size/time_per_batch)) + result = { + "Name": params.output_file, + "GPUs": distributed_parameters['world_size'], + "Mini batch size [img]": batch_size * distributed_parameters['world_size'], + "Mini batch size [img/gpu]": batch_size, + "Throughput [img/sec]": batch_size * distributed_parameters['world_size'] / time_per_batch, + "Time per mini-batch": time_per_batch + } + with open(f"{params.output_dir}/{params.output_file}.json", "w") as f: + json.dump(result, f, indent=2) + + csv_filename = f"{params.output_dir}/benchmark_summary.csv" + if params.csv_file: + csv_filename = params.csv_file + file_exists = os.path.isfile(csv_filename) + if result: + with open(csv_filename, "a", newline='') as csvfile: + writer = csv.writer(csvfile) + if not file_exists: + writer.writerow(result.keys()) + writer.writerow(result.values()) + print(f"Benchmark result saved to {csv_filename}") def main(): run_benchmarking_wrapper(copy.deepcopy(args)) @@ -315,6 +377,12 @@ def main(): parser.add_argument("--dist-url", type=str, required=False, default=None, help="url used for rendezvous of processes in distributed training. Needs to contain IP and open port of master rank0 eg. 'tcp://172.23.2.1:54321'. Required for --distributed_dataparallel") parser.add_argument("--compile", action='store_true', required=False, help="use pytorch 2.0") parser.add_argument("--compileContext", default={}, required=False, help="additional compile options") + parser.add_argument("--csv-file", type=str, default=None, required=False, help="assign output csv file name.") + parser.add_argument("--mode", type=str, choices=['training', 'inference'], default="training", help="Select mode: training or inference") + parser.add_argument("--opt-step", type=int, required=False, default=1, help="Optimizer update step") + parser.add_argument("--output-dir", type=str, default="", help="assign output directory name.") + parser.add_argument("--output-file", type=str, default="", help="assign output file name.") + parser.add_argument("--profiler-output", type=str, default="", help="assign profiler output name.") args = parser.parse_args() From 2f02eb4818d45d5d8697910cb207cea523e6d1c7 Mon Sep 17 00:00:00 2001 From: Sriram Kumar Date: Mon, 9 Feb 2026 08:32:53 -0600 Subject: [PATCH 20/22] replace apex amp with torch amp --- micro_benchmarking_audio.py | 101 ++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 45 deletions(-) diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py index fc74d3c..efb5915 100644 --- a/micro_benchmarking_audio.py +++ b/micro_benchmarking_audio.py @@ -16,6 +16,7 @@ from audio.audio_output import get_output_selection, create_target import csv import json +from torch.amp import autocast, GradScaler try: import torch._dynamo @@ -31,12 +32,6 @@ # this indicates we're using torchrun is_torchrun = True -try: - import apex - HAVE_APEX = True -except: - HAVE_APEX = False - def weight_init(m): if isinstance(m, nn.Conv2d): @@ -49,34 +44,68 @@ def weight_init(m): m.bias.data.zero_() -def forwardbackward(inp, optimizer, network, amp_opt_level, network_name, batch_size, criterion, target, step=0, opt_step=1, flops_prof_step=0): +def forwardbackward(inp, optimizer, network, params, network_name, batch_size, criterion, target, step=0, opt_step=1, flops_prof_step=0): if step % opt_step == 0: optimizer.zero_grad() if flops_prof_step: prof = FlopsProfiler(network) prof.start_profile() - out = network(**inp) - output_index = get_output_selection(network_name) - if output_index is not None: - out = out[output_index] - - loss = calculate_loss(network_name, criterion, out, target, batch_size, inp) + if params.amp: + with autocast('cuda'): + out = network(**inp) + output_index = get_output_selection(network_name) + if output_index is not None: + out = out[output_index] + + loss = calculate_loss(network_name, criterion, out, target, batch_size, inp) + scaler.scale(loss).backward() + if (step + 1) % opt_step == 0: + scaler.step(optimizer) + scaler.update() + optimizer.zero_grad() + else: + out = network(**inp) + output_index = get_output_selection(network_name) + if output_index is not None: + out = out[output_index] + loss = calculate_loss(network_name, criterion, out, target, batch_size, inp) + loss.backward() + if (step + 1) % opt_step == 0: + optimizer.step() + optimizer.zero_grad() # End profiler here if only to profile forward pass - if amp_opt_level: - with apex.amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() + if flops_prof_step: + prof.print_model_profile(profile_step=flops_prof_step) + prof.end_profile() + +def forward(inp, optimizer, network, params, network_name, batch_size, criterion, target, step=0, opt_step=1, flops_prof_step=0): + if step % opt_step == 0: + optimizer.zero_grad() + if flops_prof_step: + prof = FlopsProfiler(network) + prof.start_profile() + + if params.amp: + with autocast('cuda'): + out = network(**inp) + output_index = get_output_selection(network_name) + if output_index is not None: + out = out[output_index] else: - loss.backward() + out = network(**inp) + output_index = get_output_selection(network_name) + if output_index is not None: + out = out[output_index] + + # End profiler here if only to profile forward pass if flops_prof_step: prof.print_model_profile(profile_step=flops_prof_step) prof.end_profile() - if (step + 1) % opt_step == 0: - optimizer.step() def rendezvous(distributed_parameters): print("Initializing process group...") @@ -130,7 +159,7 @@ def run_benchmarking(local_rank, params): ngpus = params.ngpus net = params.network run_fp16 = params.fp16 - amp_opt_level = params.amp_opt_level + run_amp = params.amp distributed_dataparallel = params.distributed_dataparallel distributed_parameters = params.distributed_parameters batch_size = params.batch_size @@ -198,9 +227,6 @@ def run_benchmarking(local_rank, params): total_epochs = params.iterations optimizer = torch.optim.SGD(param_copy, lr = sgd_opt_base_learning_rate, momentum = sgd_opt_momentum, weight_decay=sgd_opt_weight_decay) - if (amp_opt_level): - network, optimizer = apex.amp.initialize(network, optimizer, opt_level="O%d"%amp_opt_level) - if is_torchrun: rendezvous(distributed_parameters) devices_to_run_on = [local_rank] @@ -232,7 +258,7 @@ def run_benchmarking(local_rank, params): ## warmup. print ("INFO: running forward and backward for warmup.") for i in range(2): - forward_fn(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, step=0, opt_step=params.opt_step) + forward_fn(inp, optimizer, network, params, net, batch_size, criterion, target, step=0, opt_step=params.opt_step) time.sleep(1) torch.cuda.synchronize() @@ -266,7 +292,7 @@ def trace_ready_callback(prof): on_trace_ready=trace_ready_callback) as prof: for i in range(iterations): with record_function(f"iteration {i}"): - forward_fn(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, step=i, opt_step=params.opt_step) + forward_fn(inp, optimizer, network, params, net, batch_size, criterion, target, step=i, opt_step=params.opt_step) prof.step() torch.cuda.synchronize() print(prof.key_averages().table(sort_by="cuda_time_total")) @@ -275,9 +301,9 @@ def trace_ready_callback(prof): with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler): for i in range(iterations): if i == flops_prof_step: - forward_fn(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, step=i, opt_step=params.opt_step, flops_prof_step=i) + forward_fn(inp, optimizer, network, params, net, batch_size, criterion, target, step=i, opt_step=params.opt_step, flops_prof_step=i) else: - forward_fn(inp, optimizer, network, amp_opt_level, net, batch_size, criterion, target, step=i, opt_step=params.opt_step) + forward_fn(inp, optimizer, network, params, net, batch_size, criterion, target, step=i, opt_step=params.opt_step) torch.cuda.synchronize() tm2 = time.time() @@ -285,16 +311,8 @@ def trace_ready_callback(prof): if run_fp16: dtype = 'FP16' - elif amp_opt_level == 1: - dtype = 'AMP-O1: Insert automatic FP16 casts around safe Pytorch functions and Tensor methods.' - elif amp_opt_level == 2: - dtype = 'AMP-O2: FP16 training with FP32 batchnorm and FP32 master weights.' - elif amp_opt_level == 3: - dtype = 'AMP-O3: Pure FP16 training.' - elif amp_opt_level == 4: - dtype = 'AMP-O4: Insert automatic BFLOAT16 casts around safe Pytorch functions and Tensor methods.' - elif amp_opt_level == 5: - dtype = 'AMP-O5: BFLOAT16 training with FP32 batchnorm and FP32 master weights.' + elif run_amp: + dtype = 'AMP: PyTorch Native Automatic Mixed Precision' else: dtype = 'FP32' @@ -368,7 +386,6 @@ def main(): parser.add_argument("--kineto", action='store_true', required=False, help="Turn kineto profiling on") parser.add_argument("--autograd_profiler", action='store_true', required=False, help="Use PyTorch autograd (old) profiler") parser.add_argument("--fp16", type=int, required=False, default=0,help="FP16 mixed precision benchmarking") - parser.add_argument("--amp-opt-level", type=int, required=False, default=0,help="apex.amp mixed precision benchmarking opt level") parser.add_argument("--distributed_dataparallel", action='store_true', required=False, help="Use torch.nn.parallel.DistributedDataParallel api to run on multiple processes/nodes. The multiple processes need to be launched manually, this script will only launch ONE process per invocation. Either use --distributed_dataparallel and manually launch multiple processes or launch this script with `torchrun`") parser.add_argument("--device_ids", type=str, required=False, default=None, help="Comma-separated list (no spaces) to specify which HIP devices (0-indexed) to run distributedDataParallel api on. Might need to use HIP_VISIBLE_DEVICES to limit visiblity of devices to different processes.") parser.add_argument("--rank", type=int, required=False, default=None, help="Rank of this process. Required for --distributed_dataparallel") @@ -377,6 +394,7 @@ def main(): parser.add_argument("--dist-url", type=str, required=False, default=None, help="url used for rendezvous of processes in distributed training. Needs to contain IP and open port of master rank0 eg. 'tcp://172.23.2.1:54321'. Required for --distributed_dataparallel") parser.add_argument("--compile", action='store_true', required=False, help="use pytorch 2.0") parser.add_argument("--compileContext", default={}, required=False, help="additional compile options") + parser.add_argument("--amp", action='store_true', default=False, required=False, help="Automatic mixed precision benchmarking") parser.add_argument("--csv-file", type=str, default=None, required=False, help="assign output csv file name.") parser.add_argument("--mode", type=str, choices=['training', 'inference'], default="training", help="Select mode: training or inference") parser.add_argument("--opt-step", type=int, required=False, default=1, help="Optimizer update step") @@ -393,11 +411,4 @@ def main(): print("ERROR: You must install (or copy) deepspeed.profiling to use --flops-prof-step") sys.exit(1) - if args.fp16 and args.amp_opt_level: - print ("ERROR: Cannot use both --fp16 and --amp-opt-level") - sys.exit(1) - if args.amp_opt_level and not HAVE_APEX: - print ("ERROR: You must install apex to use --amp-opt-level") - sys.exit(1) - main() \ No newline at end of file From a8ee74deac418b5cfbdf26395c407ede7330c1a5 Mon Sep 17 00:00:00 2001 From: Sriram Kumar Date: Wed, 11 Feb 2026 09:09:31 -0600 Subject: [PATCH 21/22] change the location of optimizer step before profiling --- micro_benchmarking_apex.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/micro_benchmarking_apex.py b/micro_benchmarking_apex.py index 8f5f729..e137ecc 100644 --- a/micro_benchmarking_apex.py +++ b/micro_benchmarking_apex.py @@ -190,6 +190,10 @@ def forwardbackward(inp, optimizer, network, params, target, step=0, opt_step=1, else: loss.backward() + if (step + 1) % opt_step == 0: + optimizer.step() + optimizer.zero_grad() + if flops_prof_step: # End profiler here to profile both fwd and bwd passes # flops = prof.get_total_flops(as_string=True) @@ -197,9 +201,6 @@ def forwardbackward(inp, optimizer, network, params, target, step=0, opt_step=1, prof.print_model_profile(profile_step=flops_prof_step) prof.end_profile() - if (step + 1) % opt_step == 0: - optimizer.step() - def forward(inp, optimizer, network, params, target, step=0, opt_step=1, flops_prof_step=0): if flops_prof_step: From 55c55175b68cccd8122098e07466277e138029b5 Mon Sep 17 00:00:00 2001 From: skishore Date: Tue, 17 Feb 2026 12:45:17 +0000 Subject: [PATCH 22/22] add readme sections for apex and audio --- README.md | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fa52322..8528753 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,9 @@ # pytorch-micro-benchmarking + +This repo provides microbenchmarking script for training and inferencing models in pytorch, apex and torchaudio libraries on ROCm. + +## Pytorch + We supply a small microbenchmarking script for PyTorch training on ROCm. To execute: @@ -37,10 +42,10 @@ python3 micro_benchmarking_pytorch.py --device_ids=1 --network resnet50 --distri To run FlopsProfiler (with deepspeed.profiling.flops_profiler imported): `python micro_benchmarking_pytorch.py --network resnet50 --amp-opt-level=2 --batch-size=256 --iterations=20 --flops-prof-step 10` -## Performance tuning +### Performance tuning If performance on a specific card and/or model is found to be lacking, typically some gains can be made by tuning MIOpen. For this, `export MIOPEN_FIND_ENFORCE=3` prior to running the model. This will take some time if untuned configurations are encountered and write to a local performance database. More information on this can be found in the [MIOpen documentation](https://rocm.github.io/MIOpen/doc/html/perfdatabase.html). -## PyTorch 2.0 +### PyTorch 2.0 Added the `--compile` option opens up PyTorch 2.0 capabilities, which comes with several options. Here are some notes from upstream: ``` Optimizes given model/function using TorchDynamo and specified backend. @@ -75,3 +80,26 @@ python micro_benchmarking_pytorch.py --network resnet50 --compile --compileConte python micro_benchmarking_pytorch.py --network resnet50 --compile --compileContext "{'options': {'static-memory': 'True', 'matmul-padding': 'True'}}" ``` Note: you cannot pass the `mode` and `options` options together. + +## TorchAudio + +The script and parameters for torchaudio are similar to pytorch. + +To execute: +`python micro_benchmarking_audio.py --network [--batch-size ] [--iterations ] [--fp16 <0 or 1> ] [--distributed_dataparallel] [--device_ids ] ` + +Possible network names are: `wav2vec2_base`, `deepspeech`, `hdemucs_low`, `tacotron2`, `wavernn`, `wav2letter`, `hubert_base` etc. + +## Apex + +The script and parameters for torchaudio are similar to pytorch. + +To execute: +`python micro_benchmarking_apex.py --network [--batch-size ] [--iterations ] [--fp16 <0 or 1> ] [--distributed_dataparallel] [--device_ids ] [--sync_bn] [--keep-batchnorm-fp32 ] [--loss-scale ]` + +There are three additional parameters. +1. `--sync_bn`: Use apex synchronized batch normalization across GPUs (useful for multi-GPU training). +2. `--keep-batchnorm-fp32`: Keep batch norm layers in FP32 when using AMP (e.g. `--keep-batchnorm-fp32 true`). Omit with opt_level O1. +3. `--loss-scale`: Loss scale for mixed precision. It is a number (e.g. `1024`) for static scaling, or `dynamic` for adaptive scaling. + +Instead of amp flag (true/false), there is a level of amp optimization used in apex. \ No newline at end of file