diff --git a/README.md b/README.md index fa52322..8528753 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,9 @@ # pytorch-micro-benchmarking + +This repo provides microbenchmarking script for training and inferencing models in pytorch, apex and torchaudio libraries on ROCm. + +## Pytorch + We supply a small microbenchmarking script for PyTorch training on ROCm. To execute: @@ -37,10 +42,10 @@ python3 micro_benchmarking_pytorch.py --device_ids=1 --network resnet50 --distri To run FlopsProfiler (with deepspeed.profiling.flops_profiler imported): `python micro_benchmarking_pytorch.py --network resnet50 --amp-opt-level=2 --batch-size=256 --iterations=20 --flops-prof-step 10` -## Performance tuning +### Performance tuning If performance on a specific card and/or model is found to be lacking, typically some gains can be made by tuning MIOpen. For this, `export MIOPEN_FIND_ENFORCE=3` prior to running the model. This will take some time if untuned configurations are encountered and write to a local performance database. More information on this can be found in the [MIOpen documentation](https://rocm.github.io/MIOpen/doc/html/perfdatabase.html). -## PyTorch 2.0 +### PyTorch 2.0 Added the `--compile` option opens up PyTorch 2.0 capabilities, which comes with several options. Here are some notes from upstream: ``` Optimizes given model/function using TorchDynamo and specified backend. @@ -75,3 +80,26 @@ python micro_benchmarking_pytorch.py --network resnet50 --compile --compileConte python micro_benchmarking_pytorch.py --network resnet50 --compile --compileContext "{'options': {'static-memory': 'True', 'matmul-padding': 'True'}}" ``` Note: you cannot pass the `mode` and `options` options together. + +## TorchAudio + +The script and parameters for torchaudio are similar to pytorch. + +To execute: +`python micro_benchmarking_audio.py --network [--batch-size ] [--iterations ] [--fp16 <0 or 1> ] [--distributed_dataparallel] [--device_ids ] ` + +Possible network names are: `wav2vec2_base`, `deepspeech`, `hdemucs_low`, `tacotron2`, `wavernn`, `wav2letter`, `hubert_base` etc. + +## Apex + +The script and parameters for torchaudio are similar to pytorch. + +To execute: +`python micro_benchmarking_apex.py --network [--batch-size ] [--iterations ] [--fp16 <0 or 1> ] [--distributed_dataparallel] [--device_ids ] [--sync_bn] [--keep-batchnorm-fp32 ] [--loss-scale ]` + +There are three additional parameters. +1. `--sync_bn`: Use apex synchronized batch normalization across GPUs (useful for multi-GPU training). +2. `--keep-batchnorm-fp32`: Keep batch norm layers in FP32 when using AMP (e.g. `--keep-batchnorm-fp32 true`). Omit with opt_level O1. +3. `--loss-scale`: Loss scale for mixed precision. It is a number (e.g. `1024`) for static scaling, or `dynamic` for adaptive scaling. + +Instead of amp flag (true/false), there is a level of amp optimization used in apex. \ No newline at end of file diff --git a/audio/audio_input.py b/audio/audio_input.py new file mode 100644 index 0000000..22e1288 --- /dev/null +++ b/audio/audio_input.py @@ -0,0 +1,65 @@ +import torch +from audio.audio_model import * + + +def get_input_type(network_name): + if network_name in acoustic_models or network_name in source_separation_models or network_name in speech_quality_models: + return "waveform" + elif network_name in speech_recognition_models: + return "acoustic features" + elif network_name in speech_synthesis_models: + if "wavernn" in network_name: + return "waveform" + else: + return "tokens" + + +def get_input(network_name, network, batch_size): + if network_name in acoustic_models: + inp = {"waveforms": torch.randn(batch_size, FRAME_COUNT, device="cuda")} + elif network_name in source_separation_models: + if "hdemucs" in network_name: + inp = {"input" : torch.randn(batch_size, 2, FRAME_COUNT, device="cuda")} + else: + inp = {"input" : torch.randn(batch_size, 1, FRAME_COUNT, device="cuda")} + elif network_name in speech_recognition_models: + if "deepspeech" in network_name: + #number of channels must be specified for deepspeech + inp = {"x" : torch.randn(batch_size, 1, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda")} + elif "wav2letter" in network_name: + inp = {"x" : torch.randn(batch_size, ACOUSTIC_FEATURES_SIZE, FRAME_COUNT, device="cuda")} + elif "emformer" in network_name: + inp = {"input" : torch.randn(batch_size, FRAME_COUNT, ACOUSTIC_FEATURES_SIZE, device="cuda"), + "lengths" : torch.randint(1, FRAME_COUNT, (batch_size,)).to(device="cuda")} + elif "conformer" in network_name: + lengths = torch.randint(1, FRAME_COUNT, (batch_size,), device="cuda") + inp = {"input" : torch.rand(batch_size, int(lengths.max()), 80, device="cuda"), + "lengths" : lengths} + elif network_name in speech_quality_models: + if "subjective" in network_name: + inp = {"waveform" : torch.randn(batch_size, FRAME_COUNT, device="cuda"), + "reference" : torch.randn(batch_size, FRAME_COUNT, device="cuda")} + else: + inp = {"x" : torch.randn(batch_size, FRAME_COUNT, device="cuda")} + elif network_name in speech_synthesis_models: + if "wavernn" in network_name: + spec_frames = 64 + waveform_length = HOP_LENGTH * (spec_frames - 4) + + inp = {"waveform" : torch.rand(batch_size, 1, waveform_length, device="cuda"), + "specgram": torch.rand(batch_size, 1, N_FREQ, spec_frames, device="cuda")} + elif "tacotron2" in network_name: + n_mels = 80 + max_mel_specgram_length = 300 + max_text_length = 100 + inp = {"tokens" : torch.randint(0, 148, (batch_size, max_text_length), dtype=torch.int32, device="cuda"), + "token_lengths" : max_text_length * torch.ones((batch_size,), device="cuda"), + "mel_specgram": torch.rand(batch_size, n_mels, max_mel_specgram_length, device="cuda"), + "mel_specgram_lengths" : max_mel_specgram_length * torch.ones((batch_size,), dtype=torch.int32, device="cuda")} + elif network_name in speech_representation_models: + inp = {"waveforms" : torch.rand(batch_size, FRAME_COUNT, device="cuda"), + "labels" : torch.randint(0, 100, (batch_size, 2), dtype=torch.int32, device="cuda")} + else: + print (f"Input for {network_name} not defined") + sys.exit(1) + return inp \ No newline at end of file diff --git a/audio/audio_loss.py b/audio/audio_loss.py new file mode 100644 index 0000000..817437a --- /dev/null +++ b/audio/audio_loss.py @@ -0,0 +1,73 @@ +import torch +from audio.languagemodels import LanguageModel +import string +from audio.hubert_loss import hubert_loss +from audio.sdr import si_sdr_loss +from torch import nn +from audio.audio_model import * + + +def get_criterion(network_name): + criterion = None + if network_name in speech_representation_models: + criterion = hubert_loss + elif network_name in speech_recognition_models or network_name in acoustic_models: + char_blank = "*" + char_space = " " + char_apostrophe = "'" + labels = char_blank + char_space + char_apostrophe + string.ascii_lowercase + language_model = LanguageModel(labels, char_blank, char_space) + criterion = torch.nn.CTCLoss(blank=language_model.mapping[char_blank], zero_infinity=False) + elif "wavernn" in network_name: + criterion = nn.CrossEntropyLoss() + elif "conv_tasnet" in network_name: + criterion = si_sdr_loss + elif "tacotron2" in network_name: + criterion = nn.MSELoss() + elif "hdemucs" in network_name: + criterion = nn.L1Loss(reduction='none') + elif "squim" in network_name: + criterion = nn.L1Loss() + else: + print (f"Criterion for network name {network_name} not defined") + sys.exit(1) + return criterion + + +def calculate_loss(network_name, criterion, output, target, batch_size, input): + loss = 0 + if network_name in speech_representation_models: + logit_m, logit_u, feature_penalty = output + loss = criterion(logit_m, logit_u, feature_penalty) + elif network_name in speech_recognition_models or network_name in acoustic_models: + output = output.transpose(-1, -2).transpose(0, 1) + T, N, C = output.shape + target, target_lengths = target + tensors_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long) + loss = criterion(output, target, tensors_lengths, target_lengths) + elif "wavernn" in network_name: + output = output.squeeze(1) + output = output.transpose(1, 2) + loss = criterion(output, target) + elif "conv_tasnet" in network_name: + target, mask = target + loss = criterion(output, target, mask) + elif "tacotron2" in network_name or "subjective" in network_name: + loss = criterion(output, target) + elif "objective" in network_name: + loss = 0 + weights = [1, 2, 0.5, 2] + for index in range(len(output)): + if index == 0: + loss = criterion(output[index], target[index]) + else: + loss += criterion(output[index], target[index]) + loss += criterion(input["x"], target[3]) + elif "hdemucs" in network_name: + dims = tuple(range(2, target.dim())) + loss = criterion(output, target) + loss = loss.mean(dims).mean(0) + else: + print (f"Loss function for {network_name} not defined") + sys.exit(1) + return loss \ No newline at end of file diff --git a/audio/audio_model.py b/audio/audio_model.py new file mode 100644 index 0000000..e13787b --- /dev/null +++ b/audio/audio_model.py @@ -0,0 +1,103 @@ +import torch +import torchaudio +import sys + +ACOUSTIC_FEATURES_SIZE = 32 +FRAME_COUNT = 1024 +HOP_LENGTH = 36 +N_FREQ = 128 +CLASSES_COUNT = 29 + +#different audio tasks related models +acoustic_models = { + "wav2vec2_base" : torchaudio.models.wav2vec2_base, + "wav2vec2_large" : torchaudio.models.wav2vec2_large, + "wav2vec2_large_lv60k" : torchaudio.models.wav2vec2_large_lv60k, + "wav2vec2_xlsr_300m" : torchaudio.models.wav2vec2_xlsr_300m, + "wav2vec2_xlsr_1b" : torchaudio.models.wav2vec2_xlsr_1b, + "wav2vec2_xlsr_2b" : torchaudio.models.wav2vec2_xlsr_2b, + "hubert_base" : torchaudio.models.hubert_base, + "hubert_large" : torchaudio.models.hubert_large, + "hubert_xlarge" : torchaudio.models.hubert_xlarge, + "wavlm_base" : torchaudio.models.wavlm_base, + "wavlm_large" : torchaudio.models.wavlm_large, +} + +speech_recognition_models = { + "conformer" : torchaudio.models.Conformer, + "deepspeech" : torchaudio.models.DeepSpeech, + "emformer" : torchaudio.models.Emformer, + "wav2letter" : torchaudio.models.Wav2Letter +} + +source_separation_models = { + "conv_tasnet_base" : torchaudio.models.conv_tasnet_base, + "hdemucs_low" : torchaudio.models.hdemucs_low, + "hdemucs_medium" : torchaudio.models.hdemucs_medium, + "hdemucs_high" : torchaudio.models.hdemucs_high, +} + +speech_quality_models = { + "squim_objective_base" : torchaudio.models.squim_objective_base, + "squim_subjective_base" : torchaudio.models.squim_subjective_base +} + +speech_synthesis_models = { + "tacotron2" : torchaudio.models.Tacotron2, + "wavernn" : torchaudio.models.WaveRNN +} + + +speech_representation_models = { + "hubert_pretrain_base" : torchaudio.models.hubert_pretrain_base, + "hubert_pretrain_large" : torchaudio.models.hubert_pretrain_large, + "hubert_pretrain_xlarge" : torchaudio.models.hubert_pretrain_xlarge +} + +def get_network_names(): + return sorted(list(acoustic_models.keys()) + + list(speech_recognition_models.keys()) + + list(source_separation_models.keys()) + + list(speech_quality_models.keys()) + + list(speech_synthesis_models.keys()) + + list(speech_representation_models.keys())) + + +def get_network(network_name): + if network_name in acoustic_models: + return acoustic_models[network_name](aux_num_out=CLASSES_COUNT).to(device="cuda") + elif network_name in source_separation_models: + if "hdemucs" in network_name: + return source_separation_models[network_name](sources = ["vocals"]).to(device="cuda") + else: + return source_separation_models[network_name]().to(device="cuda") + elif network_name in speech_recognition_models: + if "deepspeech" in network_name: + return speech_recognition_models[network_name](n_feature = ACOUSTIC_FEATURES_SIZE).to(device="cuda") + elif "wav2letter" in network_name: + return speech_recognition_models[network_name](num_features = ACOUSTIC_FEATURES_SIZE).to(device="cuda") + elif "emformer" in network_name: + return speech_recognition_models[network_name](input_dim = ACOUSTIC_FEATURES_SIZE, + num_heads=8, + ffn_dim=1024, + num_layers=20, + segment_length=4).to(device="cuda") + elif "conformer" in network_name: + return speech_recognition_models[network_name](input_dim = 80, + num_heads=4, + ffn_dim=128, + num_layers=4, + depthwise_conv_kernel_size=31).to(device="cuda") + elif network_name in speech_quality_models: + return speech_quality_models[network_name]().to(device="cuda") + elif network_name in speech_synthesis_models: + if "wavernn" in network_name: + return speech_synthesis_models[network_name](upsample_scales = [3, 3, 4], n_classes = 10, + hop_length = HOP_LENGTH, n_freq = 128).to(device="cuda") + else: + return speech_synthesis_models[network_name]().to(device="cuda") + elif network_name in speech_representation_models: + return speech_representation_models[network_name]().to(device="cuda") + else: + print ("ERROR: not a supported model '%s'" % network_name) + sys.exit(1) \ No newline at end of file diff --git a/audio/audio_output.py b/audio/audio_output.py new file mode 100644 index 0000000..c5c5cb6 --- /dev/null +++ b/audio/audio_output.py @@ -0,0 +1,54 @@ +from audio.audio_model import * + + +def get_output_selection(network_name): + if network_name in acoustic_models: + return 0 + elif "conformer" in network_name or "emformer" in network_name: + return 0 + elif "tacotron2" in network_name: + return 1 + return None + +def create_target(network_name, network, input, batch_size): + + #get output + output = network(**input) + output_index = get_output_selection(network_name) + if output_index is not None: + output = output[output_index] + + target = None + if network_name in speech_recognition_models or network_name in acoustic_models: + output = output.transpose(-1, -2).transpose(0, 1) + T, N, C = output.shape + target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long) + target = torch.randint( + low=1, + high=C, + size=(sum(target_lengths),), + dtype=torch.long, + ) + target = [target, target_lengths] + elif "wavernn" in network_name: + target = torch.randn_like(output) + target = target.squeeze(1) + target = target.transpose(1, 2) + elif "conv_tasnet" in network_name: + batch, _, time = output.shape + mask = torch.randint(low=0, high=1, size=(batch,1,time), dtype=torch.long).cuda() + target = torch.randn_like(output) + target = [target, mask] + elif "tacotron2" in network_name: + target = torch.randn_like(output) + elif "hdemucs" in network_name or "subjective" in network_name: + target = torch.randn_like(output) + elif "objective" in network_name: + target = [] + for index in range(len(output)): + target.append(torch.randn_like(output[index])) + target.append(torch.randn_like(input["x"])) + else: + print (f"Target for {network_name} not defined") + sys.exit(1) + return target \ No newline at end of file diff --git a/audio/hubert_loss.py b/audio/hubert_loss.py new file mode 100644 index 0000000..1edaebd --- /dev/null +++ b/audio/hubert_loss.py @@ -0,0 +1,36 @@ +from typing import Optional + +import torch +import torch.nn.functional as F +from torch import Tensor + + +def hubert_loss( + logit_m: Optional[Tensor], + logit_u: Optional[Tensor], + feature_penalty: Tensor, + masked_weight: float = 1.0, + unmasked_weight: float = 0.0, + feature_weight: float = 10.0, + reduction: str = "sum", +) -> Tensor: + """Compute the cross-entropy loss on HuBERT masked and non-masked logits. + Args: + logit_m (Tensor or None): The masked logit Tensor of dimension `(masked_frames, final_dim)`. + logit_u (Tensor or None): The non-masked logit Tensor of dimension `(unmasked_frames, final_dim)`. + feature_penalty (Tensor): The feature mean value for additional penalty loss. + masked_weight (float, optional): The weight for masked cross-entropy loss (Default: ``1.0``). + unmasked_weight (float, optional): The weight for non-masked cross-entropy loss (Default: ``0.0``). + feature_weight (float, optional): The weight for feature penalty loss (Default: ``10.0``). + reduction (str, optional): The reduction method for cross-entropy loss (Default: ``"sum"``). + """ + loss = feature_penalty * feature_weight * logit_m.shape[0] + if logit_m is not None: + target_m = torch.zeros(logit_m.shape[0], dtype=torch.long, device=logit_m.device) + loss_m = F.cross_entropy(logit_m, target_m, reduction=reduction) + loss += loss_m * masked_weight + if logit_u is not None: + target_u = torch.zeros(logit_u.shape[0], dtype=torch.long, device=logit_m.device) + loss_u = F.cross_entropy(logit_u, target_u, reduction=reduction) + loss += loss_u * unmasked_weight + return loss \ No newline at end of file diff --git a/audio/languagemodels.py b/audio/languagemodels.py new file mode 100644 index 0000000..8091568 --- /dev/null +++ b/audio/languagemodels.py @@ -0,0 +1,38 @@ +import collections +import itertools + + +class LanguageModel: + def __init__(self, labels, char_blank, char_space): + + self.char_space = char_space + self.char_blank = char_blank + + labels = list(labels) + self.length = len(labels) + enumerated = list(enumerate(labels)) + flipped = [(sub[1], sub[0]) for sub in enumerated] + + d1 = collections.OrderedDict(enumerated) + d2 = collections.OrderedDict(flipped) + self.mapping = {**d1, **d2} + + def encode(self, iterable): + if isinstance(iterable, list): + return [self.encode(i) for i in iterable] + else: + return [self.mapping[i] + self.mapping[self.char_blank] for i in iterable] + + def decode(self, tensor): + if len(tensor) > 0 and isinstance(tensor[0], list): + return [self.decode(t) for t in tensor] + else: + # not idempotent, since clean string + x = (self.mapping[i] for i in tensor) + x = "".join(i for i, _ in itertools.groupby(x)) + x = x.replace(self.char_blank, "") + # x = x.strip() + return x + + def __len__(self): + return self.length \ No newline at end of file diff --git a/audio/sdr.py b/audio/sdr.py new file mode 100644 index 0000000..4053b6c --- /dev/null +++ b/audio/sdr.py @@ -0,0 +1,218 @@ +import math +from itertools import permutations +from typing import Optional + +import torch + + +def sdr( + estimate: torch.Tensor, reference: torch.Tensor, mask: Optional[torch.Tensor] = None, epsilon: float = 1e-8 +) -> torch.Tensor: + """Computes source-to-distortion ratio. + + 1. scale the reference signal with power(s_est * s_ref) / powr(s_ref * s_ref) + 2. compute SNR between adjusted estimate and reference. + + Args: + estimate (torch.Tensor): Estimtaed signal. + Shape: [batch, speakers (can be 1), time frame] + reference (torch.Tensor): Reference signal. + Shape: [batch, speakers, time frame] + mask (torch.Tensor or None, optional): Binary mask to indicate padded value (0) or valid value (1). + Shape: [batch, 1, time frame] + epsilon (float, optional): constant value used to stabilize division. + + Returns: + torch.Tensor: scale-invariant source-to-distortion ratio. + Shape: [batch, speaker] + + References: + - Single-channel multi-speaker separation using deep clustering + Y. Isik, J. Le Roux, Z. Chen, S. Watanabe, and J. R. Hershey, + - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation + Luo, Yi and Mesgarani, Nima + https://arxiv.org/abs/1809.07454 + + Notes: + This function is tested to produce the exact same result as + https://github.com/naplab/Conv-TasNet/blob/e66d82a8f956a69749ec8a4ae382217faa097c5c/utility/sdr.py#L34-L56 + """ + reference_pow = reference.pow(2).mean(axis=2, keepdim=True) + mix_pow = (estimate * reference).mean(axis=2, keepdim=True) + scale = mix_pow / (reference_pow + epsilon) + + reference = scale * reference + error = estimate - reference + + reference_pow = reference.pow(2) + error_pow = error.pow(2) + + if mask is None: + reference_pow = reference_pow.mean(axis=2) + error_pow = error_pow.mean(axis=2) + else: + denom = mask.sum(axis=2) + reference_pow = (mask * reference_pow).sum(axis=2) / denom + error_pow = (mask * error_pow).sum(axis=2) / denom + + return 10 * torch.log10(reference_pow) - 10 * torch.log10(error_pow) + + +class PIT(torch.nn.Module): + """Applies utterance-level speaker permutation + + Computes the maxium possible value of the given utility function + over the permutations of the speakers. + + Args: + utility_func (function): + Function that computes the utility (opposite of loss) with signature of + (extimate: torch.Tensor, reference: torch.Tensor) -> torch.Tensor + where input Tensors are shape of [batch, speakers, frame] and + the output Tensor is shape of [batch, speakers]. + + References: + - Multi-talker Speech Separation with Utterance-level Permutation Invariant Training of + Deep Recurrent Neural Networks + Morten Kolbæk, Dong Yu, Zheng-Hua Tan and Jesper Jensen + https://arxiv.org/abs/1703.06284 + """ + + def __init__(self, utility_func): + super().__init__() + self.utility_func = utility_func + + def forward( + self, + estimate: torch.Tensor, + reference: torch.Tensor, + mask: Optional[torch.Tensor] = None, + epsilon: float = 1e-8, + ) -> torch.Tensor: + """Compute utterance-level PIT Loss + + Args: + estimate (torch.Tensor): Estimated source signals. + Shape: [bacth, speakers, time frame] + reference (torch.Tensor): Reference (original) source signals. + Shape: [batch, speakers, time frame] + mask (torch.Tensor or None, optional): Binary mask to indicate padded value (0) or valid value (1). + Shape: [batch, 1, time frame] + epsilon (float, optional): constant value used to stabilize division. + + Returns: + torch.Tensor: Maximum criterion over the speaker permutation. + Shape: [batch, ] + """ + assert estimate.shape == reference.shape + + batch_size, num_speakers = reference.shape[:2] + num_permute = math.factorial(num_speakers) + + util_mat = torch.zeros(batch_size, num_permute, dtype=estimate.dtype, device=estimate.device) + for i, idx in enumerate(permutations(range(num_speakers))): + util = self.utility_func(estimate, reference[:, idx, :], mask=mask, epsilon=epsilon) + util_mat[:, i] = util.mean(dim=1) # take the average over speaker dimension + return util_mat.max(dim=1).values + + +_sdr_pit = PIT(utility_func=sdr) + + +def sdr_pit( + estimate: torch.Tensor, reference: torch.Tensor, mask: Optional[torch.Tensor] = None, epsilon: float = 1e-8 +): + """Computes scale-invariant source-to-distortion ratio. + + 1. adjust both estimate and reference to have 0-mean + 2. scale the reference signal with power(s_est * s_ref) / powr(s_ref * s_ref) + 3. compute SNR between adjusted estimate and reference. + + Args: + estimate (torch.Tensor): Estimtaed signal. + Shape: [batch, speakers (can be 1), time frame] + reference (torch.Tensor): Reference signal. + Shape: [batch, speakers, time frame] + mask (torch.Tensor or None, optional): Binary mask to indicate padded value (0) or valid value (1). + Shape: [batch, 1, time frame] + epsilon (float, optional): constant value used to stabilize division. + + Returns: + torch.Tensor: scale-invariant source-to-distortion ratio. + Shape: [batch, speaker] + + References: + - Single-channel multi-speaker separation using deep clustering + Y. Isik, J. Le Roux, Z. Chen, S. Watanabe, and J. R. Hershey, + - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation + Luo, Yi and Mesgarani, Nima + https://arxiv.org/abs/1809.07454 + + Notes: + This function is tested to produce the exact same result as the reference implementation, + *when the inputs have 0-mean* + https://github.com/naplab/Conv-TasNet/blob/e66d82a8f956a69749ec8a4ae382217faa097c5c/utility/sdr.py#L107-L153 + """ + return _sdr_pit(estimate, reference, mask, epsilon) + + +def sdri( + estimate: torch.Tensor, + reference: torch.Tensor, + mix: torch.Tensor, + mask: Optional[torch.Tensor] = None, + epsilon: float = 1e-8, +) -> torch.Tensor: + """Compute the improvement of SDR (SDRi). + + This function compute how much SDR is improved if the estimation is changed from + the original mixture signal to the actual estimated source signals. That is, + ``SDR(estimate, reference) - SDR(mix, reference)``. + + For computing ``SDR(estimate, reference)``, PIT (permutation invariant training) is applied, + so that best combination of sources between the reference signals and the esimate signals + are picked. + + Args: + estimate (torch.Tensor): Estimated source signals. + Shape: [batch, speakers, time frame] + reference (torch.Tensor): Reference (original) source signals. + Shape: [batch, speakers, time frame] + mix (torch.Tensor): Mixed souce signals, from which the setimated signals were generated. + Shape: [batch, speakers == 1, time frame] + mask (torch.Tensor or None, optional): Binary mask to indicate padded value (0) or valid value (1). + Shape: [batch, 1, time frame] + epsilon (float, optional): constant value used to stabilize division. + + Returns: + torch.Tensor: Improved SDR. Shape: [batch, ] + + References: + - Conv-TasNet: Surpassing Ideal Time--Frequency Magnitude Masking for Speech Separation + Luo, Yi and Mesgarani, Nima + https://arxiv.org/abs/1809.07454 + """ + sdr_ = sdr_pit(estimate, reference, mask=mask, epsilon=epsilon) # [batch, ] + base_sdr = sdr(mix, reference, mask=mask, epsilon=epsilon) # [batch, speaker] + return sdr_ - base_sdr.mean(dim=1) + + +def si_sdr_loss(estimate: torch.Tensor, reference: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: + """Compute the Si-SDR loss. + + Args: + estimate (torch.Tensor): Estimated source signals. + Tensor of dimension (batch, speakers, time) + reference (torch.Tensor): Reference (original) source signals. + Tensor of dimension (batch, speakers, time) + mask (torch.Tensor): Mask to indicate padded value (0) or valid value (1). + Tensor of dimension (batch, 1, time) + + Returns: + torch.Tensor: Si-SDR loss. Tensor of dimension (batch, ) + """ + estimate = estimate - estimate.mean(axis=2, keepdim=True) + reference = reference - reference.mean(axis=2, keepdim=True) + + si_sdri = sdr_pit(estimate, reference, mask=mask) + return -si_sdri.mean() \ No newline at end of file diff --git a/micro_benchmarking_apex.py b/micro_benchmarking_apex.py new file mode 100644 index 0000000..e137ecc --- /dev/null +++ b/micro_benchmarking_apex.py @@ -0,0 +1,563 @@ +import torch +import torchvision +import random +import time +import argparse +import os +import sys +import ast +import copy +import math +import torch.nn as nn +import torch.multiprocessing as mp +try: + import apex +except: + print ("ERROR: You must install apex to run apex microbenchmarking") + sys.exit(1) +from apex.fp16_utils import FP16Model +from shufflenet import shufflenet +from shufflenet_v2 import shufflenet as shufflenet_v2 +from xception import xception +import csv +import json + +try: + import torch._dynamo + torch._dynamo.config.verbose=True + HAVE_DYNAMO = True +except: + HAVE_DYNAMO = False + +IS_PT2 = hasattr(torch, "compile") + +is_torchrun = False +if "LOCAL_RANK" in os.environ: + # this indicates we're using torchrun + is_torchrun = True + +def xform(m: nn.Module) -> nn.Module: + m = m.cuda() + m.to(memory_format=torch.channels_last) + return m + +def weight_init(m): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + +# num_classes=1000 +models = { + "alexnet" : torchvision.models.alexnet, + "densenet121" : torchvision.models.densenet121, + "densenet161" : torchvision.models.densenet161, + "densenet169" : torchvision.models.densenet169, + "densenet201" : torchvision.models.densenet201, + "googlenet" : torchvision.models.googlenet, + "inception_v3" : torchvision.models.inception_v3, + "mnasnet0_5" : torchvision.models.mnasnet0_5, + "mnasnet0_75" : torchvision.models.mnasnet0_75, + "mnasnet1_0" : torchvision.models.mnasnet1_0, + "mnasnet1_3" : torchvision.models.mnasnet1_3, + "mobilenet_v2" : torchvision.models.mobilenet_v2, + "resnet18" : torchvision.models.resnet18, + "resnet34" : torchvision.models.resnet34, + "resnet50" : torchvision.models.resnet50, + "resnet101" : torchvision.models.resnet101, + "resnet152" : torchvision.models.resnet152, + "resnext50" : torchvision.models.resnext50_32x4d, + "resnext50_32x4d" : torchvision.models.resnext50_32x4d, + "resnext101" : torchvision.models.resnext101_32x8d, + "resnext101_32x8d" : torchvision.models.resnext101_32x8d, + "shufflenet" : shufflenet, + "shufflenet_v2" : shufflenet_v2, + "shufflenet_v2_x05" : torchvision.models.shufflenet_v2_x0_5, + "shufflenet_v2_x10" : torchvision.models.shufflenet_v2_x1_0, + "shufflenet_v2_x15" : torchvision.models.shufflenet_v2_x1_5, + "shufflenet_v2_x20" : torchvision.models.shufflenet_v2_x2_0, + "shufflenet_v2_x0_5" : torchvision.models.shufflenet_v2_x0_5, + "shufflenet_v2_x1_0" : torchvision.models.shufflenet_v2_x1_0, + "shufflenet_v2_x1_5" : torchvision.models.shufflenet_v2_x1_5, + "shufflenet_v2_x2_0" : torchvision.models.shufflenet_v2_x2_0, + "SqueezeNet" : torchvision.models.squeezenet1_0, + "squeezenet1_0" : torchvision.models.squeezenet1_0, + "SqueezeNet1.1" : torchvision.models.squeezenet1_1, + "squeezenet1_1" : torchvision.models.squeezenet1_1, + "vgg11" : torchvision.models.vgg11, + "vgg13" : torchvision.models.vgg13, + "vgg16" : torchvision.models.vgg16, + "vgg19" : torchvision.models.vgg19, + "vgg11_bn" : torchvision.models.vgg11_bn, + "vgg13_bn" : torchvision.models.vgg13_bn, + "vgg16_bn" : torchvision.models.vgg16_bn, + "vgg19_bn" : torchvision.models.vgg19_bn, + "wide_resnet50_2" : torchvision.models.wide_resnet50_2, + "wide_resnet101_2" : torchvision.models.wide_resnet101_2, + "xception" : xception, +} + +# newer torchvision models, for backwards compat +try: + models["swin_t"] = torchvision.models.swin_t + models["swin_s"] = torchvision.models.swin_s + models["swin_b"] = torchvision.models.swin_b + models["swin_v2_t"] = torchvision.models.swin_v2_t + models["swin_v2_s"] = torchvision.models.swin_v2_s + models["swin_v2_b"] = torchvision.models.swin_v2_b + models["vit_b_16"] = torchvision.models.vit_b_16 + models["vit_b_32"] = torchvision.models.vit_b_32 + models["vit_l_16"] = torchvision.models.vit_l_16 + models["vit_l_32"] = torchvision.models.vit_l_32 + models["vit_h_14"] = torchvision.models.vit_h_14 + models["efficientnet_b0"] = torchvision.models.efficientnet_b0 + models["efficientnet_b1"] = torchvision.models.efficientnet_b1 + models["efficientnet_b2"] = torchvision.models.efficientnet_b2 + models["efficientnet_b3"] = torchvision.models.efficientnet_b3 + models["efficientnet_b4"] = torchvision.models.efficientnet_b4 + models["efficientnet_b5"] = torchvision.models.efficientnet_b5 + models["efficientnet_b6"] = torchvision.models.efficientnet_b6 + models["efficientnet_b7"] = torchvision.models.efficientnet_b7 + models["maxvit_t"] = torchvision.models.maxvit_t +except AttributeError: + pass + +try: + models["mobilenet_v3_large"] = torchvision.models.mobilenet_v3_large + models["mobilenet_v3_small"] = torchvision.models.mobilenet_v3_small +except AttributeError: + pass +# segmentation models, num_classes=21 +segmentation_models = { + "fcn_resnet50" : torchvision.models.segmentation.fcn_resnet50, + "fcn_resnet101" : torchvision.models.segmentation.fcn_resnet101, + "deeplabv3_resnet50" : torchvision.models.segmentation.deeplabv3_resnet50, + "deeplabv3_resnet101" : torchvision.models.segmentation.deeplabv3_resnet101, +} + +# newer torchvision segmentation models, for backwards compat +try: + segmentation_models["deeplabv3_mobilenet_v3_large"] = torchvision.models.segmentation.deeplabv3_mobilenet_v3_large + segmentation_models["lraspp_mobilenet_v3_large"] = torchvision.models.segmentation.lraspp_mobilenet_v3_large, +except AttributeError: + pass + +def get_network_names(): + return sorted(list(models.keys()) + list(segmentation_models.keys())) + +def get_network(net, params): + # aux_logits=False only used by inception_v3 + if "inception_v3" == net: + if params.nhwc: + return xform(models[net](aux_logits=False)) + return models[net](aux_logits=False).to(device="cuda") + elif net in models: + if params.nhwc: + return xform(models[net]()) + return models[net]().to(device="cuda") + elif net in segmentation_models: + if params.nhwc: + return xform(segmentation_models[net]()) + return segmentation_models[net]().to(device="cuda") + else: + print ("ERROR: not a supported model '%s'" % net) + sys.exit(1) + +def forwardbackward(inp, optimizer, network, params, target, step=0, opt_step=1, flops_prof_step=0): + if step % opt_step == 0: + optimizer.zero_grad() + if flops_prof_step: + prof = FlopsProfiler(network) + prof.start_profile() + out = network(inp) + # If using HuggingFace model outputs logits, we need to extract them + if hasattr(out, 'logits'): + logits = out.logits + else: + logits = out + loss_fn = torch.nn.CrossEntropyLoss().to(device="cuda") + if params.nhwc: + loss_fn = loss_fn.to(memory_format=torch.channels_last) + loss = loss_fn(logits, target) + + if params.amp_opt_level: + with apex.amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + + if (step + 1) % opt_step == 0: + optimizer.step() + optimizer.zero_grad() + + if flops_prof_step: + # End profiler here to profile both fwd and bwd passes + # flops = prof.get_total_flops(as_string=True) + # params = prof.get_total_params(as_string=True) + prof.print_model_profile(profile_step=flops_prof_step) + prof.end_profile() + +def forward(inp, optimizer, network, params, target, step=0, opt_step=1, flops_prof_step=0): + + if flops_prof_step: + prof = FlopsProfiler(network) + prof.start_profile() + out = network(inp) + # If using HuggingFace model outputs logits, we need to extract them + if hasattr(out, 'logits'): + logits = out.logits + else: + logits = out + + if flops_prof_step: + # End profiler here to profile both fwd and bwd passes + # flops = prof.get_total_flops(as_string=True) + # params = prof.get_total_params(as_string=True) + prof.print_model_profile(profile_step=flops_prof_step) + prof.end_profile() + + return logits + +def rendezvous(distributed_parameters): + print("Initializing process group...") + torch.distributed.init_process_group(backend=distributed_parameters['dist_backend'], init_method=distributed_parameters['dist_url'], rank=distributed_parameters['rank'], world_size=distributed_parameters['world_size']) + print("Rendezvous complete. Created process group...") + +def run_benchmarking_wrapper(params): + params.flops_prof_step = max(0, min(params.flops_prof_step, params.iterations - 1)) + if (params.device_ids): + params.device_ids = [int(x) for x in params.device_ids.split(",")] + else: + params.device_ids = None + params.distributed_parameters = {} + if is_torchrun: + params.distributed_parameters['rank'] = int(os.environ["LOCAL_RANK"]) + params.distributed_parameters['world_size'] = int(os.environ["WORLD_SIZE"]) + params.distributed_parameters['dist_backend'] = "nccl" + params.distributed_parameters['dist_url'] = 'tcp://' + os.environ["MASTER_ADDR"] + ":" + os.environ["MASTER_PORT"] + else: + params.distributed_parameters['rank'] = params.rank + params.distributed_parameters['world_size'] = params.world_size + params.distributed_parameters['dist_backend'] = params.dist_backend + params.distributed_parameters['dist_url'] = params.dist_url + + # Some arguments are required for distributed_dataparallel + if params.distributed_dataparallel: + assert params.distributed_parameters['rank'] is not None and \ + params.distributed_parameters['world_size'] is not None and \ + params.distributed_parameters['dist_backend'] is not None and \ + params.distributed_parameters['dist_url'] is not None, "rank, world-size, dist-backend and dist-url are required arguments for distributed_dataparallel" + + if is_torchrun: + params.ngpus = params.distributed_parameters['world_size'] + elif params.distributed_dataparallel: + params.ngpus = len(params.device_ids) if params.device_ids else torch.cuda.device_count() + else: + params.ngpus = 1 + + if is_torchrun: + run_benchmarking(params.distributed_parameters['rank'], params) + elif params.distributed_dataparallel: + # Assumption below that each process launched with --distributed_dataparallel has the same number of devices visible/specified + params.distributed_parameters['world_size'] = params.ngpus * params.distributed_parameters['world_size'] + params.distributed_parameters['rank'] = params.ngpus * params.distributed_parameters['rank'] + mp.spawn(run_benchmarking, nprocs=params.ngpus, args=(params,)) + else: + run_benchmarking(0, params) + +def run_benchmarking(local_rank, params): + device_ids = params.device_ids + ngpus = params.ngpus + net = params.network + run_fp16 = params.fp16 + amp_opt_level = params.amp_opt_level + distributed_dataparallel = params.distributed_dataparallel + distributed_parameters = params.distributed_parameters + batch_size = params.batch_size + kineto = params.kineto + iterations = params.iterations + autograd_profiler = params.autograd_profiler + flops_prof_step = params.flops_prof_step + + if is_torchrun: + torch.cuda.set_device("cuda:%d" % local_rank) + elif device_ids: + assert ngpus == len(device_ids) + torch.cuda.set_device("cuda:%d" % device_ids[local_rank]) + else: + torch.cuda.set_device("cuda:0") + + network = get_network(net, params) + if "shufflenet" == net: + network.apply(weight_init) + + if params.compile: + compile_ctx = {"mode": None, + "dynamic": False, + "fullgraph": False, + "backend": "inductor", + "options": None, + "disable": False} + options = None # needed for internal pytorch checks + if params.compileContext: + compile_ctx.update(ast.literal_eval(params.compileContext)) + if compile_ctx["mode"] is not None and compile_ctx["options"] is not None: + raise RuntimeError("Cannot specify mode and options simultaneously") + if compile_ctx["options"] is not None: + options = {} # needed to save multiple options + for compiler_pass in compile_ctx["options"].keys(): + options.update({compiler_pass: bool(compile_ctx["options"][compiler_pass])}) + if IS_PT2: + network = torch.compile(network, + mode=compile_ctx["mode"], + dynamic=bool(compile_ctx["dynamic"]), + fullgraph=bool(compile_ctx["fullgraph"]), + backend=compile_ctx["backend"], + options=options, + disable=compile_ctx["disable"]) + else: + print ("ERROR: requested torch.compile but this isn't pytorch 2.x") + sys.exit(1) + + ## MLPerf Setting + sgd_opt_base_learning_rate = 0.01 + sgd_opt_end_learning_rate = 1e-4 + sgd_opt_learning_rate_decay_poly_power = 2 + sgd_opt_weight_decay = 0.0001 + sgd_opt_momentum = 0.9 + opt_learning_rate_warmup_epochs = 5 + + total_epochs = params.iterations + optimizer = torch.optim.SGD(param_copy, lr = sgd_opt_base_learning_rate, momentum = sgd_opt_momentum, weight_decay=sgd_opt_weight_decay) + + if (run_fp16): + network = FP16Model(network) + + #use apex syncbn + if args.sync_bn: + network = apex.parallel.convert_syncbn_model(network) + + optimizer = apex.optimizers.FusedSGD(network.parameters(), lr = 0.01, momentum = 0.9) + + if (amp_opt_level): + network, optimizer = apex.amp.initialize(network, optimizer, opt_level="O%d"%amp_opt_level, + keep_batchnorm_fp32=args.keep_batchnorm_fp32, + loss_scale=args.loss_scale) + + if is_torchrun: + rendezvous(distributed_parameters) + devices_to_run_on = [local_rank] + print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on))) + network = apex.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on) + batch_size = int(batch_size / ngpus) + elif (distributed_dataparallel): + distributed_parameters['rank'] += local_rank + rendezvous(distributed_parameters) + devices_to_run_on = [(device_ids[local_rank] if device_ids else local_rank)] + print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on))) + network = apex.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on) + batch_size = int(batch_size / ngpus) + + if (net == "inception_v3"): + inp = torch.randn(batch_size, 3, 299, 299, device="cuda") + else: + inp = torch.randn(batch_size, 3, 224, 224, device="cuda") + if (run_fp16): + inp = inp.half() + if params.nhwc: + inp = inp.to(memory_format=torch.channels_last) + if net in models: + # number of classes is 1000 for imagenet + target = torch.randint(0, 1000, (batch_size,), device="cuda") + elif net in segmentation_models: + # number of classes is 21 for segmentation + target = torch.randint(0, 21, (batch_size,), device="cuda") + + if params.mode == "training": + forward_fn = forwardbackward + network.train() + else: + forward_fn = forward + network.eval() + + ## warmup. + print ("INFO: running forward and backward for warmup.") + for i in range(2): + forward_fn(inp, optimizer, network, params, target, step=0, opt_step=params.opt_step) + + time.sleep(1) + torch.cuda.synchronize() + + ## benchmark. + print ("INFO: running the benchmark..") + if kineto: + from torch.profiler import schedule, profile, ProfilerActivity, record_function + profiler_schedule = schedule( + skip_first = 0, + wait = 1, + warmup = 2, + active = 5, + repeat = 1, + ) + + def trace_ready_callback(prof): + rank = 0 + if torch.distributed.is_available() and torch.distributed.is_initialized(): + rank = torch.distributed.get_rank() + if rank == 0: + print("----------- Trace Ready -----------") + prof.export_chrome_trace(f"{params.profiler_output}.json") + # print(f"----------- Rank {rank} Trace Ready -----------") + # prof.export_chrome_trace(f"{params.profiler_output}_rank{rank}.json") + + tm = time.time() + with profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + schedule=profiler_schedule, + on_trace_ready=trace_ready_callback) as prof: + for i in range(iterations): + with record_function(f"iteration {i}"): + forward_fn(inp, optimizer, network, params, target, step=i, opt_step=params.opt_step) + prof.step() + torch.cuda.synchronize() + print(prof.key_averages().table(sort_by="cuda_time_total")) + else: + tm = time.time() + with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler): + for i in range(iterations): + if i == flops_prof_step: + forward_fn(inp, optimizer, network, params, target, step=i, opt_step=params.opt_step, + flops_prof_step = i) + else: + forward_fn(inp, optimizer, network, params, target, step=i, opt_step=params.opt_step) + torch.cuda.synchronize() + + tm2 = time.time() + time_per_batch = (tm2 - tm) / iterations + + if run_fp16: + dtype = 'FP16' + elif amp_opt_level == 1: + dtype = 'AMP-O1: Insert automatic FP16 casts around safe Pytorch functions and Tensor methods.' + elif amp_opt_level == 2: + dtype = 'AMP-O2: FP16 training with FP32 batchnorm and FP32 master weights.' + elif amp_opt_level == 3: + dtype = 'AMP-O3: Pure FP16 training.' + elif amp_opt_level == 4: + dtype = 'AMP-O4: Insert automatic BFLOAT16 casts around safe Pytorch functions and Tensor methods.' + elif amp_opt_level == 5: + dtype = 'AMP-O5: BFLOAT16 training with FP32 batchnorm and FP32 master weights.' + else: + dtype = 'FP32' + + result = None + if not params.output_dir: + params.output_dir = "." + + print ("OK: finished running benchmark..") + print ("--------------------SUMMARY--------------------------") + print ("Microbenchmark for network : {}".format(net)) + if distributed_dataparallel or is_torchrun: + print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------"); + print ("Num devices: 1") + else: + print ("Num devices: {}".format(ngpus)) + result = { + "Name": params.output_file, + "GPUs": 1, + "Mini batch size [img]": batch_size, + "Mini batch size [img/gpu]": batch_size, + "Throughput [img/sec]": batch_size / time_per_batch, + "Time per mini-batch": time_per_batch + } + with open(f"{params.output_dir}/{params.output_file}.json", "w") as f: + json.dump(result, f, indent=2) + + print ("Dtype: {}".format(dtype)) + print ("Mini batch size [img] : {}".format(batch_size)) + print ("Time per mini-batch : {}".format(time_per_batch)) + print ("Throughput [img/sec] : {}".format(batch_size/time_per_batch)) + if (distributed_dataparallel or is_torchrun) and distributed_parameters['rank'] == 0: + print ("") + print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------") + world_size = distributed_parameters['world_size'] + print ("Num devices: {}".format(world_size)) + print ("Dtype: {}".format(dtype)) + print ("Mini batch size [img] : {}".format(batch_size*world_size)) + print ("Time per mini-batch : {}".format(time_per_batch)) + print ("Throughput [img/sec] : {}".format(batch_size*world_size/time_per_batch)) + result = { + "Name": params.output_file, + "GPUs": distributed_parameters['world_size'], + "Mini batch size [img]": batch_size * distributed_parameters['world_size'], + "Mini batch size [img/gpu]": batch_size, + "Throughput [img/sec]": batch_size * distributed_parameters['world_size'] / time_per_batch, + "Time per mini-batch": time_per_batch + } + with open(f"{params.output_dir}/{params.output_file}.json", "w") as f: + json.dump(result, f, indent=2) + + csv_filename = f"{params.output_dir}/benchmark_summary.csv" + if params.csv_file: + csv_filename = params.csv_file + file_exists = os.path.isfile(csv_filename) + if result: + with open(csv_filename, "a", newline='') as csvfile: + writer = csv.writer(csvfile) + if not file_exists: + writer.writerow(result.keys()) + writer.writerow(result.values()) + print(f"Benchmark result saved to {csv_filename}") + +def main(): + run_benchmarking_wrapper(copy.deepcopy(args)) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--network", type=str, choices=get_network_names(), required=True, help="Network to run.") + parser.add_argument("--batch-size" , type=int, required=False, default=64, help="Batch size (will be split among devices used by this invocation)") + parser.add_argument("--iterations", type=int, required=False, default=20, help="Iterations") + parser.add_argument("--flops-prof-step", type=int, required=False, default=0, help="The flops profiling step") + parser.add_argument("--kineto", action='store_true', required=False, help="Turn kineto profiling on") + parser.add_argument("--autograd_profiler", action='store_true', required=False, help="Use PyTorch autograd (old) profiler") + parser.add_argument("--fp16", type=int, required=False, default=0,help="FP16 mixed precision benchmarking") + parser.add_argument("--amp-opt-level", type=int, required=False, default=0,help="apex.amp mixed precision benchmarking opt level") + parser.add_argument("--distributed_dataparallel", action='store_true', required=False, help="Use apex.parallel.DistributedDataParallel api to run on multiple processes/nodes. The multiple processes need to be launched manually, this script will only launch ONE process per invocation. Either use --distributed_dataparallel and manually launch multiple processes or launch this script with `torchrun`") + parser.add_argument("--device_ids", type=str, required=False, default=None, help="Comma-separated list (no spaces) to specify which HIP devices (0-indexed) to run distributedDataParallel api on. Might need to use HIP_VISIBLE_DEVICES to limit visiblity of devices to different processes.") + parser.add_argument("--rank", type=int, required=False, default=None, help="Rank of this process. Required for --distributed_dataparallel") + parser.add_argument("--world-size", type=int, required=False, default=None, help="Total number of ranks/processes. Required for --distributed_dataparallel") + parser.add_argument("--dist-backend", type=str, required=False, default=None, help="Backend used for distributed training. Can be one of 'nccl' or 'gloo'. Required for --distributed_dataparallel") + parser.add_argument("--dist-url", type=str, required=False, default=None, help="url used for rendezvous of processes in distributed training. Needs to contain IP and open port of master rank0 eg. 'tcp://172.23.2.1:54321'. Required for --distributed_dataparallel") + parser.add_argument("--compile", action='store_true', required=False, help="use pytorch 2.0") + parser.add_argument("--compileContext", default={}, required=False, help="additional compile options") + parser.add_argument('--sync_bn', action='store_true', help='enabling apex sync BN.') + parser.add_argument('--keep-batchnorm-fp32', type=str, default=None) + parser.add_argument('--loss-scale', type=str, default=None) + parser.add_argument("--csv-file", type=str, default=None, required=False, help="assign output csv file name.") + parser.add_argument("--mode", type=str, choices=['training', 'inference'], default="training", help="Select mode: training or inference") + parser.add_argument("--nhwc", action='store_true', default=False, help="Use nhwc format") + parser.add_argument("--opt-step", type=int, required=False, default=1, help="Optimizer update step") + parser.add_argument("--output-dir", type=str, default="", help="assign output directory name.") + parser.add_argument("--output-file", type=str, default="", help="assign output file name.") + parser.add_argument("--profiler-output", type=str, default="", help="assign profiler output name.") + + + args = parser.parse_args() + + if args.flops_prof_step: + try: + from deepspeed.profiling.flops_profiler import FlopsProfiler + except: + print("ERROR: You must install (or copy) deepspeed.profiling to use --flops-prof-step") + sys.exit(1) + + if args.fp16 and args.amp_opt_level: + print ("ERROR: Cannot use both --fp16 and --amp-opt-level") + sys.exit(1) + + + main() diff --git a/micro_benchmarking_audio.py b/micro_benchmarking_audio.py new file mode 100644 index 0000000..efb5915 --- /dev/null +++ b/micro_benchmarking_audio.py @@ -0,0 +1,414 @@ +import torch +import time +import argparse +import os +import sys +import ast +import copy +import math +import torch.nn as nn +import torch.multiprocessing as mp +from fp16util import network_to_half, get_param_copy +import torch.nn.functional as F +from audio.audio_model import get_network_names, get_network +from audio.audio_loss import get_criterion, calculate_loss +from audio.audio_input import get_input_type, get_input +from audio.audio_output import get_output_selection, create_target +import csv +import json +from torch.amp import autocast, GradScaler + +try: + import torch._dynamo + torch._dynamo.config.verbose=True + HAVE_DYNAMO = True +except: + HAVE_DYNAMO = False + +IS_PT2 = hasattr(torch, "compile") + +is_torchrun = False +if "LOCAL_RANK" in os.environ: + # this indicates we're using torchrun + is_torchrun = True + + +def weight_init(m): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + +def forwardbackward(inp, optimizer, network, params, network_name, batch_size, criterion, target, step=0, opt_step=1, flops_prof_step=0): + if step % opt_step == 0: + optimizer.zero_grad() + if flops_prof_step: + prof = FlopsProfiler(network) + prof.start_profile() + + if params.amp: + with autocast('cuda'): + out = network(**inp) + output_index = get_output_selection(network_name) + if output_index is not None: + out = out[output_index] + + loss = calculate_loss(network_name, criterion, out, target, batch_size, inp) + scaler.scale(loss).backward() + if (step + 1) % opt_step == 0: + scaler.step(optimizer) + scaler.update() + optimizer.zero_grad() + else: + out = network(**inp) + output_index = get_output_selection(network_name) + if output_index is not None: + out = out[output_index] + loss = calculate_loss(network_name, criterion, out, target, batch_size, inp) + loss.backward() + if (step + 1) % opt_step == 0: + optimizer.step() + optimizer.zero_grad() + + # End profiler here if only to profile forward pass + + if flops_prof_step: + prof.print_model_profile(profile_step=flops_prof_step) + prof.end_profile() + +def forward(inp, optimizer, network, params, network_name, batch_size, criterion, target, step=0, opt_step=1, flops_prof_step=0): + if step % opt_step == 0: + optimizer.zero_grad() + if flops_prof_step: + prof = FlopsProfiler(network) + prof.start_profile() + + if params.amp: + with autocast('cuda'): + out = network(**inp) + output_index = get_output_selection(network_name) + if output_index is not None: + out = out[output_index] + else: + out = network(**inp) + output_index = get_output_selection(network_name) + if output_index is not None: + out = out[output_index] + + # End profiler here if only to profile forward pass + + if flops_prof_step: + prof.print_model_profile(profile_step=flops_prof_step) + prof.end_profile() + + +def rendezvous(distributed_parameters): + print("Initializing process group...") + torch.distributed.init_process_group(backend=distributed_parameters['dist_backend'], init_method=distributed_parameters['dist_url'], rank=distributed_parameters['rank'], world_size=distributed_parameters['world_size']) + print("Rendezvous complete. Created process group...") + +def run_benchmarking_wrapper(params): + params.flops_prof_step = max(0, min(params.flops_prof_step, params.iterations - 1)) + if (params.device_ids): + params.device_ids = [int(x) for x in params.device_ids.split(",")] + else: + params.device_ids = None + params.distributed_parameters = {} + if is_torchrun: + params.distributed_parameters['rank'] = int(os.environ["LOCAL_RANK"]) + params.distributed_parameters['world_size'] = int(os.environ["WORLD_SIZE"]) + params.distributed_parameters['dist_backend'] = "nccl" + params.distributed_parameters['dist_url'] = 'tcp://' + os.environ["MASTER_ADDR"] + ":" + os.environ["MASTER_PORT"] + else: + params.distributed_parameters['rank'] = params.rank + params.distributed_parameters['world_size'] = params.world_size + params.distributed_parameters['dist_backend'] = params.dist_backend + params.distributed_parameters['dist_url'] = params.dist_url + + # Some arguments are required for distributed_dataparallel + if params.distributed_dataparallel: + assert params.distributed_parameters['rank'] is not None and \ + params.distributed_parameters['world_size'] is not None and \ + params.distributed_parameters['dist_backend'] is not None and \ + params.distributed_parameters['dist_url'] is not None, "rank, world-size, dist-backend and dist-url are required arguments for distributed_dataparallel" + + if is_torchrun: + params.ngpus = params.distributed_parameters['world_size'] + elif params.distributed_dataparallel: + params.ngpus = len(params.device_ids) if params.device_ids else torch.cuda.device_count() + else: + params.ngpus = 1 + + if is_torchrun: + run_benchmarking(params.distributed_parameters['rank'], params) + elif params.distributed_dataparallel: + # Assumption below that each process launched with --distributed_dataparallel has the same number of devices visible/specified + params.distributed_parameters['world_size'] = params.ngpus * params.distributed_parameters['world_size'] + params.distributed_parameters['rank'] = params.ngpus * params.distributed_parameters['rank'] + mp.spawn(run_benchmarking, nprocs=params.ngpus, args=(params,)) + else: + run_benchmarking(0, params) + +def run_benchmarking(local_rank, params): + device_ids = params.device_ids + ngpus = params.ngpus + net = params.network + run_fp16 = params.fp16 + run_amp = params.amp + distributed_dataparallel = params.distributed_dataparallel + distributed_parameters = params.distributed_parameters + batch_size = params.batch_size + kineto = params.kineto + iterations = params.iterations + autograd_profiler = params.autograd_profiler + flops_prof_step = params.flops_prof_step + + if is_torchrun: + torch.cuda.set_device("cuda:%d" % local_rank) + elif device_ids: + assert ngpus == len(device_ids) + torch.cuda.set_device("cuda:%d" % device_ids[local_rank]) + else: + torch.cuda.set_device("cuda:0") + + network = get_network(net) + criterion = get_criterion(net) + if "shufflenet" == net: + network.apply(weight_init) + + if (run_fp16): + network = network_to_half(network) + + if params.compile: + compile_ctx = {"mode": None, + "dynamic": False, + "fullgraph": False, + "backend": "inductor", + "options": None, + "disable": False} + options = None # needed for internal pytorch checks + if params.compileContext: + compile_ctx.update(ast.literal_eval(params.compileContext)) + if compile_ctx["mode"] is not None and compile_ctx["options"] is not None: + raise RuntimeError("Cannot specify mode and options simultaneously") + if compile_ctx["options"] is not None: + options = {} # needed to save multiple options + for compiler_pass in compile_ctx["options"].keys(): + options.update({compiler_pass: bool(compile_ctx["options"][compiler_pass])}) + if IS_PT2: + network = torch.compile(network, + mode=compile_ctx["mode"], + dynamic=bool(compile_ctx["dynamic"]), + fullgraph=bool(compile_ctx["fullgraph"]), + backend=compile_ctx["backend"], + options=options, + disable=compile_ctx["disable"]) + else: + print ("ERROR: requested torch.compile but this isn't pytorch 2.x") + sys.exit(1) + + param_copy = network.parameters() + if (run_fp16): + param_copy = get_param_copy(network) + + ## MLPerf Setting + sgd_opt_base_learning_rate = 0.01 + sgd_opt_end_learning_rate = 1e-4 + sgd_opt_learning_rate_decay_poly_power = 2 + sgd_opt_weight_decay = 0.0001 + sgd_opt_momentum = 0.9 + opt_learning_rate_warmup_epochs = 5 + + total_epochs = params.iterations + optimizer = torch.optim.SGD(param_copy, lr = sgd_opt_base_learning_rate, momentum = sgd_opt_momentum, weight_decay=sgd_opt_weight_decay) + + if is_torchrun: + rendezvous(distributed_parameters) + devices_to_run_on = [local_rank] + print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on))) + network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on) + batch_size = int(batch_size / ngpus) + elif (distributed_dataparallel): + distributed_parameters['rank'] += local_rank + rendezvous(distributed_parameters) + devices_to_run_on = [(device_ids[local_rank] if device_ids else local_rank)] + print ("INFO: Rank {} running distributed_dataparallel on devices: {}".format(distributed_parameters['rank'], str(devices_to_run_on))) + network = torch.nn.parallel.DistributedDataParallel(network, device_ids=devices_to_run_on) + batch_size = int(batch_size / ngpus) + + inp = get_input(net, network, batch_size) + + if (run_fp16): + inp = inp.half() + + target = create_target(net, network, inp, batch_size) + + if params.mode == "training": + forward_fn = forwardbackward + network.train() + else: + forward_fn = forward + network.eval() + + ## warmup. + print ("INFO: running forward and backward for warmup.") + for i in range(2): + forward_fn(inp, optimizer, network, params, net, batch_size, criterion, target, step=0, opt_step=params.opt_step) + + time.sleep(1) + torch.cuda.synchronize() + + ## benchmark. + print ("INFO: running the benchmark..") + if kineto: + from torch.profiler import schedule, profile, ProfilerActivity, record_function + profiler_schedule = schedule( + skip_first = 0, + wait = 1, + warmup = 2, + active = 5, + repeat = 1, + ) + + def trace_ready_callback(prof): + rank = 0 + if torch.distributed.is_available() and torch.distributed.is_initialized(): + rank = torch.distributed.get_rank() + if rank == 0: + print("----------- Trace Ready -----------") + prof.export_chrome_trace(f"{params.profiler_output}.json") + # print(f"----------- Rank {rank} Trace Ready -----------") + # prof.export_chrome_trace(f"{params.profiler_output}_rank{rank}.json") + + tm = time.time() + with profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + schedule=profiler_schedule, + on_trace_ready=trace_ready_callback) as prof: + for i in range(iterations): + with record_function(f"iteration {i}"): + forward_fn(inp, optimizer, network, params, net, batch_size, criterion, target, step=i, opt_step=params.opt_step) + prof.step() + torch.cuda.synchronize() + print(prof.key_averages().table(sort_by="cuda_time_total")) + else: + tm = time.time() + with torch.autograd.profiler.emit_nvtx(enabled=autograd_profiler): + for i in range(iterations): + if i == flops_prof_step: + forward_fn(inp, optimizer, network, params, net, batch_size, criterion, target, step=i, opt_step=params.opt_step, flops_prof_step=i) + else: + forward_fn(inp, optimizer, network, params, net, batch_size, criterion, target, step=i, opt_step=params.opt_step) + torch.cuda.synchronize() + + tm2 = time.time() + time_per_batch = (tm2 - tm) / iterations + + if run_fp16: + dtype = 'FP16' + elif run_amp: + dtype = 'AMP: PyTorch Native Automatic Mixed Precision' + else: + dtype = 'FP32' + + result = None + if not params.output_dir: + params.output_dir = "." + + print ("OK: finished running benchmark..") + print ("--------------------SUMMARY--------------------------") + print ("Microbenchmark for network : {}".format(net)) + if distributed_dataparallel or is_torchrun: + print ("--------This process: rank " + str(distributed_parameters['rank']) + "--------"); + print ("Num devices: 1") + else: + print ("Num devices: {}".format(ngpus)) + result = { + "Name": params.output_file, + "GPUs": 1, + "Mini batch size [img]": batch_size, + "Mini batch size [img/gpu]": batch_size, + "Throughput [img/sec]": batch_size / time_per_batch, + "Time per mini-batch": time_per_batch + } + with open(f"{params.output_dir}/{params.output_file}.json", "w") as f: + json.dump(result, f, indent=2) + print ("Dtype: {}".format(dtype)) + print ("Mini batch size [", get_input_type(net), "] : {}".format(batch_size)) + print ("Time per mini-batch : {}".format(time_per_batch)) + print ("Throughput [", get_input_type(net), "/sec] : {}".format(batch_size/time_per_batch)) + if (distributed_dataparallel or is_torchrun) and distributed_parameters['rank'] == 0: + print ("") + print ("--------Overall (all ranks) (assuming same num/type devices for each rank)--------") + world_size = distributed_parameters['world_size'] + print ("Num devices: {}".format(world_size)) + print ("Dtype: {}".format(dtype)) + print ("Mini batch size [", get_input_type(net), "] : {}".format(batch_size*world_size)) + print ("Time per mini-batch : {}".format(time_per_batch)) + print ("Throughput [", get_input_type(net), "/sec] : {}".format(batch_size*world_size/time_per_batch)) + result = { + "Name": params.output_file, + "GPUs": distributed_parameters['world_size'], + "Mini batch size [img]": batch_size * distributed_parameters['world_size'], + "Mini batch size [img/gpu]": batch_size, + "Throughput [img/sec]": batch_size * distributed_parameters['world_size'] / time_per_batch, + "Time per mini-batch": time_per_batch + } + with open(f"{params.output_dir}/{params.output_file}.json", "w") as f: + json.dump(result, f, indent=2) + + csv_filename = f"{params.output_dir}/benchmark_summary.csv" + if params.csv_file: + csv_filename = params.csv_file + file_exists = os.path.isfile(csv_filename) + if result: + with open(csv_filename, "a", newline='') as csvfile: + writer = csv.writer(csvfile) + if not file_exists: + writer.writerow(result.keys()) + writer.writerow(result.values()) + print(f"Benchmark result saved to {csv_filename}") + +def main(): + run_benchmarking_wrapper(copy.deepcopy(args)) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--network", type=str, choices=get_network_names(), required=True, help="Network to run.") + parser.add_argument("--batch-size" , type=int, required=False, default=64, help="Batch size (will be split among devices used by this invocation)") + parser.add_argument("--iterations", type=int, required=False, default=20, help="Iterations") + parser.add_argument("--flops-prof-step", type=int, required=False, default=0, help="The flops profiling step") + parser.add_argument("--kineto", action='store_true', required=False, help="Turn kineto profiling on") + parser.add_argument("--autograd_profiler", action='store_true', required=False, help="Use PyTorch autograd (old) profiler") + parser.add_argument("--fp16", type=int, required=False, default=0,help="FP16 mixed precision benchmarking") + parser.add_argument("--distributed_dataparallel", action='store_true', required=False, help="Use torch.nn.parallel.DistributedDataParallel api to run on multiple processes/nodes. The multiple processes need to be launched manually, this script will only launch ONE process per invocation. Either use --distributed_dataparallel and manually launch multiple processes or launch this script with `torchrun`") + parser.add_argument("--device_ids", type=str, required=False, default=None, help="Comma-separated list (no spaces) to specify which HIP devices (0-indexed) to run distributedDataParallel api on. Might need to use HIP_VISIBLE_DEVICES to limit visiblity of devices to different processes.") + parser.add_argument("--rank", type=int, required=False, default=None, help="Rank of this process. Required for --distributed_dataparallel") + parser.add_argument("--world-size", type=int, required=False, default=None, help="Total number of ranks/processes. Required for --distributed_dataparallel") + parser.add_argument("--dist-backend", type=str, required=False, default=None, help="Backend used for distributed training. Can be one of 'nccl' or 'gloo'. Required for --distributed_dataparallel") + parser.add_argument("--dist-url", type=str, required=False, default=None, help="url used for rendezvous of processes in distributed training. Needs to contain IP and open port of master rank0 eg. 'tcp://172.23.2.1:54321'. Required for --distributed_dataparallel") + parser.add_argument("--compile", action='store_true', required=False, help="use pytorch 2.0") + parser.add_argument("--compileContext", default={}, required=False, help="additional compile options") + parser.add_argument("--amp", action='store_true', default=False, required=False, help="Automatic mixed precision benchmarking") + parser.add_argument("--csv-file", type=str, default=None, required=False, help="assign output csv file name.") + parser.add_argument("--mode", type=str, choices=['training', 'inference'], default="training", help="Select mode: training or inference") + parser.add_argument("--opt-step", type=int, required=False, default=1, help="Optimizer update step") + parser.add_argument("--output-dir", type=str, default="", help="assign output directory name.") + parser.add_argument("--output-file", type=str, default="", help="assign output file name.") + parser.add_argument("--profiler-output", type=str, default="", help="assign profiler output name.") + + args = parser.parse_args() + + if args.flops_prof_step: + try: + from deepspeed.profiling.flops_profiler import FlopsProfiler + except: + print("ERROR: You must install (or copy) deepspeed.profiling to use --flops-prof-step") + sys.exit(1) + + main() \ No newline at end of file