diff --git a/README.md b/README.md index 98f4c02..74c0144 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ We supply a small microbenchmarking script for PyTorch training on ROCm. To execute: -`python micro_benchmarking_pytorch.py --network [--batch-size ] [--iterations ] [--fp16 <0 or 1> ] [--dataparallel|--distributed_dataparallel] [--device_ids ]` + +`./micro_benchmarking_pytorch.py --network [--batch-size ] [--iterations ] [--fp16 <0 or 1> ] [--dataparallel|--distributed_dataparallel] [--device_ids ]` Possible network names are: `alexnet`, `densenet121`, `inception_v3`, `resnet50`, `resnet101`, `SqueezeNet`, `vgg16` etc. @@ -14,17 +15,18 @@ For mGPU runs, `--distributed_dataparallel` with 1 GPU per process is recommende Eg. for a 1-GPU resnet50 run: ``` -python3 micro_benchmarking_pytorch.py --network resnet50 +./micro_benchmarking_pytorch.py --network resnet50 ``` for a 2-GPU run on a single node: ``` -python3 micro_benchmarking_pytorch.py --device_ids=0 --network resnet50 --distributed_dataparallel --rank 0 --world-size 2 --dist-backend nccl --dist-url tcp://127.0.0.1:4332 & -python3 micro_benchmarking_pytorch.py --device_ids=1 --network resnet50 --distributed_dataparallel --rank 1 --world-size 2 --dist-backend nccl --dist-url tcp://127.0.0.1:4332 & +./micro_benchmarking_pytorch.py --device_ids=0 --network resnet50 --distributed_dataparallel --rank 0 --world-size 2 --dist-backend nccl --dist-url tcp://127.0.0.1:4332 & +./micro_benchmarking_pytorch.py --device_ids=1 --network resnet50 --distributed_dataparallel --rank 1 --world-size 2 --dist-backend nccl --dist-url tcp://127.0.0.1:4332 & ``` Specify any available port in the `dist-url`. To run FlopsProfiler (with deepspeed.profiling.flops_profiler imported): -`python micro_benchmarking_pytorch.py --network resnet50 --amp-opt-level=2 --batch-size=256 --iterations=20 --flops-prof-step 10` + +`./micro_benchmarking_pytorch.py --network resnet50 --amp-opt-level=2 --batch-size=256 --iterations=20 --flops-prof-step 10` ## Performance tuning If performance on a specific card and/or model is found to be lacking, typically some gains can be made by tuning MIOpen. For this, `export MIOPEN_FIND_ENFORCE=3` prior to running the model. This will take some time if untuned configurations are encountered and write to a local performance database. More information on this can be found in the [MIOpen documentation](https://rocmsoftwareplatform.github.io/MIOpen/doc/html/perfdatabase.html). @@ -53,14 +55,15 @@ Added the `--compile` option opens up PyTorch 2.0 capabilities, which comes with With the required `--compile` option, these additional options are now available from the command line with the `--compileContext` flag. Here are a few examples: ```bash -python micro_benchmarking_pytorch.py --network resnet50 --compile # default run +./micro_benchmarking_pytorch.py --network resnet50 --compile # default run ``` ```bash -python micro_benchmarking_pytorch.py --network resnet50 --compile --compileContext "{'mode': 'max-autotune', 'fullgraph': 'True'}" +./micro_benchmarking_pytorch.py --network resnet50 --compile --compileContext "{'mode': 'max-autotune', 'fullgraph': 'True'}" ``` ```bash -python micro_benchmarking_pytorch.py --network resnet50 --compile --compileContext "{'options': {'static-memory': 'True', 'matmul-padding': 'True'}}" +./micro_benchmarking_pytorch.py --network resnet50 --compile --compileContext "{'options': {'static-memory': 'True', 'matmul-padding': 'True'}}" ``` + Note: you cannot pass the `mode` and `options` options together. diff --git a/TorchTensorOpsBench/README.md b/TorchTensorOpsBench/README.md index 7c8d6f5..d9417e7 100644 --- a/TorchTensorOpsBench/README.md +++ b/TorchTensorOpsBench/README.md @@ -1,11 +1,14 @@ To run the microbenchmark for an op: + ``` -python torch_tensor_ops_bench.py --op +./torch_tensor_ops_bench.py --op ``` The script also takes optional arguments: + ``` --dtype [=fp32 | fp16 | bf16] --device [=cuda | cpu] --input-dim dims separated by '-', default "64-1024-1024" --op-type [=None | binary(for a binary op)] +``` diff --git a/TorchTensorOpsBench/run.sh b/TorchTensorOpsBench/run.sh index 7e62dd8..0909384 100755 --- a/TorchTensorOpsBench/run.sh +++ b/TorchTensorOpsBench/run.sh @@ -1,18 +1,15 @@ -#!/usr/bin/env bash +#!/bin/bash # run model ops in fp32, fp16 and bf16 -printf "########## Running model ops with fp32 type ###########\n" -python3 torch_tensor_ops_bench.py --run-model-ops --dtype fp32 |& tee model_ops_fp32.log -printf "\n########## Running model ops with fp16 type ###########\n" -python3 torch_tensor_ops_bench.py --run-model-ops --dtype fp16 |& tee model_ops_fp16.log -printf "\n########## Running model ops with bf16 type ###########\n" -python3 torch_tensor_ops_bench.py --run-model-ops --dtype bf16 |& tee model_ops_bf16.log +echo -e "########## Running model ops with fp32 type ###########" +./torch_tensor_ops_bench.py --run-model-ops --dtype fp32 |& tee model_ops_fp32.log +echo -e "\n########## Running model ops with fp16 type ###########" +./torch_tensor_ops_bench.py --run-model-ops --dtype fp16 |& tee model_ops_fp16.log +echo -e "\n########## Running model ops with bf16 type ###########" +./torch_tensor_ops_bench.py --run-model-ops --dtype bf16 |& tee model_ops_bf16.log # run predefined ops with generic tensor size of 64-1024-1024 -printf "\n########## Running pre-defined ops with fp32 type ###########\n" -python3 torch_tensor_ops_bench.py --run-predefined --dtype fp32 |& tee predefined_ops_fp32.log -printf "\n########## Running pre-defined ops with fp16 type ###########\n" -python3 torch_tensor_ops_bench.py --run-predefined --dtype fp16 |& tee predefined_ops_fp16.log - -printf "Done\n" - +echo -e "\n########## Running pre-defined ops with fp32 type ###########" +./torch_tensor_ops_bench.py --run-predefined --dtype fp32 |& tee predefined_ops_fp32.log +echo -e "\n########## Running pre-defined ops with fp16 type ###########" +./torch_tensor_ops_bench.py --run-predefined --dtype fp16 |& tee predefined_ops_fp16.log diff --git a/TorchTensorOpsBench/torch_tensor_ops_bench.py b/TorchTensorOpsBench/torch_tensor_ops_bench.py old mode 100644 new mode 100755 index 28bfdb8..b810eb9 --- a/TorchTensorOpsBench/torch_tensor_ops_bench.py +++ b/TorchTensorOpsBench/torch_tensor_ops_bench.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import torch import torch.nn as nn diff --git a/micro_benchmarking_pytorch.py b/micro_benchmarking_pytorch.py old mode 100644 new mode 100755 index 4614074..757ede9 --- a/micro_benchmarking_pytorch.py +++ b/micro_benchmarking_pytorch.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import torch import torchvision import random diff --git a/shufflenet.py b/shufflenet.py old mode 100644 new mode 100755 index cc0daee..6d5e4dd --- a/shufflenet.py +++ b/shufflenet.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import os import sys import torch diff --git a/shufflenet_v2.py b/shufflenet_v2.py old mode 100644 new mode 100755 index 46e8f40..d1b41a8 --- a/shufflenet_v2.py +++ b/shufflenet_v2.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import os import sys import torch diff --git a/xception.py b/xception.py old mode 100644 new mode 100755 index a2c5630..353ea8f --- a/xception.py +++ b/xception.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import math import torch.nn as nn import torch.nn.functional as F