From e574b9e4c142516239fd8cdd4ba5e04f97a3fb00 Mon Sep 17 00:00:00 2001 From: zhangxiao-stack <1244360827@qq.com> Date: Thu, 12 Jan 2023 18:23:47 +0800 Subject: [PATCH 1/3] fix hip d_conv launch failed --- .../xla/ral/context/stream_executor_based_impl.cc | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tao_compiler/mlir/xla/ral/context/stream_executor_based_impl.cc b/tao_compiler/mlir/xla/ral/context/stream_executor_based_impl.cc index b3a98ca4297..5b06430bae5 100644 --- a/tao_compiler/mlir/xla/ral/context/stream_executor_based_impl.cc +++ b/tao_compiler/mlir/xla/ral/context/stream_executor_based_impl.cc @@ -710,7 +710,9 @@ struct CudnnConvParams { std::vector filter_shape; std::vector output_shape; std::vector metadata; - +#if TENSORFLOW_USE_ROCM + std::optional workspace_size; +#endif DataLayout input_dl; FilterLayout filter_dl; DataLayout output_dl; @@ -1314,8 +1316,14 @@ Status RunCudnnConvolution(CudnnConvParams& params, auto& filter_descriptor = params.filter_descriptor; auto& convolution_descriptor = params.convolution_descriptor; auto& output_descriptor = params.output_descriptor; +#if TENSORFLOW_USE_ROCM + AlgorithmConfig algorithm{AlgorithmDesc( + params.algo_id, params.tensor_ops_enabled, params.workspace_size)}; +#else AlgorithmConfig algorithm{ AlgorithmDesc(params.algo_id, params.tensor_ops_enabled)}; +#endif + #if TENSORFLOW_USE_ROCM if (profile_result) { algorithm.set_scratch_size(profile_result->scratch_size()); @@ -1505,6 +1513,7 @@ bool PickBestAlgorithm(CudnnConvParams& params, result_buffer, &scratch_allocator)) { params.algo_id = profile_result.algorithm().algo_id(); params.tensor_ops_enabled = profile_result.algorithm().tensor_ops_enabled(); + params.workspace_size = profile_result.algorithm().workspace_size(); #else for (const AlgorithmDesc& alg : GetAlgorithms(params.kind, stream_exec)) { params.algo_id = alg.algo_id(); From 530966d6437aedcb9fde26f20e390b7e6b59d4e2 Mon Sep 17 00:00:00 2001 From: zhangxiao-stack <1244360827@qq.com> Date: Wed, 18 Jan 2023 16:47:21 +0800 Subject: [PATCH 2/3] fix hip conv fp16 performance --- tao_compiler/mlir/disc/BUILD | 3 +++ tao_compiler/mlir/disc/transforms/conv_rewriter.cc | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tao_compiler/mlir/disc/BUILD b/tao_compiler/mlir/disc/BUILD index 28c26ea3ed9..e770a988dba 100644 --- a/tao_compiler/mlir/disc/BUILD +++ b/tao_compiler/mlir/disc/BUILD @@ -443,6 +443,9 @@ cc_library( "tensorflow/compiler/xla/mlir_hlo/include", "." ], + copts = tf_copts() + if_rocm_is_configured([ + "-DTENSORFLOW_USE_ROCM=1" + ]), deps = [ "@com_google_absl//absl/strings", ":pass_details", diff --git a/tao_compiler/mlir/disc/transforms/conv_rewriter.cc b/tao_compiler/mlir/disc/transforms/conv_rewriter.cc index 7bec61ecdf1..521f8c75936 100644 --- a/tao_compiler/mlir/disc/transforms/conv_rewriter.cc +++ b/tao_compiler/mlir/disc/transforms/conv_rewriter.cc @@ -270,7 +270,7 @@ struct DiscConvRewriterPass &use_tf32)); if (cc_major >= 8 && (!is_fp32 || use_tf32) || inputTy.getElementType().isF16() && - filterTy.getElementType().isF16()) { + filterTy.getElementType().isF16() && (!TENSORFLOW_USE_ROCM)) { // TensorCore prefers NHWC layouts fillNHWC(inputLayout, num_spatial_dims); fillNHWC(outputLayout, num_spatial_dims); From e30d043f390998d306bf7215a9e6f3c3d4ee4251 Mon Sep 17 00:00:00 2001 From: zhangxiao-stack <1244360827@qq.com> Date: Thu, 9 Mar 2023 15:15:33 +0800 Subject: [PATCH 3/3] update dynamo api and fix hip compiler error --- pytorch_blade/torch_blade/dynamo/__init__.py | 18 +++++++----------- tao_compiler/mlir/xla/ral/BUILD | 8 ++++---- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/pytorch_blade/torch_blade/dynamo/__init__.py b/pytorch_blade/torch_blade/dynamo/__init__.py index 4d92ba9372d..cb2a8aca791 100644 --- a/pytorch_blade/torch_blade/dynamo/__init__.py +++ b/pytorch_blade/torch_blade/dynamo/__init__.py @@ -11,10 +11,10 @@ import torch_blade.dynamo.patch_user_defined -from torch._dynamo.optimizations.training import aot_autograd -from torch._dynamo.optimizations.backends import BACKENDS, create_backend -from torch._dynamo.optimizations.subgraph import SubGraph +from torch._dynamo.backends.common import aot_autograd +from torch._dynamo.backends.registry import register_backend from torch._functorch import compilers +from torch._dynamo.utils import torchscript from functorch.compile import min_cut_rematerialization_partition import torch @@ -85,14 +85,13 @@ def _disc_compile(fx_g: fx.GraphModule, inps, use_ts=False, is_training=True) -> return f -@compilers.make_boxed_compiler def disc_compile(fx_g: fx.GraphModule, inps, use_ts=False) -> Callable: return _disc_compile(fx_g, inps, use_ts=False) def disc(fx_g: fx.GraphModule, inps) -> Callable: import tempfile with tempfile.TemporaryDirectory() as tmp: - scripted = SubGraph(fx_g, inps, tmp).scripted + scripted = torchscript(fx_g, inps) torch._C._jit_pass_remove_mutation(scripted.graph) f = torch.jit.freeze(scripted.eval()) cfg = torch_blade.Config() @@ -101,7 +100,6 @@ def disc(fx_g: fx.GraphModule, inps) -> Callable: f = torch_blade.optimize(f, True, tuple(inps)) return f -@compilers.make_boxed_compiler def disc_compile_ts(fx_g: fx.GraphModule, inps, use_ts=False) -> Callable: return _disc_compile(fx_g, inps, use_ts=True) @@ -194,7 +192,6 @@ def _get_disc_decomp(): ] ) return decompositions_dict - aot_disc = aot_autograd( # these are taken from memory_efficient_fusion() fw_compiler=disc_compile, @@ -203,7 +200,6 @@ def _get_disc_decomp(): decompositions=_get_disc_decomp(), partition_fn=min_cut_rematerialization_partition) - aot_disc_debug = aot_autograd( # these are taken from memory_efficient_fusion() fw_compiler=disc_compile_ts, @@ -212,6 +208,6 @@ def _get_disc_decomp(): decompositions=_get_disc_decomp(), partition_fn=min_cut_rematerialization_partition) -BACKENDS["disc"] = disc -BACKENDS["aot_disc"] = aot_disc -BACKENDS["aot_disc_debug"] = aot_disc_debug +register_backend(name="disc", compiler_fn=disc) +register_backend(name="aot_disc", compiler_fn=aot_disc) +register_backend(name="aot_disc_debug", compiler_fn=aot_disc_debug) diff --git a/tao_compiler/mlir/xla/ral/BUILD b/tao_compiler/mlir/xla/ral/BUILD index 36841fe73e6..41657a075fb 100644 --- a/tao_compiler/mlir/xla/ral/BUILD +++ b/tao_compiler/mlir/xla/ral/BUILD @@ -332,7 +332,7 @@ cc_library( "@local_config_cuda//cuda:cuda_driver", "@local_config_cuda//cuda:cuda_headers", ]) + if_rocm_is_configured([ - "@org_tensorflow//tensorflow/stream_executor:rocm_platform", + "@org_tensorflow//tensorflow/compiler/xla/stream_executor/rocm:all_runtime", "@org_tensorflow//tensorflow/stream_executor/rocm:rocm_driver", "@local_config_rocm//rocm:rocm_headers", ]) + if_cuda_or_rocm([ @@ -370,7 +370,7 @@ tf_gpu_library( ] + if_cuda_is_configured([ "@local_config_cuda//cuda:cuda_driver", ]) + if_rocm_is_configured([ - "//tensorflow/stream_executor/rocm:rocm_driver", + "@org_tensorflow//tensorflow/stream_executor/rocm:rocm_driver", ]), alwayslink = 1, ) @@ -429,7 +429,7 @@ cc_library( "@local_config_cuda//cuda:cuda_driver", "@local_config_cuda//cuda:cuda_headers", ]) + if_rocm_is_configured([ - "@org_tensorflow//tensorflow/stream_executor:rocm_platform", + "@org_tensorflow//tensorflow/compiler/xla/stream_executor/rocm:all_runtime", "@org_tensorflow//tensorflow/stream_executor/rocm:rocm_driver", "@local_config_rocm//rocm:rocm_headers", ]), @@ -485,7 +485,7 @@ cc_library( "@local_config_cuda//cuda:cuda_driver", "@local_config_cuda//cuda:cuda_headers", ]) + if_rocm_is_configured([ - "@org_tensorflow//tensorflow/stream_executor:rocm_platform", + "@org_tensorflow//tensorflow/compiler/xla/stream_executor/rocm:all_runtime", "@org_tensorflow//tensorflow/stream_executor/rocm:rocm_driver", "@local_config_rocm//rocm:rocm_headers", ]),