From e574b9e4c142516239fd8cdd4ba5e04f97a3fb00 Mon Sep 17 00:00:00 2001
From: zhangxiao-stack <1244360827@qq.com>
Date: Thu, 12 Jan 2023 18:23:47 +0800
Subject: [PATCH 1/3] fix hip d_conv launch failed

---
 .../xla/ral/context/stream_executor_based_impl.cc     | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tao_compiler/mlir/xla/ral/context/stream_executor_based_impl.cc b/tao_compiler/mlir/xla/ral/context/stream_executor_based_impl.cc
index b3a98ca4297..5b06430bae5 100644
--- a/tao_compiler/mlir/xla/ral/context/stream_executor_based_impl.cc
+++ b/tao_compiler/mlir/xla/ral/context/stream_executor_based_impl.cc
@@ -710,7 +710,9 @@ struct CudnnConvParams {
   std::vector<int64_t> filter_shape;
   std::vector<int64_t> output_shape;
   std::vector<int64_t> metadata;
-
+#if TENSORFLOW_USE_ROCM
+  std::optional<uint64_t> workspace_size;
+#endif
   DataLayout input_dl;
   FilterLayout filter_dl;
   DataLayout output_dl;
@@ -1314,8 +1316,14 @@ Status RunCudnnConvolution(CudnnConvParams& params,
   auto& filter_descriptor = params.filter_descriptor;
   auto& convolution_descriptor = params.convolution_descriptor;
   auto& output_descriptor = params.output_descriptor;
+#if TENSORFLOW_USE_ROCM
+  AlgorithmConfig algorithm{AlgorithmDesc(
+      params.algo_id, params.tensor_ops_enabled, params.workspace_size)};
+#else
   AlgorithmConfig algorithm{
       AlgorithmDesc(params.algo_id, params.tensor_ops_enabled)};
+#endif
+
 #if TENSORFLOW_USE_ROCM
   if (profile_result) {
     algorithm.set_scratch_size(profile_result->scratch_size());
@@ -1505,6 +1513,7 @@ bool PickBestAlgorithm(CudnnConvParams& params,
                               result_buffer, &scratch_allocator)) {
     params.algo_id = profile_result.algorithm().algo_id();
     params.tensor_ops_enabled = profile_result.algorithm().tensor_ops_enabled();
+    params.workspace_size = profile_result.algorithm().workspace_size();
 #else
   for (const AlgorithmDesc& alg : GetAlgorithms(params.kind, stream_exec)) {
     params.algo_id = alg.algo_id();

From 530966d6437aedcb9fde26f20e390b7e6b59d4e2 Mon Sep 17 00:00:00 2001
From: zhangxiao-stack <1244360827@qq.com>
Date: Wed, 18 Jan 2023 16:47:21 +0800
Subject: [PATCH 2/3] fix hip conv fp16 performance

---
 tao_compiler/mlir/disc/BUILD                       | 3 +++
 tao_compiler/mlir/disc/transforms/conv_rewriter.cc | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tao_compiler/mlir/disc/BUILD b/tao_compiler/mlir/disc/BUILD
index 28c26ea3ed9..e770a988dba 100644
--- a/tao_compiler/mlir/disc/BUILD
+++ b/tao_compiler/mlir/disc/BUILD
@@ -443,6 +443,9 @@ cc_library(
         "tensorflow/compiler/xla/mlir_hlo/include",
         "."
     ],
+    copts = tf_copts() + if_rocm_is_configured([
+        "-DTENSORFLOW_USE_ROCM=1"
+    ]),
     deps = [
         "@com_google_absl//absl/strings",
         ":pass_details",
diff --git a/tao_compiler/mlir/disc/transforms/conv_rewriter.cc b/tao_compiler/mlir/disc/transforms/conv_rewriter.cc
index 7bec61ecdf1..521f8c75936 100644
--- a/tao_compiler/mlir/disc/transforms/conv_rewriter.cc
+++ b/tao_compiler/mlir/disc/transforms/conv_rewriter.cc
@@ -270,7 +270,7 @@ struct DiscConvRewriterPass
                                                  &use_tf32));
       if (cc_major >= 8 && (!is_fp32 || use_tf32) ||
           inputTy.getElementType().isF16() &&
-              filterTy.getElementType().isF16()) {
+              filterTy.getElementType().isF16() && (!TENSORFLOW_USE_ROCM)) {
         // TensorCore prefers NHWC layouts
         fillNHWC(inputLayout, num_spatial_dims);
         fillNHWC(outputLayout, num_spatial_dims);

From e30d043f390998d306bf7215a9e6f3c3d4ee4251 Mon Sep 17 00:00:00 2001
From: zhangxiao-stack <1244360827@qq.com>
Date: Thu, 9 Mar 2023 15:15:33 +0800
Subject: [PATCH 3/3] update dynamo api and fix hip compiler error

---
 pytorch_blade/torch_blade/dynamo/__init__.py | 18 +++++++-----------
 tao_compiler/mlir/xla/ral/BUILD              |  8 ++++----
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/pytorch_blade/torch_blade/dynamo/__init__.py b/pytorch_blade/torch_blade/dynamo/__init__.py
index 4d92ba9372d..cb2a8aca791 100644
--- a/pytorch_blade/torch_blade/dynamo/__init__.py
+++ b/pytorch_blade/torch_blade/dynamo/__init__.py
@@ -11,10 +11,10 @@
 
 import torch_blade.dynamo.patch_user_defined
 
-from torch._dynamo.optimizations.training import aot_autograd
-from torch._dynamo.optimizations.backends import BACKENDS, create_backend
-from torch._dynamo.optimizations.subgraph import SubGraph
+from torch._dynamo.backends.common import aot_autograd
+from torch._dynamo.backends.registry import register_backend
 from torch._functorch import compilers
+from torch._dynamo.utils import torchscript
 from functorch.compile import min_cut_rematerialization_partition
 
 import torch
@@ -85,14 +85,13 @@ def _disc_compile(fx_g: fx.GraphModule, inps, use_ts=False, is_training=True) ->
 
     return f
 
-@compilers.make_boxed_compiler
 def disc_compile(fx_g: fx.GraphModule, inps, use_ts=False) -> Callable:
     return _disc_compile(fx_g, inps, use_ts=False)
 
 def disc(fx_g: fx.GraphModule, inps) -> Callable:
     import tempfile
     with tempfile.TemporaryDirectory() as tmp:
-        scripted = SubGraph(fx_g, inps, tmp).scripted
+        scripted = torchscript(fx_g, inps) 
         torch._C._jit_pass_remove_mutation(scripted.graph)
         f = torch.jit.freeze(scripted.eval())
         cfg = torch_blade.Config()
@@ -101,7 +100,6 @@ def disc(fx_g: fx.GraphModule, inps) -> Callable:
             f = torch_blade.optimize(f, True, tuple(inps))
         return f
 
-@compilers.make_boxed_compiler
 def disc_compile_ts(fx_g: fx.GraphModule, inps, use_ts=False) -> Callable:
     return _disc_compile(fx_g, inps, use_ts=True)
 
@@ -194,7 +192,6 @@ def _get_disc_decomp():
         ]
     )
     return decompositions_dict
-
 aot_disc = aot_autograd(
     # these are taken from memory_efficient_fusion()
     fw_compiler=disc_compile,
@@ -203,7 +200,6 @@ def _get_disc_decomp():
     decompositions=_get_disc_decomp(),
     partition_fn=min_cut_rematerialization_partition)
 
-
 aot_disc_debug = aot_autograd(
     # these are taken from memory_efficient_fusion()
     fw_compiler=disc_compile_ts,
@@ -212,6 +208,6 @@ def _get_disc_decomp():
     decompositions=_get_disc_decomp(),
     partition_fn=min_cut_rematerialization_partition)
 
-BACKENDS["disc"] = disc
-BACKENDS["aot_disc"] = aot_disc
-BACKENDS["aot_disc_debug"] = aot_disc_debug
+register_backend(name="disc", compiler_fn=disc)
+register_backend(name="aot_disc", compiler_fn=aot_disc)
+register_backend(name="aot_disc_debug", compiler_fn=aot_disc_debug)
diff --git a/tao_compiler/mlir/xla/ral/BUILD b/tao_compiler/mlir/xla/ral/BUILD
index 36841fe73e6..41657a075fb 100644
--- a/tao_compiler/mlir/xla/ral/BUILD
+++ b/tao_compiler/mlir/xla/ral/BUILD
@@ -332,7 +332,7 @@ cc_library(
         "@local_config_cuda//cuda:cuda_driver",
         "@local_config_cuda//cuda:cuda_headers",
     ]) + if_rocm_is_configured([
-        "@org_tensorflow//tensorflow/stream_executor:rocm_platform",
+        "@org_tensorflow//tensorflow/compiler/xla/stream_executor/rocm:all_runtime",
         "@org_tensorflow//tensorflow/stream_executor/rocm:rocm_driver",
         "@local_config_rocm//rocm:rocm_headers",
     ]) + if_cuda_or_rocm([
@@ -370,7 +370,7 @@ tf_gpu_library(
     ] + if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_driver",
     ]) + if_rocm_is_configured([
-        "//tensorflow/stream_executor/rocm:rocm_driver",
+        "@org_tensorflow//tensorflow/stream_executor/rocm:rocm_driver",
     ]),
     alwayslink = 1,
 )
@@ -429,7 +429,7 @@ cc_library(
         "@local_config_cuda//cuda:cuda_driver",
         "@local_config_cuda//cuda:cuda_headers",
     ]) + if_rocm_is_configured([
-        "@org_tensorflow//tensorflow/stream_executor:rocm_platform",
+        "@org_tensorflow//tensorflow/compiler/xla/stream_executor/rocm:all_runtime",
         "@org_tensorflow//tensorflow/stream_executor/rocm:rocm_driver",
         "@local_config_rocm//rocm:rocm_headers",
     ]),
@@ -485,7 +485,7 @@ cc_library(
         "@local_config_cuda//cuda:cuda_driver",
         "@local_config_cuda//cuda:cuda_headers",
     ]) + if_rocm_is_configured([
-        "@org_tensorflow//tensorflow/stream_executor:rocm_platform",
+        "@org_tensorflow//tensorflow/compiler/xla/stream_executor/rocm:all_runtime",
         "@org_tensorflow//tensorflow/stream_executor/rocm:rocm_driver",
         "@local_config_rocm//rocm:rocm_headers",
     ]),