From 77e3e3c5529fa8e69f27bd3a8023586d5d6c64c4 Mon Sep 17 00:00:00 2001
From: Wen Di <wendi5@huawei.com>
Date: Mon, 12 Jan 2026 17:23:25 +0800
Subject: [PATCH 1/3] add xnnpack for softmax

---
 third_party/xla/xla/debug_options_flags.cc    |    6 +
 third_party/xla/xla/service/cpu/BUILD         |   35 +
 third_party/xla/xla/service/cpu/BUILD.orig    | 2189 +++++++++++++
 .../xla/xla/service/cpu/cpu_compiler.cc       |    8 +
 .../xla/xla/service/cpu/cpu_compiler.cc.orig  | 2712 +++++++++++++++++
 .../xla/xla/service/cpu/cpu_runtime.cc        |    2 +
 third_party/xla/xla/service/cpu/cpu_runtime.h |    1 +
 third_party/xla/xla/service/cpu/ir_emitter.cc |   39 +
 third_party/xla/xla/service/cpu/ir_emitter.h  |    1 +
 .../service/cpu/runtime_symbol_generator.cc   |    2 +
 .../xla/xla/service/cpu/xnnpack_ops.cc        |   76 +
 third_party/xla/xla/service/cpu/xnnpack_ops.h |   36 +
 .../xla/service/cpu/xnnpack_ops_rewriter.cc   |  228 ++
 .../xla/service/cpu/xnnpack_ops_rewriter.h    |   45 +
 .../xla/service/cpu/xnnpack_pattern_utils.h   |   65 +
 third_party/xla/xla/xla.proto                 |    2 +
 16 files changed, 5447 insertions(+)
 create mode 100644 third_party/xla/xla/service/cpu/BUILD.orig
 create mode 100644 third_party/xla/xla/service/cpu/cpu_compiler.cc.orig
 create mode 100644 third_party/xla/xla/service/cpu/xnnpack_ops.cc
 create mode 100644 third_party/xla/xla/service/cpu/xnnpack_ops.h
 create mode 100644 third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc
 create mode 100644 third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h
 create mode 100644 third_party/xla/xla/service/cpu/xnnpack_pattern_utils.h

diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index 33fa90f7e35e9e..7ab70838950d98 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -103,6 +103,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_use_fusion_emitters(true);
   opts.set_xla_cpu_use_thunk_runtime(true);
   opts.set_xla_cpu_use_xnnpack(false);
+  opts.set_xla_cpu_enable_xnnpack(false);  // For softmax
   opts.set_xla_cpu_experimental_xnn_graph_fusion_mode(
       DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED);
   opts.set_xla_cpu_parallel_codegen_split_count(32);
@@ -994,6 +995,11 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                 bool_setter_for(&DebugOptions::set_xla_cpu_use_xnnpack),
                 debug_options->xla_cpu_use_xnnpack(),
                 "Use XNNPACK for supported operations."));
+  flag_list->push_back(tsl::Flag(
+      "xla_cpu_enable_xnnpack",
+      bool_setter_for(&DebugOptions::set_xla_cpu_enable_xnnpack),
+      debug_options->xla_cpu_enable_xnnpack(),
+      "Enable XNNPACK ops rewriter."));
   flag_list->push_back(tsl::Flag(
       "xla_cpu_experimental_xnn_graph_fusion_mode",
       setter_for_xla_cpu_experimental_xnn_graph_fusion_mode,
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 90388079ca2fcf..f951a6ac93b626 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -76,6 +76,7 @@ filegroup(
         "runtime_single_threaded_matmul_s32.cc",
         "runtime_single_threaded_matmul_u8.cc",
         "runtime_topk.cc",
+        "xnnpack_ops.cc",
         # Multi-threaded support.
         "runtime_conv2d.cc",
         "runtime_conv3d.cc",
@@ -109,6 +110,7 @@ filegroup(
         "runtime_single_threaded_fft.h",
         "runtime_single_threaded_matmul.h",
         "runtime_topk.h",
+        "xnnpack_ops.h",
         # Multi-threaded support.
         "runtime_conv2d.h",
         "runtime_conv3d.h",
@@ -218,6 +220,7 @@ cc_library(
         ":small_while_loop_hoisting_pass",
         ":thunk_emitter",
         ":xla_framework",
+        ":xnnpack_ops_rewriter",
         "//xla:cpu_function_runtime",
         "//xla:debug_options_flags",
         "//xla:literal",
@@ -617,6 +620,7 @@ cc_library(
         ":runtime_single_threaded_fft",
         ":runtime_single_threaded_matmul",
         ":runtime_topk",
+        ":xnnpack_ops",
         "//xla/service:custom_call_target_registry",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/strings:string_view",
@@ -838,6 +842,8 @@ cc_library(
         ":onednn_config_proto_cc",
         ":onednn_memory_util",
         ":parallel_loop_emitter",
+        ":xnnpack_ops_rewriter",
+        ":xnnpack_ops",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -2187,3 +2193,32 @@ xla_cc_test(
         "@local_tsl//tsl/platform:test",
     ],
 )
+
+cc_library(
+    name = "xnnpack_ops_rewriter",
+    srcs = ["xnnpack_ops_rewriter.cc"],
+    hdrs = [
+        "xnnpack_ops_rewriter.h",
+        "xnnpack_pattern_utils.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla:literal_comparison",
+        "//xla:literal_util",
+        "//xla:status_macros",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/service:pattern_matcher",
+    ],
+)
+
+cc_library(
+    name = "xnnpack_ops",
+    srcs = ["xnnpack_ops.cc"],
+    hdrs = ["xnnpack_ops.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@XNNPACK",
+        "@com_google_absl//absl/base",
+    ],
+)
diff --git a/third_party/xla/xla/service/cpu/BUILD.orig b/third_party/xla/xla/service/cpu/BUILD.orig
new file mode 100644
index 00000000000000..90388079ca2fcf
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/BUILD.orig
@@ -0,0 +1,2189 @@
+# Description:
+#    LLVM-based CPU backend for XLA.
+
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
+load(
+    "//third_party/compute_library:build_defs.bzl",
+    "acl_deps",
+    "if_enable_acl",
+)
+load(
+    "//xla:xla.default.bzl",
+    "xla_cc_binary",
+    "xla_cc_test",
+)
+load("//xla/tests:build_defs.bzl", "xla_test")
+load("//xla/tsl:tsl.bzl", "internal_visibility", "tf_openmp_copts", "tsl_copts")
+load("//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
+load("//xla/tsl/mkl:build_defs.bzl", "if_graph_api", "mkl_deps")
+load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
+load(
+    "//xla/tsl/platform:build_config_root.bzl",
+    "if_llvm_aarch64_available",
+    "if_llvm_powerpc_available",
+    "if_llvm_system_z_available",
+    "if_llvm_x86_available",
+)
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+load(":build_defs.bzl", "runtime_copts")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//xla:friends",
+    ],
+)
+
+# Filegroup used to collect source files for dependency checking.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "test_header_helper",
+    testonly = True,
+    hdrs = ["test_target_triple_helper.h"],
+)
+
+filegroup(
+    name = "runtime_srcs",
+    srcs = [
+        # Single-threaded support.
+        "runtime_custom_call_status.cc",
+        "runtime_fp16.cc",
+        "runtime_key_value_sort.cc",
+        "runtime_pow.cc",
+        "runtime_single_threaded_conv2d.cc",
+        "runtime_single_threaded_conv3d.cc",
+        "runtime_single_threaded_fft.cc",
+        "runtime_single_threaded_matmul_c128.cc",
+        "runtime_single_threaded_matmul_c64.cc",
+        "runtime_single_threaded_matmul_common.h",
+        "runtime_single_threaded_matmul_f8.cc",
+        "runtime_single_threaded_matmul_f16.cc",
+        "runtime_single_threaded_matmul_f32.cc",
+        "runtime_single_threaded_matmul_f64.cc",
+        "runtime_single_threaded_matmul_s32.cc",
+        "runtime_single_threaded_matmul_u8.cc",
+        "runtime_topk.cc",
+        # Multi-threaded support.
+        "runtime_conv2d.cc",
+        "runtime_conv3d.cc",
+        "runtime_fft.cc",
+        "runtime_matmul_c128.cc",
+        "runtime_matmul_c64.cc",
+        "runtime_matmul_common.h",
+        "runtime_matmul_f16.cc",
+        "runtime_matmul_f32.cc",
+        "runtime_matmul_f64.cc",
+        "runtime_matmul_s32.cc",
+        "runtime_fork_join.cc",
+        "//xla/backends/cpu/runtime:runtime_srcs",
+        #"runtime_handle_ffi_call.cc", # TODO(b/338344732): Add  "runtime_handle_ffi_call.cc".
+    ],
+    visibility = internal_visibility([":friends"]),
+)
+
+filegroup(
+    name = "runtime_hdrs",
+    srcs = [
+        # XLA Runtime support.
+        "buffer_desc.h",
+        # Single-threaded support.
+        "runtime_custom_call_status.h",
+        "runtime_fp16.h",
+        "runtime_key_value_sort.h",
+        "runtime_pow.h",
+        "runtime_single_threaded_conv2d.h",
+        "runtime_single_threaded_conv3d.h",
+        "runtime_single_threaded_fft.h",
+        "runtime_single_threaded_matmul.h",
+        "runtime_topk.h",
+        # Multi-threaded support.
+        "runtime_conv2d.h",
+        "runtime_conv3d.h",
+        "runtime_fft.h",
+        "runtime_fork_join.h",
+        "runtime_lightweight_check.h",
+        "runtime_matmul.h",
+        "//xla/backends/cpu/runtime:runtime_hdrs",
+        #"runtime_handle_ffi_call.h", # TODO(b/338344732): Add  "runtime_handle_ffi_call.h"
+    ],
+    visibility = internal_visibility([":friends"]),
+)
+
+cc_library(
+    name = "cpu_xfeed",
+    srcs = ["cpu_xfeed.cc"],
+    hdrs = ["cpu_xfeed.h"],
+    deps = [
+        ":cpu_runtime",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:types",
+        "//xla:util",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service:shaped_buffer",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:notification",
+    ],
+)
+
+cc_library(
+    name = "cpu_transfer_manager",
+    srcs = ["cpu_transfer_manager.cc"],
+    hdrs = ["cpu_transfer_manager.h"],
+    deps = [
+        ":cpu_runtime",
+        ":cpu_xfeed",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:types",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:compiler",
+        "//xla/service:generic_transfer_manager",
+        "//xla/service:transfer_manager",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/host:host_platform_id",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+    ],
+    alwayslink = True,  # Contains per-platform transfer manager registration
+)
+
+cc_library(
+    name = "buffer_info_util",
+    srcs = ["buffer_info_util.cc"],
+    hdrs = ["buffer_info_util.h"],
+    deps = [
+        "//xla:cpu_function_runtime",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "cpu_compiler_pure",
+    srcs = ["cpu_compiler.cc"],
+    hdrs = ["cpu_compiler.h"],
+    copts = tsl_copts(),
+    deps = [
+        ":buffer_info_util",
+        ":conv_canonicalization",
+        ":cpu_aot_compilation_result",
+        ":cpu_executable",
+        ":cpu_float_support",
+        ":cpu_instruction_fusion",
+        ":cpu_layout_assignment",
+        ":cpu_options",
+        ":dot_op_emitter",
+        ":executable_proto_cc",
+        ":fusion_wrapper",
+        ":ir_emission_utils",
+        ":ir_emitter",
+        ":ir_emitter2",
+        ":metrics",
+        ":onednn_contraction_rewriter",
+        ":onednn_float_support",
+        ":onednn_ops_rewriter",
+        ":parallel_task_assignment",
+        ":runtime_symbol_generator",
+        ":small_while_loop_hoisting_pass",
+        ":thunk_emitter",
+        ":xla_framework",
+        "//xla:cpu_function_runtime",
+        "//xla:debug_options_flags",
+        "//xla:literal",
+        "//xla:literal_pool",
+        "//xla:protobuf_util",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:types",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/cpu:constant_allocation",
+        "//xla/backends/cpu:xnn_fusion",
+        "//xla/backends/cpu/codegen:compiled_function_library",
+        "//xla/backends/cpu/codegen:cpu_features",
+        "//xla/backends/cpu/codegen:execution_engine",
+        "//xla/backends/cpu/codegen:ir_compiler",
+        "//xla/backends/cpu/codegen:jit_compiler",
+        "//xla/backends/cpu/codegen:object_loader",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitter_config",
+        "//xla/backends/cpu/runtime:function_library",
+        "//xla/backends/cpu/runtime:kernel_thunk",
+        "//xla/backends/cpu/runtime:thunk",
+        "//xla/backends/cpu/runtime:thunk_proto_cc_impl",
+        "//xla/backends/cpu/runtime:thunk_proto_serdes",
+        "//xla/backends/cpu/transforms:xnn_graph_fusion",
+        "//xla/hlo/analysis:hlo_ordering",
+        "//xla/hlo/analysis:indexed_array_analysis",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:hlo_module_group",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/hlo/pass:hlo_pass_pipeline",
+        "//xla/hlo/transforms:literal_canonicalizer",
+        "//xla/hlo/transforms:operand_upcaster",
+        "//xla/hlo/transforms:while_loop_trip_count_annotator",
+        "//xla/hlo/transforms/expanders:bitcast_dtypes_expander",
+        "//xla/hlo/transforms/expanders:cholesky_expander",
+        "//xla/hlo/transforms/expanders:comparison_expander",
+        "//xla/hlo/transforms/expanders:dot_decomposer",
+        "//xla/hlo/transforms/expanders:dynamic_index_splitter",
+        "//xla/hlo/transforms/expanders:eigh_expander",
+        "//xla/hlo/transforms/expanders:logistic_expander",
+        "//xla/hlo/transforms/expanders:optimization_barrier_expander",
+        "//xla/hlo/transforms/expanders:qr_expander",
+        "//xla/hlo/transforms/expanders:reduce_decomposer",
+        "//xla/hlo/transforms/expanders:reshape_decomposer",
+        "//xla/hlo/transforms/expanders:rng_bit_generator_expander",
+        "//xla/hlo/transforms/expanders:rng_expander",
+        "//xla/hlo/transforms/expanders:stochastic_convert_decomposer",
+        "//xla/hlo/transforms/simplifiers:algebraic_simplifier",
+        "//xla/hlo/transforms/simplifiers:batch_dot_simplification",
+        "//xla/hlo/transforms/simplifiers:broadcast_canonicalizer",
+        "//xla/hlo/transforms/simplifiers:conditional_canonicalizer",
+        "//xla/hlo/transforms/simplifiers:convolution_group_converter",
+        "//xla/hlo/transforms/simplifiers:dynamic_dimension_simplifier",
+        "//xla/hlo/transforms/simplifiers:flatten_call_graph",
+        "//xla/hlo/transforms/simplifiers:float_normalization",
+        "//xla/hlo/transforms/simplifiers:gather_simplifier",
+        "//xla/hlo/transforms/simplifiers:hlo_constant_folding",
+        "//xla/hlo/transforms/simplifiers:hlo_dce",
+        "//xla/hlo/transforms/simplifiers:hlo_memory_scheduler",
+        "//xla/hlo/transforms/simplifiers:optimize_input_output_buffer_alias",
+        "//xla/hlo/transforms/simplifiers:reduce_window_rewriter",
+        "//xla/hlo/transforms/simplifiers:reshape_mover",
+        "//xla/hlo/transforms/simplifiers:result_caster",
+        "//xla/hlo/transforms/simplifiers:simplify_fp_conversions",
+        "//xla/hlo/transforms/simplifiers:slice_sinker",
+        "//xla/hlo/transforms/simplifiers:sort_simplifier",
+        "//xla/hlo/transforms/simplifiers:sub_byte_normalization",
+        "//xla/hlo/transforms/simplifiers:tree_reduction_rewriter",
+        "//xla/hlo/transforms/simplifiers:tuple_simplifier",
+        "//xla/hlo/transforms/simplifiers:zero_sized_hlo_elimination",
+        "//xla/mlir_hlo",
+        "//xla/mlir_hlo:all_passes",
+        "//xla/mlir_hlo:transforms_passes",
+        "//xla/service:all_reduce_promotion",
+        "//xla/service:outer_dimension_propagation",
+        "//xla/service:get_outer_batch_value_simplifier",
+        "//xla/service:all_to_all_decomposer",
+        "//xla/service:batched_gather_scatter_normalizer",
+        "//xla/service:batchnorm_expander",
+        "//xla/service:buffer_assignment",
+        "//xla/service:call_graph",
+        "//xla/service:call_inliner",
+        "//xla/service:change_op_data_type",
+        "//xla/service:compiler",
+        "//xla/service:conditional_simplifier",
+        "//xla/service:conditional_to_select",
+        "//xla/service:copy_insertion",
+        "//xla/service:cpu_gpu_shape_verifier",
+        "//xla/service:dump",
+        "//xla/service:dynamic_dimension_inference",
+        "//xla/service:dynamic_padder",
+        "//xla/service:executable",
+        "//xla/service:float_support",
+        "//xla/service:gather_expander",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service:hlo_cse",
+        "//xla/service:hlo_execution_profile",
+        "//xla/service:hlo_module_config",
+        "//xla/service:hlo_profile_printer_data_cc",
+        "//xla/service:hlo_proto_cc",
+        "//xla/service:hlo_proto_util",
+        "//xla/service:hlo_verifier",
+        "//xla/service:layout_assignment",
+        "//xla/service:llvm_compiler",
+        "//xla/service:logical_buffer",
+        "//xla/service:map_inliner",
+        "//xla/service:scatter_expander",
+        "//xla/service:scatter_simplifier",
+        "//xla/service:select_and_scatter_expander",
+        "//xla/service:sharding_propagation",
+        "//xla/service:sharding_remover",
+        "//xla/service:slow_operation_alarm",
+        "//xla/service:topk_rewriter",
+        "//xla/service:transpose_folding",
+        "//xla/service:triangular_solve_expander",
+        "//xla/service:while_loop_constant_sinking",
+        "//xla/service:while_loop_invariant_code_motion",
+        "//xla/service:while_loop_simplifier",
+        "//xla/service/llvm_ir:llvm_command_line_options",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/service/spmd:stateful_rng_spmd_partitioner",
+        "//xla/service/spmd/shardy:shardy_xla_pass",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/host:host_platform_id",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/protobuf:error_codes_proto_impl_cc",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:BitReader",
+        "@llvm-project//llvm:BitWriter",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Linker",
+        "@llvm-project//llvm:MC",
+        "@llvm-project//llvm:Object",
+        "@llvm-project//llvm:OrcJIT",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:Target",
+        "@llvm-project//llvm:TargetParser",
+        "@llvm-project//llvm:TransformUtils",
+        "@llvm-project//mlir:AffineDialect",
+        "@llvm-project//mlir:AffineToStandard",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ArithTransforms",
+        "@llvm-project//mlir:BufferizationTransforms",
+        "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:LinalgTransforms",
+        "@llvm-project//mlir:MemRefTransforms",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ReconcileUnrealizedCasts",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:ToLLVMIRTranslation",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:VectorDialect",
+        "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:threadpool_async_executor",
+        "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_tsl//tsl/profiler/lib:traceme_encode",
+    ] + if_llvm_aarch64_available([
+        "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
+    ]) + if_llvm_powerpc_available([
+        "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
+    ]) + if_llvm_system_z_available([
+        "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
+    ]) + if_llvm_x86_available([
+        "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
+    ]),
+)
+
+cc_library(
+    name = "cpu_aot_compilation_result",
+    srcs = ["cpu_aot_compilation_result.cc"],
+    hdrs = ["cpu_aot_compilation_result.h"],
+    deps = [
+        ":buffer_info_util",
+        ":cpu_executable",
+        ":executable_proto_cc",
+        "//xla:cpu_function_runtime",
+        "//xla:util",
+        "//xla/backends/cpu:constant_allocation",
+        "//xla/backends/cpu/runtime:function_library",
+        "//xla/backends/cpu/runtime:thunk",
+        "//xla/backends/cpu/runtime:thunk_proto_cc",
+        "//xla/backends/cpu/runtime:thunk_proto_serdes",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service:buffer_value",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service:hlo_module_config",
+        "//xla/service:hlo_profile_printer_data_cc",
+        "//xla/service:hlo_proto_cc",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor/host:host_platform_id",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+cc_library(
+    # The old target name will still be used so that dependencies won't break.
+    # In the future, dependencies should be cleaned up and relinked to the above
+    # target if registration is not necesary.
+    name = "cpu_compiler",
+    srcs = ["cpu_compiler_registerer.cc"],
+    hdrs = ["cpu_compiler.h"],
+    deps = [
+        "cpu_compiler_pure",
+        ":cpu_aot_compilation_result",
+        ":executable_proto_cc",
+        "//xla:util",
+        "//xla/backends/cpu/codegen:ir_compiler",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:hlo_module_group",
+        "//xla/service:buffer_assignment",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service:hlo_profile_printer_data_cc",
+        "//xla/service:hlo_proto_cc",
+        "//xla/service:llvm_compiler",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/host:host_platform_id",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:Target",
+        "@llvm-project//llvm:TargetParser",
+    ],
+    alwayslink = True,  # Contains compiler registration
+)
+
+xla_test(
+    name = "cpu_compiler_test",
+    srcs = ["cpu_compiler_test.cc"],
+    backends = [
+        "cpu",
+    ],
+    tags = [
+        "test_migrated_to_hlo_runner_pjrt",
+        "test_xla_cpu_no_thunks",
+    ],
+    deps = [
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/tests:hlo_pjrt_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/lib/monitoring:collected_metrics",
+        "//xla/tsl/lib/monitoring:collection_registry",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+xla_test(
+    name = "cpu_compiler_internals_test",
+    srcs = ["cpu_compiler_internals_test.cc"],
+    backends = [
+        "cpu",
+    ],
+    deps = [
+        "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitter_config",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/service:llvm_compiler",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+xla_test(
+    name = "cpu_aot_compiler_test",
+    srcs = ["cpu_aot_compiler_test.cc"],
+    backends = [
+        "cpu",
+    ],
+    deps = [
+        ":cpu_aot_compilation_result",
+        ":test_header_helper",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:hlo_module_group",
+        "//xla/service:compiler",
+        "//xla/service:executable",
+        "//xla/service:hlo_runner",
+        "//xla/service:hlo_runner_interface",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:literal_test_util",
+        "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+tf_proto_library(
+    name = "executable_proto",
+    srcs = ["executable.proto"],
+    protodeps = [
+        ":xla_framework_proto",
+        "//xla/service:hlo_proto",
+        "//xla:xla_proto",
+        "//xla/backends/cpu/runtime:thunk_proto",
+    ],
+)
+
+tf_proto_library(
+    name = "xla_framework_proto",
+    srcs = ["xla_framework.proto"],
+)
+
+cc_library(
+    name = "xla_framework",
+    hdrs = ["xla_framework.h"],
+    deps = [":xla_framework_proto_cc"],
+)
+
+cc_library(
+    name = "runtime_symbol_generator",
+    srcs = [
+        "runtime_symbol_generator.cc",
+        "windows_compatibility.cc",
+        "windows_compatibility.h",
+    ],
+    hdrs = ["runtime_symbol_generator.h"],
+    copts = if_enable_acl(["-DXLA_CPU_USE_ACL=1"]) + tsl_copts(),
+    deps = [
+        ":cpu_runtime",
+        ":onednn_convolution",
+        ":onednn_layer_norm",
+        ":onednn_matmul",
+        ":onednn_softmax",
+        ":runtime_conv2d",
+        ":runtime_conv2d_acl",
+        ":runtime_conv2d_mkl",
+        ":runtime_conv3d",
+        ":runtime_custom_call_status",
+        ":runtime_fft",
+        ":runtime_fork_join",
+        ":runtime_fp16",
+        ":runtime_handle_ffi_call",
+        ":runtime_key_value_sort",
+        ":runtime_matmul",
+        ":runtime_matmul_acl",
+        ":runtime_pow",
+        ":runtime_single_threaded_conv2d",
+        ":runtime_single_threaded_conv3d",
+        ":runtime_single_threaded_fft",
+        ":runtime_single_threaded_matmul",
+        ":runtime_topk",
+        "//xla/service:custom_call_target_registry",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:OrcJIT",
+        "@llvm-project//llvm:OrcShared",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:mlir_c_runner_utils",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
+cc_library(
+    name = "runtime_lightweight_check",
+    hdrs = ["runtime_lightweight_check.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = runtime_copts(),
+)
+
+cc_library(
+    name = "runtime_fp16",
+    srcs = [
+        "runtime_fp16.cc",
+    ],
+    hdrs = [
+        "runtime_fp16.h",
+    ],
+    copts = runtime_copts(),
+    deps = ["@com_google_absl//absl/base:core_headers"],
+)
+
+cc_library(
+    name = "runtime_pow",
+    srcs = [
+        "runtime_pow.cc",
+    ],
+    hdrs = [
+        "runtime_pow.h",
+    ],
+    copts = runtime_copts(),
+    deps = ["@com_google_absl//absl/base:core_headers"],
+)
+
+cc_library(
+    name = "buffer_desc",
+    hdrs = ["buffer_desc.h"],
+)
+
+cc_library(
+    name = "cpu_executable",
+    srcs = ["cpu_executable.cc"],
+    hdrs = ["cpu_executable.h"],
+    deps = [
+        ":cpu_runtime",
+        ":executable_proto_cc",
+        "//xla:executable_run_options",
+        "//xla:literal",
+        "//xla:shape_tree",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu:constant_allocation",
+        "//xla/backends/cpu/runtime:buffer_allocations",
+        "//xla/backends/cpu/runtime:function_library",
+        "//xla/backends/cpu/runtime:thread_pool_task_runner",
+        "//xla/backends/cpu/runtime:thunk",
+        "//xla/backends/cpu/runtime:thunk_executor",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service:custom_call_status",
+        "//xla/service:custom_call_status_internal",
+        "//xla/service:executable",
+        "//xla/service:hlo_execution_profile",
+        "//xla/service:hlo_profile_printer_data_cc",
+        "//xla/service:hlo_value",
+        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:shaped_buffer",
+        "//xla/service:xla_debug_info_manager",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor/host:host_stream",
+        "//xla/tsl/concurrency:async_value",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "elemental_math_emitter",
+    srcs = ["elemental_math_emitter.cc"],
+    hdrs = ["elemental_math_emitter.h"],
+    deps = [
+        "//xla:xla_data_proto_cc",
+        "//xla/service/llvm_ir:math_ops",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "ir_emitter2",
+    srcs = ["ir_emitter2.cc"],
+    hdrs = ["ir_emitter2.h"],
+    deps = [
+        ":backend_config_proto_cc",
+        ":dot_op_emitter",
+        ":elemental_ir_emitter",
+        ":ir_emitter",
+        ":parallel_loop_emitter",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/cpu/codegen:fusion_compiler",
+        "//xla/backends/cpu/codegen:kernel_api_ir_builder",
+        "//xla/backends/cpu/codegen:symbol_name_util",
+        "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitter_config",
+        "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitters",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service:hlo_module_config",
+        "//xla/service/llvm_ir:dynamic_update_slice_util",
+        "//xla/service/llvm_ir:fused_ir_emitter",
+        "//xla/service/llvm_ir:ir_array",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/service/llvm_ir:loop_emitter",
+        "//xla/stream_executor:launch_dim",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Linker",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+xla_cc_test(
+    name = "ir_emitter_test",
+    srcs = ["ir_emitter_test.cc"],
+    deps = [
+        ":cpu_compiler",
+        ":cpu_executable",
+        ":cpu_options",
+        ":ir_emitter",
+        ":ir_function",
+        ":runtime_symbol_generator",
+        ":target_machine_features_stub",
+        "//xla:cpu_function_runtime",
+        "//xla:shape_util",
+        "//xla/backends/cpu/codegen:cpu_features",
+        "//xla/backends/cpu/codegen:execution_engine",
+        "//xla/backends/cpu/codegen:ir_compiler",
+        "//xla/backends/cpu/codegen:jit_compiler",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/hlo/analysis:hlo_ordering",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/parser:hlo_parser",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/transforms/simplifiers:hlo_memory_scheduler",
+        "//xla/service:buffer_assignment",
+        "//xla/service:buffer_value",
+        "//xla/service:hlo_module_config",
+        "//xla/service:logical_buffer",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:Target",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "ir_emitter",
+    srcs = ["ir_emitter.cc"],
+    hdrs = ["ir_emitter.h"],
+    copts = tsl_copts(),
+    deps = [
+        ":backend_config_proto_cc",
+        ":cpu_instruction_fusion",
+        ":cpu_options",
+        ":cpu_runtime",
+        ":dot_op_emitter",
+        ":elemental_ir_emitter",
+        ":ir_emission_utils",
+        ":ir_function",
+        ":onednn_config_proto_cc",
+        ":onednn_memory_util",
+        ":parallel_loop_emitter",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:elemental_ir_emitter",
+        "//xla/service:hlo_module_config",
+        "//xla/service:name_uniquer",
+        "//xla/service/llvm_ir:alias_analysis",
+        "//xla/service/llvm_ir:buffer_assignment_util",
+        "//xla/service/llvm_ir:dynamic_update_slice_util",
+        "//xla/service/llvm_ir:fused_ir_emitter",
+        "//xla/service/llvm_ir:ir_array",
+        "//xla/service/llvm_ir:ir_builder_mixin",
+        "//xla/service/llvm_ir:llvm_loop",
+        "//xla/service/llvm_ir:llvm_type_conversion_util",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/service/llvm_ir:loop_emitter",
+        "//xla/service/llvm_ir:tuple_ops",
+        "//xla/tsl/lib/math:math_util",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/meta:type_traits",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "target_machine_features_stub",
+    testonly = 1,
+    hdrs = ["target_machine_features_stub.h"],
+    deps = [
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "@llvm-project//llvm:Core",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
+cc_library(
+    name = "ir_function",
+    srcs = ["ir_function.cc"],
+    hdrs = ["ir_function.h"],
+    deps = [
+        ":cpu_runtime",
+        ":ir_emission_utils",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:types",
+        "//xla/service:hlo_module_config",
+        "//xla/service/llvm_ir:llvm_util",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Core",
+    ],
+)
+
+cc_library(
+    name = "parallel_loop_emitter",
+    srcs = ["parallel_loop_emitter.cc"],
+    hdrs = ["parallel_loop_emitter.h"],
+    deps = [
+        ":ir_emission_utils",
+        "//xla/service/llvm_ir:ir_array",
+        "//xla/service/llvm_ir:llvm_loop",
+        "//xla/service/llvm_ir:llvm_util",
+        "//xla/service/llvm_ir:loop_emitter",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Core",
+    ],
+)
+
+cc_library(
+    name = "thunk_emitter",
+    srcs = ["thunk_emitter.cc"],
+    hdrs = ["thunk_emitter.h"],
+    local_defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]),
+    deps = [
+        ":backend_config_proto_cc",
+        ":dot_op_emitter",
+        ":ir_emission_utils",
+        ":ir_emitter2",
+        "//xla:comparison_util",
+        "//xla:cpu_function_runtime",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu:onednn_emitter",
+        "//xla/backends/cpu:onednn_fusion",
+        "//xla/backends/cpu:xnn_emitter",
+        "//xla/backends/cpu:xnn_fusion",
+        "//xla/backends/cpu/codegen:computation_kernel_emitter",
+        "//xla/backends/cpu/codegen:fusion_compiler",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/backends/cpu/codegen/dot:dot_kernel_emitter",
+        "//xla/backends/cpu/codegen/elemental:concatenate_kernel_emitter",
+        "//xla/backends/cpu/codegen/elemental:elemental_kernel_emitter",
+        "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitters",
+        "//xla/backends/cpu/runtime:all_gather_thunk",
+        "//xla/backends/cpu/runtime:all_reduce_thunk",
+        "//xla/backends/cpu/runtime:all_to_all_thunk",
+        "//xla/backends/cpu/runtime:call_thunk",
+        "//xla/backends/cpu/runtime:collective_permute_thunk",
+        "//xla/backends/cpu/runtime:collective_thunk",
+        "//xla/backends/cpu/runtime:conditional_thunk",
+        "//xla/backends/cpu/runtime:convolution_thunk",
+        "//xla/backends/cpu/runtime:copy_thunk",
+        "//xla/backends/cpu/runtime:custom_call_thunk",
+        "//xla/backends/cpu/runtime:dot_thunk",
+        "//xla/backends/cpu/runtime:fft_thunk",
+        "//xla/backends/cpu/runtime:infeed_thunk",
+        "//xla/backends/cpu/runtime:kernel_thunk",
+        "//xla/backends/cpu/runtime:logical_id_thunk",
+        "//xla/backends/cpu/runtime:outfeed_thunk",
+        "//xla/backends/cpu/runtime:reduce_scatter_thunk",
+        "//xla/backends/cpu/runtime:rng_state_thunk",
+        "//xla/backends/cpu/runtime:sort_thunk",
+        "//xla/backends/cpu/runtime:thunk",
+        "//xla/backends/cpu/runtime:topk_thunk",
+        "//xla/backends/cpu/runtime:while_thunk",
+        "//xla/backends/cpu/runtime/onednn:onednn_fusion_thunk",
+        "//xla/backends/cpu/runtime/xnnpack:xnn_dot_thunk",
+        "//xla/backends/cpu/runtime/xnnpack:xnn_fusion_thunk",
+        "//xla/codegen:kernel_definition",
+        "//xla/codegen:kernel_spec",
+        "//xla/codegen:llvm_ir_kernel_source",
+        "//xla/codegen:mlir_kernel_source",
+        "//xla/hlo/ir:hlo",
+        "//xla/runtime:resource_use",
+        "//xla/service:buffer_assignment",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:hlo_module_config",
+        "//xla/service:hlo_proto_cc",
+        "//xla/service:pattern_matcher",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:JITLink",
+        "@llvm-project//llvm:ir_headers",
+        "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/profiler/lib:traceme",
+    ],
+)
+
+cc_library(
+    name = "tiled_dot_emitter",
+    srcs = ["tiled_dot_emitter.cc"],
+    hdrs = ["tiled_dot_emitter.h"],
+    deps = [
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu/codegen:vector_ir_builder",
+        "//xla/service:hlo_module_config",
+        "//xla/service/llvm_ir:kernel_support_library",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/numeric:bits",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Core",
+    ],
+)
+
+cc_library(
+    name = "dot_op_emitter",
+    srcs = ["dot_op_emitter.cc"],
+    hdrs = [
+        "dot_op_emitter.h",
+    ],
+    deps = [
+        ":backend_config_proto_cc",
+        ":cpu_options",
+        ":cpu_runtime",
+        ":tiled_dot_emitter",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_module_config",
+        "//xla/service/llvm_ir:ir_array",
+        "//xla/service/llvm_ir:kernel_support_library",
+        "//xla/service/llvm_ir:llvm_loop",
+        "//xla/service/llvm_ir:llvm_util",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
+build_test(
+    name = "sample_harness_build_test",
+    targets = [
+        ":sample_harness",
+    ],
+)
+
+xla_cc_binary(
+    name = "sample_harness",
+    srcs = ["sample_harness.cc"],
+    deps = [
+        "//xla:array4d",
+        "//xla:literal",
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/client",
+        "//xla/client:client_library",
+        "//xla/client:local_client",
+        "//xla/hlo/builder:xla_builder",
+        "//xla/hlo/builder:xla_computation",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:platform_port",
+    ],
+)
+
+cc_library(
+    name = "cpu_runtime",
+    srcs = [
+        "cpu_runtime.cc",
+        "xfeed_manager.cc",
+    ],
+    hdrs = [
+        "cpu_runtime.h",
+        "xfeed_manager.h",
+    ],
+    copts = runtime_copts(),
+    deps = [
+        ":cpu_executable_run_options",
+        "//xla:executable_run_options",
+        "//xla:shape_util",
+        "//xla:types",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu/collectives:cpu_clique_key",
+        "//xla/backends/cpu/collectives:cpu_cliques",
+        "//xla/backends/cpu/collectives:cpu_collectives",
+        "//xla/backends/cpu/collectives:in_process_collectives",
+        "//xla/core/collectives:communicator",
+        "//xla/core/collectives:rank_id",
+        "//xla/hlo/parser:hlo_parser",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:computation_placer",
+        "//xla/service:global_device_id",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:status",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/profiler/lib:traceme",
+    ],
+)
+
+cc_library(
+    name = "runtime_conv2d",
+    srcs = ["runtime_conv2d.cc"],
+    hdrs = ["runtime_conv2d.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":runtime_lightweight_check",
+        "//xla:executable_run_options",
+        "//xla/backends/cpu/runtime:convolution_thunk_internal",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
+        "@eigen_archive//:eigen3",
+    ],
+)
+
+cc_library(
+    name = "runtime_conv3d",
+    srcs = ["runtime_conv3d.cc"],
+    hdrs = ["runtime_conv3d.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":runtime_lightweight_check",
+        "//xla:executable_run_options",
+        "//xla/backends/cpu/runtime:convolution_thunk_internal",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
+        "@eigen_archive//:eigen3",
+    ],
+)
+
+cc_library(
+    name = "runtime_custom_call_status",
+    srcs = ["runtime_custom_call_status.cc"],
+    hdrs = ["runtime_custom_call_status.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/service:custom_call_status_internal",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
+cc_library(
+    name = "runtime_conv2d_mkl",
+    srcs = [
+        "runtime_conv2d_mkl.cc",
+    ],
+    hdrs = ["runtime_conv2d_mkl.h"],
+    copts = runtime_copts() + tf_openmp_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":runtime_conv2d",
+        ":runtime_single_threaded_conv2d",
+        "//xla:executable_run_options",
+        "//xla/tsl/framework/convolution:eigen_helpers",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@eigen_archive//:eigen3",
+    ] + mkl_deps(),
+)
+
+cc_library(
+    name = "runtime_fft",
+    srcs = [
+        "runtime_fft.cc",
+    ],
+    hdrs = ["runtime_fft.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:executable_run_options",
+        "@com_google_absl//absl/base:core_headers",
+        "@ducc//:fft_wrapper",
+        "@eigen_archive//:eigen3",
+    ],
+)
+
+cc_library(
+    name = "runtime_matmul",
+    srcs = [
+        "runtime_matmul_c128.cc",
+        "runtime_matmul_c64.cc",
+        "runtime_matmul_common.h",
+        "runtime_matmul_f16.cc",
+        "runtime_matmul_f32.cc",
+        "runtime_matmul_f64.cc",
+        "runtime_matmul_s32.cc",
+    ],
+    hdrs = ["runtime_matmul.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":runtime_lightweight_check",
+        "//xla:executable_run_options",
+        "//xla/tsl/framework/contraction:eigen_contraction_kernel",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
+        "@eigen_archive//:eigen3",
+    ],
+)
+
+cc_library(
+    name = "runtime_matmul_acl",
+    srcs = ["runtime_matmul_acl.cc"],
+    hdrs = ["runtime_matmul_acl.h"],
+    copts = tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":runtime_lightweight_check",
+        ":runtime_matmul",
+        "//xla:executable_run_options",
+        "//xla/tsl/platform:dynamic_annotations",
+        "@com_google_absl//absl/base",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:types",
+    ] + acl_deps(),
+)
+
+cc_library(
+    name = "runtime_conv2d_acl",
+    srcs = [
+        "runtime_conv2d_acl.cc",
+    ],
+    hdrs = ["runtime_conv2d_acl.h"],
+    copts = tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":runtime_conv2d",
+        ":runtime_lightweight_check",
+        ":runtime_single_threaded_conv2d",
+        "//xla:executable_run_options",
+        "//xla/tsl/framework/convolution:eigen_helpers",
+        "//xla/tsl/platform:dynamic_annotations",
+        "@com_google_absl//absl/base",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:types",
+    ] + acl_deps(),
+)
+
+cc_library(
+    name = "runtime_single_threaded_conv2d",
+    srcs = ["runtime_single_threaded_conv2d.cc"],
+    hdrs = ["runtime_single_threaded_conv2d.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/backends/cpu/runtime:convolution_thunk_internal",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
+        "@eigen_archive//:eigen3",
+    ],
+)
+
+cc_library(
+    name = "runtime_single_threaded_conv3d",
+    srcs = ["runtime_single_threaded_conv3d.cc"],
+    hdrs = ["runtime_single_threaded_conv3d.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/backends/cpu/runtime:convolution_thunk_internal",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
+        "@eigen_archive//:eigen3",
+    ],
+)
+
+cc_library(
+    name = "runtime_single_threaded_fft",
+    srcs = [
+        "runtime_single_threaded_fft.cc",
+    ],
+    hdrs = ["runtime_single_threaded_fft.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":runtime_fft",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
+cc_library(
+    name = "runtime_single_threaded_matmul_impl",
+    srcs = [
+        "runtime_single_threaded_matmul_c128.cc",
+        "runtime_single_threaded_matmul_c64.cc",
+        "runtime_single_threaded_matmul_common.h",
+        "runtime_single_threaded_matmul_f16.cc",
+        "runtime_single_threaded_matmul_f32.cc",
+        "runtime_single_threaded_matmul_f64.cc",
+        "runtime_single_threaded_matmul_f8.cc",
+        "runtime_single_threaded_matmul_s32.cc",
+        "runtime_single_threaded_matmul_u8.cc",
+    ],
+    hdrs = ["runtime_single_threaded_matmul.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = runtime_copts(),
+    linkstatic = 1,
+    visibility = ["//visibility:private"],
+    deps = [
+        "//xla/tsl/framework/contraction:eigen_contraction_kernel_no_mkl",
+        "@com_google_absl//absl/base:core_headers",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:ml_dtypes",
+    ],
+)
+
+cc_library(
+    name = "runtime_single_threaded_matmul",
+    hdrs = ["runtime_single_threaded_matmul.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":runtime_single_threaded_matmul_impl",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:ml_dtypes",
+    ],
+)
+
+cc_library(
+    name = "runtime_single_threaded_matmul_nomkl",
+    compatible_with = get_compatible_with_portable(),
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":runtime_single_threaded_matmul_impl",
+        "//xla/tsl/framework/contraction:eigen_contraction_kernel_no_mkl",
+        "@com_google_absl//absl/base:core_headers",
+        "@eigen_archive//:eigen3",
+    ],
+)
+
+cc_library(
+    name = "runtime_key_value_sort",
+    srcs = ["runtime_key_value_sort.cc"],
+    hdrs = ["runtime_key_value_sort.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@eigen_archive//:eigen3",
+    ],
+)
+
+cc_library(
+    name = "runtime_topk",
+    srcs = ["runtime_topk.cc"],
+    hdrs = ["runtime_topk.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+    ],
+)
+
+cc_library(
+    name = "runtime_fork_join",
+    srcs = ["runtime_fork_join.cc"],
+    hdrs = ["runtime_fork_join.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:executable_run_options",
+        "//xla/service:custom_call_status_internal",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
+cc_library(
+    name = "runtime_handle_ffi_call",
+    srcs = ["runtime_handle_ffi_call.cc"],
+    hdrs = ["runtime_handle_ffi_call.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla:executable_run_options",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/ffi:attribute_map",
+        "//xla/ffi:call_frame",
+        "//xla/ffi:execution_state",
+        "//xla/ffi:ffi_api",
+        "//xla/ffi/api:c_api",
+        "//xla/service:custom_call_status_public_headers",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:AsmParser",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+xla_cc_test(
+    name = "cpu_runtime_test",
+    srcs = ["cpu_runtime_test.cc"],
+    shard_count = 10,
+    tags = ["optonly"],
+    deps = [
+        ":cpu_runtime",
+        ":runtime_custom_call_status",
+        ":runtime_matmul",
+        ":runtime_matmul_acl",
+        ":runtime_single_threaded_matmul",
+        "//xla:array2d",
+        "//xla:executable_run_options",
+        "//xla:types",
+        "//xla/client:local_client",
+        "//xla/service:custom_call_status_internal",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings:str_format",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+xla_cc_test(
+    name = "cpu_instruction_fusion_test",
+    srcs = ["cpu_instruction_fusion_test.cc"],
+    tags = ["not_run:arm"],
+    deps = [
+        ":cpu_instruction_fusion",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/utils:hlo_matchers",
+        "//xla/service:transpose_folding",
+        "//xla/tests:test_utils",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "xfeed_manager_test",
+    size = "small",
+    srcs = ["xfeed_manager_test.cc"],
+    deps = [
+        ":cpu_runtime",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "cpu_instruction_fusion",
+    srcs = ["cpu_instruction_fusion.cc"],
+    hdrs = ["cpu_instruction_fusion.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:fusion_node_indexing_evaluation",
+        "//xla/service:instruction_fusion",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+cc_library(
+    name = "fusion_wrapper",
+    srcs = ["fusion_wrapper.cc"],
+    hdrs = ["fusion_wrapper.h"],
+    deps = [
+        "//xla/codegen/emitters:fusion_wrapper_base",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_cc_test(
+    name = "fusion_wrapper_test",
+    srcs = ["fusion_wrapper_test.cc"],
+    deps = [
+        ":fusion_wrapper",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "ir_emission_utils",
+    srcs = ["ir_emission_utils.cc"],
+    hdrs = ["ir_emission_utils.h"],
+    deps = [
+        ":cpu_runtime",
+        "//xla:shape_util",
+        "//xla:window_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/log:check",
+        "@llvm-project//llvm:Core",
+    ],
+)
+
+xla_cc_test(
+    name = "ir_emission_utils_test",
+    srcs = ["ir_emission_utils_test.cc"],
+    deps = [
+        ":ir_emission_utils",
+        ":target_machine_features_stub",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/tests:xla_internal_test_main",
+    ],
+)
+
+cc_library(
+    name = "cpu_layout_assignment",
+    srcs = ["cpu_layout_assignment.cc"],
+    hdrs = ["cpu_layout_assignment.h"],
+    deps = [
+        ":dot_op_emitter",
+        ":ir_emission_utils",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:computation_layout",
+        "//xla/service:layout_assignment",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
+    ],
+)
+
+xla_cc_test(
+    name = "cpu_layout_assignment_test",
+    size = "small",
+    srcs = ["cpu_layout_assignment_test.cc"],
+    deps = [
+        ":cpu_layout_assignment",
+        ":target_machine_features_stub",
+        "//xla:literal",
+        "//xla:shape_layout",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/testlib:test_helpers",
+        "//xla/hlo/utils:hlo_matchers",
+        "//xla/service:computation_layout",
+        "//xla/tests:test_utils",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:status",
+    ],
+)
+
+cc_library(
+    name = "conv_canonicalization",
+    srcs = ["conv_canonicalization.cc"],
+    hdrs = ["conv_canonicalization.h"],
+    deps = [
+        ":cpu_runtime",
+        ":ir_emission_utils",
+        "//xla:permutation_util",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:errors",
+    ],
+)
+
+xla_cc_test(
+    name = "conv_canonicalization_test",
+    srcs = ["conv_canonicalization_test.cc"],
+    deps = [
+        ":conv_canonicalization",
+        ":target_machine_features_stub",
+        "//xla:literal_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/hlo/testlib:test_helpers",
+        "//xla/tests:xla_internal_test_main",
+    ],
+)
+
+cc_library(
+    name = "parallel_task_assignment",
+    srcs = ["parallel_task_assignment.cc"],
+    hdrs = ["parallel_task_assignment.h"],
+    deps = [
+        ":backend_config_proto_cc",
+        ":ir_emission_utils",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service/llvm_ir:dynamic_update_slice_util",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:status",
+    ],
+)
+
+xla_cc_test(
+    name = "parallel_task_assignment_test",
+    srcs = ["parallel_task_assignment_test.cc"],
+    deps = [
+        ":backend_config_proto_cc",
+        ":cpu_executable",
+        ":parallel_task_assignment",
+        ":target_machine_features_stub",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "cpu_options",
+    srcs = ["cpu_options.cc"],
+    hdrs = ["cpu_options.h"],
+    deps = [
+        "//xla/service:hlo_module_config",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "orc_jit_memory_mapper",
+    srcs = ["orc_jit_memory_mapper.cc"],
+    hdrs = ["orc_jit_memory_mapper.h"],
+    deps = [
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",
+        "@llvm-project//llvm:ExecutionEngine",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
+xla_cc_test(
+    name = "cpu_eigen_tensor_alignment_test",
+    size = "small",
+    srcs = ["cpu_eigen_tensor_alignment_test.cc"],
+    deps = [
+        ":ir_emission_utils",
+        ":target_machine_features_stub",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/tests:xla_internal_test_main",
+    ],
+)
+
+xla_cc_test(
+    name = "vectorized_reduce_with_no_vector_registers_test",
+    size = "small",
+    srcs = ["vectorized_reduce_with_no_vector_registers_test.cc"],
+    tags = ["not_run:arm"],
+    target_compatible_with = ["@platforms//cpu:x86_64"],
+    deps = [
+        ":cpu_compiler",
+        ":cpu_transfer_manager",
+        ":test_header_helper",
+        "//xla:util",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:hlo_module_group",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/service:compiler",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:MC",
+        "@llvm-project//llvm:Target",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "scoped_ir_builder_test",
+    srcs = ["scoped_ir_builder_test.cc"],
+    deps = [
+        ":cpu_executable",
+        ":ir_emitter",
+        ":target_machine_features_stub",
+        "//xla/hlo/analysis:hlo_ordering",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/service:buffer_assignment",
+        "//xla/service:buffer_value",
+        "//xla/service:logical_buffer",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Core",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+tf_proto_library(
+    name = "onednn_config_proto",
+    srcs = ["onednn_config.proto"],
+)
+
+tf_proto_library(
+    name = "backend_config_proto",
+    srcs = ["backend_config.proto"],
+    protodeps = [
+        ":onednn_config_proto",
+    ],
+)
+
+cc_library(
+    name = "onednn_util",
+    srcs = ["onednn_util.cc"],
+    hdrs = [
+        "onednn_util.h",
+        "//xla/tsl/util:onednn_util_hdrs",
+    ],
+    copts = runtime_copts() + tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":backend_config_proto_cc",
+        ":onednn_config_proto_cc",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/tsl/platform:env",
+        "@com_google_absl//absl/synchronization",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:platform_port",
+    ] + mkl_deps(),
+)
+
+cc_library(
+    name = "onednn_memory_util",
+    srcs = ["onednn_memory_util.cc"],
+    hdrs = ["onednn_memory_util.h"],
+    copts = runtime_copts() + tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":runtime_lightweight_check",
+        "//xla:literal",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:types",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/llvm_ir:ir_array",
+        "//xla/service/llvm_ir:ir_builder_mixin",
+        "//xla/service/llvm_ir:llvm_util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:TargetParser",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+    ] + mkl_deps(),
+)
+
+cc_library(
+    name = "onednn_matmul",
+    srcs = ["onednn_matmul.cc"],
+    hdrs = ["onednn_matmul.h"],
+    copts = runtime_copts() + tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":backend_config_proto_cc",
+        ":onednn_config_proto_cc",
+        ":onednn_memory_util",
+        ":onednn_util",
+        ":runtime_lightweight_check",
+        "//xla:executable_run_options",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/synchronization",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:platform_port",
+    ] + mkl_deps(),
+)
+
+cc_library(
+    name = "onednn_convolution",
+    srcs = ["onednn_convolution.cc"],
+    hdrs = ["onednn_convolution.h"],
+    copts = runtime_copts() + tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":backend_config_proto_cc",
+        ":onednn_config_proto_cc",
+        ":onednn_memory_util",
+        ":onednn_util",
+        ":runtime_lightweight_check",
+        "//xla:executable_run_options",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/synchronization",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:platform_port",
+    ] + mkl_deps(),
+)
+
+cc_library(
+    name = "onednn_layer_norm",
+    srcs = ["onednn_layer_norm.cc"],
+    hdrs = [
+        "onednn_layer_norm.h",
+        "//xla/tsl/util:onednn_util_hdrs",
+    ],
+    copts = runtime_copts() + tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":backend_config_proto_cc",
+        ":onednn_config_proto_cc",
+        ":onednn_memory_util",
+        ":runtime_lightweight_check",
+        "//xla:executable_run_options",
+        "//xla/tsl/platform:env",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/synchronization",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:platform_port",
+    ] + mkl_deps(),
+)
+
+cc_library(
+    name = "onednn_softmax",
+    srcs = ["onednn_softmax.cc"],
+    hdrs = [
+        "onednn_softmax.h",
+        "//xla/tsl/util:onednn_util_hdrs",
+    ],
+    copts = runtime_copts() + tsl_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":backend_config_proto_cc",
+        ":onednn_config_proto_cc",
+        ":onednn_memory_util",
+        ":runtime_lightweight_check",
+        "//xla:executable_run_options",
+        "//xla/tsl/platform:env",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/synchronization",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:platform_port",
+    ] + mkl_deps(),
+)
+
+cc_library(
+    name = "onednn_pattern_utils",
+    hdrs = ["onednn_pattern_utils.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":onednn_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:pattern_matcher",
+    ] + mkl_deps(),
+)
+
+cc_library(
+    name = "onednn_contraction_rewriter",
+    srcs = ["onednn_contraction_rewriter.cc"],
+    hdrs = [
+        "onednn_contraction_rewriter.h",
+        "onednn_convolution.h",
+        "onednn_matmul.h",
+        "//xla/tsl/util:onednn_util_hdrs",
+    ],
+    copts = tsl_copts(),
+    deps = [
+        ":backend_config_proto_cc",
+        ":onednn_config_proto_cc",
+        ":onednn_convolution",
+        ":onednn_matmul",
+        ":onednn_memory_util",
+        ":onednn_pattern_utils",
+        ":onednn_util",
+        "//xla:executable_run_options",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/evaluator:hlo_evaluator",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service:hlo_creation_utils",
+        "//xla/service:pattern_matcher",
+        "//xla/tsl/platform:env",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:platform_port",
+    ] + mkl_deps(),
+)
+
+cc_library(
+    name = "onednn_ops_rewriter",
+    srcs = ["onednn_ops_rewriter.cc"],
+    hdrs = ["onednn_ops_rewriter.h"],
+    copts = tsl_copts(),
+    deps = [
+        ":backend_config_proto_cc",
+        ":onednn_config_proto_cc",
+        ":onednn_memory_util",
+        ":onednn_pattern_utils",
+        ":onednn_util",
+        "//xla:literal_comparison",
+        "//xla:literal_util",
+        "//xla:status_macros",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/service:hlo_creation_utils",
+        "//xla/service:pattern_matcher",
+        "@com_google_absl//absl/algorithm:container",
+        "@local_tsl//tsl/platform:platform_port",
+    ] + mkl_deps(),
+)
+
+cc_library(
+    name = "onednn_float_support",
+    srcs = ["onednn_float_support.cc"],
+    hdrs = ["onednn_float_support.h"],
+    copts = tsl_copts(),
+    deps = [
+        ":onednn_contraction_rewriter",
+        "//xla/service:float_support",
+    ],
+)
+
+cc_library(
+    name = "cpu_float_support",
+    hdrs = ["cpu_float_support.h"],
+    copts = tsl_copts(),
+    deps = [
+        "//xla/backends/cpu:xnn_fusion",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:float_support",
+    ],
+)
+
+xla_cc_test(
+    name = "cpu_float_support_test",
+    srcs = ["cpu_float_support_test.cc"],
+    deps = [
+        ":cpu_float_support",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu/codegen:target_machine_features",
+        "//xla/backends/cpu/codegen:target_machine_test_base",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/hlo/transforms/simplifiers:float_normalization",
+        "//xla/service:hlo_module_config",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "cpu_symbol_repository",
+    hdrs = ["cpu_symbol_repository.h"],
+    deps = [
+        "//xla:xla_proto_cc",
+        "//xla/service:symbol_repository",
+    ],
+)
+
+cc_library(
+    name = "cpu_executable_run_options",
+    hdrs = ["cpu_executable_run_options.h"],
+    deps = ["//xla/backends/cpu/collectives:cpu_collectives"],
+)
+
+cc_library(
+    name = "metrics",
+    srcs = ["metrics.cc"],
+    hdrs = ["metrics.h"],
+    deps = [
+        "//xla/tsl/lib/monitoring:counter",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:stacktrace",
+        "@local_tsl//tsl/profiler/lib:traceme",
+    ],
+)
+
+cc_library(
+    name = "elemental_ir_emitter",
+    srcs = ["elemental_ir_emitter.cc"],
+    hdrs = ["elemental_ir_emitter.h"],
+    deps = [
+        ":elemental_math_emitter",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:elemental_ir_emitter",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:ir_headers",
+    ],
+)
+
+cc_library(
+    name = "small_while_loop_hoisting_pass",
+    srcs = ["small_while_loop_hoisting_pass.cc"],
+    hdrs = ["small_while_loop_hoisting_pass.h"],
+    deps = [
+        ":cpu_executable",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/service:collective_ops_utils",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_cc_test(
+    name = "small_while_loop_hoisting_pass_test",
+    srcs = ["small_while_loop_hoisting_pass_test.cc"],
+    deps = [
+        ":backend_config_proto_cc",
+        ":small_while_loop_hoisting_pass",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:test",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_cc_test(
+    name = "metrics_test",
+    srcs = ["metrics_test.cc"],
+    deps = [
+        ":metrics",
+        "//xla/tests:xla_internal_test_main",
+        "//xla/tsl/lib/monitoring:collected_metrics",
+        "//xla/tsl/lib/monitoring:collection_registry",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 9ba0085b24d372..4a1402c6934cba 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -236,6 +236,8 @@ limitations under the License.
 #include "tsl/profiler/lib/traceme.h"
 #include "tsl/profiler/lib/traceme_encode.h"
 
+#include "xnnpack_ops_rewriter.h"
+
 #ifdef TF_LLVM_X86_AVAILABLE
 #include "llvm/TargetParser/X86TargetParser.h"
 #endif
@@ -591,6 +593,12 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   };
   pipeline.AddPass<OperandUpcaster>(upcaster_filter);
 
+  // For softmax, rewrite to custom calls with XNNPACK targets.
+  bool enable_xnnpack =
+      xla::GetDebugOptionsFromFlags().xla_cpu_enable_xnnpack();
+  if (enable_xnnpack)
+    pipeline.AddPass<XnnPackOpsRewriter>();
+
   // Expand random number generation.
   pipeline.AddPass<RngExpander>();
   pipeline.AddPass<RngBitGeneratorExpander>(RandomAlgorithm::RNG_PHILOX);
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig b/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig
new file mode 100644
index 00000000000000..9ba0085b24d372
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig
@@ -0,0 +1,2712 @@
+/* Copyright 2017 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/cpu/cpu_compiler.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <stack>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+// IWYU pragma: no_include "llvm/Config/Disassemblers.def.inc"
+// IWYU pragma: no_include "llvm/Config/Targets.def.inc"
+
+#include "absl/cleanup/cleanup.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBufferRef.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/SplitModule.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Export.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "xla/backends/cpu/codegen/cpu_features.h"
+#include "xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_config.h"
+#include "xla/backends/cpu/codegen/execution_engine.h"
+#include "xla/backends/cpu/codegen/ir_compiler.h"
+#include "xla/backends/cpu/codegen/jit_compiler.h"
+#include "xla/backends/cpu/codegen/object_loader.h"
+#include "xla/backends/cpu/codegen/target_machine_features.h"
+#include "xla/backends/cpu/constant_allocation.h"
+#include "xla/backends/cpu/runtime/function_library.h"
+#include "xla/backends/cpu/runtime/thunk.h"
+#include "xla/backends/cpu/runtime/thunk.pb.h"
+#include "xla/backends/cpu/runtime/thunk_proto_serdes.h"
+#include "xla/backends/cpu/transforms/xnn_graph_fusion.h"
+#include "xla/backends/cpu/xnn_fusion.h"
+#include "xla/cpu_function_runtime.h"
+#include "xla/hlo/analysis/hlo_ordering.h"
+#include "xla/hlo/analysis/indexed_array_analysis.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_module_group.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/hlo/pass/hlo_pass_fix.h"
+#include "xla/hlo/pass/hlo_pass_pipeline.h"
+#include "xla/hlo/transforms/expanders/bitcast_dtypes_expander.h"
+#include "xla/hlo/transforms/expanders/cholesky_expander.h"
+#include "xla/hlo/transforms/expanders/comparison_expander.h"
+#include "xla/hlo/transforms/expanders/dot_decomposer.h"
+#include "xla/hlo/transforms/expanders/dynamic_index_splitter.h"
+#include "xla/hlo/transforms/expanders/eigh_expander.h"
+#include "xla/hlo/transforms/expanders/logistic_expander.h"
+#include "xla/hlo/transforms/expanders/optimization_barrier_expander.h"
+#include "xla/hlo/transforms/expanders/qr_expander.h"
+#include "xla/hlo/transforms/expanders/reduce_decomposer.h"
+#include "xla/hlo/transforms/expanders/reshape_decomposer.h"
+#include "xla/hlo/transforms/expanders/rng_bit_generator_expander.h"
+#include "xla/hlo/transforms/expanders/rng_expander.h"
+#include "xla/hlo/transforms/expanders/stochastic_convert_decomposer.h"
+#include "xla/hlo/transforms/literal_canonicalizer.h"
+#include "xla/hlo/transforms/operand_upcaster.h"
+#include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
+#include "xla/hlo/transforms/simplifiers/batch_dot_simplification.h"
+#include "xla/hlo/transforms/simplifiers/broadcast_canonicalizer.h"
+#include "xla/hlo/transforms/simplifiers/conditional_canonicalizer.h"
+#include "xla/hlo/transforms/simplifiers/convolution_group_converter.h"
+#include "xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h"
+#include "xla/hlo/transforms/simplifiers/flatten_call_graph.h"
+#include "xla/hlo/transforms/simplifiers/float_normalization.h"
+#include "xla/hlo/transforms/simplifiers/gather_simplifier.h"
+#include "xla/hlo/transforms/simplifiers/hlo_constant_folding.h"
+#include "xla/hlo/transforms/simplifiers/hlo_dce.h"
+#include "xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h"
+#include "xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.h"
+#include "xla/hlo/transforms/simplifiers/reduce_window_rewriter.h"
+#include "xla/hlo/transforms/simplifiers/reshape_mover.h"
+#include "xla/hlo/transforms/simplifiers/result_caster.h"
+#include "xla/hlo/transforms/simplifiers/sort_simplifier.h"
+#include "xla/hlo/transforms/simplifiers/sub_byte_normalization.h"
+#include "xla/hlo/transforms/simplifiers/tree_reduction_rewriter.h"
+#include "xla/hlo/transforms/simplifiers/tuple_simplifier.h"
+#include "xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h"
+#include "xla/hlo/transforms/while_loop_trip_count_annotator.h"
+#include "xla/literal_pool.h"
+#include "xla/map_util.h"
+#include "xla/mlir_hlo/transforms/passes.h"
+#include "xla/service/all_reduce_promotion.h"
+#include "xla/service/outer_dimension_propagation.h"
+#include "xla/service/get_outer_batch_value_simplifier.h"
+#include "xla/service/all_to_all_decomposer.h"
+#include "xla/service/batched_gather_scatter_normalizer.h"
+#include "xla/service/batchnorm_expander.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/call_inliner.h"
+#include "xla/service/change_op_data_type.h"
+#include "xla/service/compiler.h"
+#include "xla/service/conditional_simplifier.h"
+#include "xla/service/conditional_to_select.h"
+#include "xla/service/copy_insertion.h"
+#include "xla/service/cpu/buffer_info_util.h"
+#include "xla/service/cpu/conv_canonicalization.h"
+#include "xla/service/cpu/cpu_aot_compilation_result.h"
+#include "xla/service/cpu/cpu_executable.h"
+#include "xla/service/cpu/cpu_float_support.h"
+#include "xla/service/cpu/cpu_instruction_fusion.h"
+#include "xla/service/cpu/cpu_layout_assignment.h"
+#include "xla/service/cpu/cpu_options.h"
+#include "xla/service/cpu/dot_op_emitter.h"
+#include "xla/service/cpu/executable.pb.h"
+#include "xla/service/cpu/fusion_wrapper.h"
+#include "xla/service/cpu/ir_emitter.h"
+#include "xla/service/cpu/ir_emitter2.h"
+#include "xla/service/cpu/metrics.h"
+#include "xla/service/cpu/parallel_task_assignment.h"
+#include "xla/service/cpu/runtime_symbol_generator.h"
+#include "xla/service/cpu/small_while_loop_hoisting_pass.h"
+#include "xla/service/cpu/thunk_emitter.h"
+#include "xla/service/cpu_gpu_shape_verifier.h"
+#include "xla/service/dump.h"
+#include "xla/service/dynamic_dimension_inference.h"
+#include "xla/service/dynamic_padder.h"
+#include "xla/service/executable.h"
+#include "xla/service/float_support.h"
+#include "xla/service/gather_expander.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_cse.h"
+#include "xla/service/hlo_execution_profile.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_profile_printer_data.pb.h"
+#include "xla/service/hlo_verifier.h"
+#include "xla/service/layout_assignment.h"
+#include "xla/service/llvm_compiler.h"
+#include "xla/service/llvm_ir/llvm_command_line_options.h"
+#include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/service/logical_buffer.h"
+#include "xla/service/map_inliner.h"
+#include "xla/service/scatter_expander.h"
+#include "xla/service/scatter_simplifier.h"
+#include "xla/service/select_and_scatter_expander.h"
+#include "xla/service/sharding_propagation.h"
+#include "xla/service/sharding_remover.h"
+#include "xla/service/slow_operation_alarm.h"
+#include "xla/service/spmd/shardy/shardy_xla_pass.h"
+#include "xla/service/spmd/stateful_rng_spmd_partitioner.h"
+#include "xla/service/topk_rewriter.h"
+#include "xla/service/transpose_folding.h"
+#include "xla/service/triangular_solve_expander.h"
+#include "xla/service/while_loop_constant_sinking.h"
+#include "xla/service/while_loop_invariant_code_motion.h"
+#include "xla/service/while_loop_simplifier.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/host/host_platform_id.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "xla/util.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/casts.h"
+#include "tsl/platform/cpu_info.h"
+#include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "tsl/profiler/lib/traceme.h"
+#include "tsl/profiler/lib/traceme_encode.h"
+
+#ifdef TF_LLVM_X86_AVAILABLE
+#include "llvm/TargetParser/X86TargetParser.h"
+#endif
+
+#if defined(INTEL_MKL)
+#include "xla/hlo/transforms/simplifiers/simplify_fp_conversions.h"
+#include "xla/service/cpu/onednn_contraction_rewriter.h"
+#include "xla/service/cpu/onednn_float_support.h"
+#include "xla/service/cpu/onednn_ops_rewriter.h"
+#endif
+
+namespace xla {
+namespace {
+
+using tsl::profiler::TraceMe;
+using tsl::profiler::TraceMeEncode;
+
+// A module identifier (prefix) for emitted LLVM modules.
+static constexpr absl::string_view kXlaModuleIdentifier = "__compute_module";
+
+// Returns a global (per-process) thread pool for XLA CPU compilation tasks.
+static tsl::thread::ThreadPool* GetCompilationThreadPool() {
+  // LLVM compilation has a lot of memory-bound pointer chasing and not
+  // so much CPU-bound work. Based on profiling a few examples, 32 threads seems
+  // to be enough to achieve maximum parallel compilation speedup.
+  static constexpr int kMaxCompilationThreads = 32;
+
+  // On Mac OS the default stack size is 512KiB, this is too small for compiling
+  // reasonably sized programs
+  tsl::ThreadOptions thread_options;
+  thread_options.stack_size = 4 * 1024 * 1024;  // 4 MB
+
+  static auto* const thread_pool = new tsl::thread::ThreadPool(
+      tsl::Env::Default(), thread_options, "xla-cpu-llvm-codegen",
+      std::min(kMaxCompilationThreads, tsl::port::MaxParallelism()));
+  return thread_pool;
+}
+
+// Returns task runner that uses the global compilation thread pool.
+static cpu::JitCompiler::TaskRunner GetCompilationTaskRunner() {
+  return [](cpu::JitCompiler::Task task) {
+    GetCompilationThreadPool()->Schedule(std::move(task));
+  };
+}
+
+// For each computation in the module, determines whether that computation
+// calls a custom-call function, either directly or indirectly (e.g. because it
+// calls another computation that does).
+absl::flat_hash_map<const HloComputation*, bool>
+ModuleComputationsTransitivelyContainCustomCall(const HloModule& module) {
+  absl::flat_hash_map<const HloComputation*, bool> custom_call_map;
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
+
+  // Can never fail because we always return an OK status from the visitor.
+  TF_CHECK_OK(call_graph->VisitNodes([&custom_call_map](
+                                         const CallGraphNode& node) {
+    const HloComputation* computation = node.computation();
+
+    for (const HloInstruction* instruction : computation->instructions()) {
+      // The computation contains a custom-call instruction directly.
+      if (DynCast<HloCustomCallInstruction>(instruction)) {
+        custom_call_map[computation] = true;
+        return absl::OkStatus();
+      }
+      // The computation calls something that contains a custom-call
+      // instruction (directly or indirectly). This lookup relies on the call
+      // graph traversing callees before callers, so that the map is always
+      // populated for all callees at this point.
+      for (const HloComputation* callee : instruction->called_computations()) {
+        bool callee_contains_custom_call = FindOrDie(custom_call_map, callee);
+        if (callee_contains_custom_call) {
+          custom_call_map[computation] = true;
+          return absl::OkStatus();
+        }
+      }
+    }
+
+    custom_call_map[computation] = false;
+    return absl::OkStatus();
+  }));
+
+  return custom_call_map;
+}
+
+}  // namespace
+
+namespace cpu {
+
+CpuCompiler::CpuCompiler() {
+  // Initialize LLVM the first time the CpuCompiler is initialized.
+  static bool llvm_initialized = []() {
+    InitializeLLVMTarget();
+    return true;
+  }();
+  (void)llvm_initialized;
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<Executable>>> CpuCompiler::Compile(
+    std::unique_ptr<HloModuleGroup> module_group,
+    std::vector<std::vector<se::StreamExecutor*>> stream_execs,
+    const CompileOptions& options) {
+  for (const std::vector<se::StreamExecutor*>& se_vector : stream_execs) {
+    if (se_vector.size() != 1) {
+      return Unimplemented(
+          "Model partitioning not implemented for the CPU compiler");
+    }
+  }
+  return LLVMCompiler::Compile(std::move(module_group), stream_execs, options);
+}
+
+/* static */ void CpuCompiler::InitializeLLVMTarget() {
+  // Initialize LLVM's MC layer for the native target.
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+}
+
+namespace {
+
+// This visitor records which HLO instructions should have profiling information
+// recorded.
+class CollectProfileCandidates : public DfsHloVisitorWithDefault {
+ public:
+  static absl::StatusOr<absl::flat_hash_map<const HloInstruction*, int64_t>>
+  GetCandidatesForComputation(
+      const HloComputation& computation,
+      const absl::flat_hash_map<const HloInstruction*, int64_t>&
+          assigned_indices) {
+    absl::flat_hash_map<const HloInstruction*, int64_t> hlo_to_profile_idx;
+    CollectProfileCandidates profile_candidates_for_computation(
+        &hlo_to_profile_idx, assigned_indices);
+    TF_RETURN_IF_ERROR(computation.Accept(&profile_candidates_for_computation));
+    return hlo_to_profile_idx;
+  }
+
+ private:
+  CollectProfileCandidates(
+      absl::flat_hash_map<const HloInstruction*, int64_t>* hlo_to_profile_idx,
+      const absl::flat_hash_map<const HloInstruction*, int64_t>&
+          assigned_indices)
+      : hlo_to_profile_idx_(hlo_to_profile_idx),
+        assigned_indices_(assigned_indices) {}
+
+  absl::Status DefaultAction(HloInstruction* hlo_instruction) override {
+    hlo_to_profile_idx_->insert(
+        {hlo_instruction, FindOrDie(assigned_indices_, hlo_instruction)});
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleCall(HloInstruction* call) override {
+    TF_RETURN_IF_ERROR(DefaultAction(call));
+    CollectProfileCandidates candidates_for_call(hlo_to_profile_idx_,
+                                                 assigned_indices_);
+    TF_RETURN_IF_ERROR(call->to_apply()->Accept(&candidates_for_call));
+    return absl::OkStatus();
+  }
+  // Recurse into "conditional" so we can profile inside of it.
+  absl::Status HandleConditional(HloInstruction* conditional) override {
+    TF_RETURN_IF_ERROR(DefaultAction(conditional));
+
+    CollectProfileCandidates candidates_for_true(hlo_to_profile_idx_,
+                                                 assigned_indices_);
+    TF_RETURN_IF_ERROR(
+        conditional->true_computation()->Accept(&candidates_for_true));
+
+    CollectProfileCandidates candidates_for_false(hlo_to_profile_idx_,
+                                                  assigned_indices_);
+    TF_RETURN_IF_ERROR(
+        conditional->false_computation()->Accept(&candidates_for_false));
+
+    return absl::OkStatus();
+  }
+
+  // Skip constants, there is nothing to profile.
+  absl::Status HandleConstant(HloInstruction*) override {
+    return absl::OkStatus();
+  }
+  // Skip parameters, they are a simple load.
+  absl::Status HandleParameter(HloInstruction*) override {
+    return absl::OkStatus();
+  }
+  // It is important to recurse for "while" or else we risk overly coarse
+  // profiling information.
+  absl::Status HandleWhile(HloInstruction* xla_while) override {
+    TF_RETURN_IF_ERROR(DefaultAction(xla_while));
+
+    CollectProfileCandidates candidates_for_condition(hlo_to_profile_idx_,
+                                                      assigned_indices_);
+    TF_RETURN_IF_ERROR(
+        xla_while->while_condition()->Accept(&candidates_for_condition));
+
+    CollectProfileCandidates candidates_for_body(hlo_to_profile_idx_,
+                                                 assigned_indices_);
+    TF_RETURN_IF_ERROR(xla_while->while_body()->Accept(&candidates_for_body));
+
+    return absl::OkStatus();
+  }
+
+  absl::flat_hash_map<const HloInstruction*, int64_t>* hlo_to_profile_idx_;
+  const absl::flat_hash_map<const HloInstruction*, int64_t>& assigned_indices_;
+};
+
+// Adds the HloVerifier for CPU to the given pipeline.
+void AddHloVerifier(HloPassPipeline* pipeline, HloVerifierOpts&& opts = {},
+                    bool debug_only = false) {
+  auto verifier_metadata =
+      std::make_unique<CpuGpuVerifierMetadata>(std::move(opts));
+
+  if (debug_only) {
+    pipeline->AddInvariantCheckerDebug<HloVerifier>(
+        std::move(verifier_metadata), "hlo verifier (debug)");
+  } else {
+    pipeline->AddInvariantChecker<HloVerifier>(std::move(verifier_metadata),
+                                               "hlo verifier");
+  }
+}
+
+std::unique_ptr<HloPassFix<HloPassPipeline>> CreateSimplificationPipeline(
+    absl::string_view name, HloModule* module, bool is_fusion_emitters) {
+  // Run the following passes to a fixed point.
+  auto pipeline =
+      std::make_unique<HloPassFix<HloPassPipeline>>(std::string(name));
+  AddHloVerifier(pipeline.get(), HloVerifierOpts{},
+                 /*debug_only=*/true);
+
+  AlgebraicSimplifierOptions options;
+  options.set_enable_dot_strength_reduction(false);
+  // "slow" minmax means we propagate nan.
+  options.set_minmax_propagate_nan(
+      !module->config().debug_options().xla_cpu_enable_fast_min_max());
+  options.set_supports_non_canonical_dots(false);
+  options.set_executing_on_cpu(true);
+  pipeline->AddPass<AlgebraicSimplifier>(options);
+  pipeline->AddPass<SortSimplifier>();
+  pipeline->AddPass<HloDCE>();
+  pipeline->AddPass<GatherExpander>(GatherExpander::kEliminateSimpleGathers);
+  if (is_fusion_emitters) {
+    // Conversion to MLIR only works with simplified gathers.
+    pipeline->AddPass<GatherSimplifier>();
+  }
+
+  // Needs to happen after algebraic simplifier.
+  // pipeline->AddPass<TreeReductionRewriter>();
+
+  // BatchNormExpander can create zero-sized ops, so zero-sized HLO
+  // elimination has to come after that pass.
+  pipeline->AddPass<ZeroSizedHloElimination>();
+
+  pipeline->AddPass<WhileLoopInvariantCodeMotion>();
+  pipeline->AddPass<TupleSimplifier>();
+  pipeline->AddPass<WhileLoopConstantSinking>();
+  pipeline->AddPass<WhileLoopSimplifier>();
+
+  // TODO(b/134075051): Re-enable after b/134075051 is fixed.
+  // pipeline->AddPass<SliceSinker>();
+
+  pipeline->AddPass<HloDCE>();
+  pipeline->AddPass<ReshapeMover>();
+  pipeline->AddPass<HloConstantFolding>(
+      options::FoldAllConstants(module->config())
+          ? HloConstantFolding::Level::kAggressive
+          : HloConstantFolding::Level::kDefault);
+  pipeline->AddPass<ConditionalSimplifier>();
+
+  return pipeline;
+}
+
+}  // namespace
+
+absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
+    HloModule* module, bool is_aot_compile,
+    TargetMachineFeatures* target_machine_features) {
+  const int64_t num_partitions = module->config().num_partitions();
+  const bool is_thunk_runtime =
+      module->config().debug_options().xla_cpu_use_thunk_runtime();
+  const bool is_fusion_emitters =
+      is_thunk_runtime &&
+      module->config().debug_options().xla_cpu_use_fusion_emitters();
+  bool use_shardy_partitioner = module->config().use_shardy_partitioner();
+  if (num_partitions > 1) {
+    if (!module->config().use_spmd_partitioning()) {
+      return InvalidArgument(
+          "num_partitions=%d but SPMD partitioning not enabled.",
+          num_partitions);
+    }
+    HloPassPipeline spmd_pipeline("spmd-partitioner");
+    // Run some IR cleanup passes before running the SPMD partitioning
+    // passes.
+    AddHloVerifier(&spmd_pipeline);
+    spmd_pipeline.AddPass<FlattenCallGraph>();
+    spmd_pipeline.AddPass<CallInliner>();
+    spmd_pipeline.AddPass<ZeroSizedHloElimination>();
+    spmd_pipeline.AddPass<ConditionalCanonicalizer>();
+    if (use_shardy_partitioner) {
+      spmd_pipeline.AddPass<sdy::ShardyXLA>();
+    } else {
+      spmd_pipeline.AddPass<ShardingPropagation>(
+          /*is_spmd=*/true, /*propagate_metadata=*/false,
+          module->config().allow_spmd_sharding_propagation_to_output(),
+          module->config().allow_spmd_sharding_propagation_to_parameters());
+    }
+    spmd_pipeline.AddPass<spmd::StatefulRngSpmdPartitioner>(
+        num_partitions, module->config().replica_count());
+    TF_RETURN_IF_ERROR(spmd_pipeline.Run(module).status());
+  } else {
+    HloPassPipeline sharding_removal_pipeline("sharding-removal");
+    AddHloVerifier(&sharding_removal_pipeline);
+    // Remove redundant sharding ops when partition_count == 1.
+    sharding_removal_pipeline.AddPass<ShardingRemover>();
+    // Run ShardyXLA without propagation, which enforces use-tuple-args.
+    if (use_shardy_partitioner) {
+      sharding_removal_pipeline.AddPass<sdy::ShardyXLA>(
+          /*runSdyShardingPropagation=*/false);
+    }
+    sharding_removal_pipeline.AddPass<HloDCE>();
+    TF_RETURN_IF_ERROR(sharding_removal_pipeline.Run(module).status());
+  }
+
+  {
+    // SubbytePacker must be run before the rest of the pipeline since it
+    // modifies the layout of the entry computation inputs/outputs, which is
+    // passed to LayoutAssignment.
+    HloPassPipeline subbyte_packer_pipeline("SubbytePacker pipeline");
+    subbyte_packer_pipeline.AddPass<SubByteNormalization>(
+        SubByteNormalization::SET_ELEMENT_SIZE);
+    TF_RETURN_IF_ERROR(subbyte_packer_pipeline.Run(module).status());
+  }
+  HloPassPipeline pipeline("HLO passes through layout assignment");
+  AddHloVerifier(&pipeline);
+  pipeline.AddPass<BatchedGatherScatterNormalizer>();
+  pipeline.AddPass<ResultCaster>();
+
+  // If XNNPACK is enabled, we only need to upcast dots that XnnDotThunk does
+  // not support. `upcaster_filter` returns false if the instruction shouldn't
+  // be processed.
+  // TODO(b/406806134): Stop calling XNNPACK from regular Dot thunks. All XNN
+  // Dots should be wrapped in an `__xnn_fusion` fusion region and processed in
+  // `XnnFusionThunk`.
+  bool xnnpack_enabled = module->config().debug_options().xla_cpu_use_xnnpack();
+  auto call_library_for_dot = [&](const HloInstruction& instr) {
+    if (!xnnpack_enabled) return false;
+    DotImplementationStrategy strategy = GetDotImplementationStrategy(
+        module->config(), instr, *target_machine_features,
+        /*allow_runtime_calls=*/true);
+    return strategy == DotImplementationStrategy::kEigen;
+  };
+  HloPredicate upcaster_filter = [&](const HloInstruction* instr) {
+    if (!call_library_for_dot(*instr)) return true;
+    return !IsXnnDotSupported(instr->dot_dimension_numbers(),
+                              instr->operand(0)->shape(),
+                              instr->operand(1)->shape(), instr->shape(),
+                              target_machine_features)
+                .value_or(false);
+  };
+  pipeline.AddPass<OperandUpcaster>(upcaster_filter);
+
+  // Expand random number generation.
+  pipeline.AddPass<RngExpander>();
+  pipeline.AddPass<RngBitGeneratorExpander>(RandomAlgorithm::RNG_PHILOX);
+
+  // Remove zero-sized HLO from the input so that other passes don't have to
+  // handle it.
+  pipeline.AddPass<ZeroSizedHloElimination>();
+
+  pipeline.AddPass<DynamicIndexSplitter>();
+
+  pipeline.AddPass<ConditionalToSelect>();
+  pipeline.AddPass<MapInliner>();
+
+  // The TopkDecomposer generates a compare op with type=TOTALORDER and must
+  // run before the ComparisonExpander which rewrites such comparisons.
+  pipeline.AddPass<TopkDecomposer>([&](const HloInstruction* instr) {
+    return instr->opcode() == HloOpcode::kTopK;
+  });
+
+  pipeline.AddPass<ComparisonExpander>();
+  pipeline.AddPass<CholeskyExpander>();
+  pipeline.AddPass<QrExpander>();
+  pipeline.AddPass<EighExpander>();
+  pipeline.AddPass<TriangularSolveExpander>();
+  pipeline.AddPass<AllToAllDecomposer>();
+  pipeline.AddPass<StochasticConvertDecomposer>();
+
+  // Inline computations with a single call site.
+  pipeline.AddPass<CallInliner>(/*single_call_site=*/true);
+  pipeline.AddPass<BatchDotSimplification>();
+  pipeline.AddPass<DotDecomposer>();
+
+  // Rewrite to custom calls with target as oneDNN library calls.
+#if defined(INTEL_MKL)
+  // AOT compiled code runs in single thread.
+  if (!is_aot_compile && !is_thunk_runtime) {
+    // Placing OneDnnOpsRewriter here to match the flax patterns
+    // TODO: Decide where would be the appropriate place for this pass to make
+    // it more generic
+    // TODO - intel: Name of the pass might seem redundant as oneDnnRewriter,
+    // but in future plan to rename oneDNNrewriter to specific to onednn matmul
+    pipeline.AddPass<OneDnnOpsRewriter>();
+  }
+#endif  // INTEL_MKL
+
+  // Promote BF16 all-reduce to F32.
+  const std::pair<PrimitiveType, PrimitiveType> ar_promoted_types[] = {
+      {BF16, F32}};
+  pipeline.AddPass<AllReducePromotion>(ar_promoted_types);
+  // Convert BF16 and F8 operations to F32 and F16 respectively so that the CPU
+  // backend can support BF16/F8 operations without directly implementing a
+  // BF16/F8 lowering for most ops.
+  CpuFloatSupport bf16_support(BF16, call_library_for_dot,
+                               target_machine_features);
+#if defined(INTEL_MKL)
+  OneDnnFloatSupport onednn_bf16_support(BF16);
+  if (!is_aot_compile && !is_thunk_runtime) {
+    pipeline.AddPass<FloatNormalization>(&onednn_bf16_support);
+  } else {
+    pipeline.AddPass<FloatNormalization>(&bf16_support);
+  }
+#else
+  pipeline.AddPass<FloatNormalization>(&bf16_support);
+#endif
+  FloatSupport f8e5m2_support(F8E5M2, F16);
+  pipeline.AddPass<FloatNormalization>(&f8e5m2_support);
+  FloatSupport f8e4m3_support(F8E4M3, F16);
+  pipeline.AddPass<FloatNormalization>(&f8e4m3_support);
+  FloatSupport f8e4m3fn_support(F8E4M3FN, F16);
+  pipeline.AddPass<FloatNormalization>(&f8e4m3fn_support);
+  FloatSupport f8e4m3b11fnuz_support(F8E4M3B11FNUZ, F16);
+  pipeline.AddPass<FloatNormalization>(&f8e4m3b11fnuz_support);
+  FloatSupport f8e5m2fnuz_support(F8E5M2FNUZ, F16);
+  pipeline.AddPass<FloatNormalization>(&f8e5m2fnuz_support);
+  FloatSupport f8e4m3fnuz_support(F8E4M3FNUZ, F16);
+  pipeline.AddPass<FloatNormalization>(&f8e4m3fnuz_support);
+  FloatSupport f8e3m4_support(F8E3M4, F16);
+  pipeline.AddPass<FloatNormalization>(&f8e3m4_support);
+  FloatSupport s4_support(S4, S8);
+  pipeline.AddPass<FloatNormalization>(&s4_support);
+  FloatSupport u4_support(U4, U8);
+  pipeline.AddPass<FloatNormalization>(&u4_support);
+  FloatSupport f4e2m1fn_support(F4E2M1FN, F16);
+  pipeline.AddPass<FloatNormalization>(&f4e2m1fn_support);
+  FloatSupport f8e8m0fnu_support(F8E8M0FNU, F32);
+  pipeline.AddPass<FloatNormalization>(&f8e8m0fnu_support);
+  // After canonicalization, there may be more batch dots that can be
+  // simplified.
+  pipeline.AddPass<BatchDotSimplification>();
+  auto cost_model = [](HloInstruction* conv) {
+    // We need a cost model for CPUs. Currently, do nothing.
+    return false;
+  };
+  pipeline.AddPass<ConvolutionGroupConverter>(
+      /*should_expand=*/[](HloInstruction* conv) { return true; }, cost_model,
+      /*convert_batch_groups_only=*/true);
+  auto feature_group_should_expand = [](HloInstruction* conv) {
+    switch (conv->shape().element_type()) {
+      case F16:
+      case F32:
+        return false;
+      default:
+        return true;
+    }
+  };
+  pipeline.AddPass<ConvolutionGroupConverter>(
+      feature_group_should_expand, cost_model,
+      /*convert_batch_groups_only=*/false);
+  pipeline.AddPass<BatchNormExpander>(
+      /*rewrite_training_op=*/true,
+      /*rewrite_inference_op=*/true,
+      /*rewrite_grad_op=*/true);
+  pipeline.AddPass<LogisticExpander>();
+  pipeline.AddPass<ConditionalCanonicalizer>();
+  pipeline.AddPass<DynamicDimensionSimplifier>();
+
+  if (module->config()
+          .debug_options()
+          .xla_reduce_window_rewrite_base_length() != 0) {
+    pipeline.AddPass<HloPassFix<ReduceWindowRewriter>>(
+        module->config()
+            .debug_options()
+            .xla_reduce_window_rewrite_base_length());
+  }
+  auto dynamic_padder_options = DynamicPadderOptions();
+  // TODO(pgavin): ShapeChecks were never implemented correctly by the dynamic
+  // padder.  The mode defaults to kIgnore, and it was not overridden for nested
+  // computations (such as while bodies or conditional branches), and so cases
+  // that could not be proven would still be accepted even with compile-time
+  // checks enabled.  Recent changes to the DynamicPadder correctly
+  // override the mode.  However, some models have started to rely on the check
+  // being ignored, and they would be broken if it is enforced.
+  dynamic_padder_options.shape_check_mode =
+      DynamicDimensionInference::ShapeCheckMode::kIgnore;
+  pipeline.AddPass<DynamicPadder>(dynamic_padder_options);
+
+  pipeline.AddPass<ConvCanonicalization>(target_machine_features);
+
+  // Run fp16 dots/convs in fp32 and then downcast the result to fp16.
+  // Justification:
+  //
+  //   - This is significantly faster on our CPUs today than true fp16.
+  //   - It's numerically more accurate.  (Granted, this is not always
+  //     desirable, thus the ability to disable this functionality.)
+  //   - It matches more closely the GPU's behavior on fp16 dot/conv, where
+  //     accumulation happens in f32.
+  if (!module->config().debug_options().xla_cpu_strict_dot_conv_math()) {
+    pipeline.AddPass<ChangeOpDataType>(
+        F16, F32, HloPredicateIsOp<HloOpcode::kDot, HloOpcode::kConvolution>);
+  }
+
+  pipeline.AddPass(CreateSimplificationPipeline("simplification", module,
+                                                is_fusion_emitters));
+
+  // Scatter expander is sandwiched between two simplification pipelines to
+  // enable constant folding with the original scatter instructions (which is
+  // more efficient than with the expanded version) but then to also ensure that
+  // the resulting while loops are simplified.
+  pipeline.AddPass<SelectAndScatterExpander>();
+  if (is_fusion_emitters) {
+    pipeline.AddPass<ScatterExpander>(
+        ScatterExpander::kEliminateSimpleScatters);
+    pipeline.AddPass<ScatterSimplifier>();
+  }
+  if (!is_fusion_emitters || !kFusionEmitterScatterEnabled) {
+    pipeline.AddPass<ScatterExpander>(ScatterExpander::kEliminateAllScatters);
+  }
+
+  pipeline.AddPass(CreateSimplificationPipeline(
+      "post_scatter_expansion_simplification", module, is_fusion_emitters));
+
+  pipeline.AddPass<BitcastDtypesExpander>();
+
+  pipeline.AddPass<TopkRewriter>([](const HloSortInstruction* sort, int64_t) {
+    return sort->operand(0)->shape().element_type() == F32;
+  });
+  pipeline.AddPass<IndexedArrayAnalysisPrinterPass>();
+  pipeline.AddPass<TransposeFolding>(
+      [&](const HloInstruction& dot, int64_t operand) -> absl::StatusOr<bool> {
+        if (DotImplementationCanHandleTranspose(dot, *target_machine_features,
+                                                /*allow_runtime_calls=*/true)) {
+          return TransposeFolding::IsRowColumnTransposeDotOperand(dot, operand);
+        }
+        return false;
+      },
+      TransposeFolding::NeverFoldTranspose);
+  pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
+
+  pipeline.AddPass<OptimizationBarrierExpander>();
+  pipeline.AddPass<TupleSimplifier>();
+
+  // Annotate while loops with statically known trip counts, so that at run time
+  // we can avoid running the loop condition computations.
+  pipeline.AddPass<WhileLoopTripCountAnnotator>();
+
+  // Layout assignment uses alias analysis, which requires the call graph to be
+  // flattened.
+  pipeline.AddPass<FlattenCallGraph>();
+  ChannelLayoutConstraints layout_constraints;
+  pipeline.AddPass<CpuLayoutAssignment>(
+      module->mutable_entry_computation_layout(), target_machine_features,
+      &layout_constraints);
+  // Run SubByteNormalization because CpuLayoutAssignment may modify a
+  // Layout's element_size_in_bits field.
+  pipeline.AddPass<SubByteNormalization>(
+      SubByteNormalization::SET_ELEMENT_SIZE);
+
+  // Finally canonicalize all literals larger than 1024 bytes in the module to
+  // reuse the same literal across multiple HLO modules.
+  pipeline.AddPass<LiteralCanonicalizer>(LiteralPool::Default(),
+                                         /*min_size_bytes=*/1024);
+
+  return pipeline.Run(module).status();
+}
+
+absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
+    HloModule* module, bool is_aot_compile,
+    TargetMachineFeatures* target_machine_features,
+    const CompileOptions& compile_options) {
+  const auto& debug_options = module->config().debug_options();
+  const bool is_thunk_runtime = debug_options.xla_cpu_use_thunk_runtime();
+  const bool is_fusion_emitters =
+      is_thunk_runtime && debug_options.xla_cpu_use_fusion_emitters();
+  HloPassPipeline pipeline("HLO passes after layout assignment");
+
+  {
+    HloPassPipeline normalization_pipeline("hlo normalization");
+    normalization_pipeline.AddPass<ReshapeDecomposer>();
+    normalization_pipeline.AddPass<ReduceDecomposer>();
+    normalization_pipeline.AddPass<BroadcastCanonicalizer>();
+    TF_RETURN_IF_ERROR(normalization_pipeline.Run(module).status());
+  }
+
+  // After layout assignment, use a layout-sensitive verifier.
+  pipeline.AddPass<HloPassPipeline>("after layout assignment");
+  AddHloVerifier(&pipeline, HloVerifierOpts{}.MakeLayoutSensitive(),
+                 /*debug_only=*/true);
+
+  pipeline.AddPass<ReshapeDecomposer>();
+
+  const int max_parallelism =
+      module->config().intra_op_parallelism_threads() > 0
+          ? module->config().intra_op_parallelism_threads()
+          : tsl::port::NumSchedulableCPUs();
+
+#if defined(INTEL_MKL)
+  // AOT compiled code runs in single thread.
+  if (!is_aot_compile && !is_thunk_runtime) {
+    // Run SimplifyFPConversions pass to simplify the BF16 pattern and make it
+    // easier to match.
+    // Remove `f32 -> bf16 -> f32` casts inserted by bf16 normalization.
+    if (debug_options.xla_allow_excess_precision()) {
+      pipeline.AddPass<SimplifyFPConversions>();
+    }
+    pipeline.AddPass<OneDnnContractionRewriter>(max_parallelism,
+                                                compile_options.thread_pool);
+    // Run SimplifyFPConversions pass again to remove redundant Convert ops
+    // that may exist as a result of running OneDnnContractionRewriter pass.
+    if (debug_options.xla_allow_excess_precision()) {
+      pipeline.AddPass<SimplifyFPConversions>();
+    }
+  }
+#endif  // INTEL_MKL
+
+  if (module->config()
+          .debug_options()
+          .xla_cpu_experimental_xnn_graph_fusion_mode() !=
+      DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED) {
+    pipeline.AddPass<XnnGraphFusion>();
+  }
+
+  // Add a fusion pass now that layout assignment is done.
+  pipeline.AddPass<CpuInstructionFusion>();
+  if (is_fusion_emitters) {
+    pipeline.AddPass<FusionWrapper>();
+  }
+
+  // The LayoutAssignment pass may leave behind kCopy instructions which are
+  // duplicate or NOPs, so remove them with algebraic simplification and CSE.
+  // Run this to a fixed point.
+  [&pipeline = pipeline.AddPass<HloPassFix<HloPassPipeline>>(
+       "simplification after layout assignment"),
+   &module] {
+    AddHloVerifier(
+        &pipeline,
+        HloVerifierOpts{}.MakeLayoutSensitive().WithInstructionCanChangeLayout(
+            LayoutAssignment::InstructionCanChangeLayout),
+        /*debug_only=*/true);
+    AlgebraicSimplifierOptions options;
+    options.set_is_layout_sensitive(true);
+    options.set_supports_non_canonical_dots(false);
+    options.set_enable_dot_strength_reduction(false);
+    // "slow" minmax means we propagate nan.
+    options.set_minmax_propagate_nan(
+        !module->config().debug_options().xla_cpu_enable_fast_min_max());
+    options.set_executing_on_cpu(true);
+    pipeline.AddPass<AlgebraicSimplifier>(options);
+    pipeline.AddPass<HloDCE>();
+    pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
+  }();
+
+  // Outline ops in the entry computation into calls to subcomputations.
+  if (!is_aot_compile) {
+    // Run ParallelTaskAssigner to assign parallel tasks to HLOs in module.
+    // Note this is not run for AOT because it would bring in thread pool
+    // and thread synchronization dependencies which would likely increase
+    // binary size (and most AOT applications are single-threaded).
+    // TODO(b/29630486) Support multi-threaded AOT.
+    pipeline.AddPass<ParallelTaskAssigner>(
+        max_parallelism, ShapeSizeBytesFunction(), target_machine_features);
+  }
+  // Copy insertion should be performed immediately before IR emission to
+  // avoid inserting unnecessary copies (later pass adds an instruction which
+  // materializes the value) or missing a necessary copy (later pass removes
+  // an instruction which materializes a value). DCE must be run immediately
+  // before (and sometime after) copy insertion, to avoid dead code from
+  // interfering with the rewrites.
+  pipeline.AddPass<HloDCE>();
+  pipeline.AddPass<OptimizeInputOutputBufferAlias>(true);
+
+  // If enabled we'll use more precise region based analysis for copy removal.
+  if (debug_options.xla_cpu_copy_insertion_use_region_analysis()) {
+    pipeline.AddPass<CopyInsertion>(
+        /*can_share_buffer=*/nullptr,
+        /*use_region_based_live_range_analysis=*/-1);
+  } else {
+    pipeline.AddPass<CopyInsertion>();
+  }
+
+  // The hoisting of small while loops is only useful in the context of the
+  // thunk runtime.
+  if (module->config().debug_options().xla_cpu_use_thunk_runtime()) {
+    TF_ASSIGN_OR_RETURN(
+        int64_t byte_threshold,
+        xla::cpu::options::SmallWhileLoopByteThreshold(module->config()));
+    pipeline.AddPass<SmallWhileLoopHoistingPass>(byte_threshold);
+  }
+
+  pipeline.AddPass<OuterDimensionPropagationPass>();
+  pipeline.AddPass<GetOuterBatchValueSimplifier>();
+  pipeline.AddPass<HloDCE>();
+  return pipeline.Run(module).status();
+}
+
+absl::Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
+                                       llvm::TargetMachine* target_machine,
+                                       const CompileOptions& compile_options) {
+  TargetMachineFeatures target_machine_features(target_machine);
+  TF_RETURN_IF_ERROR(RunHloPassesThroughLayoutAssn(module, is_aot_compile,
+                                                   &target_machine_features));
+
+  return RunHloPassesAfterLayoutAssn(module, is_aot_compile,
+                                     &target_machine_features, compile_options);
+}
+
+namespace {
+
+// Align buffers to XLA:CPU minimal alignment.
+int64_t memory_alignment(LogicalBuffer::Color) {
+  return cpu_function_runtime::MinAlign();
+}
+
+llvm::TargetOptions CompilerTargetOptions(
+    const HloModuleConfig& module_config) {
+  llvm::TargetOptions target_options;
+  // Always allow FMA fusion. This increases precision instead of decreasing it.
+  target_options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
+  return target_options;
+}
+
+std::pair<LLVMCompiler::ModuleHook, LLVMCompiler::ModuleHook> GetIRModuleHooks(
+    const HloModule& hlo_module,
+    const LLVMCompiler::ModuleHook& user_pre_optimization_hook,
+    const LLVMCompiler::ModuleHook& user_post_optimization_hook) {
+  // Create the IR hooks. If applicable, each IR hook does the following:
+  //
+  //  * Calls the user supplied module hook.
+  //  * Writes out the IR to a file in the output directory designated by
+  //    --xla_dump_to
+  const HloModule* hlo_module_ptr = &hlo_module;
+  auto hook = [user_pre_optimization_hook, user_post_optimization_hook,
+               hlo_module_ptr](bool optimized,
+                               const llvm::Module& llvm_module) {
+    const auto& user_hook =
+        !optimized ? user_pre_optimization_hook : user_post_optimization_hook;
+    if (user_hook) {
+      user_hook(llvm_module);
+    }
+
+    // Include LLVM module identifier suffix in case `llvm_module` is just a
+    // part of the original LLVM module constructed by the XLA.
+    absl::string_view id = llvm_module.getModuleIdentifier();
+    size_t pos = std::min(id.size(), 1 + kXlaModuleIdentifier.size());
+    llvm_ir::DumpIrIfEnabled(*hlo_module_ptr, llvm_module, optimized,
+                             /*filename_suffix=*/id.substr(pos));
+  };
+  return {[hook](const llvm::Module& llvm_module) {
+            return hook(/*optimized=*/false, llvm_module);
+          },
+          [hook](const llvm::Module& llvm_module) {
+            return hook(/*optimized=*/true, llvm_module);
+          }};
+}
+
+absl::Status VerifyLlvmModule(const llvm::Module& llvm_module) {
+  XLA_SCOPED_LOGGING_TIMER("CpuCompiler - Running LLVM verifier");
+
+  std::string err;
+  llvm::raw_string_ostream err_stream(err);
+
+  // verifyModule() returns true if the module is broken.
+  TF_RET_CHECK(!llvm::verifyModule(llvm_module, &err_stream))
+      << "Invalid LLVM IR before optimizations:\n"
+      << err_stream.str()
+      << "\nThis probably indicates a bug in the HLO -> LLVM IR lowering. "
+         "Rerun with --xla_dump_to to get the IR. ";
+  return absl::OkStatus();
+}
+
+absl::Status CreateHloProfilingArtifacts(
+    const HloModule& module,
+    absl::flat_hash_map<const HloInstruction*, int64_t>*
+        instruction_to_profile_idx,
+    absl::flat_hash_map<const HloComputation*, int64_t>*
+        computation_to_profile_idx,
+    std::unique_ptr<HloProfileIndexMap>* hlo_profile_index_map,
+    std::unique_ptr<HloProfilePrinterData>* hlo_profile_printer_data) {
+  *hlo_profile_index_map = std::make_unique<HloProfileIndexMap>(module);
+  const HloComputation& entry_computation = *module.entry_computation();
+
+  TF_ASSIGN_OR_RETURN(
+      *instruction_to_profile_idx,
+      CollectProfileCandidates::GetCandidatesForComputation(
+          entry_computation,
+          (*hlo_profile_index_map)->instruction_to_profile_idx()));
+
+  auto shape_size_bytes = [](const Shape& shape) {
+    // On the cpu, opaques are pointers.
+    if (shape.IsOpaque()) {
+      return static_cast<int64_t>(sizeof(void*));
+    }
+    return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  };
+
+  HloCostAnalysis cost_analysis(shape_size_bytes);
+  TF_RETURN_IF_ERROR(entry_computation.Accept(&cost_analysis));
+  *hlo_profile_printer_data = CreateHloProfilePrinterData(
+      **hlo_profile_index_map, cost_analysis, entry_computation.name());
+  *computation_to_profile_idx =
+      (*hlo_profile_index_map)->computation_to_profile_idx();
+
+  return absl::OkStatus();
+}
+
+}  // namespace
+
+absl::StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
+    std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/,
+    const CompileOptions& options) {
+  auto& config = module->config();
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<llvm::TargetMachine> jit_target_machine,
+      IrCompiler::InferTargetMachine(
+          CompilerTargetOptions(config), IrCompiler::GetCodeGenOptLevel(config),
+          CpuFeatureFromString(config.debug_options().xla_cpu_max_isa())));
+
+  TF_RETURN_IF_ERROR(RunHloPasses(module.get(), /*is_aot_compile=*/false,
+                                  jit_target_machine.get(),
+                                  /*compile_options=*/options));
+  return std::move(module);
+}
+
+namespace {
+
+static void DumpModuleToFile(const llvm::Module& llvm_module,
+                             const llvm::object::ObjectFile& obj_file,
+                             const HloModule& hlo_module) {
+  absl::string_view id = llvm_module.getModuleIdentifier();
+  size_t pos = std::min(id.size(), 1 + kXlaModuleIdentifier.size());
+  auto get_file_suffix = [&]() {
+    std::vector<absl::string_view> parts = {"obj-file"};
+    parts.reserve(3);
+    absl::string_view middle_name = id.substr(pos);
+    if (!middle_name.empty()) {
+      parts.push_back(middle_name);
+    }
+    parts.push_back("o");
+    return absl::StrJoin(parts, ".");
+  };
+  DumpToFileInDir(
+      hlo_module, /*file_prefix=*/"", get_file_suffix(),
+      absl::string_view(obj_file.getData().data(), obj_file.getData().size()));
+}
+
+// Post-compilation callback functor for use by SimpleOrcJIT.
+//
+// Dumps machine code if dumping is enabled for the module.
+static std::function<void(const llvm::Module&, const llvm::object::ObjectFile&)>
+CreateOrcJITPostCompilationHook(const HloModule* hlo_module,
+                                std::vector<std::string>* obj_files) {
+  return [=](const llvm::Module& llvm_module,
+             const llvm::object::ObjectFile& obj_file) {
+    if (obj_files) obj_files->push_back(obj_file.getData().str());
+
+    if (DumpingEnabledForHloModule(*hlo_module)) {
+      DumpModuleToFile(llvm_module, obj_file, *hlo_module);
+    }
+  };
+}
+
+struct ComputationToEmit {
+  HloComputation* computation;
+
+  // Are we emitting this computation with fast-math reassociation enabled?
+  // We enable reassociation for reductions because it has a significant
+  // performance impact.
+  bool allow_reassociation;
+
+  bool operator==(const ComputationToEmit& other) const {
+    return computation == other.computation &&
+           allow_reassociation == other.allow_reassociation;
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const ComputationToEmit& c) {
+    return H::combine(std::move(h), c.computation, c.allow_reassociation);
+  }
+};
+
+std::vector<ComputationToEmit> SubcomputationEmissionOrder(
+    HloComputation* root) {
+  absl::flat_hash_set<ComputationToEmit> visited;
+  std::vector<ComputationToEmit> postorder;
+
+  // agenda of (node, leave) pairs.
+  std::stack<std::pair<ComputationToEmit, bool>> agenda;
+  agenda.emplace(ComputationToEmit{root, false}, false);
+  while (!agenda.empty()) {
+    ComputationToEmit c;
+    bool leave;
+    std::tie(c, leave) = agenda.top();
+    agenda.pop();
+
+    if (leave) {
+      postorder.push_back(c);
+      continue;
+    }
+
+    if (visited.insert(c).second) {
+      agenda.emplace(c, true);
+      for (auto* instruction : c.computation->instructions()) {
+        bool allow_reassociation =
+            instruction->opcode() == HloOpcode::kAllReduce ||
+            instruction->opcode() == HloOpcode::kReduce ||
+            instruction->opcode() == HloOpcode::kReduceWindow;
+        auto cc = absl::MakeSpan(instruction->called_computations());
+        for (auto it = cc.rbegin(); it != cc.rend(); ++it) {
+          HloComputation* called_computation = *it;
+          ComputationToEmit callee{
+              called_computation, c.allow_reassociation || allow_reassociation};
+          if (!visited.contains(callee)) {
+            agenda.emplace(callee, false);
+          }
+        }
+      }
+    }
+  }
+  DCHECK(!postorder.empty() && postorder.back().computation == root);
+  postorder.pop_back();
+  return postorder;
+}
+
+}  // namespace
+
+// Removes unused globals and function declarations from the LLVM module.
+//
+// After splitting LLVM module into multiple parts, we end up with unused
+// symbols in each part: external globals and function declarations. We don't
+// support linking across modules added to SimpleOrcJIT, and we don't need it,
+// because we never construct LLVM IR that might require cross-module linking,
+// so we can just remove unused symbols from each part.
+static void RemoveUnusedSymbols(llvm::Module& module) {
+  llvm::SmallVector<llvm::GlobalVariable*> unused_globals;
+  llvm::SmallVector<llvm::Function*> unused_functions;
+
+  for (llvm::GlobalVariable& gv : module.globals()) {
+    if (gv.use_empty()) unused_globals.push_back(&gv);
+  }
+  for (llvm::Function& f : module.functions()) {
+    if (f.isDeclaration() && f.use_empty()) unused_functions.push_back(&f);
+  }
+
+  for (auto* gv : unused_globals) {
+    module.eraseGlobalVariable(gv);
+  }
+  for (auto* f : unused_functions) {
+    f->eraseFromParent();
+  }
+}
+
+// Clones a ThreadSafeModule from the given LLVM module in a new LLVM context.
+//
+// To enable parallel compilation, each LLVM module has to be owned by a
+// separate LLVM context. We take each part of the original module after a
+// split, and clone it into a new LLVM context.
+static llvm::orc::ThreadSafeModule CloneAsThreadSafeModule(
+    int64_t part, std::unique_ptr<llvm::Module> module) {
+  TraceMe trace([&] {
+    return TraceMeEncode("CpuCompiler::CloneAsThreadSafeModule",
+                         {{"part", part}});
+  });
+
+  // There is no way to clone a module from one context to another, so we need
+  // to serialize the module to bitcode and parse it back into the new context.
+  llvm::SmallString<0> bc;
+  llvm::raw_svector_ostream bcos(bc);
+  llvm::WriteBitcodeToFile(*module, bcos);
+
+  // Parse module back into its own LLVM context.
+  auto clone_context = std::make_unique<llvm::LLVMContext>();
+  auto clone_module = llvm::parseBitcodeFile(
+      llvm::MemoryBufferRef(
+          llvm::StringRef(bc.data(), bc.size()),
+          absl::StrFormat("%s_part_%02d", kXlaModuleIdentifier, part)),
+      *clone_context);
+
+  return llvm::orc::ThreadSafeModule(std::move(*clone_module),
+                                     std::move(clone_context));
+}
+
+namespace {
+// Compiled symbols (kernels and comparators) from a single LLVM module part.
+struct CompiledSymbolsPart {
+  std::vector<IrEmitter2::KernelInfo> kernels;
+  std::vector<IrEmitter2::ComparatorInfo> comparators;
+};
+}  // namespace
+
+// Collect IrEmitter2 symbols that got into the LLVM module part. We issue
+// compilation tasks in parallel, and to maximize concurrency we don't issue
+// separate compilation tasks that compile symbols from the same module.
+static CompiledSymbolsPart CollectCompiledSymbolsPart(
+    const IrEmitter2& ir_emitter, const llvm::Module& module) {
+  CompiledSymbolsPart syms;
+
+  auto find_kernel =
+      [&](llvm::StringRef name) -> std::optional<IrEmitter2::KernelInfo> {
+    for (auto& k : ir_emitter.kernels()) {
+      if (k.name == name) return k;
+    }
+    return std::nullopt;
+  };
+
+  auto find_comparator =
+      [&](llvm::StringRef name) -> std::optional<IrEmitter2::ComparatorInfo> {
+    for (auto& c : ir_emitter.comparators()) {
+      if (c.name == name) return c;
+    }
+    return std::nullopt;
+  };
+
+  for (auto& f : module.functions()) {
+    if (auto kernel = find_kernel(f.getName())) {
+      syms.kernels.push_back(*kernel);
+    }
+    if (auto comparator = find_comparator(f.getName())) {
+      syms.comparators.push_back(*comparator);
+    }
+  }
+
+  return syms;
+}
+
+// If LLVM module has large constants constructed from literals, we don't want
+// to split it, because it will cause us to copy large constants across module
+// parts. We should not be storing large constants in LLVM IR in a first place,
+// but while we do that, we have to be extra-careful, or it leads to extremely
+// long compilation times, OOMs and timeouts.
+//
+// TODO(b/361800465): Figure out how to avoid putting large constants into
+// LLVM IR in the first place.
+static bool HasLargeConstants(llvm::Module& module) {
+  static constexpr int kMaxConstantSize = 10000;
+  for (auto& g : module.globals()) {
+    if (!g.hasInitializer()) {
+      continue;
+    }
+
+    llvm::Constant* initializer = g.getInitializer();
+    if (auto* arr = llvm::dyn_cast<llvm::ArrayType>(initializer->getType())) {
+      if (arr->getNumElements() > kMaxConstantSize) return true;
+    }
+  }
+  return false;
+}
+
+inline void VlogMaxIsa(absl::string_view max_cpu_isa) {
+  if (VLOG_IS_ON(1) && !max_cpu_isa.empty()) {
+    if (tsl::port::IsX86CPU()) {
+      VLOG(1) << "`xla_cpu_max_isa` is set. Will not use features newer than: "
+              << max_cpu_isa;
+    } else {
+      VLOG(1) << "`xla_cpu_max_isa` is set to `" << max_cpu_isa
+              << "`. This flag is not supported on non-x86 CPUs yet.";
+    }
+  }
+}
+
+// We keep HloProto in the CpuExecutable, but we don't need to keep literals
+// payload in it as we use it only for debugging and memory analysis.
+static void StripPayloadFromLiteralProto(HloProto& proto) {
+  auto* module = proto.mutable_hlo_module();
+  for (auto& computation : *module->mutable_computations()) {
+    for (auto& instruction : *computation.mutable_instructions()) {
+      // We only keep literal shape to correctly estimate memory usage of the
+      // HLO module, but we don't need the actual literal data.
+      if (instruction.has_literal()) {
+        LiteralProto literal;
+        *literal.mutable_shape() = instruction.literal().shape();
+        *instruction.mutable_literal() = std::move(literal);
+      }
+    }
+  }
+}
+
+// Extracts the given set of kernels from the original module.
+// Returns a new module with the extracted kernels.
+static absl::StatusOr<std::unique_ptr<llvm::Module>> ExtractKernelsFromModule(
+    llvm::Module* original_module,
+    absl::flat_hash_set<llvm::StringRef> kernels) {
+  // Clone into a new module, only keeping definitions of the relevant kernels.
+  auto should_clone_definition = [&kernels](const llvm::GlobalValue* gv) {
+    if (auto* func = llvm::dyn_cast<llvm::Function>(gv)) {
+      return kernels.contains(func->getName());
+    }
+    return false;
+  };
+  llvm::ValueToValueMapTy vmap;
+  std::unique_ptr<llvm::Module> module =
+      llvm::CloneModule(*original_module, vmap, should_clone_definition);
+
+  // Erase the cloned symbols from the original module.
+  for (const auto& kernel_name : kernels) {
+    llvm::Function* to_be_removed = original_module->getFunction(kernel_name);
+    if (to_be_removed == nullptr) {
+      return Internal("Cannot remove kernel %s: cannot be found in module %s",
+                      kernel_name, original_module->getName());
+    }
+    to_be_removed->eraseFromParent();
+  }
+  return module;
+}
+
+static void AddXlaBackendExtraOptionsAsModuleFlag(
+    llvm::Module* llvm_module, llvm::StringRef backend_extra_options) {
+  auto* options_mdstring =
+      llvm::MDString::get(llvm_module->getContext(), backend_extra_options);
+  llvm_module->addModuleFlag(llvm::Module::Error, "xla_backend_extra_options",
+                             options_mdstring);
+}
+
+absl::StatusOr<std::unique_ptr<CpuExecutable>>
+CpuCompiler::CompileCpuExecutable(std::unique_ptr<HloModule> module) {
+  TraceMe trace([&] {
+    return TraceMeEncode("CpuCompiler::CompileCpuExecutable",
+                         {{"name", module->name()}});
+  });
+
+  ModuleHook pre_optimization_ir_hook;
+  ModuleHook post_optimization_ir_hook;
+  std::tie(pre_optimization_ir_hook, post_optimization_ir_hook) =
+      GetIRModuleHooks(*module, user_pre_optimization_hook_,
+                       user_post_optimization_hook_);
+
+  // Compile must be thread-safe so create a new LLVM context for the module.
+  mlir::MLIRContext mlir_context;
+  auto llvm_context = std::make_unique<llvm::LLVMContext>();
+  auto llvm_module =
+      std::make_unique<llvm::Module>(kXlaModuleIdentifier, *llvm_context);
+
+  const DebugOptions& debug_options = module->config().debug_options();
+
+  // We collect compiled object files (machine code) so we can export
+  // CpuExecutable to an AOT compilation result.
+  std::vector<std::string> obj_files;
+
+  // We split LLVM module and distribute it across separate DyLibs to enable
+  // parallel compilation at run time.
+  size_t parallel_codegen_split_count =
+      debug_options.xla_cpu_parallel_codegen_split_count();
+  VlogMaxIsa(debug_options.xla_cpu_max_isa());
+
+  const HloModuleConfig& config = module->config();
+
+  // Options for compiling LLVM IR to machine code.
+  IrCompiler::Options ir_compiler_options{
+      /*optimization_level=*/IrCompiler::GetCodeGenOptLevel(config),
+      /*optimize_for_size=*/options::OptimizeForSizeRequested(config),
+      /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()),
+      /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(config),
+      /*disable_expensive_passes=*/
+      debug_options.xla_llvm_disable_expensive_passes(),
+      /*slp_vectorizer_disabled=*/options::SlpVectorizerDisabled(config),
+      /*disable_loop_unrolling=*/options::DisableLoopUnrolling(config),
+  };
+
+  // Compiler hooks to intercept compiled LLVM IR modules.
+  IrCompiler::CompilationHooks ir_compiler_hooks{
+      pre_optimization_ir_hook,
+      post_optimization_ir_hook,
+      CreateOrcJITPostCompilationHook(module.get(), &obj_files),
+  };
+
+  // Definition generator to link with XLA:CPU host runtime symbols.
+  ExecutionEngine::DefinitionGenerator definition_generator =
+      [](const llvm::DataLayout& data_layout) {
+        return std::make_unique<RuntimeSymbolGenerator>(data_layout);
+      };
+
+  // Options for orchestrating the JIT compilation process.
+  JitCompiler::Options jit_compiler_options{
+      /*num_dylibs=*/parallel_codegen_split_count,
+      /*definition_generator=*/std::move(definition_generator),
+  };
+
+  std::unique_ptr<IrCompiler> ir_compiler = IrCompiler::Create(
+      CompilerTargetOptions(module->config()), std::move(ir_compiler_options),
+      std::move(ir_compiler_hooks));
+
+  TF_ASSIGN_OR_RETURN(
+      JitCompiler jit_compiler,
+      JitCompiler::Create(std::move(jit_compiler_options),
+                          std::move(ir_compiler), GetCompilationTaskRunner()));
+
+  HloComputation* entry_computation = module->entry_computation();
+  absl::flat_hash_map<const HloInstruction*, int64_t>
+      instruction_to_profile_idx;
+  absl::flat_hash_map<const HloComputation*, int64_t>
+      computation_to_profile_idx;
+  std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
+  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data;
+  if (module->config().hlo_profiling_enabled()) {
+    TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts(
+        *module, &instruction_to_profile_idx, &computation_to_profile_idx,
+        &hlo_profile_index_map, &hlo_profile_printer_data));
+  }
+
+  // Cache these flags here since we'll want to access them after the module's
+  // ownership is std::moved.
+  const bool embed_ir_in_executable =
+      debug_options.xla_embed_ir_in_executable();
+
+  TF_ASSIGN_OR_RETURN(HloSchedule schedule, CreateHloSchedule(*module));
+  TF_RETURN_IF_ERROR(module->set_schedule(schedule));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferAssignment> assignment,
+                      CreateBufferAssignment(*module));
+  DumpHloModuleIfEnabled(*module, *assignment,
+                         absl::StrCat("cpu_", kAfterOptimizationsDumpName));
+
+  // Dump computation proto state and buffer assignment for
+  // GetCompiledMemoryStats results.
+  auto with_hlo_proto = [&](std::unique_ptr<CpuExecutable> cpu_executable) {
+    auto hlo_proto = std::make_unique<HloProto>();
+    *hlo_proto->mutable_hlo_module() = cpu_executable->module().ToProto();
+    *hlo_proto->mutable_buffer_assignment() =
+        cpu_executable->buffer_assignment().ToProto();
+    StripPayloadFromLiteralProto(*hlo_proto);
+    cpu_executable->set_hlo_proto(std::move(hlo_proto));
+    return cpu_executable;
+  };
+
+  TargetMachineFeatures target_machine_features(jit_compiler.target_machine());
+
+  // TODO(ezhulenev): Once we fully migrate to Thunks current IrEmitter should
+  // be renamed to NestedIrEmitter and be used only for emitting nested (aka
+  // thread local or embedded) computations (reductions, maps, etc.).
+
+  // (Nested) IrEmitter is responsible for building LLVM module with functions
+  // for all HLO computations. In thunk execution mode we only build LLVM
+  // functions for embedded computations (e.g. reduction computations) and all
+  // high-level operations (fusions, elementwise, etc.) are lowered to kernel
+  // functions (which are also LLVM functions, but use a HostKernel ABI).
+  IrEmitter nested_ir_emitter(
+      &mlir_context, *module, *assignment, llvm_module.get(),
+      std::move(instruction_to_profile_idx),
+      std::move(computation_to_profile_idx),
+      ModuleComputationsTransitivelyContainCustomCall(*module),
+      &target_machine_features,
+#ifdef MEMORY_SANITIZER
+      /*emit_code_for_msan=*/true
+#else
+      /*emit_code_for_msan=*/false
+#endif
+  );
+
+  // If we use Thunk runtime then instead of emitting LLVM function for the
+  // entry computation we emit a sequence of thunks that implement the
+  // computation as a sequence of interpreted commands.
+  if (module->config().debug_options().xla_cpu_use_thunk_runtime()) {
+    // The thunk runtime manages large constants, therefore we only emit
+    // small ones.
+    TF_RETURN_IF_ERROR(nested_ir_emitter.EmitSmallConstantGlobals());
+
+    // IR emitter is responsible for building LLVM module with host kernels for
+    // corresponding HLO instructions (fusions, elemental instructions, etc.).
+    IrEmitter2 ir_emitter2(*module, llvm_module.get(), &nested_ir_emitter);
+
+    // Thunk emitter is responsible for building a Thunk sequence that will
+    // resolved kernels in the compiled LLVM module and execute them together
+    // with Thunks implemented as library calls (e.g. oneDNN or Eigen).
+    ThunkEmitter thunk_emitter(ir_emitter2, *assignment,
+                               target_machine_features, module->config());
+    TF_ASSIGN_OR_RETURN(ThunkSequence thunks,
+                        thunk_emitter.EmitEntryComputation(*module));
+
+    std::string ir_module_string;
+    if (embed_ir_in_executable) {
+      std::string emitter2_ir = llvm_ir::DumpToString(llvm_module.get());
+
+      auto thunk_kernel_fmt = [](std::string* out,
+                                 const ThunkEmitter::EmittedKernel& kernel) {
+        absl::StrAppend(
+            out, llvm_ir::DumpToString(kernel.module.getModuleUnlocked()));
+      };
+      std::string thunks_ir =
+          absl::StrJoin(thunk_emitter.kernels(), "\n", thunk_kernel_fmt);
+
+      ir_module_string = absl::StrCat(emitter2_ir, "\n", thunks_ir);
+    }
+
+    TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
+    for (const auto& [name, module] : thunk_emitter.kernels()) {
+      TF_RETURN_IF_ERROR(VerifyLlvmModule(*module.getModuleUnlocked()));
+    }
+
+    // Some kernels have to be compiled separately because they have
+    // extra backend options.
+    int num_extra_functions = 0;
+    using BackendOptions = llvm::StringRef;
+    using Kernel = llvm::StringRef;
+    absl::flat_hash_map<BackendOptions, absl::flat_hash_set<Kernel>>
+        backend_extra_options_to_kernels;
+    for (const auto& k : ir_emitter2.kernels()) {
+      if (k.backend_extra_options.empty()) continue;
+      auto [_, inserted] =
+          backend_extra_options_to_kernels[k.backend_extra_options].insert(
+              k.name);
+      CHECK(inserted) << "Kernel " << k.name << " is not unique";
+      num_extra_functions++;
+    }
+    const int num_extra_parts = backend_extra_options_to_kernels.size();
+    // We assign one dylib to each set of kernels that have the same extra
+    // backend options. We do this because we work under the assumption that
+    // very few kernels will set extra options, and if they do, the options are
+    // likely to be identical.
+    if (num_extra_parts >= parallel_codegen_split_count) {
+      return Internal(
+          "Too many extra compilation parts due to non-default options (%d). "
+          "Consider reducing this number or increasing "
+          "parallel_codegen_split_count (%d)",
+          num_extra_parts, parallel_codegen_split_count);
+    }
+
+    // We define the number of module parts based on the total number of
+    // compiled functions (kernels and comparators) that are called from thunks,
+    // and the maximum number of parts that we want to split the module into.
+    size_t num_compiled_functions = ir_emitter2.kernels().size() +
+                                    ir_emitter2.comparators().size() +
+                                    thunk_emitter.kernels().size();
+    size_t num_default_parts =
+        std::min(num_compiled_functions - num_extra_functions,
+                 parallel_codegen_split_count - num_extra_parts);
+
+    // JIT compile the LLVM IR module to in-memory machine code. We split the
+    // module into `num_jit_dylibs` parts to allow parallel compilation. In
+    // practice, all of the kernel functions are independent and don't call each
+    // other, so we can compile each individual part in parallel. We split
+    // module preserving locals, which should guarantee that all thread local
+    // computations end up in the same module with the corresponding kernel.
+
+    // Collect all compiled symbols grouped by LLVM module part, so that we can
+    // issue compile tasks in parallel without any interference.
+    std::vector<CompiledSymbolsPart> compiled_parts;
+
+    VLOG(2) << "Compile LLVM module with " << ir_emitter2.kernels().size()
+            << " kernels and " << ir_emitter2.comparators().size()
+            << " comparators";
+
+    int dylib_index = 0;
+    auto add_jit_module = [&](std::unique_ptr<llvm::Module> llvm_module_part) {
+      // Collect symbols that are compiled in this LLVM module part.
+      RemoveUnusedSymbols(*llvm_module_part);
+      compiled_parts.push_back(
+          CollectCompiledSymbolsPart(ir_emitter2, *llvm_module_part));
+
+      std::string dump = llvm_ir::DumpToString(llvm_module_part.get());
+      VLOG(5) << "Adding compilation module:\n" << dump;
+
+      // Clone LLVM module part into its own thread safe context.
+      auto tsm =
+          CloneAsThreadSafeModule(dylib_index, std::move(llvm_module_part));
+      TF_CHECK_OK(jit_compiler.AddModule(std::move(tsm), dylib_index++));
+    };
+
+    // If there are extra parts, compile them first, since we must
+    // remove the affected kernels from the LLVM module.
+    if (num_extra_parts > 0) {
+      TraceMe trace([&] {
+        return TraceMeEncode("CompileExtraKernels",
+                             {{"num_extra_parts", num_extra_parts}});
+      });
+      for (const auto& [backend_extra_options, kernels] :
+           backend_extra_options_to_kernels) {
+        TF_ASSIGN_OR_RETURN(
+            std::unique_ptr<llvm::Module> new_module,
+            ExtractKernelsFromModule(llvm_module.get(), kernels));
+        AddXlaBackendExtraOptionsAsModuleFlag(new_module.get(),
+                                              backend_extra_options);
+        add_jit_module(std::move(new_module));
+      }
+    }
+
+    if (HasLargeConstants(*llvm_module)) {
+      VLOG(3) << "Skip parallel compilation due to large constants";
+      num_default_parts = 1;
+    }
+
+    if (num_default_parts > 1) {
+      VLOG(3) << "Split LLVM module into " << num_default_parts
+              << " parts before codegen to enable parallel compilation"
+              << " (max split count: " << parallel_codegen_split_count << ")";
+
+      TraceMe trace([&] {
+        return TraceMeEncode("SplitModule",
+                             {{"num_default_parts", num_default_parts}});
+      });
+
+      llvm::SplitModule(*llvm_module, num_default_parts, add_jit_module,
+                        /*PreserveLocals=*/true, /*RoundRobin=*/true);
+      // Free resources used by the original LLVM module.
+      llvm_module.reset();
+      llvm_context.reset();
+
+    } else {
+      VLOG(3) << "Compile LLVM module without splitting (max split count: "
+              << parallel_codegen_split_count << ")";
+      compiled_parts.push_back(
+          CollectCompiledSymbolsPart(ir_emitter2, *llvm_module));
+      TF_CHECK_OK(jit_compiler.AddModule(llvm::orc::ThreadSafeModule(
+          std::move(llvm_module), std::move(llvm_context))));
+    }
+
+    // Collect compiled symbols from all LLVM module parts.
+    std::vector<FunctionLibrary::Symbol> compiled_symbols;
+
+    absl::flat_hash_map<FunctionLibrary::TypeId, SymbolProto::FunctionTypeId>
+        symbol_type_id_to_function_type_id;
+
+    VLOG(3) << "Adding " << thunk_emitter.kernels().size()
+            << " kernels to the JIT compiler";
+    // Make sure we use all the "default" modules for maximum parallelism.
+    int num_default_so_far = dylib_index - num_extra_parts;
+    int kernel_dylib_index =
+        num_default_so_far < num_default_parts ? num_default_so_far : 0;
+    for (auto& [name, module] : thunk_emitter.kernels()) {
+      compiled_symbols.push_back(
+          FunctionLibrary::Sym<FunctionLibrary::Kernel>(name));
+      symbol_type_id_to_function_type_id.emplace(
+          compiled_symbols.back().type_id, SymbolProto::KERNEL);
+      TF_CHECK_OK(jit_compiler.AddModule(std::move(module),
+                                         num_extra_parts + kernel_dylib_index));
+      // Simply roundrobin the default kernel dylibs
+      kernel_dylib_index = (kernel_dylib_index + 1) % num_default_parts;
+    }
+
+    for (const CompiledSymbolsPart& part : compiled_parts) {
+      for (const IrEmitter2::KernelInfo& kernel : part.kernels) {
+        compiled_symbols.push_back(
+            FunctionLibrary::Sym<FunctionLibrary::Kernel>(kernel.name));
+        symbol_type_id_to_function_type_id.emplace(
+            compiled_symbols.back().type_id, SymbolProto::KERNEL);
+      }
+      for (const IrEmitter2::ComparatorInfo& comparator : part.comparators) {
+        compiled_symbols.push_back(
+            FunctionLibrary::Sym<FunctionLibrary::Comparator>(comparator.name));
+        symbol_type_id_to_function_type_id.emplace(
+            compiled_symbols.back().type_id, SymbolProto::COMPARATOR);
+      }
+    }
+
+    VLOG(3) << "Collected " << compiled_symbols.size() << " compiled symbols";
+
+    TraceMe trace_codegen([&] {
+      return TraceMeEncode(
+          "Codegen", {{"num_default_parts", num_default_parts},
+                      {"num_extra_parts", num_extra_parts},
+                      {"num_compiled_functions", num_compiled_functions}});
+    });
+
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<FunctionLibrary> function_library,
+                        std::move(jit_compiler).Compile(compiled_symbols));
+
+    // Create constant allocations from the buffer assignment.
+    TF_ASSIGN_OR_RETURN(std::vector<ConstantAllocation> constants,
+                        CreateConstantAllocations(*assignment));
+
+    TF_ASSIGN_OR_RETURN(
+        auto cpu_executable,
+        CpuExecutable::Create(std::move(function_library),
+                              std::move(assignment), std::move(module),
+                              std::move(thunks), std::move(constants),
+                              std::move(hlo_profile_printer_data),
+                              std::move(hlo_profile_index_map)));
+
+    // Save object files to be able to export them to AOT compilation
+    // result.
+    cpu_executable->set_obj_files(std::move(obj_files));
+
+    // Save compiled symbols to be able to export them to AOT compilation
+    // result.
+    cpu_executable->set_compiled_symbols(std::move(compiled_symbols));
+
+    // Save mapping between symbol type id and function type id to be able to
+    // export them to AOT compilation result.
+    cpu_executable->set_symbol_type_id_to_function_type_id(
+        symbol_type_id_to_function_type_id);
+
+    if (embed_ir_in_executable) {
+      cpu_executable->set_ir_module_string(ir_module_string);
+    }
+
+    return with_hlo_proto(std::move(cpu_executable));
+  }
+
+  TF_RETURN_IF_ERROR(nested_ir_emitter.EmitAllConstantGlobals());
+
+  // Each computation is a single function.  Emit all embedded computations
+  // before the entry computation. The order of computations returned from
+  // SubcomputationEmissionOrder guarantees that a called computation occurs
+  // before a caller computation.
+  for (ComputationToEmit subcomputation :
+       SubcomputationEmissionOrder(entry_computation)) {
+    if (subcomputation.computation->IsFusionComputation()) {
+      continue;
+    }
+    TF_RETURN_IF_ERROR(
+        nested_ir_emitter
+            .EmitComputation(
+                subcomputation.computation, subcomputation.computation->name(),
+                /*is_top_level_computation=*/false,
+                schedule.sequence(subcomputation.computation).instructions(),
+                subcomputation.allow_reassociation)
+            .status());
+  }
+  absl::string_view function_name_prefix = entry_computation->name().empty()
+                                               ? "__compute"
+                                               : entry_computation->name();
+  TF_ASSIGN_OR_RETURN(llvm::Function * entry_function,
+                      nested_ir_emitter.EmitComputation(
+                          entry_computation, function_name_prefix,
+                          /*is_top_level_computation=*/true,
+                          schedule.sequence(entry_computation).instructions(),
+                          /*allow_reassociation=*/false));
+
+  std::string ir_module_string;
+  if (embed_ir_in_executable) {
+    ir_module_string = llvm_ir::DumpToString(llvm_module.get());
+  }
+
+  TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
+
+  // Save entry function name before destroying LLVM module.
+  std::string entry_function_name = entry_function->getName().str();
+
+  // JIT compile the LLVM IR module to in-memory machine code.
+  llvm::orc::ThreadSafeModule thread_safe_module(std::move(llvm_module),
+                                                 std::move(llvm_context));
+  TF_RETURN_IF_ERROR(jit_compiler.AddModule(std::move(thread_safe_module)));
+
+  using ComputeFn = std::remove_pointer_t<CpuExecutable::ComputeFunctionType>;
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<FunctionLibrary> function_library,
+      std::move(jit_compiler)
+          .Compile({FunctionLibrary::Sym<ComputeFn>(entry_function_name)}));
+
+  TF_ASSIGN_OR_RETURN(
+      auto cpu_executable,
+      CpuExecutable::Create(std::move(function_library), std::move(assignment),
+                            std::move(module), entry_function_name,
+                            std::move(hlo_profile_printer_data),
+                            std::move(hlo_profile_index_map)));
+
+  cpu_executable->set_obj_files(std::move(obj_files));
+
+  if (embed_ir_in_executable) {
+    cpu_executable->set_ir_module_string(ir_module_string);
+  }
+
+  return with_hlo_proto(std::move(cpu_executable));
+}
+
+absl::StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
+    std::unique_ptr<HloModule> module,
+    [[maybe_unused]] se::StreamExecutor* stream_exec,
+    const CompileOptions& options) {
+  TraceMe trace([&] {
+    return TraceMeEncode("CpuCompiler::RunBackend", {{"name", module->name()}});
+  });
+
+  VLOG(1) << "Compiling: " << module->name();
+  RecordCpuCompilerStacktrace();
+  XLA_SCOPED_LOGGING_TIMER(
+      absl::StrFormat("Compiling [%s] for CPU using JIT", module->name()));
+  std::string slow_compilation_msg =
+      absl::StrCat("Compiling module ", module->name());
+  auto slow_compile_alarm = SlowCompilationAlarm(slow_compilation_msg);
+  auto llvm_options = llvm_ir::ExtractXlaBackendExtraOptions(
+      module->config().debug_options().xla_backend_extra_options());
+  llvm_ir::LLVMCommandLineOptionsLock llvm_lock(llvm_options);
+
+  std::unique_ptr<CpuExecutable> cpu_executable;
+  TF_ASSIGN_OR_RETURN(cpu_executable, CompileCpuExecutable(std::move(module)));
+
+  cpu_executable->set_debug_info(
+      cpu_executable->buffer_assignment().StatsString(
+          /*report_total_fragmentation=*/true));
+  VLOG(1) << "Compilation finished";
+  return std::unique_ptr<Executable>(std::move(cpu_executable));
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
+                                const AotCompilationOptions& aot_options) {
+  TF_RET_CHECK(!module_group->empty());
+  std::vector<std::unique_ptr<HloModule>> modules =
+      module_group->ConsumeModules();
+
+  auto llvm_options = llvm_ir::ExtractXlaBackendExtraOptions(
+      modules[0]->config().debug_options().xla_backend_extra_options());
+  VlogMaxIsa(modules[0]->config().debug_options().xla_cpu_max_isa());
+  llvm_ir::LLVMCommandLineOptionsLock llvm_lock(llvm_options);
+
+  // We can pass just one llvm::TargetOptions when we compile the LLVM module,
+  // so we bail if the configs have conflicting flags. At the moment, the only
+  // flags that need to be consistent are for fast-math.
+  for (const auto& fn_and_name :
+       {std::make_pair(&DebugOptions::xla_cpu_enable_fast_math,
+                       "xla_cpu_enable_fast_math"),
+        std::make_pair(&DebugOptions::xla_cpu_fast_math_honor_infs,
+                       "xla_cpu_fast_math_honor_infs"),
+        std::make_pair(&DebugOptions::xla_cpu_fast_math_honor_nans,
+                       "xla_cpu_fast_math_honor_nans")}) {
+    // This only works because each of the method pointers above returns a
+    // bool. Otherwise we'd have to do some template magic.
+    const auto& field_method_ptr = fn_and_name.first;
+    const auto& field_name = fn_and_name.second;
+    bool first_module_val =
+        (modules[0]->config().debug_options().*field_method_ptr)();
+    for (int64_t i = 0; i < modules.size(); ++i) {
+      bool cur_module_val =
+          (modules[i]->config().debug_options().*field_method_ptr)();
+      if (first_module_val != cur_module_val) {
+        return InvalidArgument(
+            "All HLO module configs must have the same value for %s, but "
+            "module 0 and %d have different values (%d vs %d).",
+            field_name, i, first_module_val, cur_module_val);
+      }
+    }
+  }
+
+  if (aot_options.PlatformId() != se::host::kHostPlatformId) {
+    return InvalidArgument("Incompatible AOT compilation platform");
+  }
+  const CpuAotCompilationOptions& options =
+      static_cast<const CpuAotCompilationOptions&>(aot_options);
+  llvm::Triple triple(llvm::Triple::normalize(options.triple()));
+  std::string error;
+  const llvm::Target* target =
+      llvm::TargetRegistry::lookupTarget(triple.getTriple(), error);
+  if (target == nullptr) {
+    return Internal("TargetRegistry::lookupTarget failed: %s", error);
+  }
+
+  llvm::Reloc::Model reloc_model = llvm::Reloc::Static;
+  llvm::PICLevel::Level pic_level = llvm::PICLevel::NotPIC;
+  llvm::PIELevel::Level pie_level = llvm::PIELevel::Default;
+  switch (options.relocation_model()) {
+    case CpuAotCompilationOptions::RelocationModel::Static:
+      reloc_model = llvm::Reloc::Static;
+      pic_level = llvm::PICLevel::NotPIC;
+      pie_level = llvm::PIELevel::Default;
+      break;
+    case CpuAotCompilationOptions::RelocationModel::SmallPic:
+      reloc_model = llvm::Reloc::PIC_;
+      pic_level = llvm::PICLevel::SmallPIC;
+      pie_level = llvm::PIELevel::Default;
+      break;
+    case CpuAotCompilationOptions::RelocationModel::BigPic:
+      reloc_model = llvm::Reloc::PIC_;
+      pic_level = llvm::PICLevel::BigPIC;
+      pie_level = llvm::PIELevel::Default;
+      break;
+    case CpuAotCompilationOptions::RelocationModel::SmallPie:
+      reloc_model = llvm::Reloc::PIC_;
+      pic_level = llvm::PICLevel::SmallPIC;
+      pie_level = llvm::PIELevel::Small;
+      break;
+    case CpuAotCompilationOptions::RelocationModel::BigPie:
+      reloc_model = llvm::Reloc::PIC_;
+      pic_level = llvm::PICLevel::BigPIC;
+      pie_level = llvm::PIELevel::Large;
+      break;
+  }
+  llvm::CodeGenOptLevel opt_level =
+      IrCompiler::GetCodeGenOptLevel(modules[0]->config());
+  llvm::TargetOptions target_options =
+      CompilerTargetOptions(modules[0]->config());
+  auto target_machine_builder = [&]() {
+    return absl::WrapUnique(target->createTargetMachine(
+        triple.getTriple(), options.cpu_name(), options.features(),
+        target_options, reloc_model, std::nullopt, opt_level));
+  };
+
+  std::unique_ptr<llvm::TargetMachine> target_machine =
+      target_machine_builder();
+
+  // Compile must be thread-safe so create a new LLVM context for the module.
+  mlir::MLIRContext mlir_context;
+  llvm::LLVMContext llvm_context;
+
+  std::vector<std::unique_ptr<AotCompilationResult>> results;
+  for (auto& hlo_module : modules) {
+    VLOG(1) << "Compiling ahead-of-time: " << hlo_module->name();
+    if (hlo_module->has_schedule()) {
+      continue;
+    }
+
+    TF_RETURN_IF_ERROR(RunHloPasses(hlo_module.get(), /*is_aot_compile=*/true,
+                                    target_machine.get(),
+                                    /*dummy*/ CompileOptions{}));
+
+    if (hlo_module->config().debug_options().xla_cpu_use_thunk_runtime()) {
+      TF_ASSIGN_OR_RETURN(results.emplace_back(),
+                          CompileAheadOfTimeThunks(
+                              std::move(hlo_module), target_machine_builder,
+                              options, triple, pic_level, pie_level));
+    } else {
+      TF_ASSIGN_OR_RETURN(results.emplace_back(),
+                          CompileAheadOfTimeLegacy(
+                              std::move(hlo_module), target_machine_builder,
+                              options, triple, pic_level, pie_level));
+    }
+  }
+
+  VLOG(1) << "Compilation finished";
+  return std::move(results);
+}
+
+absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+CpuCompiler::CompileAheadOfTimeLegacy(
+    std::unique_ptr<HloModule> module,
+    IrCompiler::TargetMachineBuilder target_machine_builder,
+    const CpuAotCompilationOptions& aot_options, const llvm::Triple& triple,
+    const llvm::PICLevel::Level& pic_level,
+    const llvm::PIELevel::Level& pie_level) {
+  TF_ASSIGN_OR_RETURN(HloSchedule schedule,
+                      ScheduleModule(module.get(), BufferSizeBytesFunction()));
+
+  // Run buffer analysis on the HLO graph. This analysis figures out which
+  // temporary buffers are required to run the computation.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<BufferAssignment> assignment,
+      BufferAssigner::Run(module.get(),
+                          std::make_unique<SequentialHloOrdering>(schedule),
+                          BufferSizeBytesFunction(), memory_alignment,
+                          /*allocate_buffers_for_constants=*/true));
+  // BufferAssignment::ToString() includes a header, so no need for us to
+  // print one ourselves.
+  if (DumpingEnabledForHloModule(*module)) {
+    DumpToFileInDirOrStdout(*module, "", "buffer_assignment",
+                            assignment->ToString());
+  }
+  DumpHloModuleIfEnabled(*module, *assignment,
+                         absl::StrCat("cpu_", kAfterOptimizationsDumpName));
+
+  absl::flat_hash_map<const HloInstruction*, int64_t>
+      instruction_to_profile_idx;
+  absl::flat_hash_map<const HloComputation*, int64_t>
+      computation_to_profile_idx;
+  std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
+  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data;
+
+  if (module->config().hlo_profiling_enabled()) {
+    TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts(
+        *module, &instruction_to_profile_idx, &computation_to_profile_idx,
+        &hlo_profile_index_map, &hlo_profile_printer_data));
+  }
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::TargetMachine> target_machine,
+                      target_machine_builder());
+  TargetMachineFeatures target_machine_features(target_machine.get());
+  std::vector<cpu_function_runtime::BufferInfo> buffer_infos =
+      CreateBufferInfosFromBufferAssignment(*module, *assignment);
+  HloComputation* computation = module->entry_computation();
+
+  // Compile must be thread-safe so create a new LLVM context for the module.
+  mlir::MLIRContext mlir_context;
+  auto llvm_context = std::make_unique<llvm::LLVMContext>();
+
+  // Set required information before emitting IR
+  auto llvm_module =
+      std::make_unique<llvm::Module>(kXlaModuleIdentifier, *llvm_context);
+  llvm_module->setDataLayout(target_machine->createDataLayout());
+  llvm_module->setTargetTriple(triple);
+  if (pic_level != llvm::PICLevel::NotPIC) {
+    llvm_module->setPICLevel(pic_level);
+  }
+  if (pie_level != llvm::PIELevel::Default) {
+    llvm_module->setPIELevel(pie_level);
+  }
+  IrEmitter ir_emitter(&mlir_context, *module, *assignment, llvm_module.get(),
+                       std::move(instruction_to_profile_idx),
+                       std::move(computation_to_profile_idx),
+                       ModuleComputationsTransitivelyContainCustomCall(*module),
+                       &target_machine_features,
+                       // TODO(b/66051036): Run full msan for AOT.
+                       /*emit_code_for_msan=*/false);
+
+  TF_RETURN_IF_ERROR(ir_emitter.EmitAllConstantGlobals());
+
+  for (ComputationToEmit subcomputation :
+       SubcomputationEmissionOrder(computation)) {
+    if (subcomputation.computation->IsFusionComputation()) {
+      continue;
+    }
+    TF_RETURN_IF_ERROR(
+        ir_emitter
+            .EmitComputation(
+                subcomputation.computation, subcomputation.computation->name(),
+                /*is_top_level_computation=*/false,
+                schedule.sequence(subcomputation.computation).instructions(),
+                subcomputation.allow_reassociation)
+            .status());
+  }
+  const std::string& entry_point_name = aot_options.entry_point_name();
+  TF_ASSIGN_OR_RETURN(
+      llvm::Function * entry_function,
+      ir_emitter.EmitComputation(computation, entry_point_name,
+                                 /*is_top_level_computation=*/true,
+                                 schedule.sequence(computation).instructions(),
+                                 /*allow_reassociation=*/false));
+
+  CHECK(entry_function->getName() == entry_point_name);
+
+  ModuleHook pre_optimization_ir_hook;
+  ModuleHook post_optimization_ir_hook;
+  std::tie(pre_optimization_ir_hook, post_optimization_ir_hook) =
+      GetIRModuleHooks(*module, user_pre_optimization_hook_,
+                       user_post_optimization_hook_);
+
+  // Run the LLVM verifier over the unoptimized LLVM IR.  If it fails, run
+  // the pre-optimization IR dump hook before returning.
+  {
+    absl::Status verify_status = VerifyLlvmModule(*llvm_module);
+    if (!verify_status.ok() && pre_optimization_ir_hook) {
+      pre_optimization_ir_hook(*llvm_module);
+    }
+    TF_RETURN_IF_ERROR(verify_status);
+  }
+
+  auto post_codegen_hook = [&](const llvm::Module& llvm_module,
+                               const llvm::object::ObjectFile& obj_file) {
+    if (!DumpingEnabledForHloModule(*module)) {
+      return;
+    }
+    DumpModuleToFile(llvm_module, obj_file, *module);
+  };
+
+  DebugOptions debug_options = module->config().debug_options();
+  IrCompiler::Options ir_compiler_options = {
+      /*optimization_level=*/target_machine->getOptLevel(),
+      /*optimize_for_size=*/
+      options::OptimizeForSizeRequested(module->config()),
+      /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()),
+      /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(module->config()),
+      /*disable_expensive_passes=*/
+      debug_options.xla_llvm_disable_expensive_passes(),
+      /*disable_slp_vectorizer=*/
+      options::SlpVectorizerDisabled(module->config()),
+      /*disable_loop_unrolling=*/
+      options::DisableLoopUnrolling(module->config()),
+      /*dfsan_enabled=*/aot_options.sanitize_dataflow(),
+      /*dfsan_abilists_enabled=*/aot_options.sanitize_abilists_dataflow()};
+
+  IrCompiler::CompilationHooks ir_compiler_hooks = {
+      pre_optimization_ir_hook,
+      post_optimization_ir_hook,
+      post_codegen_hook,
+  };
+
+  IrCompiler ir_compiler(std::move(target_machine_builder),
+                         std::move(ir_compiler_options),
+                         std::move(ir_compiler_hooks));
+
+  std::unique_ptr<llvm::MemoryBuffer> object_file =
+      cantFail(ir_compiler(*llvm_module));
+  ObjectFileData object_file_data(object_file->getBufferStart(),
+                                  object_file->getBufferEnd());
+
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
+                      assignment->GetUniqueTopLevelOutputSlice());
+
+  return std::make_unique<CpuAotCompilationResultLegacy>(
+      std::move(object_file_data), std::move(buffer_infos),
+      result_slice.index(), std::move(module),
+      std::move(hlo_profile_printer_data));
+}
+
+absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+CpuCompiler::CompileAheadOfTimeThunks(
+    std::unique_ptr<HloModule> module,
+    IrCompiler::TargetMachineBuilder target_machine_builder,
+    const CpuAotCompilationOptions& aot_options, const llvm::Triple& triple,
+    const llvm::PICLevel::Level& pic_level,
+    const llvm::PIELevel::Level& pie_level) {
+  TraceMe trace([&] {
+    return TraceMeEncode("CpuCompiler::CompileAheadOfTimeThunks",
+                         {{"name", module->name()}});
+  });
+  // Compile must be thread-safe so create a new LLVM context for the module.
+  mlir::MLIRContext mlir_context;
+  auto llvm_context = std::make_unique<llvm::LLVMContext>();
+
+  const DebugOptions& debug_options = module->config().debug_options();
+
+  TF_ASSIGN_OR_RETURN(HloSchedule schedule, CreateHloSchedule(*module));
+  TF_RETURN_IF_ERROR(module->set_schedule(schedule));
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferAssignment> assignment,
+                      CreateBufferAssignment(*module));
+  DumpHloModuleIfEnabled(*module, *assignment,
+                         absl::StrCat("cpu_aot_", kAfterOptimizationsDumpName));
+
+  // TODO profiling related, probably delete this
+  absl::flat_hash_map<const HloInstruction*, int64_t>
+      instruction_to_profile_idx;
+  absl::flat_hash_map<const HloComputation*, int64_t>
+      computation_to_profile_idx;
+  std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
+  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data;
+  if (module->config().hlo_profiling_enabled()) {
+    TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts(
+        *module, &instruction_to_profile_idx, &computation_to_profile_idx,
+        &hlo_profile_index_map, &hlo_profile_printer_data));
+  }
+  // probably delete this end
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::TargetMachine> target_machine,
+                      target_machine_builder());
+  TargetMachineFeatures target_machine_features(target_machine.get());
+
+  auto llvm_module =
+      std::make_unique<llvm::Module>(kXlaModuleIdentifier, *llvm_context);
+
+  llvm_module->setDataLayout(target_machine->createDataLayout());
+  llvm_module->setTargetTriple(triple);
+  if (pic_level != llvm::PICLevel::NotPIC) {
+    llvm_module->setPICLevel(pic_level);
+  }
+  if (pie_level != llvm::PIELevel::Default) {
+    llvm_module->setPIELevel(pie_level);
+  }
+
+  // Emitting part
+  // TODO(ezhulenev): Once we fully migrate to Thunks current IrEmitter should
+  // be renamed to NestedIrEmitter and be used only for emitting nested (aka
+  // thread local or embedded) computations (reductions, maps, etc.).
+
+  // (Nested) IrEmitter is responsible for building LLVM module with functions
+  // for all HLO computations. In thunk execution mode we only build LLVM
+  // functions for embedded computations (e.g. reduction computations) and all
+  // high-level operations (fusions, elementwise, etc.) are lowered to kernel
+  // functions (which are also LLVM functions, but use a HostKernel ABI).
+  IrEmitter nested_ir_emitter(
+      &mlir_context, *module, *assignment, llvm_module.get(),
+      std::move(instruction_to_profile_idx),
+      std::move(computation_to_profile_idx),
+      ModuleComputationsTransitivelyContainCustomCall(*module),
+      &target_machine_features,
+      // TODO(b/66051036): Run full msan for AOT.
+      /*emit_code_for_msan=*/false);
+
+  // The thunk runtime manages large constants, therefore we only emit
+  // small ones.
+  TF_RETURN_IF_ERROR(nested_ir_emitter.EmitSmallConstantGlobals());
+
+  // IR emitter is responsible for building LLVM module with host kernels for
+  // corresponding HLO instructions (fusions, elemental instructions, etc.).
+  IrEmitter2 ir_emitter2(*module, llvm_module.get(), &nested_ir_emitter);
+
+  // Thunk emitter is responsible for building a Thunk sequence that will
+  // resolved kernels in the compiled LLVM module and execute them together
+  // with Thunks implemented as library calls (e.g. oneDNN or Eigen).
+  ThunkEmitter thunk_emitter(ir_emitter2, *assignment, target_machine_features,
+                             module->config());
+  TF_ASSIGN_OR_RETURN(ThunkSequence thunks,
+                      thunk_emitter.EmitEntryComputation(*module));
+
+  // Cache these flags here since we'll want to access them after the module's
+  // ownership is std::moved.
+  const bool embed_ir_in_executable =
+      debug_options.xla_embed_ir_in_executable();
+
+  std::string ir_module_string;
+  if (embed_ir_in_executable) {
+    std::string emitter2_ir = llvm_ir::DumpToString(llvm_module.get());
+
+    auto thunk_kernel_fmt = [](std::string* out,
+                               const ThunkEmitter::EmittedKernel& kernel) {
+      absl::StrAppend(out,
+                      llvm_ir::DumpToString(kernel.module.getModuleUnlocked()));
+    };
+    std::string thunks_ir =
+        absl::StrJoin(thunk_emitter.kernels(), "\n", thunk_kernel_fmt);
+
+    ir_module_string = absl::StrCat(emitter2_ir, "\n", thunks_ir);
+  }
+
+  TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
+  for (const auto& [name, module] : thunk_emitter.kernels()) {
+    TF_RETURN_IF_ERROR(VerifyLlvmModule(*module.getModuleUnlocked()));
+  }
+
+  // Compilation part
+  ModuleHook pre_optimization_ir_hook;
+  ModuleHook post_optimization_ir_hook;
+  std::tie(pre_optimization_ir_hook, post_optimization_ir_hook) =
+      GetIRModuleHooks(*module, user_pre_optimization_hook_,
+                       user_post_optimization_hook_);
+
+  std::vector<std::string> obj_files;
+  auto post_codegen_hook = [&](const llvm::Module& llvm_module,
+                               const llvm::object::ObjectFile& obj_file) {
+    obj_files.push_back(obj_file.getData().str());
+    if (!DumpingEnabledForHloModule(*module)) {
+      return;
+    }
+    absl::string_view id = llvm_module.getModuleIdentifier();
+    size_t pos = std::min(id.size(), 1 + kXlaModuleIdentifier.size());
+    DumpToFileInDir(
+        *module, /*file_prefix=*/"",
+        /*file_suffix=*/absl::StrCat("obj-file.", id.substr(pos), ".o"),
+        absl::string_view(obj_file.getData().data(),
+                          obj_file.getData().size()));
+  };
+
+  IrCompiler::Options ir_compiler_options = {
+      /*optimization_level=*/target_machine->getOptLevel(),
+      /*optimize_for_size=*/
+      options::OptimizeForSizeRequested(module->config()),
+      /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()),
+      /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(module->config()),
+      /*disable_expensive_passes=*/
+      module->config().debug_options().xla_llvm_disable_expensive_passes(),
+      /*disable_slp_vectorizer=*/
+      options::SlpVectorizerDisabled(module->config()),
+      /*disable_loop_unrolling=*/
+      options::DisableLoopUnrolling(module->config()),
+      /*dfsan_enabled=*/aot_options.sanitize_dataflow(),
+      /*dfsan_abilists_enabled=*/aot_options.sanitize_abilists_dataflow()};
+
+  IrCompiler::CompilationHooks ir_compiler_hooks = {
+      pre_optimization_ir_hook,
+      post_optimization_ir_hook,
+      post_codegen_hook,
+  };
+
+  IrCompiler ir_compiler(std::move(target_machine_builder),
+                         std::move(ir_compiler_options),
+                         std::move(ir_compiler_hooks));
+
+  // For simplicity no parallel compilation is used.
+  std::vector<CompiledSymbolsPart> compiled_parts;
+  compiled_parts.push_back(
+      CollectCompiledSymbolsPart(ir_emitter2, *llvm_module));
+
+  // Collect compiled symbols from all LLVM module parts.
+  std::vector<FunctionLibrary::Symbol> compiled_symbols;
+
+  absl::flat_hash_map<FunctionLibrary::TypeId, SymbolProto::FunctionTypeId>
+      symbol_type_id_to_function_type_id;
+
+  VLOG(3) << "Compiling " << thunk_emitter.kernels().size()
+          << " thunk kernels.";
+
+  // We have to clone the LLVM module into a local context to be able to link
+  // it with the other modules. This enables us to have one object file for all
+  // the kernels.
+  auto copy_llvm_module_to_local_context =
+      [&llvm_context](llvm::Module& module) {
+        // There is no way to clone a module from one context to another, so we
+        // need to serialize the module to bitcode and parse it back into the
+        // new context.
+        llvm::SmallString<0> bc;
+        llvm::raw_svector_ostream bcos(bc);
+        llvm::WriteBitcodeToFile(module, bcos);
+
+        // Parse module back into its own LLVM context.
+        auto clone_module = llvm::parseBitcodeFile(
+            llvm::MemoryBufferRef(llvm::StringRef(bc.data(), bc.size()),
+                                  absl::StrFormat("%s_cloned_to_local_context",
+                                                  kXlaModuleIdentifier)),
+            *llvm_context);
+
+        return clone_module;
+      };
+
+  llvm::Linker linker(*llvm_module);
+
+  for (auto& [name, module] : thunk_emitter.kernels()) {
+    compiled_symbols.push_back(
+        FunctionLibrary::Sym<FunctionLibrary::Kernel>(name));
+    symbol_type_id_to_function_type_id.emplace(compiled_symbols.back().type_id,
+                                               SymbolProto::KERNEL);
+    auto cloned_module =
+        copy_llvm_module_to_local_context(*module.getModuleUnlocked());
+    if (!cloned_module) {
+      return Internal("Failed to clone LLVM module.");
+    }
+    // Match data layouts to avoid warning messages.
+    cloned_module->get()->setDataLayout(llvm_module->getDataLayout());
+    linker.linkInModule(std::move(cloned_module.get()));
+  }
+
+  cantFail(ir_compiler(*llvm_module));
+
+  for (const CompiledSymbolsPart& part : compiled_parts) {
+    for (const IrEmitter2::KernelInfo& kernel : part.kernels) {
+      compiled_symbols.push_back(
+          FunctionLibrary::Sym<FunctionLibrary::Kernel>(kernel.name));
+      symbol_type_id_to_function_type_id.emplace(
+          compiled_symbols.back().type_id, SymbolProto::KERNEL);
+    }
+    for (const IrEmitter2::ComparatorInfo& comparator : part.comparators) {
+      compiled_symbols.push_back(
+          FunctionLibrary::Sym<FunctionLibrary::Comparator>(comparator.name));
+      symbol_type_id_to_function_type_id.emplace(
+          compiled_symbols.back().type_id, SymbolProto::COMPARATOR);
+    }
+  }
+
+  VLOG(3) << "Collected " << compiled_symbols.size() << " compiled symbols";
+
+  // Create constant allocations from the buffer assignment.
+  TF_ASSIGN_OR_RETURN(std::vector<ConstantAllocation> constants,
+                      CreateConstantAllocations(*assignment));
+
+  TF_ASSIGN_OR_RETURN(
+      auto cpu_executable,
+      CpuExecutable::Create(
+          /*function_library=*/nullptr,  // NOTE: We don't need to generate a
+                                         // function library as the only purpose
+                                         // of this executable is to get
+                                         // exported.
+          std::move(assignment), std::move(module), std::move(thunks),
+          std::move(constants), std::move(hlo_profile_printer_data),
+          std::move(hlo_profile_index_map)));
+
+  // Save compiled symbols to be able to export them to AOT compilation
+  // result.
+  cpu_executable->set_compiled_symbols(std::move(compiled_symbols));
+
+  // Save mapping between symbol type id and function type id to be able to
+  // export them to AOT compilation result.
+  cpu_executable->set_symbol_type_id_to_function_type_id(
+      symbol_type_id_to_function_type_id);
+
+  if (embed_ir_in_executable) {
+    cpu_executable->set_ir_module_string(ir_module_string);
+  }
+
+  // Dump computation proto state and buffer assignment for
+  // GetCompiledMemoryStats results.
+  auto with_hlo_proto = [&](std::unique_ptr<CpuExecutable> cpu_executable) {
+    auto hlo_proto = std::make_unique<HloProto>();
+    *hlo_proto->mutable_hlo_module() = cpu_executable->module().ToProto();
+    *hlo_proto->mutable_buffer_assignment() =
+        cpu_executable->buffer_assignment().ToProto();
+    StripPayloadFromLiteralProto(*hlo_proto);
+    cpu_executable->set_hlo_proto(std::move(hlo_proto));
+    return cpu_executable;
+  };
+
+  cpu_executable = with_hlo_proto(std::move(cpu_executable));
+
+  const ThunkSequence& thunk_sequence =
+      cpu_executable->thunks().thunk_sequence();
+
+  std::unique_ptr<HloProfilePrinterData> executable_hlo_profile_printer_data =
+      cpu_executable->module().config().hlo_profiling_enabled()
+          ? std::make_unique<HloProfilePrinterData>(
+                cpu_executable->hlo_profile_printer_data())
+          : nullptr;
+
+  return CpuAotCompilationResultThunks::Create(
+      &cpu_executable->module(), &cpu_executable->buffer_assignment(),
+      cpu_executable->module_name(), std::move(obj_files),
+      cpu_executable->get_compiled_symbols_proto(), thunk_sequence,
+      std::move(*cpu_executable).consume_function_library().release(),
+      std::move(executable_hlo_profile_printer_data));
+}
+
+se::Platform::Id CpuCompiler::PlatformId() const {
+  return se::host::kHostPlatformId;
+}
+
+HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const {
+  return CpuExecutable::ShapeSizeBytes;
+}
+
+namespace {
+
+// TODO(basioli): This should be removed once new runtime is implemented, and
+// CpuAotCompilationResult will be the only implementation of
+// AotCompilationResult. This is still used as it allows us to `Export` and
+// subsequently load both runtimes.
+
+// This is a result of exporting JIT compiled
+// CpuExecutable to AOT compilation result that can be saved on disk and shipped
+// over the wire.
+class CpuExecutableAotCompilationResult : public AotCompilationResult {
+ public:
+  static absl::StatusOr<std::unique_ptr<CpuExecutableAotCompilationResult>>
+  Create(const HloModule* hlo_module, const BufferAssignment* buffer_assignment,
+         absl::string_view function_name, std::vector<std::string> obj_files,
+         std::vector<SymbolProto> symbols, const ThunkSequence* thunks,
+         CompilationResultProto::ObjFileKind obj_file_kind) {
+    std::optional<ThunkSequenceProto> thunk_proto;
+
+    if (thunks != nullptr) {
+      ThunkSequenceSerDesProtobuf thunk_sequence_serdes(
+          &buffer_assignment->Allocations());
+      TF_ASSIGN_OR_RETURN(thunk_proto, thunk_sequence_serdes.ToProto(*thunks));
+    }
+
+    return absl::WrapUnique(new CpuExecutableAotCompilationResult(
+        hlo_module, buffer_assignment, function_name, std::move(obj_files),
+        std::move(symbols), thunk_proto, obj_file_kind));
+  }
+
+  absl::StatusOr<std::string> SerializeAsString() const override {
+    return proto_.SerializeAsString();
+  }
+
+  static absl::StatusOr<std::unique_ptr<CpuExecutableAotCompilationResult>>
+  FromString(const std::string& serialized) {
+    CompilationResultProto proto;
+    if (!proto.ParseFromString(serialized)) {
+      return Internal(
+          "Failed to parse serialized CpuExecutableAotCompilationResult.");
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModule> module,
+        HloModule::CreateFromProtoWithConfig(proto.hlo_module()));
+
+    return std::unique_ptr<CpuExecutableAotCompilationResult>(
+        new CpuExecutableAotCompilationResult(proto, std::move(module)));
+  }
+
+  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
+      Compiler* compiler,
+      const se::StreamExecutor* stream_exec) const&& override;
+
+  const HloModule* optimized_module() const override { return module_.get(); }
+
+  std::unique_ptr<HloModule> consume_optimized_module() override {
+    return std::move(module_);
+  }
+
+ private:
+  CpuExecutableAotCompilationResult(
+      const HloModule* hlo_module, const BufferAssignment* buffer_assignment,
+      absl::string_view function_name, std::vector<std::string> obj_files,
+      std::vector<SymbolProto> symbols,
+      const std::optional<ThunkSequenceProto>& thunks,
+      CompilationResultProto::ObjFileKind obj_file_kind) {
+    *proto_.mutable_hlo_module()->mutable_hlo_module() = hlo_module->ToProto();
+    *proto_.mutable_hlo_module()->mutable_config() =
+        hlo_module->config().ToProto();
+    *proto_.mutable_buffer_assignment() = buffer_assignment->ToProto();
+    proto_.set_entry_function_name(std::string(function_name));
+    for (std::string& obj_file : obj_files) {
+      proto_.add_obj_files(std::move(obj_file));
+    }
+
+    for (const auto& symbol : symbols) {
+      auto* symbol_proto = proto_.add_compiled_symbols();
+      *symbol_proto = symbol;
+    }
+    proto_.set_obj_files_kind(obj_file_kind);
+    module_ = hlo_module->Clone();
+
+    if (thunks.has_value()) {
+      ThunkSequenceSerDesProtobuf thunk_sequence_serdes(
+          &buffer_assignment->Allocations());
+      *proto_.mutable_thunk_sequence() = *thunks;
+    }
+  }
+
+  explicit CpuExecutableAotCompilationResult(CompilationResultProto proto,
+                                             std::unique_ptr<HloModule> module)
+      : proto_(std::move(proto)), module_(std::move(module)) {}
+
+  CompilationResultProto proto_;
+  std::unique_ptr<HloModule> module_;
+};
+
+}  // namespace
+
+absl::StatusOr<std::unique_ptr<Executable>>
+CpuExecutableAotCompilationResult::LoadExecutable(
+    Compiler* compiler, const se::StreamExecutor* stream_exec) const&& {
+  // Recreate HloModule from proto.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<HloModule> module,
+      HloModule::CreateFromProtoWithConfig(proto_.hlo_module()));
+
+  VLOG(2) << "Load XLA:CPU executable for module: " << module->name();
+
+  // Recreate BufferAssignment from proto.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<BufferAssignment> buffer_assignment,
+      BufferAssignment::FromProto(proto_.buffer_assignment(), module.get(),
+                                  compiler->BufferSizeBytesFunction(),
+                                  /*can_share_buffer=*/nullptr));
+
+  const DebugOptions& debug_options = module->config().debug_options();
+  VlogMaxIsa(debug_options.xla_cpu_max_isa());
+  const HloModuleConfig& config = module->config();
+
+  // Infer target machine from the current host CPU.
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<llvm::TargetMachine> target_machine,
+      IrCompiler::InferTargetMachine(
+          std::move(CompilerTargetOptions(module->config())),
+          IrCompiler::GetCodeGenOptLevel(config),
+          CpuFeatureFromString(debug_options.xla_cpu_max_isa())));
+
+  // Definition generator to link with XLA:CPU host runtime symbols.
+  ExecutionEngine::DefinitionGenerator definition_generator =
+      [](const llvm::DataLayout& data_layout) {
+        return std::make_unique<RuntimeSymbolGenerator>(data_layout);
+      };
+
+  ObjectLoader object_loader(/*num_dylibs=*/1,
+                             target_machine->createDataLayout(),
+                             definition_generator);
+
+  for (size_t i = 0; i < object_loader.num_dylibs(); ++i) {
+    object_loader.dylib(i).value()->addGenerator(
+        std::make_unique<RuntimeSymbolGenerator>(
+            target_machine->createDataLayout()));
+  }
+
+  // We might have an XLA:CPU executable that has only runtime thunks and
+  // doesn't have any corresponding object files, and it's absolutely fine.
+  VLOG(2) << "Load XLA:CPU executable from " << proto_.obj_files_size()
+          << " object files; entry_function_name="
+          << proto_.entry_function_name();
+
+  size_t obj_file_index = 0;
+  for (auto& obj_file : proto_.obj_files()) {
+    llvm::StringRef data(obj_file.data(), obj_file.size());
+    TF_RETURN_IF_ERROR(
+        object_loader.AddObjFile(llvm::MemoryBuffer::getMemBuffer(
+            data, absl::StrCat(proto_.entry_function_name(), "_",
+                               obj_file_index++))));
+  }
+
+  std::unique_ptr<CpuExecutable> cpu_executable;
+
+  if (proto_.obj_files_kind() == CompilationResultProto::KERNELS) {
+    ThunkSequenceSerDesProtobuf thunk_sequence_serdes(
+        &buffer_assignment->Allocations());
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<ThunkSequence> thunks,
+        thunk_sequence_serdes.FromProto(proto_.thunk_sequence()));
+
+    VLOG(3) << "Loaded " << thunks->size() << " thunks.";
+
+    std::vector<FunctionLibrary::Symbol> compiled_symbols;
+
+    for (const auto& symbol_proto : proto_.compiled_symbols()) {
+      switch (symbol_proto.function_type_id()) {
+        case SymbolProto::KERNEL:
+          compiled_symbols.push_back(
+              FunctionLibrary::Sym<FunctionLibrary::Kernel>(
+                  symbol_proto.name()));
+          break;
+        case SymbolProto::COMPARATOR:
+          compiled_symbols.push_back(
+              FunctionLibrary::Sym<FunctionLibrary::Comparator>(
+                  symbol_proto.name()));
+          break;
+        default:
+          return Internal(
+              "Unknown function type id %s",
+              SymbolProto_FunctionTypeId_Name(symbol_proto.function_type_id()));
+      }
+    }
+
+    VLOG(3) << "Collected " << compiled_symbols.size() << " compiled symbols";
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<FunctionLibrary> function_library,
+                        std::move(object_loader).Load(compiled_symbols));
+
+    // Create constant allocations from the buffer assignment.
+    TF_ASSIGN_OR_RETURN(std::vector<ConstantAllocation> constants,
+                        CreateConstantAllocations(*buffer_assignment));
+
+    TF_ASSIGN_OR_RETURN(
+        cpu_executable,
+        CpuExecutable::Create(std::move(function_library),
+                              std::move(buffer_assignment), std::move(module),
+                              std::move(*thunks), std::move(constants), nullptr,
+                              nullptr));
+
+  } else if (proto_.obj_files_kind() == CompilationResultProto::CLASSIC) {
+    // Create a "classic" CPU executable.
+    using ComputeFn = std::remove_pointer_t<CpuExecutable::ComputeFunctionType>;
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<FunctionLibrary> function_library,
+                        std::move(object_loader)
+                            .Load({FunctionLibrary::Sym<ComputeFn>(
+                                proto_.entry_function_name())}));
+
+    TF_ASSIGN_OR_RETURN(
+        cpu_executable,
+        CpuExecutable::Create(std::move(function_library),
+                              std::move(buffer_assignment), std::move(module),
+                              proto_.entry_function_name(), nullptr, nullptr));
+
+  } else {
+    return Internal("Unknown obj file kind");
+  }
+
+  // Dump computation proto state and buffer assignment for
+  // GetCompiledMemoryStats results.
+  auto hlo_proto = std::make_unique<HloProto>();
+  *hlo_proto->mutable_hlo_module() = cpu_executable->module().ToProto();
+  *hlo_proto->mutable_buffer_assignment() =
+      cpu_executable->buffer_assignment().ToProto();
+  cpu_executable->set_hlo_proto(std::move(hlo_proto));
+
+  return cpu_executable;
+}
+
+absl::StatusOr<std::unique_ptr<AotCompilationResult>> CpuCompiler::Export(
+    Executable* executable) const {
+  auto* cpu_executable = tensorflow::down_cast<CpuExecutable*>(executable);
+  if (!cpu_executable)
+    return Internal("Could not downcast Executable to CpuExecutable");
+
+  // Export object files for all dylibs.
+  std::vector<std::string> obj_files;
+  for (const auto& obj_file : cpu_executable->obj_files()) {
+    obj_files.push_back(std::string(obj_file));
+  }
+
+  auto kind = cpu_executable->has_thunks() ? CompilationResultProto::KERNELS
+                                           : CompilationResultProto::CLASSIC;
+  const ThunkSequence* thunk_sequence =
+      cpu_executable->has_thunks() ? &cpu_executable->thunks().thunk_sequence()
+                                   : nullptr;
+
+  std::vector<SymbolProto> compiled_symbols =
+      cpu_executable->get_compiled_symbols_proto();
+
+  return CpuExecutableAotCompilationResult::Create(
+      &cpu_executable->module(), &cpu_executable->buffer_assignment(),
+      cpu_executable->module_name(), std::move(obj_files),
+      std::move(compiled_symbols), thunk_sequence, kind);
+}
+
+absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+CpuCompiler::LoadAotCompilationResult(
+    const std::string& serialized_aot_result) {
+  return CpuExecutableAotCompilationResult::FromString(serialized_aot_result);
+}
+
+absl::StatusOr<HloSchedule> CpuCompiler::CreateHloSchedule(
+    const HloModule& hlo_module) const {
+  // Select a memory scheduler optimized for concurrency vs minimal memory.
+  auto scheduler =
+      hlo_module.config()
+              .debug_options()
+              .xla_cpu_enable_concurrency_optimized_scheduler()
+          ? std::unique_ptr<ModuleSchedulerAlgorithm>(
+                std::make_unique<BFScheduler>(BufferSizeBytesFunction()))
+          : std::make_unique<DFSMemoryScheduler>(BufferSizeBytesFunction());
+
+  // Select an order for emitting the HLO instructions for each
+  // computation. Using this sequence enables tighter buffer liveness analysis
+  // and reduced memory usage (as compared to using `DependencyHloOrdering`).
+  return ScheduleModule(&hlo_module, *scheduler);
+}
+
+absl::StatusOr<std::unique_ptr<BufferAssignment>>
+CpuCompiler::CreateBufferAssignment(const HloModule& module) const {
+  // Run buffer allocation on the HLO graph.
+  return BufferAssigner::Run(
+      &module, std::make_unique<SequentialHloOrdering>(module.schedule()),
+      BufferSizeBytesFunction(), memory_alignment,
+      /*allocate_buffers_for_constants=*/true);
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.cc b/third_party/xla/xla/service/cpu/cpu_runtime.cc
index 7caf9c43b1119b..1f8e9291f84c32 100644
--- a/third_party/xla/xla/service/cpu/cpu_runtime.cc
+++ b/third_party/xla/xla/service/cpu/cpu_runtime.cc
@@ -197,6 +197,8 @@ extern const char* const kOneDnnMatMulReorderSymbolName =
     "__xla_cpu_runtime_OneDnnMatMulReorder";
 extern const char* const kHandleFfiCallSymbolName =
     "__xla_cpu_runtime_HandleFfiCall";
+extern const char* const kXnnPackSoftMaxNDSymbolName =
+    "__xla_cpu_runtime_XnnPackSoftMaxND";
 
 namespace {
 
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.h b/third_party/xla/xla/service/cpu/cpu_runtime.h
index 71e27ea600ee28..31c7f9d0d86ef5 100644
--- a/third_party/xla/xla/service/cpu/cpu_runtime.h
+++ b/third_party/xla/xla/service/cpu/cpu_runtime.h
@@ -97,6 +97,7 @@ extern const char* const kOneDnnLayerNormSymbolName;
 extern const char* const kOneDnnConvolutionSymbolName;
 extern const char* const kOneDnnMatMulReorderSymbolName;
 extern const char* const kHandleFfiCallSymbolName;
+extern const char* const kXnnPackSoftMaxNDSymbolName;
 
 // All symbol names for XLA CPU runtime functions need to start with this
 // prefix.
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.cc b/third_party/xla/xla/service/cpu/ir_emitter.cc
index feca6552d243f8..2bd5d7278b07c5 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter.cc
@@ -110,6 +110,9 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
+#include "xnnpack_ops.h"
+#include "xnnpack_ops_rewriter.h"
+
 #if defined(INTEL_MKL)
 #include "xla/service/cpu/onednn_memory_util.h"
 #endif
@@ -2463,6 +2466,39 @@ absl::Status IrEmitter::HandleTopK(HloInstruction* hlo) {
   return absl::OkStatus();
 }
 
+absl::Status IrEmitter::HandleXnnPackSoftMax(HloInstruction* hlo) {
+  const HloInstruction* input = hlo->operand(0);
+  Shape shape = input->shape();
+
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
+  TF_RET_CHECK(input->shape().element_type() == F32);
+  TF_RET_CHECK(shape.dimensions().size() >= 2);
+
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice input_values_slice,
+                      assignment_.GetUniqueSlice(hlo->operand(0), {}));
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice out_values_slice,
+                      assignment_.GetUniqueSlice(hlo, {}));
+
+  llvm::Value* values_ptr = EmitBufferPointer(input_values_slice, shape);
+  llvm::Value* out_values_ptr = EmitBufferPointer(out_values_slice, shape);
+
+  // Flatten the batches into a single dimension.
+  int channels = shape.dimensions(shape.dimensions().size() - 1);
+  int batch_size = 1;
+  for (int i = 0; i < shape.dimensions().size() - 1; i++)
+    batch_size = batch_size * shape.dimensions(i);
+
+  EmitCallToFunc(runtime::kXnnPackSoftMaxNDSymbolName,
+                 {/*run_options=*/GetExecutableRunOptionsArgument(),
+                  /*input*/ values_ptr,
+                  /*output*/ out_values_ptr,
+                  /*batch_size*/ b()->getInt64(batch_size),
+                  /*channels*/ b()->getInt64(channels)},
+                 b()->getVoidTy());
+
+  return absl::OkStatus();
+}
+
 #if defined(INTEL_MKL)
 
 // Emits operands alloca vector for oneDNN custom calls.
@@ -2815,6 +2851,9 @@ absl::Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
   if (custom_call->custom_call_target() == "TopK") {
     return HandleTopK(custom_call);
   }
+  if (custom_call->custom_call_target() == kCustomCallXnnPackSoftMax) {
+    return HandleXnnPackSoftMax(custom_call);
+  }
 #if defined(INTEL_MKL)
   if (custom_call->custom_call_target() == "__onednn$matmul") {
     return HandleOneDnnMatMulCalls(custom_call,
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.h b/third_party/xla/xla/service/cpu/ir_emitter.h
index 40f54d2f4bff97..b3d47d41c6ca69 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.h
+++ b/third_party/xla/xla/service/cpu/ir_emitter.h
@@ -336,6 +336,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   absl::Status HandleTopK(HloInstruction* hlo) override;
   absl::Status HandleAllReduceSingleReplica(HloInstruction* crs);
   absl::Status HandleAllReduceMultipleReplica(HloInstruction* crs);
+  absl::Status HandleXnnPackSoftMax(HloInstruction* hlo);
 #if defined(INTEL_MKL)
   std::vector<StackAlloca> EmitOneDnnOperandsAlloca(HloInstruction* custom_call,
                                                     llvm::Value*& args_val,
diff --git a/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc b/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc
index 87aca6c386751a..64e5970c8f04a4 100644
--- a/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc
+++ b/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "xla/service/cpu/runtime_topk.h"
 #include "xla/service/cpu/windows_compatibility.h"
+#include "xla/service/cpu/xnnpack_ops.h"
 #include "xla/service/custom_call_target_registry.h"
 #include "tsl/platform/logging.h"
 
@@ -209,6 +210,7 @@ static bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(TracingStart);
   REGISTER_CPU_RUNTIME_SYMBOL(TracingEnd);
   REGISTER_CPU_RUNTIME_SYMBOL(HandleFfiCall);
+  REGISTER_CPU_RUNTIME_SYMBOL(XnnPackSoftMaxND);
 #if defined(INTEL_MKL)
   REGISTER_CPU_RUNTIME_SYMBOL(OneDnnMatMul);
   REGISTER_CPU_RUNTIME_SYMBOL(OneDnnSoftmax);
diff --git a/third_party/xla/xla/service/cpu/xnnpack_ops.cc b/third_party/xla/xla/service/cpu/xnnpack_ops.cc
new file mode 100644
index 00000000000000..902086924f0fdf
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/xnnpack_ops.cc
@@ -0,0 +1,76 @@
+/* Original Copyright: Copyright (c) Facebook, Inc. and its affiliates.
+This source code is licensed under the BSD-style license found in the
+LICENSE file in the root directory of this source tree.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define XNN_LOG_LEVEL 4
+#include <iostream>
+#include "xnnpack.h"
+#include "absl/base/attributes.h"
+
+namespace xla {
+namespace cpu {
+
+extern "C" {
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_XnnPackSoftMaxND(
+    const void* run_options_ptr, void* in, void* out, int64_t batch_size,
+    int64_t channels) {
+  // NB: run_options_ptr is ignored.
+  float* input = (float*)in;
+  float* output = (float*)out;
+
+  xnn_status status = xnn_initialize(nullptr /* allocator */);
+  if (status != xnn_status_success) {
+    std::cout << "failed to initialize XNNPACK";
+    return;
+  }
+
+  xnn_operator_t softmax_op = nullptr;
+  status = xnn_create_softmax_nc_f32(0 /* flags */, &softmax_op);
+  if (status != xnn_status_success || softmax_op == nullptr) {
+    std::cout << "failed to create SoftMax operator\n";
+    return;
+  }
+
+  status = xnn_reshape_softmax_nc_f32(softmax_op, channels, /* channels */
+                                      channels /* input stride */,
+                                      channels /* output stride */, batch_size,
+                                      /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    std::cout << "failed to reshape SoftMax operator";
+    return;
+  }
+
+  status = xnn_setup_softmax_nc_f32(softmax_op, input, output);
+  if (status != xnn_status_success) {
+    std::cout << "failed to setup SoftMax operator";
+    return;
+  }
+
+  status = xnn_run_operator(softmax_op, /*threadpool=*/nullptr);
+  if (status != xnn_status_success) {
+    std::cout << "failed to run SoftMax operator";
+    return;
+  }
+
+  xnn_delete_operator(softmax_op);
+
+  xnn_deinitialize();
+}
+
+}  // extern "C"
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/xnnpack_ops.h b/third_party/xla/xla/service/cpu/xnnpack_ops.h
new file mode 100644
index 00000000000000..c3811f641a9f4c
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/xnnpack_ops.h
@@ -0,0 +1,36 @@
+/* Referenced & Modified External Open Source Code:
+Source URL: https://github.com/openxla/xla/pull/7540/files
+Original Copyright: 2023 The TensorFlow Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_XNNPACK_OPS_H_
+#define XLA_SERVICE_CPU_XNNPACK_OPS_H_
+
+namespace xla {
+namespace cpu {
+
+extern "C" {
+
+extern void __xla_cpu_runtime_XnnPackSoftMaxND(const void* run_options_ptr,
+                                               void* in, void* out,
+                                               int64_t batch_size,
+                                               int64_t channels);
+
+}  // extern "C"
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_XNNPACK_OPS_H_
diff --git a/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc
new file mode 100644
index 00000000000000..a3a5f1827d0da8
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc
@@ -0,0 +1,228 @@
+/*
+Referenced & Modified External Open Source Code:
+Original Copyright: 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xnnpack_ops_rewriter.h"
+
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/literal_comparison.h"
+#include "xla/literal_util.h"
+#include "xnnpack_pattern_utils.h"
+#include "xla/status_macros.h"
+
+namespace xla {
+namespace cpu {
+
+extern const char* const kCustomCallXnnPackSoftMax = "__xnnpack$softmax";
+
+namespace {
+namespace m = match;
+namespace pu = ::xla::cpu::xnnpack_pattern_utils_internal;
+
+bool IsNegInfConstScalar(const HloInstruction* const_instr) {
+  if (const_instr->opcode() != HloOpcode::kConstant) {
+    return false;
+  }
+  if (!ShapeUtil::IsEffectiveScalar(const_instr->shape())) {
+    return false;
+  }
+  auto value = LiteralUtil::GetFirstScalarLiteral(const_instr->literal());
+  return literal_comparison::Equal(
+             value, LiteralUtil::MinValue(const_instr->shape().element_type()))
+      .ok();
+}
+
+bool IsMaxReducerComputation(const HloComputation* comp) {
+  if (comp->root_instruction()->opcode() != HloOpcode::kMaximum) {
+    return false;
+  }
+  auto max_instr = comp->root_instruction();
+  const HloInstruction* p0 = comp->parameter_instruction(0);
+  const HloInstruction* p1 = comp->parameter_instruction(1);
+  const HloInstruction* max_p0 = max_instr->operand(0);
+  const HloInstruction* max_p1 = max_instr->operand(1);
+  return (max_p0 == p0 && max_p1 == p1) || (max_p1 == p0 && max_p0 == p1);
+}
+
+// Pattern to match any of Maximum(Reduce_max(...), -inf) or Reduce_max(...).
+auto MaxReduce(HloInstruction** instr) {
+  auto is_valid_reduce_max = [](const HloInstruction* reduce) {
+    HloComputation* reducer = reduce->to_apply();
+    return IsMaxReducerComputation(reducer) &&
+           (reduce->dimensions().size() == 1) &&
+           (reduce->operand(1)->opcode() == HloOpcode::kConstant) &&
+           IsNegInfConstScalar(reduce->operand(1));
+  };
+
+  return m::AnyOf<HloInstruction>(
+      m::Maximum().WithBinaryOperandsAnyOrder(
+          m::Reduce(instr).WithPredicate(is_valid_reduce_max).WithOneUse(),
+          pu::OptionalBroadcast(
+              m::Constant().WithPredicate(IsNegInfConstScalar))),
+      m::Reduce(instr).WithPredicate(is_valid_reduce_max).WithOneUse());
+}
+
+// Matches the softmax pattern with divide instruction as root node.
+// Here we pass 'instr' as root node and return the producer HloInstruction.
+// Tha axis on which softmax is applied is stored in 'axis'.
+std::optional<HloInstruction*> MatchSoftmax(HloInstruction* instr, int* axis) {
+  //
+  // producer
+  // |   \
+  // |  reduce_max or max(reduce_max)
+  // |     |
+  // |  reshape
+  // |     |
+  // |  broadcast
+  // |     |
+  // |  reshape
+  // |     |
+  // |  broadcast
+  // |   /
+  // subtract
+  // |
+  // exponential
+  // |   \
+  // |   Convert(optional)
+  // |     |
+  // |  reduce_sum
+  // |     |
+  // |   Convert(optional)
+  // |     |
+  // |  reshape
+  // |     |
+  // |   Convert(optional)
+  // |     |
+  // |  broadcast
+  // |     |
+  // |  reshape
+  // |     |
+  // |  broadcast
+  // |   /
+  // divide  // (instr parameter)
+  //
+
+  // This matcher covers the most common SoftMax patterns we have encountered
+  // in real-life models.
+  HloInstruction* left_exponential;
+  HloInstruction* right_exponential;
+  HloInstruction* left_producer;
+  HloInstruction* reduce_sum;
+  HloInstruction* reduce_max;
+  HloInstruction* reduce_instr;
+
+  // Lower diamond
+  if (!Match(instr,
+             m::Divide(
+                 m::Exp(&left_exponential, m::Op()),
+                 m::Broadcast(m::Reshape(m::Broadcast(
+                     pu::OptionalConvert(m::Reshape(pu::OptionalConvert(
+                         m::Reduce(&reduce_sum,
+                                   pu::OptionalConvert(
+                                       m::Exp(&right_exponential, m::Op())),
+                                   m::ConstantScalar(0))
+                             .WithPredicate([](const HloInstruction* reduce) {
+                               HloComputation* reducer = reduce->to_apply();
+                               return (reducer->root_instruction()->opcode() ==
+                                           HloOpcode::kAdd &&
+                                       reduce->dimensions().size() == 1);
+                             })
+                             .WithOneUse()))))))))) {
+    return std::nullopt;
+  }
+
+  if (left_exponential != right_exponential ||
+      left_exponential->user_count() != 2) {
+    return std::nullopt;
+  }
+
+  // Upper diamond
+  if (!Match(left_exponential->mutable_operand(0),
+             m::Subtract(m::Op(&left_producer),
+                         m::Broadcast(m::Reshape(m::Broadcast(
+                                          m::Reshape(m::Op(&reduce_instr)))))
+                             .WithOneUse())
+                 .WithOneUse())) {
+    return std::nullopt;
+  }
+
+  // Match the reduce max.
+  if (!Match(reduce_instr, MaxReduce(&reduce_max))) {
+    return std::nullopt;
+  }
+
+  if (left_producer != reduce_max->operand(0) ||
+      left_producer->user_count() != 2) {
+    return std::nullopt;
+  }
+
+  if (reduce_sum->dimensions()[0] != reduce_max->dimensions()[0]) {
+    return std::nullopt;
+  }
+
+  *axis = reduce_sum->dimensions()[0];
+
+  return left_producer;
+}
+
+}  // namespace
+
+class XnnPackOpsRewriterVisitor : public DfsHloRewriteVisitor {
+ public:
+  absl::Status HandleDivide(HloInstruction* divide_instr) override {
+    if (divide_instr->HasControlDependencies()) {
+      return absl::OkStatus();
+    }
+    if (!pu::IsSupportedType(divide_instr->shape().element_type())) {
+      return absl::OkStatus();
+    }
+    int axis = -1;
+    std::optional<HloInstruction*> producer = MatchSoftmax(divide_instr, &axis);
+    if (producer == std::nullopt) {
+      return absl::OkStatus();
+    }
+
+    const Shape& output_shape = divide_instr->shape();
+    int softmax_dims = output_shape.dimensions().size();
+    if (softmax_dims < 2) {
+      XLA_VLOG_LINES(3, "Found SoftMax with " + std::to_string(softmax_dims) +
+                            " dims, which is not supported\n");
+      return absl::OkStatus();
+    }
+
+    HloInstruction* softmax_call =
+        divide_instr->AddInstruction(HloInstruction::CreateCustomCall(
+            output_shape, {producer.value()}, kCustomCallXnnPackSoftMax));
+    TF_RETURN_IF_ERROR(ReplaceInstruction(divide_instr, softmax_call));
+
+    return absl::OkStatus();
+  }
+};
+
+absl::StatusOr<bool> XnnPackOpsRewriter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  XLA_VLOG_LINES(3,
+                 "XnnPackOpsRewriter::Run(), before:\n" + module->ToString());
+  XnnPackOpsRewriterVisitor visitor;
+  TF_ASSIGN_OR_RETURN(auto result,
+                      visitor.RunOnModule(module, execution_threads));
+  XLA_VLOG_LINES(3, "XnnPackOpsRewriter::Run(), after:\n" + module->ToString());
+  return result;
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h
new file mode 100644
index 00000000000000..2bdc58965c96dc
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h
@@ -0,0 +1,45 @@
+/* Referenced & Modified External Open Source Code:
+Original Copyright: 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_XNNPACK_OPS_REWRITER_H_
+#define XLA_SERVICE_CPU_XNNPACK_OPS_REWRITER_H_
+
+#include <optional>
+
+#include "absl/algorithm/container.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace cpu {
+
+extern const char* const kCustomCallXnnPackSoftMax;
+
+class XnnPackOpsRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "xnnpack-ops-rewriter"; }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_XNNPACK_OPS_REWRITER_H_
diff --git a/third_party/xla/xla/service/cpu/xnnpack_pattern_utils.h b/third_party/xla/xla/service/cpu/xnnpack_pattern_utils.h
new file mode 100644
index 00000000000000..1ea52de3695def
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/xnnpack_pattern_utils.h
@@ -0,0 +1,65 @@
+/*
+Referenced & Modified External Open Source Code:
+Original Copyright: 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_XNNPACK_PATTERN_UTILS_H_
+#define XLA_SERVICE_CPU_XNNPACK_PATTERN_UTILS_H_
+
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/pattern_matcher.h"
+
+namespace xla {
+namespace cpu {
+
+namespace xnnpack_pattern_utils_internal {
+namespace m = match;
+
+template <typename Pattern>
+auto OptionalConvert(Pattern pattern) {
+  return m::AnyOf<HloInstruction>(m::Convert(pattern), std::move(pattern));
+}
+
+template <typename Pattern>
+auto OptionalBroadcast(Pattern pattern) {
+  return m::AnyOf<HloInstruction>(m::Broadcast(pattern), std::move(pattern));
+}
+
+// Simplified from upstream XLA.
+inline bool IsSupportedType(xla::PrimitiveType dtype) { return dtype == F32; }
+
+template <typename Pattern>
+inline auto SupportedConvert(Pattern pattern) {
+  auto supported_convert = [](const HloInstruction* instr) -> bool {
+    return IsSupportedType(instr->shape().element_type()) &&
+           IsSupportedType(instr->operand(0)->shape().element_type());
+  };
+  return m::Convert(pattern).WithPredicate(supported_convert);
+}
+
+template <typename Pattern>
+inline auto SupportedConvert(HloInstruction** convert, Pattern pattern) {
+  auto supported_convert = [](const HloInstruction* instr) -> bool {
+    return IsSupportedType(instr->shape().element_type()) &&
+           IsSupportedType(instr->operand(0)->shape().element_type());
+  };
+  return m::Convert(convert, pattern).WithPredicate(supported_convert);
+}
+}  // namespace xnnpack_pattern_utils_internal
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_XNNPACK_PATTERN_UTILS_H_
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index ca8ba0553bd56a..854eed7235720a 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -222,6 +222,8 @@ message DebugOptions {
   // When true, XLA:CPU uses XNNPACK to execute supported operations.
   bool xla_cpu_use_xnnpack = 359;
 
+  bool xla_cpu_enable_xnnpack = 389;
+
   // Enabling this will enable optimizations that ignore the possibility of NaN.
   bool xla_enable_fast_math = 335;
 

From 912c9afbf83ab83575ab68a25b1c833e025e358c Mon Sep 17 00:00:00 2001
From: Wen Di <wendi5@huawei.com>
Date: Mon, 12 Jan 2026 17:33:37 +0800
Subject: [PATCH 2/3] add kernel selector

---
 tensorflow/workspace2.bzl                     |    2 +
 third_party/xla/third_party/openblas/BUILD    |    0
 .../xla/third_party/openblas/openblas.BUILD   |   17 +
 .../xla/third_party/openblas/workspace.bzl    |   10 +
 third_party/xla/workspace2.bzl                |    2 +
 third_party/xla/xla/debug_options_flags.cc    |   10 +-
 third_party/xla/xla/service/cpu/BUILD         |   82 +-
 third_party/xla/xla/service/cpu/BUILD.orig    |   35 +
 .../xla/xla/service/cpu/cpu_compiler.cc       |   11 +-
 .../xla/xla/service/cpu/cpu_compiler.cc.orig  |    8 +
 .../xla/xla/service/cpu/cpu_runtime.cc        |   46 +
 third_party/xla/xla/service/cpu/cpu_runtime.h |   29 +
 third_party/xla/xla/service/cpu/ir_emitter.cc |  186 +-
 third_party/xla/xla/service/cpu/ir_emitter.h  |    3 +
 .../xla/xla/service/cpu/kernel_selector.cc    |  423 ++
 .../xla/xla/service/cpu/kernel_selector.h     |  191 +
 .../cpu/kernel_selector_ops_rewriter.cc       |  658 +++
 .../cpu/kernel_selector_ops_rewriter.h        |   42 +
 .../service/cpu/runtime_symbol_generator.cc   |   22 +
 .../xla/service/cpu/xnnpack_ops_rewriter.cc   |    4 +-
 .../xla/service/cpu/xnnpack_ops_rewriter.h    |    2 -
 third_party/xla/xla/service/libs/BUILD        |   17 +
 .../xla/service/libs/libblas_mlir/Makefile    |   52 +
 .../libs/libblas_mlir/include/MemrefHelpers.h |   10 +
 .../service/libs/libblas_mlir/include/cblas.h |   11 +
 .../kernels/sbatch_matmul_3d_nn_mlir.s        | 4079 ++++++++++++++++
 .../kernels/sbatch_matmul_3d_nt_mlir.s        | 2987 ++++++++++++
 .../kernels/sbatch_matmul_4d_nn_mlir.s        | 4171 +++++++++++++++++
 .../kernels/sbatch_matmul_4d_nt_mlir.s        | 3208 +++++++++++++
 .../kernels/sgemm_nn_alpha1_beta1_mlir.s      | 4104 ++++++++++++++++
 .../kernels/sgemv_n_alpha1_beta1_mlir.s       |  709 +++
 .../libblas_mlir/src/sbatch_matmul_3d.cpp     |   46 +
 .../libblas_mlir/src/sbatch_matmul_4d.cpp     |   49 +
 .../service/libs/libblas_mlir/src/sgemm.cpp   |   43 +
 .../service/libs/libblas_mlir/src/sgemv.cpp   |   43 +
 third_party/xla/xla/xla.proto                 |    3 +-
 36 files changed, 21295 insertions(+), 20 deletions(-)
 create mode 100644 third_party/xla/third_party/openblas/BUILD
 create mode 100644 third_party/xla/third_party/openblas/openblas.BUILD
 create mode 100644 third_party/xla/third_party/openblas/workspace.bzl
 create mode 100644 third_party/xla/xla/service/cpu/kernel_selector.cc
 create mode 100644 third_party/xla/xla/service/cpu/kernel_selector.h
 create mode 100644 third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.cc
 create mode 100644 third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.h
 create mode 100644 third_party/xla/xla/service/libs/BUILD
 create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/Makefile
 create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/include/MemrefHelpers.h
 create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/include/cblas.h
 create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nn_mlir.s
 create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nt_mlir.s
 create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nn_mlir.s
 create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nt_mlir.s
 create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemm_nn_alpha1_beta1_mlir.s
 create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemv_n_alpha1_beta1_mlir.s
 create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_3d.cpp
 create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_4d.cpp
 create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/src/sgemm.cpp
 create mode 100644 third_party/xla/xla/service/libs/libblas_mlir/src/sgemv.cpp

diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 85edecae8c67a6..2b1aa738a475cd 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -29,6 +29,7 @@ load("@local_xla//third_party/nvshmem:workspace.bzl", nvshmem = "repo")
 load("@local_xla//third_party/pybind11_abseil:workspace.bzl", pybind11_abseil = "repo")
 load("@local_xla//third_party/pybind11_bazel:workspace.bzl", pybind11_bazel = "repo")
 load("@local_xla//third_party/robin_map:workspace.bzl", robin_map = "repo")
+load("@local_xla//third_party/openblas:workspace.bzl", openblas = "repo")
 load("@rules_jvm_external//:defs.bzl", "maven_install")
 load("@tf_runtime//:dependencies.bzl", "tfrt_dependencies")
 load("//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl", "def_file_filter_configure")
@@ -100,6 +101,7 @@ def _initialize_third_party():
     tensorrt()
     nvshmem()
     triton()
+    openblas()
 
     # copybara: tsl vendor
 
diff --git a/third_party/xla/third_party/openblas/BUILD b/third_party/xla/third_party/openblas/BUILD
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/third_party/xla/third_party/openblas/openblas.BUILD b/third_party/xla/third_party/openblas/openblas.BUILD
new file mode 100644
index 00000000000000..6d36eec9e6b0d7
--- /dev/null
+++ b/third_party/xla/third_party/openblas/openblas.BUILD
@@ -0,0 +1,17 @@
+genrule(
+    name = "build_openblas",
+    srcs = glob(["**"], exclude = ["*.a"]),
+    outs = ["libopenblas.a"],
+    cmd = """
+        cd $$(dirname $(location //:README.md)) && \
+        make NO_SHARED=1 ONLY_CBLAS=1 TARGET=ARMV8 ARCH=arm64 && \
+        cd - && \
+        cp $$(dirname $(location //:README.md))/libopenblas_*.a $@
+    """,
+)
+
+cc_import(
+    name = "openblas",
+    static_library = "libopenblas.a",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/xla/third_party/openblas/workspace.bzl b/third_party/xla/third_party/openblas/workspace.bzl
new file mode 100644
index 00000000000000..6728207dbfe58f
--- /dev/null
+++ b/third_party/xla/third_party/openblas/workspace.bzl
@@ -0,0 +1,10 @@
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    tf_http_archive(
+        name = "openblas",
+        strip_prefix = "OpenBLAS-0.3.29",
+        sha256 = "38240eee1b29e2bde47ebb5d61160207dc68668a54cac62c076bb5032013b1eb",
+        urls = tf_mirror_urls("https://github.com/OpenMathLib/OpenBLAS/archive/8795fc7985635de1ecf674b87e2008a15097ffab.tar.gz"),
+        build_file = "//third_party/openblas:openblas.BUILD",
+    )
diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl
index 345f1931c68e47..cc2013365b40c8 100644
--- a/third_party/xla/workspace2.bzl
+++ b/third_party/xla/workspace2.bzl
@@ -18,6 +18,7 @@ load("//third_party/shardy:workspace.bzl", shardy = "repo")
 load("//third_party/stablehlo:workspace.bzl", stablehlo = "repo")
 load("//third_party/triton:workspace.bzl", triton = "repo")
 load("//third_party/uv:workspace.bzl", uv = "repo")
+load("//third_party/openblas:workspace.bzl", openblas = "repo")
 
 def _initialize_third_party():
     """ Load third party repositories.  See above load() statements. """
@@ -31,6 +32,7 @@ def _initialize_third_party():
     stablehlo()
     triton()
     uv()
+    openblas()
 
 # Define all external repositories required by TensorFlow
 def _tf_repositories():
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index 7ab70838950d98..7792ab22f7f929 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -100,10 +100,11 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 #ifdef XLA_CPU_USE_ACL
   opts.set_xla_cpu_use_acl(true);
 #endif
-  opts.set_xla_cpu_use_fusion_emitters(true);
-  opts.set_xla_cpu_use_thunk_runtime(true);
+  opts.set_xla_cpu_use_fusion_emitters(false);
+  opts.set_xla_cpu_use_thunk_runtime(false);
   opts.set_xla_cpu_use_xnnpack(false);
   opts.set_xla_cpu_enable_xnnpack(false);  // For softmax
+  opts.set_xla_cpu_use_kernel_selector(false);
   opts.set_xla_cpu_experimental_xnn_graph_fusion_mode(
       DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED);
   opts.set_xla_cpu_parallel_codegen_split_count(32);
@@ -1000,6 +1001,11 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       bool_setter_for(&DebugOptions::set_xla_cpu_enable_xnnpack),
       debug_options->xla_cpu_enable_xnnpack(),
       "Enable XNNPACK ops rewriter."));
+  flag_list->push_back(tsl::Flag(
+      "xla_cpu_use_kernel_selector",
+      bool_setter_for(&DebugOptions::set_xla_cpu_use_kernel_selector),
+      debug_options->xla_cpu_use_kernel_selector() ,
+      "Replace dot with custom call to libraries."));
   flag_list->push_back(tsl::Flag(
       "xla_cpu_experimental_xnn_graph_fusion_mode",
       setter_for_xla_cpu_experimental_xnn_graph_fusion_mode,
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index f951a6ac93b626..bc46d88d626fa0 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -89,6 +89,7 @@ filegroup(
         "runtime_matmul_f64.cc",
         "runtime_matmul_s32.cc",
         "runtime_fork_join.cc",
+        "kernel_selector.cc",
         "//xla/backends/cpu/runtime:runtime_srcs",
         #"runtime_handle_ffi_call.cc", # TODO(b/338344732): Add  "runtime_handle_ffi_call.cc".
     ],
@@ -118,6 +119,7 @@ filegroup(
         "runtime_fork_join.h",
         "runtime_lightweight_check.h",
         "runtime_matmul.h",
+        "kernel_selector.h",
         "//xla/backends/cpu/runtime:runtime_hdrs",
         #"runtime_handle_ffi_call.h", # TODO(b/338344732): Add  "runtime_handle_ffi_call.h"
     ],
@@ -195,7 +197,11 @@ cc_library(
     name = "cpu_compiler_pure",
     srcs = ["cpu_compiler.cc"],
     hdrs = ["cpu_compiler.h"],
-    copts = tsl_copts(),
+    copts = tsl_copts() + select({
+        ":enable_blas_mlir": ["-DENABLE_BLAS_MLIR"],
+        ":disable_blas_mlir": [],
+        "//conditions:default": [],
+    }),
     deps = [
         ":buffer_info_util",
         ":conv_canonicalization",
@@ -221,6 +227,7 @@ cc_library(
         ":thunk_emitter",
         ":xla_framework",
         ":xnnpack_ops_rewriter",
+        ":kernel_selector_ops_rewriter",
         "//xla:cpu_function_runtime",
         "//xla:debug_options_flags",
         "//xla:literal",
@@ -420,7 +427,21 @@ cc_library(
         "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
     ]) + if_llvm_x86_available([
         "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
-    ]),
+    ]) + select({
+        ":enable_blas_mlir": [":libmlir"],
+        ":disable_blas_mlir": [],
+        "//conditions:default": [],
+    }),
+)
+
+config_setting(
+    name = "enable_blas_mlir",
+    define_values = {"ENABLE_BLAS_MLIR": "true"},
+)
+
+config_setting(
+    name = "disable_blas_mlir",
+    define_values = {"ENABLE_BLAS_MLIR": "false"},
 )
 
 cc_library(
@@ -595,7 +616,11 @@ cc_library(
         "windows_compatibility.h",
     ],
     hdrs = ["runtime_symbol_generator.h"],
-    copts = if_enable_acl(["-DXLA_CPU_USE_ACL=1"]) + tsl_copts(),
+    copts = if_enable_acl(["-DXLA_CPU_USE_ACL=1"]) + tsl_copts() + select({
+        ":enable_blas_mlir": ["-DENABLE_BLAS_MLIR"],
+        ":disable_blas_mlir": [],
+        "//conditions:default": [],
+    }),
     deps = [
         ":cpu_runtime",
         ":onednn_convolution",
@@ -621,6 +646,7 @@ cc_library(
         ":runtime_single_threaded_matmul",
         ":runtime_topk",
         ":xnnpack_ops",
+        ":kernel_selector",
         "//xla/service:custom_call_target_registry",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/strings:string_view",
@@ -842,8 +868,6 @@ cc_library(
         ":onednn_config_proto_cc",
         ":onednn_memory_util",
         ":parallel_loop_emitter",
-        ":xnnpack_ops_rewriter",
-        ":xnnpack_ops",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -1108,7 +1132,11 @@ cc_library(
         "cpu_runtime.h",
         "xfeed_manager.h",
     ],
-    copts = runtime_copts(),
+    copts = runtime_copts() + select({
+        ":enable_blas_mlir": ["-DENABLE_BLAS_MLIR"],
+        ":disable_blas_mlir": [],
+        "//conditions:default": [],
+    }),
     deps = [
         ":cpu_executable_run_options",
         "//xla:executable_run_options",
@@ -2201,6 +2229,7 @@ cc_library(
         "xnnpack_ops_rewriter.h",
         "xnnpack_pattern_utils.h",
     ],
+    copts = ["-O3"],
     visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
@@ -2216,9 +2245,50 @@ cc_library(
     name = "xnnpack_ops",
     srcs = ["xnnpack_ops.cc"],
     hdrs = ["xnnpack_ops.h"],
+    copts = ["-O3"],
     visibility = ["//visibility:public"],
     deps = [
         "@XNNPACK",
         "@com_google_absl//absl/base",
     ],
 )
+
+cc_library(
+    name = "kernel_selector",
+    srcs = ["kernel_selector.cc"],
+    hdrs = ["kernel_selector.h"],
+    copts = ["-O3"] + select({
+        ":enable_blas_mlir": ["-DENABLE_BLAS_MLIR"],
+        ":disable_blas_mlir": [],
+        "//conditions:default": [],
+    }),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":runtime_lightweight_check",
+        "//xla:executable_run_options",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:blocking_counter",
+        "@openblas//:openblas",
+    ],
+)
+
+cc_library(
+    name = "kernel_selector_ops_rewriter",
+    srcs = ["kernel_selector_ops_rewriter.cc"],
+    hdrs = ["kernel_selector_ops_rewriter.h"],
+    copts = ["-O3"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cpu_runtime",
+        "//xla/hlo/ir:hlo",
+        "//xla:literal_util",
+        "//xla/hlo/pass:hlo_pass",
+    ],
+)
+
+cc_import(
+    name = "libmlir",
+    visibility = ["//visibility:public"],
+    shared_library = "//xla/service/libs:libblas_mlir.so",
+    system_provided = 0
+)
diff --git a/third_party/xla/xla/service/cpu/BUILD.orig b/third_party/xla/xla/service/cpu/BUILD.orig
index 90388079ca2fcf..f951a6ac93b626 100644
--- a/third_party/xla/xla/service/cpu/BUILD.orig
+++ b/third_party/xla/xla/service/cpu/BUILD.orig
@@ -76,6 +76,7 @@ filegroup(
         "runtime_single_threaded_matmul_s32.cc",
         "runtime_single_threaded_matmul_u8.cc",
         "runtime_topk.cc",
+        "xnnpack_ops.cc",
         # Multi-threaded support.
         "runtime_conv2d.cc",
         "runtime_conv3d.cc",
@@ -109,6 +110,7 @@ filegroup(
         "runtime_single_threaded_fft.h",
         "runtime_single_threaded_matmul.h",
         "runtime_topk.h",
+        "xnnpack_ops.h",
         # Multi-threaded support.
         "runtime_conv2d.h",
         "runtime_conv3d.h",
@@ -218,6 +220,7 @@ cc_library(
         ":small_while_loop_hoisting_pass",
         ":thunk_emitter",
         ":xla_framework",
+        ":xnnpack_ops_rewriter",
         "//xla:cpu_function_runtime",
         "//xla:debug_options_flags",
         "//xla:literal",
@@ -617,6 +620,7 @@ cc_library(
         ":runtime_single_threaded_fft",
         ":runtime_single_threaded_matmul",
         ":runtime_topk",
+        ":xnnpack_ops",
         "//xla/service:custom_call_target_registry",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/strings:string_view",
@@ -838,6 +842,8 @@ cc_library(
         ":onednn_config_proto_cc",
         ":onednn_memory_util",
         ":parallel_loop_emitter",
+        ":xnnpack_ops_rewriter",
+        ":xnnpack_ops",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -2187,3 +2193,32 @@ xla_cc_test(
         "@local_tsl//tsl/platform:test",
     ],
 )
+
+cc_library(
+    name = "xnnpack_ops_rewriter",
+    srcs = ["xnnpack_ops_rewriter.cc"],
+    hdrs = [
+        "xnnpack_ops_rewriter.h",
+        "xnnpack_pattern_utils.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla:literal_comparison",
+        "//xla:literal_util",
+        "//xla:status_macros",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/service:pattern_matcher",
+    ],
+)
+
+cc_library(
+    name = "xnnpack_ops",
+    srcs = ["xnnpack_ops.cc"],
+    hdrs = ["xnnpack_ops.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@XNNPACK",
+        "@com_google_absl//absl/base",
+    ],
+)
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 4a1402c6934cba..c6d02568dfb9e4 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -183,6 +183,8 @@ limitations under the License.
 #include "xla/service/cpu/runtime_symbol_generator.h"
 #include "xla/service/cpu/small_while_loop_hoisting_pass.h"
 #include "xla/service/cpu/thunk_emitter.h"
+#include "xla/service/cpu/xnnpack_ops_rewriter.h"
+#include "xla/service/cpu/kernel_selector_ops_rewriter.h"
 #include "xla/service/cpu_gpu_shape_verifier.h"
 #include "xla/service/dump.h"
 #include "xla/service/dynamic_dimension_inference.h"
@@ -236,8 +238,6 @@ limitations under the License.
 #include "tsl/profiler/lib/traceme.h"
 #include "tsl/profiler/lib/traceme_encode.h"
 
-#include "xnnpack_ops_rewriter.h"
-
 #ifdef TF_LLVM_X86_AVAILABLE
 #include "llvm/TargetParser/X86TargetParser.h"
 #endif
@@ -599,6 +599,13 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   if (enable_xnnpack)
     pipeline.AddPass<XnnPackOpsRewriter>();
 
+  bool use_kernel_selector =
+      xla::GetDebugOptionsFromFlags().xla_cpu_use_kernel_selector();
+  if (use_kernel_selector) {
+    // This pass rewrites hlo.dot into custom calls.
+    pipeline.AddPass<KernelSelectorOpsRewriter>();
+  }
+
   // Expand random number generation.
   pipeline.AddPass<RngExpander>();
   pipeline.AddPass<RngBitGeneratorExpander>(RandomAlgorithm::RNG_PHILOX);
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig b/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig
index 9ba0085b24d372..4a1402c6934cba 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig
@@ -236,6 +236,8 @@ limitations under the License.
 #include "tsl/profiler/lib/traceme.h"
 #include "tsl/profiler/lib/traceme_encode.h"
 
+#include "xnnpack_ops_rewriter.h"
+
 #ifdef TF_LLVM_X86_AVAILABLE
 #include "llvm/TargetParser/X86TargetParser.h"
 #endif
@@ -591,6 +593,12 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   };
   pipeline.AddPass<OperandUpcaster>(upcaster_filter);
 
+  // For softmax, rewrite to custom calls with XNNPACK targets.
+  bool enable_xnnpack =
+      xla::GetDebugOptionsFromFlags().xla_cpu_enable_xnnpack();
+  if (enable_xnnpack)
+    pipeline.AddPass<XnnPackOpsRewriter>();
+
   // Expand random number generation.
   pipeline.AddPass<RngExpander>();
   pipeline.AddPass<RngBitGeneratorExpander>(RandomAlgorithm::RNG_PHILOX);
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.cc b/third_party/xla/xla/service/cpu/cpu_runtime.cc
index 1f8e9291f84c32..5b66495798d800 100644
--- a/third_party/xla/xla/service/cpu/cpu_runtime.cc
+++ b/third_party/xla/xla/service/cpu/cpu_runtime.cc
@@ -199,6 +199,52 @@ extern const char* const kHandleFfiCallSymbolName =
     "__xla_cpu_runtime_HandleFfiCall";
 extern const char* const kXnnPackSoftMaxNDSymbolName =
     "__xla_cpu_runtime_XnnPackSoftMaxND";
+extern const char* const kArgMax3DParallelSymbolName =
+    "__xla_cpu_runtime_ArgMax3DParallel";
+extern const char* const kArgMax3DSequentialSymbolName =
+    "__xla_cpu_runtime_ArgMax3DSequential";
+extern const char* const kKernelSelectorGEMVSymbolName =
+    "__xla_cpu_runtime_KernelSelectorGEMV";
+extern const char* const kKernelSelectorGEMMSequentialSymbolName =
+    "__xla_cpu_runtime_KernelSelectorGEMMSequential";
+extern const char* const kKernelSelectorGEMMParallelSymbolName =
+    "__xla_cpu_runtime_KernelSelectorGEMMParallel";
+extern const char* const kKernelSelectorBatch3DSequentialSymbolName =
+    "__xla_cpu_runtime_KernelSelectorBatch3DSequential";
+extern const char* const kKernelSelectorBatch3DParallelSymbolName =
+    "__xla_cpu_runtime_KernelSelectorBatch3DParallel";
+#ifdef ENABLE_BLAS_MLIR
+extern const char* const kKernelSelectorGEMVMLIRSymbolName =
+    "__xla_cpu_runtime_KernelSelectorGEMVMLIR";
+#endif  // ENABLE_BLAS_MLIR
+extern const char* const kKernelSelectorBatch4DSequentialSymbolName =
+    "__xla_cpu_runtime_KernelSelectorBatch4DSequential";
+extern const char* const kKernelSelectorBatch4DParallelSymbolName =
+    "__xla_cpu_runtime_KernelSelectorBatch4DParallel";
+#ifdef ENABLE_BLAS_MLIR
+extern const char* const kKernelSelectorGEMMMLIRSymbolName =
+    "__xla_cpu_runtime_KernelSelectorGEMMMLIR";
+extern const char* const kKernelSelectorBatch3DMLIRSymbolName =
+    "__xla_cpu_runtime_KernelSelectorBatch3DMLIR";
+extern const char* const kKernelSelectorBatch4DMLIRSymbolName =
+    "__xla_cpu_runtime_KernelSelectorBatch4DMLIR";
+#endif  // ENABLE_BLAS_MLIR
+extern const char* const kKernelSelectorGEMVEmptySymbolName =
+    "__xla_cpu_runtime_KernelSelectorGEMVEmpty";
+extern const char* const kKernelSelectorGEMMEmptySymbolName =
+    "__xla_cpu_runtime_KernelSelectorGEMMEmpty";
+extern const char* const kKernelSelectorBatch3DEmptySymbolName =
+    "__xla_cpu_runtime_KernelSelectorBatch3DEmpty";
+extern const char* const kKernelSelectorBatch4DEmptySymbolName =
+    "__xla_cpu_runtime_KernelSelectorBatch4DEmpty";
+extern const char* const kArgMax3DEmptySymbolName =
+    "__xla_cpu_runtime_ArgMax3DEmpty";
+extern const char* const kKernelSelectorOperationGEMV = "GEMV";
+extern const char* const kKernelSelectorOperationGEMM = "GEMM";
+extern const char* const kKernelSelectorOperationBATCH3D = "BATCH3D";
+extern const char* const kKernelSelectorOperationBATCH4D = "BATCH4D";
+extern const char* const kKernelSelectorOperationARGMAX = "ARGMAX";
+extern const char* const kCustomCallKernelSelector = "KernelSelector";
 
 namespace {
 
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.h b/third_party/xla/xla/service/cpu/cpu_runtime.h
index 31c7f9d0d86ef5..4469a468a2ff5c 100644
--- a/third_party/xla/xla/service/cpu/cpu_runtime.h
+++ b/third_party/xla/xla/service/cpu/cpu_runtime.h
@@ -98,6 +98,35 @@ extern const char* const kOneDnnConvolutionSymbolName;
 extern const char* const kOneDnnMatMulReorderSymbolName;
 extern const char* const kHandleFfiCallSymbolName;
 extern const char* const kXnnPackSoftMaxNDSymbolName;
+extern const char* const kArgMax3DParallelSymbolName;
+extern const char* const kArgMax3DSequentialSymbolName;
+extern const char* const kKernelSelectorGEMVSymbolName;
+extern const char* const kKernelSelectorGEMMSequentialSymbolName;
+extern const char* const kKernelSelectorGEMMParallelSymbolName;
+extern const char* const kKernelSelectorBatch3DSequentialSymbolName;
+extern const char* const kKernelSelectorBatch3DParallelSymbolName;
+extern const char* const kKernelSelectorBatch4DSequentialSymbolName;
+extern const char* const kKernelSelectorBatch4DParallelSymbolName;
+#ifdef ENABLE_BLAS_MLIR
+extern const char* const kKernelSelectorGEMVMLIRSymbolName;
+extern const char* const kKernelSelectorGEMMMLIRSymbolName;
+extern const char* const kKernelSelectorBatch3DMLIRSymbolName;
+extern const char* const kKernelSelectorBatch4DMLIRSymbolName;
+#endif  // ENABLE_BLAS_MLIR
+extern const char* const kKernelSelectorGEMVEmptySymbolName;
+extern const char* const kKernelSelectorGEMMEmptySymbolName;
+extern const char* const kKernelSelectorBatch3DEmptySymbolName;
+extern const char* const kKernelSelectorBatch4DEmptySymbolName;
+extern const char* const kArgMax3DEmptySymbolName;
+
+// Kernel selector operation names.
+extern const char* const kKernelSelectorOperationGEMV;
+extern const char* const kKernelSelectorOperationGEMM;
+extern const char* const kKernelSelectorOperationBATCH3D;
+extern const char* const kKernelSelectorOperationBATCH4D;
+extern const char* const kKernelSelectorOperationARGMAX;
+
+extern const char* const kCustomCallKernelSelector;
 
 // All symbol names for XLA CPU runtime functions need to start with this
 // prefix.
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.cc b/third_party/xla/xla/service/cpu/ir_emitter.cc
index 2bd5d7278b07c5..f99308bcd6104f 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter.cc
@@ -110,9 +110,6 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
-#include "xnnpack_ops.h"
-#include "xnnpack_ops_rewriter.h"
-
 #if defined(INTEL_MKL)
 #include "xla/service/cpu/onednn_memory_util.h"
 #endif
@@ -2499,6 +2496,184 @@ absl::Status IrEmitter::HandleXnnPackSoftMax(HloInstruction* hlo) {
   return absl::OkStatus();
 }
 
+absl::Status IrEmitter::HandleKernelSelectorArgMax(HloInstruction* hlo) {
+   OpMetadata metadata = hlo->metadata();
+
+  const HloInstruction* in1 = hlo->operand(0);
+  const HloInstruction* in2 = hlo->operand(1);
+  const HloInstruction* in3 = hlo->operand(2);
+  const HloInstruction* in4 = hlo->operand(3);
+
+  Shape shape = in1->shape();
+  TF_RET_CHECK(shape.dimensions().size() == 3);
+
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
+
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice input1_slice,
+                      assignment_.GetUniqueSlice(in1, {}));
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice input2_slice,
+                      assignment_.GetUniqueSlice(in2, {}));
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice out_values_slice,
+                      assignment_.GetUniqueSlice(hlo, {0}));
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice out_indices_slice,
+                      assignment_.GetUniqueSlice(hlo, {1}));
+
+  llvm::Value* values1_ptr = EmitBufferPointer(input1_slice, in1->shape());
+  llvm::Value* values2_ptr = EmitBufferPointer(input2_slice, in2->shape());
+  llvm::Value* out_values_ptr =
+      EmitBufferPointer(out_values_slice, hlo->shape().tuple_shapes(0));
+  llvm::Value* out_indices_ptr =
+      EmitBufferPointer(out_indices_slice, hlo->shape().tuple_shapes(1));
+
+  float cst1_val = in3->literal().Get<float>({});
+  llvm::Constant* cst1 = llvm::ConstantFP::get(b()->getFloatTy(), cst1_val);
+
+  EmitCallToFunc(
+      metadata.op_name(),
+      {/*run_options=*/GetExecutableRunOptionsArgument(),
+       /*B*/ b()->getInt64(shape.dimensions(0)),
+       /*M*/ b()->getInt64(shape.dimensions(1)),
+       /*N*/ b()->getInt64(shape.dimensions(2)),
+       /*invals*/ BitCast(values1_ptr, b()->getInt32Ty()->getPointerTo()),
+       /*inidxs*/ BitCast(values2_ptr, b()->getInt32Ty()->getPointerTo()),
+       /*init_value*/ cst1,
+       /*init_idx*/ b()->getInt32(in4->literal().Get<int>({})),
+       /*outvals*/ BitCast(out_values_ptr, b()->getFloatTy()->getPointerTo()),
+       /*outidxs*/ BitCast(out_indices_ptr, b()->getInt32Ty()->getPointerTo())},
+      b()->getVoidTy());
+
+  llvm_ir::EmitTuple(GetIrArrayFor(hlo), {out_values_ptr, out_indices_ptr},
+                     b());
+  return absl::OkStatus();
+}
+
+absl::Status IrEmitter::HandleKernelSelectorBlas(HloInstruction* custom_call) {
+  OpMetadata metadata = custom_call->metadata();
+
+  bool isGEMV = (metadata.op_type() == runtime::kKernelSelectorOperationGEMV);
+  bool isGEMM = (metadata.op_type() == runtime::kKernelSelectorOperationGEMM);
+  bool isBATCHMATMUL3D =
+      (metadata.op_type() == runtime::kKernelSelectorOperationBATCH3D);
+  bool isBATCHMATMUL4D =
+      (metadata.op_type() == runtime::kKernelSelectorOperationBATCH4D);
+  bool isBATCHMATMUL = isBATCHMATMUL3D | isBATCHMATMUL4D;
+
+  int operand = 0;
+  std::vector<llvm::Value*> arguments;
+
+  //  |               arguments               |
+  //  |  gemm  |  batch3d |  batch4d | gemv   |
+  //  -----------------------------------------
+  //  |  trA   |  trA     |  trA     |  trA   |
+  //  |  trB   |  trB     |  trB     |        |
+  //  |  A     |  A       |  A       |  A     |
+  //  |  B     |  B       |  B       |  X     |
+  //  |        |          |  Q       |        |
+  //  |        |  P       |  P       |        |
+  //  |  M     |  M       |  M       |  M     |
+  //  |  N     |  N       |  N       |  N     |
+  //  |  K     |  K       |  K       |        |
+  //  |  alpha |          |          |  alpha |
+  //  |  beta  |          |          |  beta  |
+
+  arguments.push_back(/*run_options=*/GetExecutableRunOptionsArgument());
+
+  // trA
+  HloInstruction const* trA = custom_call->operand(operand++);
+  bool tranA = trA->literal().Get<bool>({});
+  arguments.push_back(b()->getInt1(tranA));
+
+  if (isGEMM || isBATCHMATMUL) {
+    // trB
+    HloInstruction const* trB = custom_call->operand(operand++);
+    bool tranB = trB->literal().Get<bool>({});
+    arguments.push_back(b()->getInt1(tranB));
+  }
+
+  // A
+  HloInstruction const* A = custom_call->operand(operand++);
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice a_slice,
+                      assignment_.GetUniqueSlice(A, {}));
+  llvm::Value* A_ptr = EmitBufferPointer(a_slice, A->shape());
+  arguments.push_back(A_ptr);
+
+  // B (or X in GEMV)
+  HloInstruction const* B = custom_call->operand(operand++);
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice b_slice,
+                      assignment_.GetUniqueSlice(B, {}));
+  llvm::Value* B_ptr = EmitBufferPointer(b_slice, B->shape());
+  arguments.push_back(B_ptr);
+
+  if (isBATCHMATMUL) {
+    // Q
+    if (isBATCHMATMUL4D) {
+      HloInstruction const* Q = custom_call->operand(operand++);
+      int q = Q->literal().Get<int>({});
+      arguments.push_back(b()->getInt32(q));
+    }
+
+    // P
+    HloInstruction const* P = custom_call->operand(operand++);
+    int p = P->literal().Get<int>({});
+    arguments.push_back(b()->getInt32(p));
+  }
+
+  // M
+  HloInstruction const* M = custom_call->operand(operand++);
+  int m = M->literal().Get<int>({});
+  arguments.push_back(b()->getInt32(m));
+
+  // N
+  HloInstruction const* N = custom_call->operand(operand++);
+  int n = N->literal().Get<int>({});
+  arguments.push_back(b()->getInt32(n));
+
+  if (isGEMM || isBATCHMATMUL) {
+    // K
+    HloInstruction const* K = custom_call->operand(operand++);
+    int k = K->literal().Get<int>({});
+    arguments.push_back(b()->getInt32(k));
+  }
+
+  float beta = 0.0;
+  if (isGEMM || isGEMV) {
+    // Alpha
+    HloInstruction const* Alpha = custom_call->operand(operand++);
+    float alpha = Alpha->literal().Get<float>({});
+    llvm::Constant* alphaConst = llvm::ConstantFP::get(b()->getFloatTy(), alpha);
+    arguments.push_back(alphaConst);
+
+    // Beta
+    HloInstruction const* Beta = custom_call->operand(operand++);
+    beta = Beta->literal().Get<float>({});
+    llvm::Constant* betaConst = llvm::ConstantFP::get(b()->getFloatTy(), beta);
+    arguments.push_back(betaConst);
+  }
+
+  // C (or Y in GEMV)
+  HloInstruction const* C = custom_call;
+
+  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice c_slice,
+                      assignment_.GetUniqueSlice(C, {}));
+  llvm::Value* C_ptr = EmitBufferPointer(c_slice, C->shape());
+  arguments.push_back(C_ptr);
+
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(custom_call));
+
+  EmitCallToFunc(metadata.op_name(), arguments, b()->getVoidTy());
+
+  return absl::OkStatus();
+}
+
+absl::Status IrEmitter::HandleKernelSelector(HloInstruction* custom_call) {
+  OpMetadata metadata = custom_call->metadata();
+
+  if (metadata.op_type() == runtime::kKernelSelectorOperationARGMAX)
+    return HandleKernelSelectorArgMax(custom_call);
+  else
+    return HandleKernelSelectorBlas(custom_call);
+}
+
 #if defined(INTEL_MKL)
 
 // Emits operands alloca vector for oneDNN custom calls.
@@ -2851,9 +3026,12 @@ absl::Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
   if (custom_call->custom_call_target() == "TopK") {
     return HandleTopK(custom_call);
   }
-  if (custom_call->custom_call_target() == kCustomCallXnnPackSoftMax) {
+  if (custom_call->custom_call_target() == "__xnnpack$softmax") {
     return HandleXnnPackSoftMax(custom_call);
   }
+  if (custom_call->custom_call_target() == runtime::kCustomCallKernelSelector) {
+    return HandleKernelSelector(custom_call);
+  }
 #if defined(INTEL_MKL)
   if (custom_call->custom_call_target() == "__onednn$matmul") {
     return HandleOneDnnMatMulCalls(custom_call,
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.h b/third_party/xla/xla/service/cpu/ir_emitter.h
index b3d47d41c6ca69..9d668325d1618b 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.h
+++ b/third_party/xla/xla/service/cpu/ir_emitter.h
@@ -337,6 +337,9 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   absl::Status HandleAllReduceSingleReplica(HloInstruction* crs);
   absl::Status HandleAllReduceMultipleReplica(HloInstruction* crs);
   absl::Status HandleXnnPackSoftMax(HloInstruction* hlo);
+  absl::Status HandleKernelSelector(HloInstruction* hlo);
+  absl::Status HandleKernelSelectorBlas(HloInstruction* hlo);
+  absl::Status HandleKernelSelectorArgMax(HloInstruction* hlo);
 #if defined(INTEL_MKL)
   std::vector<StackAlloca> EmitOneDnnOperandsAlloca(HloInstruction* custom_call,
                                                     llvm::Value*& args_val,
diff --git a/third_party/xla/xla/service/cpu/kernel_selector.cc b/third_party/xla/xla/service/cpu/kernel_selector.cc
new file mode 100644
index 00000000000000..0ba46ab5989c44
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/kernel_selector.cc
@@ -0,0 +1,423 @@
+/* Copyright 2025 Huawei. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "kernel_selector.h"
+
+#define EIGEN_USE_THREADS
+
+#include <string.h>
+
+#include "tsl/platform/blocking_counter.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+#include "xla/executable_run_options.h"
+#include "xla/service/cpu/runtime_lightweight_check.h"
+
+namespace xla {
+namespace cpu {
+
+// TODO: Need to test handling trA, trB
+void __xla_cpu_runtime_KernelSelectorGEMMSequential(
+    const void* run_options_ptr, bool trA, bool trB, const float* A,
+    const float* B, int M, int N, int K, float alpha, float beta, float* C) {
+  CBLAS_LAYOUT Order = CblasRowMajor;
+  CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans;
+  CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans;
+  int lda = trA ? M : K;
+  int ldb = trB ? K : N;
+  int ldc = N;
+
+  cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C,
+              ldc);
+}
+
+// TODO: Need to test handling trA, trB
+void __xla_cpu_runtime_KernelSelectorGEMMParallel(
+    const void* run_options_ptr, bool trA, bool trB, const float* A,
+    const float* B, int M, int N, int K, float alpha, float beta, float* C) {
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
+  const Eigen::ThreadPoolDevice* thread_pool =
+      (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool());
+  Eigen::ThreadPoolInterface* eigen_interface_ = thread_pool->getPool();
+
+  CBLAS_LAYOUT Order = CblasRowMajor;
+  CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans;
+  CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans;
+  int lda = trA ? M : K;
+  int ldb = trB ? K : N;
+  int ldc = N;
+
+  float beta_v = beta;
+  if (beta == 0.0) {
+    beta_v = 1.0;
+    memset(C, 0.0, M * N * sizeof(float));
+  }
+
+  int njobs = eigen_interface_->NumThreads();
+
+  int sqrt_jobs = (int)sqrt(njobs);
+
+  tsl::BlockingCounter bc(njobs);
+
+  // TODO: Look at a more flexible way to distribute computation amongst
+  // threads.
+  for (int i = 0; i < sqrt_jobs; i++) {
+    for (int j = 0; j < sqrt_jobs; j++) {
+      int M_tile = M / sqrt_jobs;
+      int N_tile = N / sqrt_jobs;
+
+      int M_start = i * M_tile;
+      int N_start = j * N_tile;
+
+      int M_len = (i == sqrt_jobs - 1) ? (M - M_start) : M_tile;
+      int N_len = (j == sqrt_jobs - 1) ? (N - N_start) : N_tile;
+
+      eigen_interface_->Schedule([=, &bc]() {
+        cblas_sgemm(Order, TransA, TransB, M_len, N_len, K, alpha,
+                    &A[M_start * lda], lda, &B[N_start], ldb, beta_v,
+                    &C[M_start * ldc + N_start], ldc);
+        bc.DecrementCount();
+      });
+    }
+  }
+  bc.Wait();
+}
+
+void __xla_cpu_runtime_KernelSelectorBatch3DSequential(
+    const void* run_options_ptr, bool trA, bool trB, const float* A,
+    const float* B, int P, int M, int N, int K, float* C) {
+  CBLAS_LAYOUT Order = CblasRowMajor;
+  CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans;
+  CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans;
+  int lda = trA ? M : K;
+  int ldb = trB ? K : N;
+  int ldc = N;
+
+  float alpha = 1.0;
+  float beta = 0.0;
+
+  for (int i = 0; i < P; ++i) {
+    cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, &A[i * M * K], lda,
+                &B[i * K * N], ldb, beta, &C[i * M * N], ldc);
+  }
+}
+
+void __xla_cpu_runtime_KernelSelectorBatch3DParallel(
+    const void* run_options_ptr, bool trA, bool trB, const float* A,
+    const float* B, int P, int M, int N, int K, float* C) {
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
+  const Eigen::ThreadPoolDevice* thread_pool =
+      (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool());
+  Eigen::ThreadPoolInterface* eigen_interface_ = thread_pool->getPool();
+
+  CBLAS_LAYOUT Order = CblasRowMajor;
+  CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans;
+  CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans;
+  int lda = trA ? M : K;
+  int ldb = trB ? K : N;
+  int ldc = N;
+
+  float alpha = 1.0;
+  float beta = 0.0;
+
+  int njobs = eigen_interface_->NumThreads();
+
+  int num_batches = P;
+
+  tsl::BlockingCounter bc(num_batches < njobs ? num_batches : njobs);
+
+  // parallelize batches
+  int PB = (num_batches) / njobs;
+  int rem = (num_batches) % njobs;
+
+  // TODO: Need to test handling trA
+  for (int batchIdx = 0, threadIdx = 0; batchIdx < num_batches; threadIdx++) {
+    int adjPB = PB + (threadIdx < rem ? 1 : 0);
+
+    eigen_interface_->Schedule([=, &bc]() {
+      for (int i = 0; i < adjPB; i++) {
+        const float* AA = &A[(batchIdx + i) * M * K];
+        const float* BB = &B[(batchIdx + i) * K * N];
+        float* CC = &C[(batchIdx + i) * M * N];
+        cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, AA, lda, BB, ldb,
+                    beta, CC, ldc);
+      }
+      bc.DecrementCount();
+    });
+
+    batchIdx += adjPB;
+  }
+  bc.Wait();
+}
+
+void __xla_cpu_runtime_KernelSelectorGEMV(const void* run_options_ptr, bool trA,
+                                          const float* A, const float* X, int M,
+                                          int N, float alpha, float beta,
+                                          float* Y) {
+  int lda = trA ? M : N;
+  int incX = 1;
+  int incY = 1;
+  CBLAS_LAYOUT Order = CblasRowMajor;
+  CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans;
+  cblas_sgemv(Order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
+}
+
+#ifdef ENABLE_BLAS_MLIR
+void __xla_cpu_runtime_KernelSelectorGEMMMLIR(const void* run_options_ptr,
+                                              bool trA, bool trB,
+                                              const float* A, const float* B,
+                                              int M, int N, int K, float alpha,
+                                              float beta, float* C) {
+  CBLAS_LAYOUT Order = CblasRowMajor;
+  CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans;
+  CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans;
+  int lda = trA ? M : K;
+  int ldb = trB ? K : N;
+  int ldc = N;
+
+  float beta_v = beta;
+  if (beta == 0.0) {
+    beta_v = 1.0;
+    memset(C, 0.0, M * N * sizeof(float));
+  }
+
+  cblas_sgemm_mlir(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
+                   beta_v, C, ldc);
+}
+
+void __xla_cpu_runtime_KernelSelectorBatch3DMLIR(const void* run_options_ptr,
+                                                 bool trA, bool trB,
+                                                 const float* A, const float* B,
+                                                 int P, int M, int N, int K,
+                                                 float* C) {
+  CBLAS_LAYOUT Order = CblasRowMajor;
+  CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans;
+  CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans;
+  int lda = trA ? M : K;
+  int ldb = trB ? K : N;
+  int ldc = N;
+
+  cblas_sbatch_matmul_mlir(Order, TransA, TransB, P, M, N, K, A, lda, B, ldb, C,
+                           ldc);
+}
+
+void __xla_cpu_runtime_KernelSelectorBatch4DMLIR(const void* run_options_ptr,
+                                                 bool trA, bool trB,
+                                                 const float* A, const float* B,
+                                                 int Q, int P, int M, int N,
+                                                 int K, float* C) {
+  CBLAS_LAYOUT Order = CblasRowMajor;
+  CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans;
+  CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans;
+  int lda = trA ? M : K;
+  int ldb = trB ? K : N;
+  int ldc = N;
+
+  cblas_sbatch_matmul_4d_mlir(Order, TransA, TransB, Q, P, M, N, K, A, lda, B,
+                              ldb, C, ldc);
+}
+#endif  // ENABLE_BLAS_MLIR
+
+void __xla_cpu_runtime_KernelSelectorBatch4DSequential(
+    const void* run_options_ptr, bool trA, bool trB, const float* A,
+    const float* B, int Q, int P, int M, int N, int K, float* C) {
+  CBLAS_LAYOUT Order = CblasRowMajor;
+  CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans;
+  CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans;
+  int lda = trA ? M : K;
+  int ldb = trB ? K : N;
+  int ldc = N;
+
+  float alpha = 1.0;
+  float beta = 0.0;
+
+  for (int i = 0; i < Q * P; ++i) {
+    cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, &A[i * M * K], lda,
+                &B[i * K * N], ldb, beta, &C[i * M * N], ldc);
+  }
+}
+
+void __xla_cpu_runtime_KernelSelectorBatch4DParallel(
+    const void* run_options_ptr, bool trA, bool trB, const float* A,
+    const float* B, int Q, int P, int M, int N, int K, float* C) {
+  CBLAS_LAYOUT Order = CblasRowMajor;
+  CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans;
+  CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans;
+  int lda = trA ? M : K;
+  int ldb = trB ? K : N;
+  int ldc = N;
+
+  float alpha = 1.0;
+  float beta = 0.0;
+
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
+  const Eigen::ThreadPoolDevice* thread_pool =
+      (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool());
+  Eigen::ThreadPoolInterface* eigen_interface_ = thread_pool->getPool();
+
+  int njobs = eigen_interface_->NumThreads();
+
+  int num_batches = P * Q;
+
+  tsl::BlockingCounter bc(num_batches < njobs ? num_batches : njobs);
+
+  // parallelize batches
+  int PB = (num_batches) / njobs;
+  int rem = (num_batches) % njobs;
+
+  // TODO: Need to test handling trA
+  for (int batchIdx = 0, threadIdx = 0; batchIdx < num_batches; threadIdx++) {
+    int adjPB = PB + (threadIdx < rem ? 1 : 0);
+
+    eigen_interface_->Schedule([=, &bc]() {
+      for (int i = 0; i < adjPB; i++) {
+        const float* AA = &A[(batchIdx + i) * M * K];
+        const float* BB = &B[(batchIdx + i) * K * N];
+        float* CC = &C[(batchIdx + i) * M * N];
+        cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, AA, lda, BB, ldb,
+                    beta, CC, ldc);
+      }
+      bc.DecrementCount();
+    });
+
+    batchIdx += adjPB;
+  }
+  bc.Wait();
+}
+
+#ifdef ENABLE_BLAS_MLIR
+void __xla_cpu_runtime_KernelSelectorGEMVMLIR(const void* run_options_ptr,
+                                              bool trA, const float* A,
+                                              const float* X, int M, int N,
+                                              float alpha, float beta,
+                                              float* Y) {
+  int lda = trA ? M : N;
+  int incX = 1;
+  int incY = 1;
+  CBLAS_LAYOUT Order = CblasRowMajor;
+  CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans;
+
+  cblas_sgemv_mlir(Order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
+}
+#endif  // ENABLE_BLAS_MLIR
+
+void __xla_cpu_runtime_ArgMaxTask(size_t out_idx, int N, float* invals,
+                                  int32_t* inidxs, float init_value,
+                                  int32_t init_idx, float* outvals,
+                                  int32_t* outidxs) {
+  float maxval = init_value;
+  int32_t maxidx = init_idx;
+  size_t idx = (out_idx)*N;
+
+  for (int i = 0; i < N; i++) {
+    float val = invals[idx];
+    int32_t idx_val = inidxs[idx];
+
+    if (val >= maxval) {
+      maxval = val;
+      maxidx = idx_val;
+    }
+
+    idx++;
+  }
+
+  outvals[out_idx] = maxval;
+  outidxs[out_idx] = maxidx;
+}
+
+void __xla_cpu_runtime_ArgMax3DParallel(const void* run_options_ptr, int B,
+                                        int M, int N, float* invals,
+                                        int32_t* inidxs, float init_value,
+                                        int32_t init_idx, float* outvals,
+                                        int32_t* outidxs) {
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
+  const Eigen::ThreadPoolDevice* thread_pool =
+      (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool());
+  Eigen::ThreadPoolInterface* eigen_interface_ = thread_pool->getPool();
+
+  int BM = B * M;
+  int num_threads = eigen_interface_->NumThreads();
+  const int block_size = (BM + num_threads - 1) / num_threads;
+  tsl::BlockingCounter bc(num_threads);
+
+  for (size_t t = 0; t < num_threads; t++) {
+    size_t start = t * block_size;
+    size_t end = std::min<size_t>((t + 1) * block_size, BM);
+
+    eigen_interface_->ScheduleWithHint(
+        [=, &bc]() {
+          for (size_t bm = start; bm < end; bm++) {
+            __xla_cpu_runtime_ArgMaxTask(bm, N, invals, inidxs, init_value,
+                                         init_idx, outvals, outidxs);
+          }
+          bc.DecrementCount();
+        },
+        t, t + 1);
+  }
+
+  bc.Wait();
+}
+
+void __xla_cpu_runtime_ArgMax3DSequential(const void* run_options_ptr, int B,
+                                          int M, int N, float* invals,
+                                          int32_t* inidxs, float init_value,
+                                          int32_t init_idx, float* outvals,
+                                          int32_t* outidxs) {
+  // NB: run_options_ptr is ignored in the sequential version.
+  for (int b = 0; b < B; b++) {
+    for (int m = 0; m < M; m++) {
+      size_t out_idx = b * M + m;
+      __xla_cpu_runtime_ArgMaxTask(out_idx, N, invals, inidxs, init_value,
+                                   init_idx, outvals, outidxs);
+    }
+  }
+}
+
+void __xla_cpu_runtime_ArgMax3DEmpty(const void* run_options_ptr, int B, int M,
+                                     int N, float* invals, int32_t* inidxs,
+                                     float init_value, int32_t init_idx,
+                                     float* outvals, int32_t* outidxs) {}
+
+void __xla_cpu_runtime_KernelSelectorGEMVEmpty(const void* run_options_ptr,
+                                               bool trA, const float* A,
+                                               const float* X, int M, int N,
+                                               float alpha, float beta,
+                                               float* Y) {}
+
+void __xla_cpu_runtime_KernelSelectorGEMMEmpty(const void* run_options_ptr,
+                                               bool trA, bool trB,
+                                               const float* A, const float* B,
+                                               int m, int n, int k, float alpha,
+                                               float beta, float* C) {}
+
+void __xla_cpu_runtime_KernelSelectorBatch3DEmpty(const void* run_options_ptr,
+                                                  bool trA, bool trB,
+                                                  const float* A,
+                                                  const float* B, int P, int M,
+                                                  int N, int K, float* C) {}
+
+void __xla_cpu_runtime_KernelSelectorBatch4DEmpty(
+    const void* run_options_ptr, bool trA, bool trB, const float* A,
+    const float* B, int Q, int P, int M, int N, int K, float* C) {}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/kernel_selector.h b/third_party/xla/xla/service/cpu/kernel_selector.h
new file mode 100644
index 00000000000000..beb64d033f6b99
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/kernel_selector.h
@@ -0,0 +1,191 @@
+/* Copyright 2025 Huawei. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_CPU_KERNEL_SELECTOR_H_
+#define XLA_SERVICE_CPU_KERNEL_SELECTOR_H_
+#include <cstdint>
+
+namespace xla {
+namespace cpu {
+
+#ifndef OPENBLAS_CONST
+#define OPENBLAS_CONST const
+#endif
+
+typedef enum CBLAS_ORDER {
+  CblasRowMajor = 101,
+  CblasColMajor = 102
+} CBLAS_ORDER;
+
+typedef enum CBLAS_TRANSPOSE {
+  CblasNoTrans = 111,
+  CblasTrans = 112,
+  CblasConjTrans = 113,
+  CblasConjNoTrans = 114
+} CBLAS_TRANSPOSE;
+
+typedef int blasint;
+typedef CBLAS_ORDER CBLAS_LAYOUT;
+
+extern "C" {
+
+// BLAS interface
+extern void cblas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order,
+                        OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
+                        OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB,
+                        OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
+                        OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha,
+                        OPENBLAS_CONST float* A, OPENBLAS_CONST blasint lda,
+                        OPENBLAS_CONST float* B, OPENBLAS_CONST blasint ldb,
+                        OPENBLAS_CONST float beta, float* C,
+                        OPENBLAS_CONST blasint ldc);
+
+extern void cblas_sgemv(OPENBLAS_CONST enum CBLAS_ORDER order,
+                        OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,
+                        OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
+                        OPENBLAS_CONST float alpha, OPENBLAS_CONST float* a,
+                        OPENBLAS_CONST blasint lda, OPENBLAS_CONST float* x,
+                        OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta,
+                        float* y, OPENBLAS_CONST blasint incy);
+
+#ifdef ENABLE_BLAS_MLIR
+// MLIR LIB
+extern void cblas_sbatch_matmul_mlir(
+    const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+    const enum CBLAS_TRANSPOSE TransB, const blasint P, const blasint M,
+    const blasint N, const blasint K, const float* A, const blasint lda,
+    const float* B, const blasint ldb, float* C, const blasint ldc);
+
+extern void cblas_sbatch_matmul_4d_mlir(
+    const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+    const enum CBLAS_TRANSPOSE TransB, const blasint Q, const blasint P,
+    const blasint M, const blasint N, const blasint K, const float* A,
+    const blasint lda, const float* B, const blasint ldb, float* C,
+    const blasint ldc);
+
+extern void cblas_sgemm_mlir(const enum CBLAS_ORDER Order,
+                             const enum CBLAS_TRANSPOSE TransA,
+                             const enum CBLAS_TRANSPOSE TransB, const blasint M,
+                             const blasint N, const blasint K,
+                             const float alpha, const float* A,
+                             const blasint lda, const float* B,
+                             const blasint ldb, const float beta, float* C,
+                             const blasint ldc);
+
+extern void cblas_sgemv_mlir(const enum CBLAS_ORDER Order,
+                             const enum CBLAS_TRANSPOSE TransA, const blasint M,
+                             const blasint N, const float alpha, const float* A,
+                             const blasint lda, const float* X,
+                             const blasint incX, const float beta, float* Y,
+                             const blasint incY);
+#endif  // ENABLE_BLAS_MLIR
+}  // extern "C"
+
+void __xla_cpu_runtime_KernelSelectorGEMMSequential(
+    const void* run_options_ptr, bool trA, bool trB, const float* A,
+    const float* B, int M, int N, int K, float alpha, float beta, float* C);
+
+void __xla_cpu_runtime_KernelSelectorGEMMParallel(
+    const void* run_options_ptr, bool trA, bool trB, const float* A,
+    const float* B, int m, int n, int k, float alpha, float beta, float* C);
+
+void __xla_cpu_runtime_KernelSelectorBatch3DSequential(
+    const void* run_options_ptr, bool trA, bool trB, const float* A,
+    const float* B, int P, int M, int N, int K, float* C);
+
+void __xla_cpu_runtime_KernelSelectorBatch3DParallel(
+    const void* run_options_ptr, bool trA, bool trB, const float* A,
+    const float* B, int P, int M, int N, int K, float* C);
+
+void __xla_cpu_runtime_KernelSelectorBatch4DSequential(
+    const void* run_options_ptr, bool trA, bool trB, const float* A,
+    const float* B, int Q, int P, int M, int N, int K, float* C);
+
+void __xla_cpu_runtime_KernelSelectorBatch4DParallel(
+    const void* run_options_ptr, bool trA, bool trB, const float* A,
+    const float* B, int Q, int P, int M, int N, int K, float* C);
+
+void __xla_cpu_runtime_KernelSelectorGEMV(const void* run_options_ptr, bool trA,
+                                          const float* A, const float* X, int M,
+                                          int N, float alpha, float beta,
+                                          float* Y);
+
+#ifdef ENABLE_BLAS_MLIR
+void __xla_cpu_runtime_KernelSelectorGEMMMLIR(const void* run_options_ptr,
+                                              bool trA, bool trB,
+                                              const float* A, const float* B,
+                                              int m, int n, int k, float alpha,
+                                              float beta, float* C);
+
+void __xla_cpu_runtime_KernelSelectorBatch3DMLIR(const void* run_options_ptr,
+                                                 bool trA, bool trB,
+                                                 const float* A, const float* B,
+                                                 int P, int M, int N, int K,
+                                                 float* C);
+
+void __xla_cpu_runtime_KernelSelectorBatch4DMLIR(const void* run_options_ptr,
+                                                 bool trA, bool trB,
+                                                 const float* A, const float* B,
+                                                 int Q, int P, int M, int N,
+                                                 int K, float* C);
+
+void __xla_cpu_runtime_KernelSelectorGEMVMLIR(const void* run_options_ptr,
+                                              bool trA, const float* A,
+                                              const float* X, int M, int N,
+                                              float alpha, float beta,
+                                              float* Y);
+#endif  // ENABLE_BLAS_MLIR
+
+void __xla_cpu_runtime_ArgMax3DParallel(const void* run_options_ptr, int B,
+                                        int M, int N, float* invals,
+                                        int32_t* inidxs, float init_value,
+                                        int32_t init_idx, float* outvals,
+                                        int32_t* outidxs);
+void __xla_cpu_runtime_ArgMax3DSequential(const void* run_options_ptr, int B,
+                                          int M, int N, float* invals,
+                                          int32_t* inidxs, float init_value,
+                                          int32_t init_idx, float* outvals,
+                                          int32_t* outidxs);
+
+void __xla_cpu_runtime_ArgMax3DEmpty(const void* run_options_ptr, int B, int M,
+                                     int N, float* invals, int32_t* inidxs,
+                                     float init_value, int32_t init_idx,
+                                     float* outvals, int32_t* outidxs);
+
+void __xla_cpu_runtime_KernelSelectorGEMVEmpty(const void* run_options_ptr,
+                                               bool trA, const float* A,
+                                               const float* X, int M, int N,
+                                               float alpha, float beta,
+                                               float* Y);
+
+void __xla_cpu_runtime_KernelSelectorGEMMEmpty(const void* run_options_ptr,
+                                               bool trA, bool trB,
+                                               const float* A, const float* B,
+                                               int m, int n, int k, float alpha,
+                                               float beta, float* C);
+
+void __xla_cpu_runtime_KernelSelectorBatch3DEmpty(const void* run_options_ptr,
+                                                  bool trA, bool trB,
+                                                  const float* A,
+                                                  const float* B, int P, int M,
+                                                  int N, int K, float* C);
+
+void __xla_cpu_runtime_KernelSelectorBatch4DEmpty(
+    const void* run_options_ptr, bool trA, bool trB, const float* A,
+    const float* B, int Q, int P, int M, int N, int K, float* C);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_KERNEL_SELECTOR_H_
diff --git a/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.cc b/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.cc
new file mode 100644
index 00000000000000..79868054c13ed1
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.cc
@@ -0,0 +1,658 @@
+/* Copyright 2025 Huawei. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "kernel_selector_ops_rewriter.h"
+
+#include <cctype>
+#include <fstream>
+#include <regex>
+#include <sstream>
+
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/literal_util.h"
+#include "xla/service/cpu/cpu_runtime.h"
+
+namespace xla {
+namespace cpu {
+
+// Uncomment to get printed information about the sizes and the call selected.
+#define PRINT_DEBUG
+
+#ifdef PRINT_DEBUG
+#include <iostream>
+#define DEBUG(x) std::cerr << x << "\n";
+#else
+#define DEBUG(x) \
+  do {           \
+  } while (0);
+#endif
+
+enum Operation { NONE, GEMV, GEMM, BATCH_MATMUL_3D, BATCH_MATMUL_4D };
+enum KernelType { kGEMV, kGEMM, kBATCH3D, kBATCH4D, kARGMAX };
+
+using Range = std::pair<int, int>;
+using RangeSet = std::vector<Range>;
+
+Range maxRange = {0, INT_MAX};
+
+class IntervalMap {
+  using TypedRange = std::pair<KernelType, RangeSet>;
+  std::map<TypedRange, std::string> m_map;
+
+ public:
+  void insert(KernelType kTy, RangeSet& ranges, std::string& value) {
+    m_map[{kTy, ranges}] = value;
+  }
+
+  bool lookup(KernelType kTy, std::vector<int>& keys, std::string& outValue,
+              bool& fallback) const {
+    fallback = false;
+    for (const auto& entry : m_map) {
+      TypedRange typedRange = entry.first;
+      std::string value = entry.second;
+      if (typedRange.first != kTy) continue;
+
+      const RangeSet& ranges = typedRange.second;
+      if (ranges.size() != keys.size()) continue;
+
+      bool match = true;
+      for (size_t i = 0; i < ranges.size(); ++i) {
+        if (keys[i] < ranges[i].first || keys[i] > ranges[i].second) {
+          match = false;
+          break;
+        }
+        if (ranges[i] == maxRange) {
+          fallback = true;
+        }
+      }
+
+      if (match) {
+        outValue = value;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void print() const {
+    for (const auto& entry : m_map) {
+      TypedRange typedRange = entry.first;
+      std::string value = entry.second;
+      int kTy = typedRange.first;
+      const RangeSet& ranges = typedRange.second;
+
+      DEBUG("[" << kTy << "](");
+      for (const auto& range : ranges) {
+        DEBUG("[" << range.first << ":" << range.second << "] ");
+      }
+      DEBUG(") -> " << value << "\n");
+    }
+  }
+
+  void clear() { m_map.clear(); }
+};
+
+struct ParsedData {
+  std::string kernelName;
+  RangeSet sizes;
+  std::string functionName;
+  bool isValid;
+};
+
+std::map<std::string, KernelType> kernelStringToType = {{"gemv", kGEMV},
+                                                        {"gemm", kGEMM},
+                                                        {"batch3d", kBATCH3D},
+                                                        {"batch4d", kBATCH4D},
+                                                        {"argmax", kARGMAX}};
+std::map<KernelType, std::string> kernelTypeToString;  // filled automatically.
+
+std::map<KernelType, int> kernelTypeToSizeRank = {
+    {kGEMV, 2}, {kGEMM, 3}, {kARGMAX, 3}, {kBATCH3D, 4}, {kBATCH4D, 5}};
+
+int parseInt(const std::string& str) {
+  if (str == "*") return maxRange.second;
+
+  int size = std::stoi(str);
+  if (size < 0) {
+    LOG(ERROR) << "Found invalid size: " << size;
+    return -1;
+  }
+
+  return size;
+}
+
+Range parseRange(const std::string& str) {
+  size_t colonPos = str.find(':');
+
+  if (str == "*") {
+    return maxRange;
+  }
+
+  // For non-range strings like "1" we create a range {1,1}
+  if (colonPos == std::string::npos) {
+    int value = parseInt(str);
+    return {value, value};
+  }
+
+  auto left = str.substr(0, colonPos);
+  auto right = str.substr(colonPos + 1);
+
+  int start = parseInt(left);
+  int end = parseInt(right);
+
+  assert(start <= end);
+
+  return {start, end};
+}
+
+// Parses line from the mapping file which look like [kernel](size1,size2,...)
+// -> symbol
+ParsedData parseLine(std::string& line) {
+  // Remove all whitespace from the line first.
+  line.erase(std::remove_if(line.begin(), line.end(), ::isspace), line.end());
+  // A range looks like 23:29 or 12:*
+  std::string range = R"(\d+:(?:\d+|\*))";
+  // An element is either a number, a *, or a range
+  std::string element = R"((?:\d+|\*|)" + range + R"())";
+  // Sizes is a list of elements in parentheses
+  std::string sizes = R"(\(((?:)" + element + R"(,)*)" + element + R"()\))";
+  std::regex pattern(R"(^\[(.+)\])" + sizes + R"(->(.+))");
+
+  std::smatch matches;
+
+  ParsedData data;
+  data.isValid = false;
+
+  if (std::regex_match(line, matches, pattern)) {
+    data.kernelName = matches[1];
+    std::stringstream ss(matches[2]);
+    std::string token;
+
+    while (std::getline(ss, token, ',')) {
+      auto range = parseRange(token);
+      if (range.first == -1 || range.second == -1) return data;
+      data.sizes.push_back(range);
+    }
+    data.functionName = matches[3];
+    data.isValid = true;
+  } else {
+    XLA_VLOG_LINES(3, "KernelSelectorOpsRewriter::parseLine() : No match.\n");
+  }
+
+  return data;
+}
+
+IntervalMap sizesToSymbol;
+
+const char* kernel_map_file = std::getenv("KERNEL_MAP_FILE");
+
+void fill_map_from_file(const char* map_file, IntervalMap& map) {
+  if (!map_file) {
+    XLA_VLOG_LINES(3, "NO MAP FILE\n");
+    return;
+  }
+
+  std::ifstream file(map_file);
+  if (!file.is_open()) {
+    std::string file_name(map_file);
+    XLA_VLOG_LINES(3,
+                   "KernelSelectorOpsRewriter::fill_map_from_file() : Cannot "
+                   "open file. \n");
+    return;
+  }
+
+  // Clear the map to prevent conflicts and unexpected
+  // behaviour due to default pre-filled values.
+  map.clear();
+
+  std::string line;
+  int lineno = 1;
+  while (std::getline(file, line)) {
+    // If the file we are reading has Windows line endings, make sure
+    // we remove the `\r` before processing the regex, otherwise it will
+    // not match.
+    if (!line.empty() && line.back() == '\r') {
+      line.pop_back();
+    }
+
+    ParsedData data = parseLine(line);
+    if (!data.isValid) {
+      LOG(ERROR) << "Regex did not match on line " << lineno;
+    } else {
+      if (kernelStringToType.find(data.kernelName) ==
+          kernelStringToType.end()) {
+        LOG(ERROR) << data.kernelName << " is not a valid kernel type";
+        return;
+      }
+
+      KernelType kTy = kernelStringToType[data.kernelName];
+      int expectedRank = kernelTypeToSizeRank[kTy];
+
+      // Fallback case (i.e. lines like [gemm](*) -> symbol): store in the map
+      // the correct amount of "infinite" ranges:
+      if (data.sizes.size() == 1 && data.sizes[0] == maxRange) {
+        data.sizes.assign(expectedRank, maxRange);
+      }
+
+      if (data.sizes.size() != expectedRank) {
+        LOG(ERROR) << data.kernelName
+                   << " expected to have an input size of rank " << expectedRank
+                   << ", but got " << data.sizes.size() << "(line " << lineno
+                   << ")";
+      } else {
+        map.insert(kTy, data.sizes, data.functionName);
+      }
+    }
+    lineno++;
+  }
+
+  return;
+}
+
+class KernelSelectorOpsRewriterVisitor : public DfsHloRewriteVisitor {
+ private:
+  void printDebugMessage(KernelType kTy, std::vector<int> sizes) {
+    std::string debug_msg = "{";
+    for (size_t i = 0; i < sizes.size(); ++i) {
+      debug_msg += std::to_string(sizes[i]);
+      if (i != sizes.size() - 1) {
+        debug_msg += ", ";
+      }
+    }
+    debug_msg +=
+        "} -> Is not on the map and a fallback was not specified. The " +
+        kernelTypeToString[kTy] + " will not be replaced.";
+
+    DEBUG(debug_msg);
+  }
+
+  std::string GetKernelSelectorFunction(KernelType kTy, std::vector<int> sizes,
+                                        bool& fallback) {
+    std::string fun_name;
+    bool found = sizesToSymbol.lookup(kTy, sizes, fun_name, fallback);
+    fallback = false;
+
+    if (!found) {
+#ifdef PRINT_DEBUG
+      printDebugMessage(kTy, sizes);
+#endif
+    }
+    return fun_name;
+  }
+
+  Operation getOperation(HloInstruction* instr) {
+    if (auto* dot = DynCast<HloDotInstruction>(instr)) {
+      auto batch_dims = dot->dot_dimension_numbers().lhs_batch_dimensions();
+      auto dims = dot->shape().dimensions();
+      if (batch_dims.size() == 1) {
+        return Operation::BATCH_MATMUL_3D;
+      }
+      if (batch_dims.size() == 2) {
+        return Operation::BATCH_MATMUL_4D;
+      }
+      if (dims.size() == 1) {
+        return Operation::GEMV;
+      }
+      if (batch_dims.empty()) {
+        return Operation::GEMM;
+      }
+    }
+    return Operation::NONE;
+  }
+
+  template <typename T>
+  HloInstruction* makeConstant(HloInstruction* op, T value) {
+    auto litteral = LiteralUtil::CreateR0<T>(value);
+    return op->AddInstruction(
+        HloInstruction::CreateConstant(std::move(litteral)));
+  }
+
+#ifdef PRINT_DEBUG
+  std::map<std::vector<int>, std::string> AllocatedGemmSizes;
+  std::map<std::vector<int>, std::string> AllocatedGemvSizes;
+  std::map<std::vector<int>, std::string> AllocatedBatchMatmul3DSizes;
+  std::map<std::vector<int>, std::string> AllocatedBatchMatmul4DSizes;
+  std::map<std::vector<int>, std::string> AllocatedArgMax3DSizes;
+#endif
+
+ public:
+  absl::Status HandleDot(HloInstruction* dot) override {
+    Operation operation = getOperation(dot);
+    if (operation == Operation::NONE) {
+      return absl::OkStatus();
+    }
+    bool fallbackSelected;
+
+    // Collect all the operands for the CustomCall
+    switch (operation) {
+      case GEMM: {
+        KernelType kTy = kGEMM;
+        auto dnums = dot->dot_dimension_numbers();
+        auto lhs_contracting_dims = dnums.lhs_contracting_dimensions();
+        auto rhs_contracting_dims = dnums.rhs_contracting_dimensions();
+
+        assert(lhs_contracting_dims.size() == 1);
+        assert(rhs_contracting_dims.size() == 1);
+
+        HloInstruction* trA = makeConstant(dot, lhs_contracting_dims[0] == 0);
+        HloInstruction* trB = makeConstant(dot, rhs_contracting_dims[0] == 1);
+
+        HloInstruction* alpha = makeConstant(dot, (float)1.0);
+        HloInstruction* beta = makeConstant(dot, (float)0.0);
+
+        HloInstruction* A = dot->operands()[0];
+        HloInstruction* B = dot->operands()[1];
+
+        int m = dot->shape().dimensions(0);
+        HloInstruction* M = makeConstant(dot, m);
+
+        int n = dot->shape().dimensions(1);
+        HloInstruction* N = makeConstant(dot, n);
+
+        int k = A->shape().dimensions(lhs_contracting_dims[0]);
+        HloInstruction* K = makeConstant(dot, k);
+
+        std::string fun_name =
+            GetKernelSelectorFunction(kTy, {m, n, k}, fallbackSelected);
+        if (fun_name.empty()) return absl::OkStatus();
+
+#ifdef PRINT_DEBUG
+        if (AllocatedGemmSizes.find({m, n, k}) == AllocatedGemmSizes.end()) {
+          AllocatedGemmSizes[{m, n, k}] = fun_name;
+          DEBUG("{m: " << m << ", n: " << n << ", k: " << k << "} -> "
+                       << fun_name << (fallbackSelected ? " (fallback)" : ""));
+        }
+#endif
+
+        std::vector<HloInstruction*> operands = {trA, trB, A,     B,   M,
+                                                 N,   K,   alpha, beta};
+
+        HloInstruction* kernel_selector_call =
+            dot->AddInstruction(HloInstruction::CreateCustomCall(
+                dot->shape(), operands, runtime::kCustomCallKernelSelector));
+
+        // Add metadata
+        OpMetadata metadata = dot->metadata();
+        metadata.set_op_name(fun_name);
+        metadata.set_op_type(runtime::kKernelSelectorOperationGEMM);
+        kernel_selector_call->set_metadata(metadata);
+        TF_RETURN_IF_ERROR(ReplaceInstruction(dot, kernel_selector_call));
+
+        break;
+      }
+      case GEMV: {
+        KernelType kTy = kGEMV;
+        auto dnums = dot->dot_dimension_numbers();
+        auto lhs_contracting_dims = dnums.lhs_contracting_dimensions();
+
+        assert(lhs_contracting_dims.size() == 1);
+
+        bool is_trA = lhs_contracting_dims[0] == 0;
+        HloInstruction* trA = makeConstant(dot, is_trA);
+
+        HloInstruction* alpha = makeConstant(dot, (float)1.0);
+        HloInstruction* beta = makeConstant(dot, (float)0.0);
+
+        HloInstruction* A = dot->operands()[0];
+        HloInstruction* X = dot->operands()[1];
+
+        int m = A->shape().dimensions(is_trA ? 1 : 0);
+        HloInstruction* M = makeConstant(dot, m);
+
+        int n = A->shape().dimensions(is_trA ? 0 : 1);
+        HloInstruction* N = makeConstant(dot, n);
+
+        std::string fun_name =
+            GetKernelSelectorFunction(kTy, {m, n}, fallbackSelected);
+        if (fun_name.empty()) return absl::OkStatus();
+
+#ifdef PRINT_DEBUG
+        if (AllocatedGemvSizes.find({m, n}) == AllocatedGemvSizes.end()) {
+          AllocatedGemvSizes[{m, n}] = fun_name;
+          DEBUG("{m: " << m << ", n: " << n << "} -> " << fun_name
+                       << (fallbackSelected ? " (fallback)" : ""));
+        }
+#endif
+
+        std::vector<HloInstruction*> operands = {trA, A, X, M, N, alpha, beta};
+
+        HloInstruction* kernel_selector_call =
+            dot->AddInstruction(HloInstruction::CreateCustomCall(
+                dot->shape(), operands, runtime::kCustomCallKernelSelector));
+
+        // Add metadata
+        OpMetadata metadata = dot->metadata();
+        metadata.set_op_name(fun_name);
+        metadata.set_op_type(runtime::kKernelSelectorOperationGEMV);
+        kernel_selector_call->set_metadata(metadata);
+        TF_RETURN_IF_ERROR(ReplaceInstruction(dot, kernel_selector_call));
+
+        break;
+      }
+      case BATCH_MATMUL_3D: {
+        KernelType kTy = kBATCH3D;
+        auto dnums = dot->dot_dimension_numbers();
+        auto lhs_contracting_dims = dnums.lhs_contracting_dimensions();
+        auto rhs_contracting_dims = dnums.rhs_contracting_dimensions();
+
+        assert(lhs_contracting_dims.size() == 1);
+        assert(rhs_contracting_dims.size() == 1);
+
+        HloInstruction* trA = makeConstant(dot, lhs_contracting_dims[0] == 1);
+        HloInstruction* trB = makeConstant(dot, rhs_contracting_dims[0] == 2);
+
+        HloInstruction* A = dot->operands()[0];
+        HloInstruction* B = dot->operands()[1];
+
+        int p = dot->shape().dimensions(0);
+        HloInstruction* P = makeConstant(dot, p);
+
+        int num_batch_dims = dnums.lhs_batch_dimensions_size();
+
+        int m = dot->shape().dimensions(num_batch_dims);
+        HloInstruction* M = makeConstant(dot, m);
+
+        int n = dot->shape().dimensions(num_batch_dims + 1);
+        HloInstruction* N = makeConstant(dot, n);
+
+        int k = A->shape().dimensions(lhs_contracting_dims[0]);
+        HloInstruction* K = makeConstant(dot, k);
+
+        std::string fun_name =
+            GetKernelSelectorFunction(kTy, {p, m, n, k}, fallbackSelected);
+        if (fun_name.empty()) return absl::OkStatus();
+
+#ifdef PRINT_DEBUG
+        if (AllocatedBatchMatmul3DSizes.find({p, m, n, k}) ==
+            AllocatedBatchMatmul3DSizes.end()) {
+          AllocatedBatchMatmul3DSizes[{p, m, n, k}] = fun_name;
+          DEBUG("{p: " << p << ", m: " << m << ", n: " << n << ", k: " << k
+                       << "} -> " << fun_name
+                       << (fallbackSelected ? " (fallback)" : ""));
+        }
+#endif
+
+        std::vector<HloInstruction*> operands = {trA, trB, A, B, P, M, N, K};
+
+        HloInstruction* kernel_selector_call =
+            dot->AddInstruction(HloInstruction::CreateCustomCall(
+                dot->shape(), operands, runtime::kCustomCallKernelSelector));
+
+        // Add metadata
+        OpMetadata metadata = dot->metadata();
+        metadata.set_op_name(fun_name);
+        metadata.set_op_type(runtime::kKernelSelectorOperationBATCH3D);
+        kernel_selector_call->set_metadata(metadata);
+        TF_RETURN_IF_ERROR(ReplaceInstruction(dot, kernel_selector_call));
+
+        break;
+      }
+      case BATCH_MATMUL_4D: {
+        KernelType kTy = kBATCH4D;
+        auto dnums = dot->dot_dimension_numbers();
+        auto lhs_contracting_dims = dnums.lhs_contracting_dimensions();
+        auto rhs_contracting_dims = dnums.rhs_contracting_dimensions();
+
+        assert(lhs_contracting_dims.size() == 1);
+        assert(rhs_contracting_dims.size() == 1);
+
+        HloInstruction* trA = makeConstant(dot, lhs_contracting_dims[0] == 2);
+        HloInstruction* trB = makeConstant(dot, rhs_contracting_dims[0] == 3);
+
+        HloInstruction* A = dot->operands()[0];
+        HloInstruction* B = dot->operands()[1];
+
+        int q = dot->shape().dimensions(0);
+        HloInstruction* Q = makeConstant(dot, q);
+
+        int p = dot->shape().dimensions(1);
+        HloInstruction* P = makeConstant(dot, p);
+
+        int num_batch_dims = dnums.lhs_batch_dimensions_size();
+
+        int m = dot->shape().dimensions(num_batch_dims);
+        HloInstruction* M = makeConstant(dot, m);
+
+        int n = dot->shape().dimensions(num_batch_dims + 1);
+        HloInstruction* N = makeConstant(dot, n);
+
+        int k = A->shape().dimensions(lhs_contracting_dims[0]);
+        HloInstruction* K = makeConstant(dot, k);
+
+        std::string fun_name = GetKernelSelectorFunction(kTy, {q, p, m, n, k}, fallbackSelected);
+
+        if (fun_name.empty()) return absl::OkStatus();
+
+#ifdef PRINT_DEBUG
+        if (AllocatedBatchMatmul4DSizes.find({q, p, m, n, k}) ==
+            AllocatedBatchMatmul4DSizes.end()) {
+          AllocatedBatchMatmul4DSizes[{q, p, m, n, k}] = fun_name;
+          DEBUG("{q: " << q << ", p: " << p << ", m: " << m << ", n: " << n
+                       << ", k: " << k << "} -> " << fun_name
+                       << (fallbackSelected ? " (fallback)" : ""));
+        }
+#endif
+
+        std::vector<HloInstruction*> operands = {trA, trB, A, B, Q, P, M, N, K};
+
+        HloInstruction* kernel_selector_call =
+            dot->AddInstruction(HloInstruction::CreateCustomCall(
+                dot->shape(), operands, runtime::kCustomCallKernelSelector));
+
+        // Add metadata
+        OpMetadata metadata = dot->metadata();
+        metadata.set_op_name(fun_name);
+        metadata.set_op_type(runtime::kKernelSelectorOperationBATCH4D);
+        kernel_selector_call->set_metadata(metadata);
+        TF_RETURN_IF_ERROR(ReplaceInstruction(dot, kernel_selector_call));
+
+        break;
+      }
+      default:
+        DEBUG("No library funcion was selected.");
+        return absl::OkStatus();
+    }
+
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleReduce(HloInstruction* reduce) override {
+    bool fallbackSelected;
+    std::string op_type = reduce->metadata().op_type();
+    // TODO: Is this reliable way to check for ArgMax?
+    // Works for BERT but its unclear if this is the proper way.
+    if (op_type != "ArgMax") {
+      return absl::OkStatus();
+    }
+
+    auto reduceOpr = reduce->operands();
+    // The ArgMax pattern we support has exactly 4 operands.
+    if (reduceOpr.size() != 4) {
+      return absl::OkStatus();
+    }
+
+    // We currently only support 3D ArgMax.
+    auto dims = reduceOpr[0]->shape().dimensions();
+    if (dims.size() != 3) {
+      return absl::OkStatus();
+    }
+
+    KernelType kTy = kARGMAX;
+    int b = dims[0];
+    int m = dims[1];
+    int n = dims[2];
+
+    std::string fun_name = GetKernelSelectorFunction(kTy, {b, m, n}, fallbackSelected);
+
+    if (fun_name.empty()) return absl::OkStatus();
+
+#ifdef PRINT_DEBUG
+    if (AllocatedArgMax3DSizes.find({b, m, n}) ==
+        AllocatedArgMax3DSizes.end()) {
+      AllocatedArgMax3DSizes[{b, m, n}] = fun_name;
+      DEBUG("{b: " << b << ", m: " << m << ", n: " << n << "} -> " << fun_name
+                   << (fallbackSelected ? " (fallback)" : ""));
+    }
+#endif
+
+    std::vector<HloInstruction*> operands;
+    for (int i = 0; i < 4; i++) operands.push_back(reduceOpr[i]);
+
+    HloInstruction* kernel_selector_call =
+        reduce->AddInstruction(HloInstruction::CreateCustomCall(
+            reduce->shape(), operands, runtime::kCustomCallKernelSelector));
+
+    // Add metadata
+    OpMetadata metadata = reduce->metadata();
+    metadata.set_op_name(fun_name);
+    metadata.set_op_type(runtime::kKernelSelectorOperationARGMAX);
+    kernel_selector_call->set_metadata(metadata);
+    TF_RETURN_IF_ERROR(ReplaceInstruction(reduce, kernel_selector_call));
+
+    return absl::OkStatus();
+  }
+};  // namespace cpu
+
+absl::StatusOr<bool> KernelSelectorOpsRewriter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  XLA_VLOG_LINES(
+      3, "KernelSelectorOpsRewriter::Run(), before:\n" + module->ToString());
+
+  if (!kernel_map_file) {
+    LOG(INFO) << "KERNEL_MAP_FILE is not set. The kernel selector will not "
+                 "run.\n Check xla/service/cpu/example_kernel_map.txt for an "
+                 "example of kernel map file";
+    return absl::OkStatus();
+  }
+
+  // Build the reverse map.
+  for (const auto& pair : kernelStringToType) {
+    kernelTypeToString[pair.second] = pair.first;
+  }
+
+  fill_map_from_file(kernel_map_file, sizesToSymbol);
+
+  KernelSelectorOpsRewriterVisitor visitor;
+  TF_ASSIGN_OR_RETURN(auto result,
+                      visitor.RunOnModule(module, execution_threads));
+  XLA_VLOG_LINES(
+      3, "KernelSelectorOpsRewriter::Run(), after:\n" + module->ToString());
+  return result;
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.h b/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.h
new file mode 100644
index 00000000000000..36714cfdf315b3
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.h
@@ -0,0 +1,42 @@
+/* Copyright 2025 Huawei. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_CPU_KERNEL_SELECTOR_OPS_REWRITER_H_
+#define XLA_SERVICE_CPU_KERNEL_SELECTOR_OPS_REWRITER_H_
+
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+namespace cpu {
+
+// This pass rewrites hlo.dot into custom calls.
+class KernelSelectorOpsRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "kernel-selector-ops-rewriter";
+  }
+
+  using HloPassInterface::Run;
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CPU_KERNEL_SELECTOR_OPS_REWRITER_H_
diff --git a/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc b/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc
index 64e5970c8f04a4..fd9479f35fff82 100644
--- a/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc
+++ b/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <algorithm>
 
 #include "absl/functional/any_invocable.h"
 #include "absl/strings/string_view.h"
@@ -57,6 +58,7 @@ limitations under the License.
 #include "xla/service/cpu/runtime_topk.h"
 #include "xla/service/cpu/windows_compatibility.h"
 #include "xla/service/cpu/xnnpack_ops.h"
+#include "xla/service/cpu/kernel_selector.h"
 #include "xla/service/custom_call_target_registry.h"
 #include "tsl/platform/logging.h"
 
@@ -211,6 +213,26 @@ static bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(TracingEnd);
   REGISTER_CPU_RUNTIME_SYMBOL(HandleFfiCall);
   REGISTER_CPU_RUNTIME_SYMBOL(XnnPackSoftMaxND);
+  REGISTER_CPU_RUNTIME_SYMBOL(ArgMax3DParallel);
+  REGISTER_CPU_RUNTIME_SYMBOL(ArgMax3DSequential);
+  REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMMSequential);
+  REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMMParallel);
+  REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch3DSequential);
+  REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch3DParallel);
+  REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch4DSequential);
+  REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch4DParallel);
+  REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMV);
+#ifdef ENABLE_BLAS_MLIR
+  REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMMMLIR);
+  REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch3DMLIR);
+  REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch4DMLIR);
+  REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMVMLIR);
+#endif  // ENABLE_BLAS_MLIR
+  REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMVEmpty);
+  REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMMEmpty);
+  REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch3DEmpty);
+  REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch4DEmpty);
+  REGISTER_CPU_RUNTIME_SYMBOL(ArgMax3DEmpty);
 #if defined(INTEL_MKL)
   REGISTER_CPU_RUNTIME_SYMBOL(OneDnnMatMul);
   REGISTER_CPU_RUNTIME_SYMBOL(OneDnnSoftmax);
diff --git a/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc
index a3a5f1827d0da8..4687473caf3ac7 100644
--- a/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc
@@ -26,8 +26,6 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-extern const char* const kCustomCallXnnPackSoftMax = "__xnnpack$softmax";
-
 namespace {
 namespace m = match;
 namespace pu = ::xla::cpu::xnnpack_pattern_utils_internal;
@@ -205,7 +203,7 @@ class XnnPackOpsRewriterVisitor : public DfsHloRewriteVisitor {
 
     HloInstruction* softmax_call =
         divide_instr->AddInstruction(HloInstruction::CreateCustomCall(
-            output_shape, {producer.value()}, kCustomCallXnnPackSoftMax));
+            output_shape, {producer.value()}, "__xnnpack$softmax"));
     TF_RETURN_IF_ERROR(ReplaceInstruction(divide_instr, softmax_call));
 
     return absl::OkStatus();
diff --git a/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h
index 2bdc58965c96dc..f1cd18769d1704 100644
--- a/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h
+++ b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h
@@ -27,8 +27,6 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-extern const char* const kCustomCallXnnPackSoftMax;
-
 class XnnPackOpsRewriter : public HloModulePass {
  public:
   absl::string_view name() const override { return "xnnpack-ops-rewriter"; }
diff --git a/third_party/xla/xla/service/libs/BUILD b/third_party/xla/xla/service/libs/BUILD
new file mode 100644
index 00000000000000..c9435fb4686cf4
--- /dev/null
+++ b/third_party/xla/xla/service/libs/BUILD
@@ -0,0 +1,17 @@
+cc_binary(
+    name = "libblas_mlir.so",
+    srcs = ["libblas_mlir/src/sgemm.cpp",
+        "libblas_mlir/src/sgemv.cpp",
+        "libblas_mlir/src/sbatch_matmul_3d.cpp",
+        "libblas_mlir/src/sbatch_matmul_4d.cpp",
+        "libblas_mlir/kernels/sbatch_matmul_3d_nn_mlir.s",
+        "libblas_mlir/kernels/sbatch_matmul_3d_nt_mlir.s",
+        "libblas_mlir/kernels/sbatch_matmul_4d_nn_mlir.s",
+        "libblas_mlir/kernels/sbatch_matmul_4d_nt_mlir.s",
+        "libblas_mlir/kernels/sgemm_nn_alpha1_beta1_mlir.s",
+        "libblas_mlir/kernels/sgemv_n_alpha1_beta1_mlir.s"],
+    linkshared = True,
+    linkstatic = False,
+    includes = ["libblas_mlir/include"],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/xla/xla/service/libs/libblas_mlir/Makefile b/third_party/xla/xla/service/libs/libblas_mlir/Makefile
new file mode 100644
index 00000000000000..941f9062f20211
--- /dev/null
+++ b/third_party/xla/xla/service/libs/libblas_mlir/Makefile
@@ -0,0 +1,52 @@
+# List of source files
+SRCS := sgemm.cpp sgemv.cpp sbatch_matmul_3d.cpp sbatch_matmul_4d.cpp
+KERNELS_DIR := kernels
+KERNEL_SRCS := $(wildcard $(KERNELS_DIR)/*.s)
+
+# Source directory
+SRC_DIR := src
+
+# Output directory
+BUILD := build
+
+# Compiler and flags
+CC := gcc
+CFLAGS := -S -I include -O3
+ASFLAGS := -c -O3
+LDFLAGS := -shared
+
+# Full paths
+SRC_PATHS := $(SRCS:%=$(SRC_DIR)/%)
+ASM := $(SRCS:%.cpp=$(BUILD)/%.s)
+OBJS := $(SRCS:%.cpp=$(BUILD)/%.o)
+KERNEL_OBJS := $(KERNEL_SRCS:$(KERNELS_DIR)/%.s=$(BUILD)/%.o)
+
+# All object files
+ALL_OBJS := $(OBJS) $(KERNEL_OBJS)
+
+# Default target
+all: $(BUILD) $(ASM) $(ALL_OBJS) $(BUILD)/libblas_mlir.so
+
+# Create build directory
+$(BUILD):
+	@mkdir -p $(BUILD)
+
+# Compile each .cpp file to .s in build/
+$(BUILD)/%.s: $(SRC_DIR)/%.cpp
+	@$(CC) $(CFLAGS) $< -o $@
+
+# Assemble .s to .o
+$(BUILD)/%.o: $(BUILD)/%.s
+	@$(CC) $(ASFLAGS) $< -o $@
+
+# Assemble kernels .s to .o
+$(BUILD)/%.o: $(KERNELS_DIR)/%.s | $(BUILD)
+	@$(CC) $(ASFLAGS) $< -o $@
+
+# Link .o files into lib.so
+$(BUILD)/libblas_mlir.so: $(ALL_OBJS)
+	@$(CC) $(LDFLAGS) -o $@ $^
+
+# Clean target
+clean:
+	@rm -rf $(BUILD)
\ No newline at end of file
diff --git a/third_party/xla/xla/service/libs/libblas_mlir/include/MemrefHelpers.h b/third_party/xla/xla/service/libs/libblas_mlir/include/MemrefHelpers.h
new file mode 100644
index 00000000000000..6d4fab5e34f49c
--- /dev/null
+++ b/third_party/xla/xla/service/libs/libblas_mlir/include/MemrefHelpers.h
@@ -0,0 +1,10 @@
+#ifndef MEMREF_HELPERS_H_
+#define MEMREF_HELPERS_H_
+
+#define Memref_1D_Args(NAME, M, S) NAME, NAME, 0, M, S
+#define Memref_2D_Args(NAME, M, N, LD) NAME, NAME, 0, M, N, LD, 1
+#define Memref_3D_Args(NAME, B, M, N, LD) NAME, NAME, 0, B, M, N, M *LD, LD, 1
+#define Memref_4D_Args(NAME, B1, B2, M, N, LD)                                 \
+  NAME, NAME, 0, B1, B2, M, N, B2 *M *LD, M *LD, LD, 1
+
+#endif
\ No newline at end of file
diff --git a/third_party/xla/xla/service/libs/libblas_mlir/include/cblas.h b/third_party/xla/xla/service/libs/libblas_mlir/include/cblas.h
new file mode 100644
index 00000000000000..4f7c410ec9bb3b
--- /dev/null
+++ b/third_party/xla/xla/service/libs/libblas_mlir/include/cblas.h
@@ -0,0 +1,11 @@
+typedef int BLASINT;
+
+typedef enum CBLAS_ORDER {
+  CblasRowMajor = 101,
+  CblasColMajor = 102
+} CBLAS_ORDER;
+
+typedef enum CBLAS_TRANSPOSE {
+  CblasNoTrans = 111,
+  CblasTrans = 112,
+} CBLAS_TRANSPOSE;
diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nn_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nn_mlir.s
new file mode 100644
index 00000000000000..38d54d0f69c54c
--- /dev/null
+++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nn_mlir.s
@@ -0,0 +1,4079 @@
+	.text
+	.file	"LLVMDialectModule"
+	.globl	sbatch_matmul_3d_nn_mlir                    // -- Begin function sbatch_matmul_3d_nn_mlir
+	.p2align	4
+	.type	sbatch_matmul_3d_nn_mlir,@function
+sbatch_matmul_3d_nn_mlir:                           // @sbatch_matmul_3d_nn_mlir
+	.cfi_startproc
+// %bb.0:
+	stp	d15, d14, [sp, #-160]!          // 16-byte Folded Spill
+	stp	d13, d12, [sp, #16]             // 16-byte Folded Spill
+	stp	x29, x30, [sp, #64]             // 16-byte Folded Spill
+	stp	x28, x27, [sp, #80]             // 16-byte Folded Spill
+	stp	x26, x25, [sp, #96]             // 16-byte Folded Spill
+	stp	x24, x23, [sp, #112]            // 16-byte Folded Spill
+	stp	x22, x21, [sp, #128]            // 16-byte Folded Spill
+	stp	x20, x19, [sp, #144]            // 16-byte Folded Spill
+	stp	d11, d10, [sp, #32]             // 16-byte Folded Spill
+	stp	d9, d8, [sp, #48]               // 16-byte Folded Spill
+	sub	sp, sp, #1040
+	.cfi_def_cfa_offset 1200
+	.cfi_offset w19, -8
+	.cfi_offset w20, -16
+	.cfi_offset w21, -24
+	.cfi_offset w22, -32
+	.cfi_offset w23, -40
+	.cfi_offset w24, -48
+	.cfi_offset w25, -56
+	.cfi_offset w26, -64
+	.cfi_offset w27, -72
+	.cfi_offset w28, -80
+	.cfi_offset w30, -88
+	.cfi_offset w29, -96
+	.cfi_offset b8, -104
+	.cfi_offset b9, -112
+	.cfi_offset b10, -120
+	.cfi_offset b11, -128
+	.cfi_offset b12, -136
+	.cfi_offset b13, -144
+	.cfi_offset b14, -152
+	.cfi_offset b15, -160
+	cmp	x4, #0
+	ldr	x13, [sp, #1248]
+	ldr	x29, [sp, #1336]
+	lsl	x23, x5, #6
+	cinv	x8, x4, lt
+	ldr	x20, [sp, #1264]
+	ldr	x26, [sp, #1216]
+	add	x0, x23, #64
+	add	x9, x8, x8, lsr #63
+	add	x10, x8, #3
+	mov	x19, x7
+	str	x6, [sp, #760]                  // 8-byte Folded Spill
+	mov	x21, x5
+	stp	x13, x3, [sp, #144]             // 16-byte Folded Spill
+	mov	x27, x2
+	str	x1, [sp, #720]                  // 8-byte Folded Spill
+	asr	x9, x9, #1
+	str	x4, [sp, #744]                  // 8-byte Folded Spill
+	cinv	x28, x9, lt
+	cmp	x8, #0
+	ldr	x9, [sp, #1256]
+	csel	x8, x10, x8, lt
+	cmp	x4, #0
+	ldr	x10, [sp, #1328]
+	asr	x8, x8, #2
+	cinv	x24, x8, lt
+	cmp	x13, #0
+	cinv	x8, x13, lt
+	str	x9, [sp, #752]                  // 8-byte Folded Spill
+	add	x9, x8, x8, lsr #63
+	str	x10, [sp, #736]                 // 8-byte Folded Spill
+	add	x10, x8, #15
+	add	x11, x8, #7
+	add	x12, x8, #3
+	asr	x9, x9, #1
+	cinv	x14, x9, lt
+	ldr	x9, [sp, #1296]
+	cmp	x8, #0
+	str	x14, [sp, #1000]                // 8-byte Folded Spill
+	str	x9, [sp, #696]                  // 8-byte Folded Spill
+	ldr	x9, [sp, #1288]
+	str	x9, [sp, #688]                  // 8-byte Folded Spill
+	csel	x9, x10, x8, lt
+	csel	x10, x11, x8, lt
+	csel	x8, x12, x8, lt
+	cmp	x13, #0
+	asr	x9, x9, #4
+	asr	x8, x8, #2
+	asr	x10, x10, #3
+	cinv	x11, x9, lt
+	ldr	x9, [sp, #1224]
+	cinv	x25, x8, lt
+	cinv	x10, x10, lt
+	lsl	x8, x25, #2
+	str	x11, [sp, #1016]                // 8-byte Folded Spill
+	str	x10, [sp, #1008]                // 8-byte Folded Spill
+	str	x8, [sp, #600]                  // 8-byte Folded Spill
+	lsl	x8, x14, #1
+	str	x8, [sp, #648]                  // 8-byte Folded Spill
+	str	x9, [sp, #712]                  // 8-byte Folded Spill
+	lsl	x9, x11, #4
+	str	x9, [sp, #832]                  // 8-byte Folded Spill
+	lsl	x9, x10, #3
+	str	x9, [sp, #768]                  // 8-byte Folded Spill
+	bl	malloc
+	lsl	x8, x24, #2
+	negs	x9, x21
+	add	x10, x19, x19, lsl #1
+	mov	w12, #1                         // =0x1
+	str	x8, [sp, #1024]                 // 8-byte Folded Spill
+	lsl	x8, x28, #1
+	and	x9, x9, #0x3
+	str	x27, [sp, #704]                 // 8-byte Folded Spill
+	str	x8, [sp, #920]                  // 8-byte Folded Spill
+	add	x8, x0, #63
+	lsl	x27, x27, #2
+	lsl	x5, x21, #2
+	and	x22, x8, #0xffffffffffffffc0
+	and	x8, x21, #0x3
+	bfi	x12, x24, #2, #62
+	mul	x17, x19, x12
+	csneg	x6, x8, x9, mi
+	lsl	x8, x10, #2
+	mul	x18, x28, x19
+	add	x12, x5, x27
+	lsl	x15, x6, #2
+	str	x8, [sp, #1032]                 // 8-byte Folded Spill
+	mul	x16, x24, x19
+	lsl	x2, x16, #4
+	sub	x8, x5, x15
+	lsl	x3, x17, #2
+	stp	x5, x8, [sp, #96]               // 16-byte Folded Spill
+	lsl	x4, x20, #2
+	sub	x8, x12, x15
+	sub	x12, x22, x6, lsl #6
+	mov	x13, x20
+	str	x8, [sp, #904]                  // 8-byte Folded Spill
+	add	x10, x4, x20
+	lsl	x11, x20, #5
+	lsl	x20, x19, #2
+	add	x8, x12, x23
+	lsl	x9, x13, #4
+	sub	x28, x11, x4
+	str	x0, [sp, #16]                   // 8-byte Folded Spill
+	str	x8, [sp, #552]                  // 8-byte Folded Spill
+	add	x8, x27, x18, lsl #3
+	lsl	x10, x10, #2
+	str	x13, [sp, #728]                 // 8-byte Folded Spill
+	str	xzr, [sp, #184]                 // 8-byte Folded Spill
+	str	xzr, [sp, #776]                 // 8-byte Folded Spill
+	add	x12, x8, x5
+	str	x8, [sp, #888]                  // 8-byte Folded Spill
+	sub	x8, x12, x15
+	ldr	x12, [sp, #104]                 // 8-byte Folded Reload
+	str	x26, [sp, #680]                 // 8-byte Folded Spill
+	str	x4, [sp, #824]                  // 8-byte Folded Spill
+	str	x8, [sp, #896]                  // 8-byte Folded Spill
+	add	x8, x3, x27
+	add	x14, x8, x5
+	str	x8, [sp, #992]                  // 8-byte Folded Spill
+	sub	x8, x14, x15
+	sub	x14, x21, x6
+	str	x8, [sp, #880]                  // 8-byte Folded Spill
+	add	x8, x2, x27
+	add	x5, x8, x5
+	str	x8, [sp, #912]                  // 8-byte Folded Spill
+	sub	x8, x5, x15
+	add	x15, x21, x20
+	ldr	x5, [sp, #720]                  // 8-byte Folded Reload
+	str	x8, [sp, #872]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #1016]                 // 8-byte Folded Reload
+	sub	x1, x15, x6
+	add	x15, x21, x16, lsl #2
+	lsl	x16, x25, #4
+	str	x9, [sp, #1016]                 // 8-byte Folded Spill
+	sub	x15, x15, x6
+	lsl	x15, x15, #2
+	lsl	x7, x8, #6
+	ldr	x8, [sp, #1008]                 // 8-byte Folded Reload
+	str	x15, [sp, #576]                 // 8-byte Folded Spill
+	add	x15, x21, x17
+	sub	x15, x15, x6
+	lsl	x15, x15, #2
+	str	x15, [sp, #568]                 // 8-byte Folded Spill
+	add	x15, x21, x18, lsl #1
+	lsl	x17, x8, #5
+	ldr	x8, [sp, #1000]                 // 8-byte Folded Reload
+	sub	x15, x15, x6
+	lsl	x18, x15, #2
+	lsl	x15, x8, #3
+	ldr	x8, [sp, #712]                  // 8-byte Folded Reload
+	lsl	x8, x8, #2
+	add	x23, x11, x8
+	str	x8, [sp, #864]                  // 8-byte Folded Spill
+	add	x23, x26, x23
+	str	x23, [sp, #984]                 // 8-byte Folded Spill
+	add	x23, x9, x8
+	add	x23, x26, x23
+	str	x23, [sp, #976]                 // 8-byte Folded Spill
+	add	x23, x4, x8
+	add	x23, x26, x23
+	str	x23, [sp, #968]                 // 8-byte Folded Spill
+	lsl	x23, x13, #3
+	add	x24, x23, x8
+	add	x24, x26, x24
+	str	x24, [sp, #960]                 // 8-byte Folded Spill
+	add	x24, x13, x13, lsl #1
+	lsl	x25, x24, #3
+	lsl	x30, x24, #2
+	add	x24, x26, x8
+	add	x0, x24, x28
+	str	x0, [sp, #952]                  // 8-byte Folded Spill
+	add	x0, x24, x25
+	str	x0, [sp, #944]                  // 8-byte Folded Spill
+	add	x0, x24, x10
+	str	x0, [sp, #936]                  // 8-byte Folded Spill
+	add	x0, x24, x30
+	str	x0, [sp, #928]                  // 8-byte Folded Spill
+	add	x0, x12, #4
+	ldr	x12, [sp, #904]                 // 8-byte Folded Reload
+	str	x0, [sp, #512]                  // 8-byte Folded Spill
+	madd	x24, x13, x0, x8
+	add	x0, x12, #4
+	str	x0, [sp, #672]                  // 8-byte Folded Spill
+	mul	x0, x13, x14
+	add	x24, x26, x24
+	add	x0, x8, x0, lsl #2
+	lsl	x8, x19, #4
+	str	x8, [sp, #1008]                 // 8-byte Folded Spill
+	add	x12, x26, x0
+	add	x0, x8, x27
+	add	x0, x0, x5
+	add	x8, x0, #32
+	add	x0, x27, x1, lsl #2
+	add	x1, x26, x4
+	str	x8, [sp, #816]                  // 8-byte Folded Spill
+	add	x0, x0, x5
+	add	x8, x0, #4
+	str	x8, [sp, #808]                  // 8-byte Folded Spill
+	add	x8, x5, x3
+	add	x3, x26, x11
+	add	x11, x26, x23
+	add	x23, x24, x7
+	str	x8, [sp, #624]                  // 8-byte Folded Spill
+	add	x8, x5, x2
+	add	x2, x26, x28
+	add	x0, x3, x7
+	str	x8, [sp, #616]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #888]                  // 8-byte Folded Reload
+	str	x0, [sp, #504]                  // 8-byte Folded Spill
+	add	x0, x2, x7
+	str	x0, [sp, #496]                  // 8-byte Folded Spill
+	add	x13, x8, x5
+	add	x8, x13, #32
+	add	x13, x26, x10
+	str	x8, [sp, #640]                  // 8-byte Folded Spill
+	add	x8, x18, #4
+	add	x18, x26, x25
+	str	x8, [sp, #560]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #896]                  // 8-byte Folded Reload
+	add	x0, x18, x7
+	str	x0, [sp, #488]                  // 8-byte Folded Spill
+	add	x0, x13, x7
+	str	x0, [sp, #480]                  // 8-byte Folded Spill
+	add	x0, x26, x9
+	add	x9, x0, x7
+	add	x8, x5, x8
+	str	x9, [sp, #472]                  // 8-byte Folded Spill
+	add	x9, x1, x7
+	str	x8, [sp, #632]                  // 8-byte Folded Spill
+	add	x8, x26, x30
+	str	x9, [sp, #464]                  // 8-byte Folded Spill
+	add	x9, x11, x7
+	str	x9, [sp, #456]                  // 8-byte Folded Spill
+	add	x9, x8, x7
+	str	x9, [sp, #448]                  // 8-byte Folded Spill
+	add	x9, x12, x7
+	str	x9, [sp, #440]                  // 8-byte Folded Spill
+	ldr	x9, [sp, #880]                  // 8-byte Folded Reload
+	add	x9, x5, x9
+	str	x9, [sp, #544]                  // 8-byte Folded Spill
+	ldr	x9, [sp, #872]                  // 8-byte Folded Reload
+	add	x9, x5, x9
+	str	x9, [sp, #656]                  // 8-byte Folded Spill
+	add	x9, x3, x17
+	str	x9, [sp, #432]                  // 8-byte Folded Spill
+	add	x9, x2, x17
+	str	x9, [sp, #424]                  // 8-byte Folded Spill
+	add	x9, x18, x17
+	str	x9, [sp, #416]                  // 8-byte Folded Spill
+	add	x9, x13, x17
+	str	x9, [sp, #408]                  // 8-byte Folded Spill
+	add	x9, x0, x17
+	str	x9, [sp, #400]                  // 8-byte Folded Spill
+	add	x9, x1, x17
+	str	x9, [sp, #392]                  // 8-byte Folded Spill
+	add	x9, x11, x17
+	str	x9, [sp, #384]                  // 8-byte Folded Spill
+	add	x9, x8, x17
+	str	x9, [sp, #376]                  // 8-byte Folded Spill
+	add	x9, x24, x17
+	str	x9, [sp, #368]                  // 8-byte Folded Spill
+	add	x9, x12, x17
+	lsl	x17, x21, #3
+	str	x9, [sp, #360]                  // 8-byte Folded Spill
+	add	x9, x3, x16
+	str	x17, [sp, #72]                  // 8-byte Folded Spill
+	str	x9, [sp, #352]                  // 8-byte Folded Spill
+	add	x9, x2, x16
+	str	x9, [sp, #344]                  // 8-byte Folded Spill
+	add	x9, x18, x16
+	str	x9, [sp, #336]                  // 8-byte Folded Spill
+	add	x9, x13, x16
+	str	x9, [sp, #328]                  // 8-byte Folded Spill
+	add	x9, x0, x16
+	str	x9, [sp, #320]                  // 8-byte Folded Spill
+	add	x9, x1, x16
+	str	x9, [sp, #312]                  // 8-byte Folded Spill
+	add	x9, x11, x16
+	add	x11, x11, x15
+	str	x9, [sp, #304]                  // 8-byte Folded Spill
+	add	x9, x8, x16
+	add	x8, x8, x15
+	str	x9, [sp, #296]                  // 8-byte Folded Spill
+	add	x9, x24, x16
+	str	x8, [sp, #216]                  // 8-byte Folded Spill
+	add	x8, x24, x15
+	str	x9, [sp, #288]                  // 8-byte Folded Spill
+	add	x9, x12, x16
+	lsl	x16, x21, #4
+	str	x8, [sp, #208]                  // 8-byte Folded Spill
+	str	x9, [sp, #280]                  // 8-byte Folded Spill
+	lsl	x9, x21, #5
+	sub	x7, x16, x6, lsl #4
+	sub	x10, x9, x6, lsl #5
+	sub	x6, x17, x6, lsl #3
+	mov	x17, x12
+	add	x12, x18, x15
+	stp	x16, x9, [sp, #80]              // 16-byte Folded Spill
+	lsl	x9, x19, #3
+	add	x8, x17, x15
+	str	x12, [sp, #256]                 // 8-byte Folded Spill
+	add	x12, x13, x15
+	ldr	x13, [sp, #1032]                // 8-byte Folded Reload
+	add	x16, x5, x9
+	add	x9, x9, x27
+	str	x14, [sp, #1032]                // 8-byte Folded Spill
+	str	x8, [sp, #200]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #992]                  // 8-byte Folded Reload
+	str	x12, [sp, #248]                 // 8-byte Folded Spill
+	str	x16, [sp, #800]                 // 8-byte Folded Spill
+	add	x16, x3, x15
+	add	x9, x5, x9
+	ldr	x3, [sp, #776]                  // 8-byte Folded Reload
+	stp	x6, x10, [sp, #56]              // 16-byte Folded Spill
+	str	x16, [sp, #272]                 // 8-byte Folded Spill
+	add	x16, x2, x15
+	str	x9, [sp, #592]                  // 8-byte Folded Spill
+	add	x9, x20, x27
+	mov	x2, x23
+	sub	x23, x14, #4
+	str	x16, [sp, #264]                 // 8-byte Folded Spill
+	ldr	x16, [sp, #184]                 // 8-byte Folded Reload
+	add	x9, x5, x9
+	add	x12, x5, x13
+	str	x9, [sp, #584]                  // 8-byte Folded Spill
+	mov	x9, x24
+	add	x8, x5, x8
+	str	x12, [sp, #792]                 // 8-byte Folded Spill
+	add	x12, x5, x27
+	str	x8, [sp, #536]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #912]                  // 8-byte Folded Reload
+	add	x13, x12, x13
+	str	x13, [sp, #608]                 // 8-byte Folded Spill
+	add	x13, x0, x15
+	str	x13, [sp, #240]                 // 8-byte Folded Spill
+	add	x13, x1, x15
+	stp	x11, x13, [sp, #224]            // 16-byte Folded Spill
+	add	x8, x5, x8
+	str	x8, [sp, #528]                  // 8-byte Folded Spill
+	sub	x8, x14, #3
+	str	x8, [sp, #912]                  // 8-byte Folded Spill
+	sub	x8, x14, #2
+	str	x8, [sp, #904]                  // 8-byte Folded Spill
+	sub	x8, x14, #1
+	str	x8, [sp, #896]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #752]                  // 8-byte Folded Reload
+	lsl	x11, x8, #2
+	ldr	x8, [sp, #760]                  // 8-byte Folded Reload
+	lsl	x8, x8, #2
+	stp	x8, x11, [sp, #128]             // 16-byte Folded Spill
+	add	x8, x5, x20
+	add	x11, x10, #32
+	str	x8, [sp, #784]                  // 8-byte Folded Spill
+	add	x8, x22, #128
+	str	x8, [sp, #664]                  // 8-byte Folded Spill
+	add	x8, x22, #256
+	str	x8, [sp, #1000]                 // 8-byte Folded Spill
+	ldr	x8, [sp, #552]                  // 8-byte Folded Reload
+	add	x8, x8, #64
+	str	x8, [sp, #992]                  // 8-byte Folded Spill
+	add	x8, x7, #16
+	stp	x8, x11, [sp, #40]              // 16-byte Folded Spill
+	add	x8, x6, #8
+	stp	x7, x8, [sp, #24]               // 16-byte Folded Spill
+	b	.LBB0_4
+	.p2align	2
+.LBB0_1:                                //   in Loop: Header=BB0_4 Depth=1
+	str	s0, [x24, x9, lsl #2]
+.LBB0_2:                                //   in Loop: Header=BB0_4 Depth=1
+	ldr	x0, [sp, #120]                  // 8-byte Folded Reload
+	bl	free
+.LBB0_3:                                // %.backedge53
+                                        //   in Loop: Header=BB0_4 Depth=1
+	ldr	x8, [sp, #800]                  // 8-byte Folded Reload
+	ldp	x11, x10, [sp, #128]            // 16-byte Folded Reload
+	add	x8, x8, x11
+	ldr	x5, [sp, #872]                  // 8-byte Folded Reload
+	ldp	x9, x16, [sp, #176]             // 16-byte Folded Reload
+	ldp	x3, x17, [sp, #160]             // 16-byte Folded Reload
+	ldr	x12, [sp, #880]                 // 8-byte Folded Reload
+	ldr	x2, [sp, #192]                  // 8-byte Folded Reload
+	add	x5, x5, x11
+	add	x16, x16, x10
+	add	x9, x9, x10
+	add	x17, x17, x10
+	add	x12, x12, x11
+	add	x2, x2, x10
+	str	x8, [sp, #800]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #784]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	str	x8, [sp, #784]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #792]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	str	x8, [sp, #792]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #816]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	str	x8, [sp, #816]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #808]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	str	x8, [sp, #808]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #624]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	str	x8, [sp, #624]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #616]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	str	x8, [sp, #616]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #640]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	str	x8, [sp, #640]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #632]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	str	x8, [sp, #632]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #504]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #504]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #496]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #496]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #488]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #488]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #480]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #480]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #472]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #472]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #464]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #464]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #456]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #456]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #448]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #448]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #440]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #440]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #544]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	str	x8, [sp, #544]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #656]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	str	x8, [sp, #656]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #432]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #432]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #424]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #424]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #416]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #416]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #408]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #408]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #400]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #400]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #392]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #392]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #384]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #384]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #376]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #376]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #368]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #368]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #360]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #360]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #352]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #352]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #344]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #344]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #336]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #336]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #328]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #328]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #320]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #320]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #312]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #312]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #304]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #304]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #296]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #296]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #288]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #288]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #280]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #280]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #592]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	str	x8, [sp, #592]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #584]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	str	x8, [sp, #584]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #272]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #272]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #264]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #264]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #256]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #256]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #248]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #248]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #608]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	str	x8, [sp, #608]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #240]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #240]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #232]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #232]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #224]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #224]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #216]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #216]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #208]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #208]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #200]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #200]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #536]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	str	x8, [sp, #536]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #528]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	str	x8, [sp, #528]                  // 8-byte Folded Spill
+.LBB0_4:                                // =>This Loop Header: Depth=1
+                                        //     Child Loop BB0_8 Depth 2
+                                        //       Child Loop BB0_10 Depth 3
+                                        //       Child Loop BB0_12 Depth 3
+                                        //       Child Loop BB0_15 Depth 3
+                                        //         Child Loop BB0_17 Depth 4
+                                        //         Child Loop BB0_19 Depth 4
+                                        //       Child Loop BB0_22 Depth 3
+                                        //       Child Loop BB0_24 Depth 3
+                                        //       Child Loop BB0_28 Depth 3
+                                        //       Child Loop BB0_30 Depth 3
+                                        //     Child Loop BB0_36 Depth 2
+                                        //     Child Loop BB0_39 Depth 2
+                                        //     Child Loop BB0_42 Depth 2
+                                        //       Child Loop BB0_44 Depth 3
+                                        //       Child Loop BB0_46 Depth 3
+                                        //     Child Loop BB0_49 Depth 2
+                                        //     Child Loop BB0_51 Depth 2
+                                        //     Child Loop BB0_55 Depth 2
+                                        //     Child Loop BB0_57 Depth 2
+                                        //     Child Loop BB0_61 Depth 2
+                                        //     Child Loop BB0_64 Depth 2
+                                        //     Child Loop BB0_67 Depth 2
+                                        //       Child Loop BB0_69 Depth 3
+                                        //       Child Loop BB0_71 Depth 3
+                                        //     Child Loop BB0_74 Depth 2
+                                        //     Child Loop BB0_76 Depth 2
+                                        //     Child Loop BB0_80 Depth 2
+                                        //     Child Loop BB0_82 Depth 2
+                                        //     Child Loop BB0_86 Depth 2
+                                        //     Child Loop BB0_89 Depth 2
+                                        //     Child Loop BB0_92 Depth 2
+                                        //       Child Loop BB0_94 Depth 3
+                                        //       Child Loop BB0_96 Depth 3
+                                        //     Child Loop BB0_99 Depth 2
+                                        //     Child Loop BB0_101 Depth 2
+                                        //     Child Loop BB0_105 Depth 2
+                                        //     Child Loop BB0_107 Depth 2
+                                        //     Child Loop BB0_111 Depth 2
+                                        //     Child Loop BB0_114 Depth 2
+                                        //     Child Loop BB0_117 Depth 2
+                                        //       Child Loop BB0_119 Depth 3
+                                        //       Child Loop BB0_121 Depth 3
+                                        //     Child Loop BB0_124 Depth 2
+                                        //     Child Loop BB0_126 Depth 2
+                                        //     Child Loop BB0_130 Depth 2
+                                        //     Child Loop BB0_132 Depth 2
+	ldr	x8, [sp, #152]                  // 8-byte Folded Reload
+	cmp	x3, x8
+	b.ge	.LBB0_133
+// %bb.5:                               //   in Loop: Header=BB0_4 Depth=1
+	stp	x16, x2, [sp, #184]             // 16-byte Folded Spill
+	add	x8, x3, #1
+	ldr	x2, [sp, #832]                  // 8-byte Folded Reload
+	mov	x4, x16
+	str	x3, [sp, #776]                  // 8-byte Folded Spill
+	ldr	x3, [sp, #768]                  // 8-byte Folded Reload
+	str	x12, [sp, #880]                 // 8-byte Folded Spill
+	mov	x12, xzr
+	stp	x8, x17, [sp, #160]             // 16-byte Folded Spill
+	mov	x8, x17
+	str	x9, [sp, #176]                  // 8-byte Folded Spill
+	str	x5, [sp, #872]                  // 8-byte Folded Spill
+	b	.LBB0_8
+	.p2align	2
+.LBB0_6:                                //   in Loop: Header=BB0_8 Depth=2
+	stp	q3, q2, [x11]
+	stp	q1, q0, [x11, #32]
+.LBB0_7:                                // %.backedge
+                                        //   in Loop: Header=BB0_8 Depth=2
+	ldr	x12, [sp, #856]                 // 8-byte Folded Reload
+	add	x4, x4, #64
+	add	x9, x9, #64
+	add	x8, x8, #64
+.LBB0_8:                                //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Loop Header: Depth=2
+                                        //       Child Loop BB0_10 Depth 3
+                                        //       Child Loop BB0_12 Depth 3
+                                        //       Child Loop BB0_15 Depth 3
+                                        //         Child Loop BB0_17 Depth 4
+                                        //         Child Loop BB0_19 Depth 4
+                                        //       Child Loop BB0_22 Depth 3
+                                        //       Child Loop BB0_24 Depth 3
+                                        //       Child Loop BB0_28 Depth 3
+                                        //       Child Loop BB0_30 Depth 3
+	cmp	x12, x2
+	b.ge	.LBB0_31
+// %bb.9:                               //   in Loop: Header=BB0_8 Depth=2
+	add	x10, x12, #16
+	ldr	x11, [sp, #688]                 // 8-byte Folded Reload
+	ldr	x2, [sp, #776]                  // 8-byte Folded Reload
+	mov	x3, x4
+	str	x10, [sp, #856]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #696]                 // 8-byte Folded Reload
+	mov	x17, xzr
+	add	x13, x11, x10, lsl #2
+	ldr	x10, [sp, #736]                 // 8-byte Folded Reload
+	ldr	x11, [sp, #752]                 // 8-byte Folded Reload
+	ldr	x1, [sp, #680]                  // 8-byte Folded Reload
+	str	x4, [sp, #888]                  // 8-byte Folded Spill
+	mov	x4, x5
+	ldr	x5, [sp, #784]                  // 8-byte Folded Reload
+	ldr	x6, [sp, #800]                  // 8-byte Folded Reload
+	mul	x14, x2, x10
+	lsl	x10, x29, #1
+	mul	x16, x2, x11
+	add	x11, x10, x29
+	add	x15, x14, x12
+	str	x16, [sp, #848]                 // 8-byte Folded Spill
+	add	x16, x16, x12
+	add	x0, x15, x29
+	add	x10, x15, x10
+	add	x18, x13, x15, lsl #2
+	add	x11, x15, x11
+	add	x15, x13, x0, lsl #2
+	add	x10, x13, x10, lsl #2
+	ldr	x0, [sp, #712]                  // 8-byte Folded Reload
+	add	x11, x13, x11, lsl #2
+	ldp	q6, q4, [x18, #32]
+	ldp	q1, q0, [x18]
+	ldr	x18, [sp, #664]                 // 8-byte Folded Reload
+	add	x0, x1, x0, lsl #2
+	ldp	q19, q17, [x10, #32]
+	ldp	q22, q20, [x10]
+	ldr	x10, [sp, #760]                 // 8-byte Folded Reload
+	ldp	q3, q2, [x15, #32]
+	str	x0, [sp, #840]                  // 8-byte Folded Spill
+	ldp	q7, q5, [x15]
+	ldp	q18, q16, [x11, #32]
+	ldp	q23, q21, [x11]
+	ldr	x11, [sp, #704]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #720]                 // 8-byte Folded Reload
+	add	x0, x0, x16, lsl #2
+	mul	x16, x2, x10
+	add	x15, x15, x11, lsl #2
+	ldr	x2, [sp, #792]                  // 8-byte Folded Reload
+	ldp	q29, q28, [x0, #32]
+	ldp	q30, q31, [x0]
+	lsl	x10, x16, #2
+	ldr	q26, [x15, x10]
+	add	x10, x16, x19
+	lsl	x10, x10, #2
+	ldr	q25, [x15, x10]
+	add	x10, x16, x19, lsl #1
+	lsl	x10, x10, #2
+	ldr	q24, [x15, x10]
+	add	x10, x2, x27
+	cmp	xzr, x23
+	prfm	pldl1keep, [x10, #16]
+	ldr	q27, [x10]
+	b.ge	.LBB0_11
+	.p2align	2
+.LBB0_10:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_8 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldr	x1, [sp, #936]                  // 8-byte Folded Reload
+	ldr	x24, [sp, #968]                 // 8-byte Folded Reload
+	fmla	v1.4s, v30.4s, v26.s[0]
+	fmla	v0.4s, v31.4s, v26.s[0]
+	ldr	x0, [sp, #984]                  // 8-byte Folded Reload
+	fmla	v6.4s, v29.4s, v26.s[0]
+	fmla	v4.4s, v28.4s, v26.s[0]
+	add	x11, x6, x27
+	fmla	v7.4s, v30.4s, v25.s[0]
+	fmla	v5.4s, v31.4s, v25.s[0]
+	stp	q30, q31, [x18, #-128]
+	fmla	v3.4s, v29.4s, v25.s[0]
+	fmla	v2.4s, v28.4s, v25.s[0]
+	stp	q29, q28, [x18, #-96]
+	add	x1, x1, x3
+	add	x24, x24, x3
+	add	x26, x0, x3
+	ldr	x0, [sp, #952]                  // 8-byte Folded Reload
+	fmla	v17.4s, v28.4s, v24.s[0]
+	fmla	v19.4s, v29.4s, v24.s[0]
+	fmla	v22.4s, v30.4s, v24.s[0]
+	fmla	v20.4s, v31.4s, v24.s[0]
+	fmla	v16.4s, v28.4s, v27.s[0]
+	fmla	v18.4s, v29.4s, v27.s[0]
+	prfm	pldl1keep, [x1]
+	ldr	x1, [sp, #944]                  // 8-byte Folded Reload
+	fmla	v21.4s, v31.4s, v27.s[0]
+	fmla	v23.4s, v30.4s, v27.s[0]
+	ldp	q28, q29, [x24, #32]
+	ldp	q30, q31, [x24]
+	ldr	x24, [sp, #960]                 // 8-byte Folded Reload
+	add	x0, x0, x3
+	add	x1, x1, x3
+	add	x24, x24, x3
+	fmla	v4.4s, v29.4s, v26.s[1]
+	fmla	v0.4s, v31.4s, v26.s[1]
+	fmla	v6.4s, v28.4s, v26.s[1]
+	fmla	v1.4s, v30.4s, v26.s[1]
+	fmla	v2.4s, v29.4s, v25.s[1]
+	fmla	v3.4s, v28.4s, v25.s[1]
+	fmla	v5.4s, v31.4s, v25.s[1]
+	stp	q30, q31, [x18, #-64]
+	fmla	v7.4s, v30.4s, v25.s[1]
+	stp	q28, q29, [x18, #-32]
+	fmla	v20.4s, v31.4s, v24.s[1]
+	fmla	v22.4s, v30.4s, v24.s[1]
+	fmla	v19.4s, v28.4s, v24.s[1]
+	fmla	v23.4s, v30.4s, v27.s[1]
+	prfm	pldl1keep, [x1]
+	ldr	x1, [sp, #928]                  // 8-byte Folded Reload
+	fmla	v21.4s, v31.4s, v27.s[1]
+	ldp	q31, q30, [x24, #32]
+	fmla	v17.4s, v29.4s, v24.s[1]
+	fmla	v18.4s, v28.4s, v27.s[1]
+	fmla	v16.4s, v29.4s, v27.s[1]
+	ldp	q29, q28, [x24]
+	add	x1, x1, x3
+	fmla	v4.4s, v30.4s, v26.s[2]
+	fmla	v2.4s, v30.4s, v25.s[2]
+	add	x10, x5, x27
+	fmla	v17.4s, v30.4s, v24.s[2]
+	stp	q29, q28, [x18]
+	fmla	v16.4s, v30.4s, v27.s[2]
+	stp	q31, q30, [x18, #32]
+	prfm	pldl1keep, [x0]
+	ldr	x0, [sp, #976]                  // 8-byte Folded Reload
+	ldp	q30, q9, [x1]
+	ldp	q10, q8, [x1, #32]
+	fmla	v6.4s, v31.4s, v26.s[2]
+	fmla	v3.4s, v31.4s, v25.s[2]
+	fmla	v19.4s, v31.4s, v24.s[2]
+	fmla	v1.4s, v29.4s, v26.s[2]
+	fmla	v0.4s, v28.4s, v26.s[2]
+	fmla	v7.4s, v29.4s, v25.s[2]
+	fmla	v5.4s, v28.4s, v25.s[2]
+	fmla	v22.4s, v29.4s, v24.s[2]
+	fmla	v20.4s, v28.4s, v24.s[2]
+	fmla	v23.4s, v29.4s, v27.s[2]
+	add	x28, x4, x27
+	add	x7, x11, #32
+	add	x30, x10, #32
+	add	x25, x28, #32
+	add	x0, x0, x3
+	fmla	v18.4s, v31.4s, v27.s[2]
+	fmla	v21.4s, v28.4s, v27.s[2]
+	stp	q30, q9, [x18, #64]
+	fmla	v4.4s, v8.4s, v26.s[3]
+	stp	q10, q8, [x18, #96]
+	prfm	pldl1keep, [x26]
+	fmla	v6.4s, v10.4s, v26.s[3]
+	fmla	v0.4s, v9.4s, v26.s[3]
+	fmla	v1.4s, v30.4s, v26.s[3]
+	fmla	v2.4s, v8.4s, v25.s[3]
+	fmla	v3.4s, v10.4s, v25.s[3]
+	fmla	v5.4s, v9.4s, v25.s[3]
+	fmla	v7.4s, v30.4s, v25.s[3]
+	ldp	q29, q28, [x0, #32]
+	fmla	v20.4s, v9.4s, v24.s[3]
+	fmla	v22.4s, v30.4s, v24.s[3]
+	add	x17, x17, #4
+	add	x6, x6, #16
+	fmla	v19.4s, v10.4s, v24.s[3]
+	fmla	v17.4s, v8.4s, v24.s[3]
+	add	x5, x5, #16
+	add	x4, x4, #16
+	fmla	v23.4s, v30.4s, v27.s[3]
+	ldp	q30, q31, [x0]
+	prfm	pldl1keep, [x25]
+	ldr	q26, [x28, #16]
+	prfm	pldl1keep, [x30]
+	ldr	q25, [x10, #16]
+	prfm	pldl1keep, [x7]
+	ldr	q24, [x11, #16]
+	ldr	x10, [sp, #1016]                // 8-byte Folded Reload
+	fmla	v21.4s, v9.4s, v27.s[3]
+	fmla	v18.4s, v10.4s, v27.s[3]
+	fmla	v16.4s, v8.4s, v27.s[3]
+	add	x3, x3, x10
+	add	x2, x2, #16
+	add	x18, x18, #256
+	add	x10, x2, x27
+	cmp	x17, x23
+	prfm	pldl1keep, [x10, #16]
+	ldr	q27, [x10]
+	b.lt	.LBB0_10
+.LBB0_11:                               //   in Loop: Header=BB0_8 Depth=2
+	ldr	x0, [sp, #728]                  // 8-byte Folded Reload
+	ldr	x17, [sp, #912]                 // 8-byte Folded Reload
+	add	x11, x22, x23, lsl #6
+	fmla	v1.4s, v30.4s, v26.s[0]
+	ldr	x3, [sp, #848]                  // 8-byte Folded Reload
+	ldr	x4, [sp, #840]                  // 8-byte Folded Reload
+	fmla	v0.4s, v31.4s, v26.s[0]
+	fmla	v6.4s, v29.4s, v26.s[0]
+	ldr	x1, [sp, #904]                  // 8-byte Folded Reload
+	stp	q30, q31, [x11]
+	fmla	v4.4s, v28.4s, v26.s[0]
+	stp	q29, q28, [x11, #32]
+	fmla	v2.4s, v28.4s, v25.s[0]
+	fmla	v7.4s, v30.4s, v25.s[0]
+	madd	x10, x17, x0, x3
+	madd	x18, x1, x0, x3
+	fmla	v5.4s, v31.4s, v25.s[0]
+	fmla	v3.4s, v29.4s, v25.s[0]
+	fmla	v17.4s, v28.4s, v24.s[0]
+	fmla	v19.4s, v29.4s, v24.s[0]
+	fmla	v20.4s, v31.4s, v24.s[0]
+	fmla	v22.4s, v30.4s, v24.s[0]
+	fmla	v16.4s, v28.4s, v27.s[0]
+	fmla	v18.4s, v29.4s, v27.s[0]
+	ldr	x11, [sp, #896]                 // 8-byte Folded Reload
+	add	x17, x22, x17, lsl #6
+	fmla	v21.4s, v31.4s, v27.s[0]
+	fmla	v23.4s, v30.4s, v27.s[0]
+	ldr	x5, [sp, #824]                  // 8-byte Folded Reload
+	ldr	x6, [sp, #872]                  // 8-byte Folded Reload
+	add	x10, x10, x12
+	mov	x2, xzr
+	add	x10, x4, x10, lsl #2
+	ldp	q28, q29, [x10]
+	ldp	q30, q31, [x10, #32]
+	add	x10, x18, x12
+	add	x18, x22, x1, lsl #6
+	add	x10, x4, x10, lsl #2
+	fmla	v4.4s, v31.4s, v26.s[1]
+	fmla	v0.4s, v29.4s, v26.s[1]
+	fmla	v5.4s, v29.4s, v25.s[1]
+	fmla	v2.4s, v31.4s, v25.s[1]
+	fmla	v20.4s, v29.4s, v24.s[1]
+	fmla	v17.4s, v31.4s, v24.s[1]
+	fmla	v21.4s, v29.4s, v27.s[1]
+	fmla	v16.4s, v31.4s, v27.s[1]
+	fmla	v6.4s, v30.4s, v26.s[1]
+	stp	q28, q29, [x17]
+	stp	q30, q31, [x17, #32]
+	fmla	v1.4s, v28.4s, v26.s[1]
+	fmla	v3.4s, v30.4s, v25.s[1]
+	fmla	v7.4s, v28.4s, v25.s[1]
+	fmla	v22.4s, v28.4s, v24.s[1]
+	fmla	v19.4s, v30.4s, v24.s[1]
+	fmla	v23.4s, v28.4s, v27.s[1]
+	fmla	v18.4s, v30.4s, v27.s[1]
+	ldp	q29, q28, [x10, #32]
+	ldp	q31, q30, [x10]
+	madd	x10, x11, x0, x3
+	add	x0, x22, x11, lsl #6
+	ldr	x11, [sp, #1032]                // 8-byte Folded Reload
+	add	x10, x10, x12
+	fmla	v0.4s, v30.4s, v26.s[2]
+	fmla	v4.4s, v28.4s, v26.s[2]
+	fmla	v2.4s, v28.4s, v25.s[2]
+	fmla	v5.4s, v30.4s, v25.s[2]
+	fmla	v17.4s, v28.4s, v24.s[2]
+	fmla	v20.4s, v30.4s, v24.s[2]
+	fmla	v16.4s, v28.4s, v27.s[2]
+	fmla	v21.4s, v30.4s, v27.s[2]
+	add	x10, x4, x10, lsl #2
+	stp	q31, q30, [x18]
+	fmla	v1.4s, v31.4s, v26.s[2]
+	stp	q29, q28, [x18, #32]
+	fmla	v6.4s, v29.4s, v26.s[2]
+	fmla	v7.4s, v31.4s, v25.s[2]
+	fmla	v3.4s, v29.4s, v25.s[2]
+	fmla	v19.4s, v29.4s, v24.s[2]
+	fmla	v22.4s, v31.4s, v24.s[2]
+	fmla	v18.4s, v29.4s, v27.s[2]
+	fmla	v23.4s, v31.4s, v27.s[2]
+	ldp	q28, q29, [x10]
+	fmla	v0.4s, v29.4s, v26.s[3]
+	ldp	q30, q31, [x10, #32]
+	fmla	v4.4s, v31.4s, v26.s[3]
+	fmla	v5.4s, v29.4s, v25.s[3]
+	fmla	v2.4s, v31.4s, v25.s[3]
+	fmla	v20.4s, v29.4s, v24.s[3]
+	fmla	v17.4s, v31.4s, v24.s[3]
+	fmla	v21.4s, v29.4s, v27.s[3]
+	ldr	x10, [sp, #672]                 // 8-byte Folded Reload
+	fmla	v16.4s, v31.4s, v27.s[3]
+	fmla	v6.4s, v30.4s, v26.s[3]
+	fmla	v1.4s, v28.4s, v26.s[3]
+	fmla	v3.4s, v30.4s, v25.s[3]
+	fmla	v7.4s, v28.4s, v25.s[3]
+	stp	q28, q29, [x0]
+	stp	q30, q31, [x0, #32]
+	fmla	v22.4s, v28.4s, v24.s[3]
+	fmla	v19.4s, v30.4s, v24.s[3]
+	fmla	v23.4s, v28.4s, v27.s[3]
+	fmla	v18.4s, v30.4s, v27.s[3]
+	cmp	x11, x21
+	b.ge	.LBB0_13
+	.p2align	2
+.LBB0_12:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_8 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x1, x6, x10
+	add	x10, x10, #4
+	add	x3, x1, x20
+	prfm	pldl1keep, [x1]
+	ldur	s24, [x1, #-4]
+	add	x1, x9, x2
+	add	x4, x3, x20
+	prfm	pldl1keep, [x3]
+	ldur	s25, [x3, #-4]
+	add	x3, x8, x2
+	add	x2, x2, x5
+	prfm	pldl1keep, [x4]
+	ldur	s26, [x4, #-4]
+	add	x4, x4, x20
+	prfm	pldl1keep, [x4]
+	ldur	s27, [x4, #-4]
+	prfm	pldl1keep, [x1]
+	ldp	q28, q29, [x3, #32]
+	add	x1, x22, x11, lsl #6
+	ldp	q30, q31, [x3]
+	add	x11, x11, #1
+	fmla	v4.4s, v29.4s, v24.s[0]
+	fmla	v0.4s, v31.4s, v24.s[0]
+	fmla	v5.4s, v31.4s, v25.s[0]
+	fmla	v2.4s, v29.4s, v25.s[0]
+	fmla	v20.4s, v31.4s, v26.s[0]
+	fmla	v17.4s, v29.4s, v26.s[0]
+	fmla	v6.4s, v28.4s, v24.s[0]
+	fmla	v1.4s, v30.4s, v24.s[0]
+	fmla	v3.4s, v28.4s, v25.s[0]
+	fmla	v7.4s, v30.4s, v25.s[0]
+	fmla	v22.4s, v30.4s, v26.s[0]
+	fmla	v19.4s, v28.4s, v26.s[0]
+	fmla	v23.4s, v30.4s, v27.s[0]
+	fmla	v21.4s, v31.4s, v27.s[0]
+	fmla	v18.4s, v28.4s, v27.s[0]
+	stp	q30, q31, [x1]
+	stp	q28, q29, [x1, #32]
+	fmla	v16.4s, v29.4s, v27.s[0]
+	cmp	x11, x21
+	b.lt	.LBB0_12
+.LBB0_13:                               // %.preheader
+                                        //   in Loop: Header=BB0_8 Depth=2
+	ldr	x1, [sp, #808]                  // 8-byte Folded Reload
+	ldr	x11, [sp, #816]                 // 8-byte Folded Reload
+	mov	x10, xzr
+	mov	w5, #1                          // =0x1
+	mov	w6, #2                          // =0x2
+	mov	w4, #3                          // =0x3
+	mov	w3, #4                          // =0x4
+	b	.LBB0_15
+	.p2align	2
+.LBB0_14:                               // %.loopexit
+                                        //   in Loop: Header=BB0_15 Depth=3
+	ldr	x10, [sp, #1008]                // 8-byte Folded Reload
+	add	x11, x11, x10
+	add	x1, x1, x10
+	mov	x10, x3
+	mov	x3, x7
+.LBB0_15:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_8 Depth=2
+                                        // =>    This Loop Header: Depth=3
+                                        //         Child Loop BB0_17 Depth 4
+                                        //         Child Loop BB0_19 Depth 4
+	madd	x10, x10, x29, x14
+	add	x10, x10, x12
+	madd	x2, x5, x29, x14
+	madd	x5, x6, x29, x14
+	add	x2, x2, x12
+	add	x5, x5, x12
+	add	x10, x13, x10, lsl #2
+	stp	q1, q0, [x10]
+	stp	q6, q4, [x10, #32]
+	add	x10, x13, x2, lsl #2
+	add	x2, x13, x5, lsl #2
+	stp	q7, q5, [x10]
+	stp	q3, q2, [x10, #32]
+	madd	x10, x4, x29, x14
+	add	x10, x10, x12
+	stp	q22, q20, [x2]
+	stp	q19, q17, [x2, #32]
+	ldr	x2, [sp, #1024]                 // 8-byte Folded Reload
+	cmp	x3, x2
+	add	x10, x13, x10, lsl #2
+	stp	q23, q21, [x10]
+	stp	q18, q16, [x10, #32]
+	b.ge	.LBB0_20
+// %bb.16:                              //   in Loop: Header=BB0_15 Depth=3
+	madd	x10, x3, x29, x14
+	add	x4, x3, #3
+	add	x5, x3, #1
+	add	x6, x3, #2
+	madd	x2, x5, x29, x14
+	ldp	q28, q29, [x22, #32]
+	mov	x30, xzr
+	madd	x24, x6, x29, x14
+	ldp	q30, q31, [x22]
+	add	x7, x3, #4
+	add	x10, x10, x12
+	add	x10, x13, x10, lsl #2
+	add	x2, x2, x12
+	add	x2, x13, x2, lsl #2
+	ldp	q6, q4, [x10, #32]
+	ldp	q1, q0, [x10]
+	madd	x10, x4, x29, x14
+	ldp	q3, q2, [x2, #32]
+	add	x10, x10, x12
+	ldp	q7, q5, [x2]
+	add	x2, x24, x12
+	add	x2, x13, x2, lsl #2
+	ldp	q19, q17, [x2, #32]
+	ldp	q22, q20, [x2]
+	mov	x2, x11
+	add	x10, x13, x10, lsl #2
+	ldp	q18, q16, [x10, #32]
+	ldp	q23, q21, [x10]
+	madd	x10, x3, x19, x16
+	lsl	x10, x10, #2
+	ldr	q27, [x15, x10]
+	madd	x10, x5, x19, x16
+	lsl	x10, x10, #2
+	ldr	q26, [x15, x10]
+	madd	x10, x6, x19, x16
+	lsl	x10, x10, #2
+	ldr	q25, [x15, x10]
+	madd	x10, x4, x19, x16
+	lsl	x10, x10, #2
+	ldr	q24, [x15, x10]
+	ldr	x10, [sp, #1000]                // 8-byte Folded Reload
+	fmla	v4.4s, v29.4s, v27.s[0]
+	cmp	xzr, x23
+	b.ge	.LBB0_18
+	.p2align	2
+.LBB0_17:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_8 Depth=2
+                                        //       Parent Loop BB0_15 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x28, x10, #64
+	fmla	v6.4s, v28.4s, v27.s[0]
+	fmla	v1.4s, v30.4s, v27.s[0]
+	add	x24, x10, #128
+	prfm	pldl1keep, [x28]
+	ldp	q9, q8, [x10, #-160]
+	fmla	v0.4s, v31.4s, v27.s[0]
+	ldp	q12, q15, [x10, #-192]
+	fmla	v2.4s, v29.4s, v26.s[0]
+	fmla	v3.4s, v28.4s, v26.s[0]
+	fmla	v5.4s, v31.4s, v26.s[0]
+	fmla	v7.4s, v30.4s, v26.s[0]
+	fmla	v17.4s, v29.4s, v25.s[0]
+	prfm	pldl1keep, [x24]
+	fmla	v19.4s, v28.4s, v25.s[0]
+	fmla	v20.4s, v31.4s, v25.s[0]
+	ldp	q11, q10, [x10, #-128]
+	fmla	v22.4s, v30.4s, v25.s[0]
+	fmla	v16.4s, v29.4s, v24.s[0]
+	ldp	q13, q14, [x10, #-96]
+	fmla	v18.4s, v28.4s, v24.s[0]
+	fmla	v21.4s, v31.4s, v24.s[0]
+	add	x26, x10, #192
+	prfm	pldl1keep, [x26]
+	fmla	v23.4s, v30.4s, v24.s[0]
+	fmla	v0.4s, v15.4s, v27.s[1]
+	add	x25, x10, #256
+	add	x30, x30, #4
+	fmla	v1.4s, v12.4s, v27.s[1]
+	fmla	v6.4s, v9.4s, v27.s[1]
+	fmla	v4.4s, v8.4s, v27.s[1]
+	fmla	v7.4s, v12.4s, v26.s[1]
+	fmla	v5.4s, v15.4s, v26.s[1]
+	fmla	v3.4s, v9.4s, v26.s[1]
+	fmla	v2.4s, v8.4s, v26.s[1]
+	fmla	v22.4s, v12.4s, v25.s[1]
+	fmla	v20.4s, v15.4s, v25.s[1]
+	fmla	v19.4s, v9.4s, v25.s[1]
+	fmla	v17.4s, v8.4s, v25.s[1]
+	fmla	v23.4s, v12.4s, v24.s[1]
+	fmla	v21.4s, v15.4s, v24.s[1]
+	ldp	q15, q12, [x10, #-64]
+	fmla	v18.4s, v9.4s, v24.s[1]
+	fmla	v16.4s, v8.4s, v24.s[1]
+	ldp	q9, q8, [x10, #-32]
+	prfm	pldl1keep, [x25]
+	ldp	q28, q29, [x10, #32]
+	ldp	q30, q31, [x10]
+	add	x10, x2, x20
+	prfm	pldl1keep, [x2]
+	fmla	v4.4s, v14.4s, v27.s[2]
+	fmla	v6.4s, v13.4s, v27.s[2]
+	fmla	v1.4s, v11.4s, v27.s[2]
+	fmla	v0.4s, v10.4s, v27.s[2]
+	fmla	v2.4s, v14.4s, v26.s[2]
+	fmla	v3.4s, v13.4s, v26.s[2]
+	fmla	v5.4s, v10.4s, v26.s[2]
+	fmla	v7.4s, v11.4s, v26.s[2]
+	fmla	v17.4s, v14.4s, v25.s[2]
+	fmla	v19.4s, v13.4s, v25.s[2]
+	fmla	v20.4s, v10.4s, v25.s[2]
+	fmla	v22.4s, v11.4s, v25.s[2]
+	fmla	v16.4s, v14.4s, v24.s[2]
+	fmla	v18.4s, v13.4s, v24.s[2]
+	fmla	v21.4s, v10.4s, v24.s[2]
+	fmla	v23.4s, v11.4s, v24.s[2]
+	fmla	v0.4s, v12.4s, v27.s[3]
+	fmla	v1.4s, v15.4s, v27.s[3]
+	fmla	v6.4s, v9.4s, v27.s[3]
+	fmla	v4.4s, v8.4s, v27.s[3]
+	ldur	q27, [x2, #-16]
+	prfm	pldl1keep, [x10]
+	add	x2, x2, #16
+	fmla	v7.4s, v15.4s, v26.s[3]
+	fmla	v5.4s, v12.4s, v26.s[3]
+	fmla	v3.4s, v9.4s, v26.s[3]
+	fmla	v2.4s, v8.4s, v26.s[3]
+	ldur	q26, [x10, #-16]
+	add	x10, x10, x20
+	add	x24, x10, x20
+	prfm	pldl1keep, [x10]
+	fmla	v22.4s, v15.4s, v25.s[3]
+	fmla	v20.4s, v12.4s, v25.s[3]
+	fmla	v19.4s, v9.4s, v25.s[3]
+	fmla	v17.4s, v8.4s, v25.s[3]
+	ldur	q25, [x10, #-16]
+	prfm	pldl1keep, [x24]
+	mov	x10, x25
+	fmla	v23.4s, v15.4s, v24.s[3]
+	fmla	v21.4s, v12.4s, v24.s[3]
+	fmla	v18.4s, v9.4s, v24.s[3]
+	fmla	v16.4s, v8.4s, v24.s[3]
+	ldur	q24, [x24, #-16]
+	fmla	v4.4s, v29.4s, v27.s[0]
+	cmp	x30, x23
+	b.lt	.LBB0_17
+.LBB0_18:                               //   in Loop: Header=BB0_15 Depth=3
+	ldp	q10, q8, [x17, #32]
+	ldp	q12, q11, [x17]
+	fmla	v6.4s, v28.4s, v27.s[0]
+	fmla	v0.4s, v31.4s, v27.s[0]
+	fmla	v1.4s, v30.4s, v27.s[0]
+	fmla	v2.4s, v29.4s, v26.s[0]
+	fmla	v3.4s, v28.4s, v26.s[0]
+	fmla	v5.4s, v31.4s, v26.s[0]
+	ldp	q9, q13, [x18, #32]
+	fmla	v7.4s, v30.4s, v26.s[0]
+	fmla	v17.4s, v29.4s, v25.s[0]
+	ldr	x2, [sp, #992]                  // 8-byte Folded Reload
+	ldr	x25, [sp, #1032]                // 8-byte Folded Reload
+	fmla	v19.4s, v28.4s, v25.s[0]
+	fmla	v20.4s, v31.4s, v25.s[0]
+	mov	x10, x1
+	fmla	v22.4s, v30.4s, v25.s[0]
+	fmla	v16.4s, v29.4s, v24.s[0]
+	fmla	v18.4s, v28.4s, v24.s[0]
+	fmla	v21.4s, v31.4s, v24.s[0]
+	fmla	v23.4s, v30.4s, v24.s[0]
+	ldp	q29, q30, [x18]
+	ldp	q31, q28, [x0, #32]
+	fmla	v1.4s, v12.4s, v27.s[1]
+	fmla	v0.4s, v11.4s, v27.s[1]
+	fmla	v6.4s, v10.4s, v27.s[1]
+	fmla	v4.4s, v8.4s, v27.s[1]
+	fmla	v7.4s, v12.4s, v26.s[1]
+	fmla	v5.4s, v11.4s, v26.s[1]
+	fmla	v3.4s, v10.4s, v26.s[1]
+	fmla	v2.4s, v8.4s, v26.s[1]
+	fmla	v22.4s, v12.4s, v25.s[1]
+	fmla	v20.4s, v11.4s, v25.s[1]
+	fmla	v19.4s, v10.4s, v25.s[1]
+	fmla	v17.4s, v8.4s, v25.s[1]
+	fmla	v23.4s, v12.4s, v24.s[1]
+	fmla	v21.4s, v11.4s, v24.s[1]
+	fmla	v18.4s, v10.4s, v24.s[1]
+	fmla	v16.4s, v8.4s, v24.s[1]
+	ldp	q10, q8, [x0]
+	fmla	v4.4s, v13.4s, v27.s[2]
+	fmla	v6.4s, v9.4s, v27.s[2]
+	fmla	v0.4s, v30.4s, v27.s[2]
+	fmla	v1.4s, v29.4s, v27.s[2]
+	fmla	v2.4s, v13.4s, v26.s[2]
+	fmla	v3.4s, v9.4s, v26.s[2]
+	fmla	v5.4s, v30.4s, v26.s[2]
+	fmla	v7.4s, v29.4s, v26.s[2]
+	fmla	v17.4s, v13.4s, v25.s[2]
+	fmla	v19.4s, v9.4s, v25.s[2]
+	fmla	v20.4s, v30.4s, v25.s[2]
+	fmla	v22.4s, v29.4s, v25.s[2]
+	fmla	v16.4s, v13.4s, v24.s[2]
+	fmla	v18.4s, v9.4s, v24.s[2]
+	fmla	v21.4s, v30.4s, v24.s[2]
+	fmla	v23.4s, v29.4s, v24.s[2]
+	fmla	v1.4s, v10.4s, v27.s[3]
+	fmla	v0.4s, v8.4s, v27.s[3]
+	fmla	v6.4s, v31.4s, v27.s[3]
+	fmla	v4.4s, v28.4s, v27.s[3]
+	fmla	v7.4s, v10.4s, v26.s[3]
+	fmla	v5.4s, v8.4s, v26.s[3]
+	fmla	v3.4s, v31.4s, v26.s[3]
+	fmla	v2.4s, v28.4s, v26.s[3]
+	fmla	v22.4s, v10.4s, v25.s[3]
+	fmla	v20.4s, v8.4s, v25.s[3]
+	fmla	v19.4s, v31.4s, v25.s[3]
+	fmla	v17.4s, v28.4s, v25.s[3]
+	fmla	v23.4s, v10.4s, v24.s[3]
+	fmla	v21.4s, v8.4s, v24.s[3]
+	fmla	v18.4s, v31.4s, v24.s[3]
+	fmla	v16.4s, v28.4s, v24.s[3]
+	cmp	x25, x21
+	b.ge	.LBB0_14
+	.p2align	2
+.LBB0_19:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_8 Depth=2
+                                        //       Parent Loop BB0_15 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x24, x10, x20
+	prfm	pldl1keep, [x10]
+	ldur	s24, [x10, #-4]
+	add	x25, x25, #1
+	prfm	pldl1keep, [x24]
+	ldur	s25, [x24, #-4]
+	add	x24, x24, x20
+	add	x10, x10, #4
+	prfm	pldl1keep, [x24]
+	ldur	s26, [x24, #-4]
+	add	x24, x24, x20
+	prfm	pldl1keep, [x24]
+	ldur	s27, [x24, #-4]
+	prfm	pldl1keep, [x2]
+	ldp	q28, q29, [x2, #-32]
+	fmla	v4.4s, v29.4s, v24.s[0]
+	ldp	q30, q31, [x2, #-64]
+	fmla	v0.4s, v31.4s, v24.s[0]
+	fmla	v5.4s, v31.4s, v25.s[0]
+	fmla	v2.4s, v29.4s, v25.s[0]
+	fmla	v20.4s, v31.4s, v26.s[0]
+	fmla	v17.4s, v29.4s, v26.s[0]
+	add	x2, x2, #64
+	fmla	v6.4s, v28.4s, v24.s[0]
+	fmla	v1.4s, v30.4s, v24.s[0]
+	fmla	v3.4s, v28.4s, v25.s[0]
+	fmla	v7.4s, v30.4s, v25.s[0]
+	fmla	v22.4s, v30.4s, v26.s[0]
+	fmla	v19.4s, v28.4s, v26.s[0]
+	fmla	v23.4s, v30.4s, v27.s[0]
+	fmla	v21.4s, v31.4s, v27.s[0]
+	fmla	v18.4s, v28.4s, v27.s[0]
+	fmla	v16.4s, v29.4s, v27.s[0]
+	cmp	x25, x21
+	b.lt	.LBB0_19
+	b	.LBB0_14
+	.p2align	2
+.LBB0_20:                               //   in Loop: Header=BB0_8 Depth=2
+	ldr	x10, [sp, #1024]                // 8-byte Folded Reload
+	ldr	x11, [sp, #920]                 // 8-byte Folded Reload
+	cmp	x10, x11
+	b.ge	.LBB0_26
+// %bb.21:                              //   in Loop: Header=BB0_8 Depth=2
+	ldr	x3, [sp, #1024]                 // 8-byte Folded Reload
+	ldp	q18, q19, [x22, #32]
+	mov	x10, xzr
+	ldp	q20, q21, [x22]
+	ldr	x4, [sp, #1000]                 // 8-byte Folded Reload
+	madd	x11, x3, x29, x14
+	add	x11, x11, x12
+	add	x1, x13, x11, lsl #2
+	add	x11, x3, #1
+	madd	x3, x3, x19, x16
+	madd	x2, x11, x29, x14
+	madd	x11, x11, x19, x16
+	ldp	q1, q0, [x1, #32]
+	ldp	q4, q2, [x1]
+	lsl	x3, x3, #2
+	add	x2, x2, x12
+	lsl	x11, x11, #2
+	ldr	q17, [x15, x3]
+	ldr	x3, [sp, #624]                  // 8-byte Folded Reload
+	add	x2, x13, x2, lsl #2
+	ldr	q16, [x15, x11]
+	ldr	x11, [sp, #616]                 // 8-byte Folded Reload
+	ldp	q5, q3, [x2, #32]
+	ldp	q7, q6, [x2]
+	cmp	xzr, x23
+	b.ge	.LBB0_23
+	.p2align	2
+.LBB0_22:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_8 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x30, x4, #64
+	fmla	v0.4s, v19.4s, v17.s[0]
+	fmla	v1.4s, v18.4s, v17.s[0]
+	add	x28, x4, #128
+	prfm	pldl1keep, [x30]
+	ldp	q23, q22, [x4, #-160]
+	fmla	v4.4s, v20.4s, v17.s[0]
+	ldp	q24, q25, [x4, #-192]
+	fmla	v2.4s, v21.4s, v17.s[0]
+	fmla	v3.4s, v19.4s, v16.s[0]
+	fmla	v5.4s, v18.4s, v16.s[0]
+	fmla	v6.4s, v21.4s, v16.s[0]
+	fmla	v7.4s, v20.4s, v16.s[0]
+	prfm	pldl1keep, [x28]
+	ldp	q19, q18, [x4, #-128]
+	ldp	q20, q21, [x4, #-96]
+	fmla	v2.4s, v25.4s, v17.s[1]
+	fmla	v0.4s, v22.4s, v17.s[1]
+	fmla	v6.4s, v25.4s, v16.s[1]
+	fmla	v3.4s, v22.4s, v16.s[1]
+	fmla	v4.4s, v24.4s, v17.s[1]
+	fmla	v1.4s, v23.4s, v17.s[1]
+	add	x24, x4, #192
+	prfm	pldl1keep, [x24]
+	fmla	v7.4s, v24.4s, v16.s[1]
+	fmla	v5.4s, v23.4s, v16.s[1]
+	ldp	q23, q22, [x4, #-32]
+	ldp	q24, q25, [x4, #-64]
+	add	x6, x3, x27
+	add	x25, x11, x27
+	fmla	v0.4s, v21.4s, v17.s[2]
+	fmla	v2.4s, v18.4s, v17.s[2]
+	fmla	v3.4s, v21.4s, v16.s[2]
+	fmla	v6.4s, v18.4s, v16.s[2]
+	fmla	v1.4s, v20.4s, v17.s[2]
+	fmla	v4.4s, v19.4s, v17.s[2]
+	add	x5, x4, #256
+	add	x7, x6, #32
+	fmla	v5.4s, v20.4s, v16.s[2]
+	fmla	v7.4s, v19.4s, v16.s[2]
+	add	x26, x25, #32
+	prfm	pldl1keep, [x26]
+	add	x10, x10, #4
+	add	x3, x3, #16
+	add	x11, x11, #16
+	fmla	v2.4s, v25.4s, v17.s[3]
+	fmla	v0.4s, v22.4s, v17.s[3]
+	fmla	v6.4s, v25.4s, v16.s[3]
+	fmla	v3.4s, v22.4s, v16.s[3]
+	fmla	v4.4s, v24.4s, v17.s[3]
+	fmla	v1.4s, v23.4s, v17.s[3]
+	ldr	q17, [x25, #16]
+	prfm	pldl1keep, [x7]
+	fmla	v7.4s, v24.4s, v16.s[3]
+	fmla	v5.4s, v23.4s, v16.s[3]
+	ldr	q16, [x6, #16]
+	prfm	pldl1keep, [x5]
+	ldp	q18, q19, [x4, #32]
+	ldp	q20, q21, [x4]
+	mov	x4, x5
+	cmp	x10, x23
+	b.lt	.LBB0_22
+.LBB0_23:                               //   in Loop: Header=BB0_8 Depth=2
+	ldp	q23, q22, [x17, #32]
+	ldp	q25, q24, [x17]
+	fmla	v0.4s, v19.4s, v17.s[0]
+	fmla	v1.4s, v18.4s, v17.s[0]
+	fmla	v2.4s, v21.4s, v17.s[0]
+	fmla	v4.4s, v20.4s, v17.s[0]
+	fmla	v3.4s, v19.4s, v16.s[0]
+	fmla	v5.4s, v18.4s, v16.s[0]
+	ldp	q18, q19, [x18]
+	fmla	v6.4s, v21.4s, v16.s[0]
+	fmla	v7.4s, v20.4s, v16.s[0]
+	ldp	q20, q21, [x18, #32]
+	fmla	v2.4s, v24.4s, v17.s[1]
+	fmla	v0.4s, v22.4s, v17.s[1]
+	ldr	x10, [sp, #880]                 // 8-byte Folded Reload
+	ldr	x11, [sp, #992]                 // 8-byte Folded Reload
+	fmla	v4.4s, v25.4s, v17.s[1]
+	fmla	v1.4s, v23.4s, v17.s[1]
+	ldr	x3, [sp, #1032]                 // 8-byte Folded Reload
+	ldr	x6, [sp, #576]                  // 8-byte Folded Reload
+	fmla	v7.4s, v25.4s, v16.s[1]
+	fmla	v6.4s, v24.4s, v16.s[1]
+	ldp	q25, q24, [x0]
+	fmla	v5.4s, v23.4s, v16.s[1]
+	fmla	v3.4s, v22.4s, v16.s[1]
+	ldp	q23, q22, [x0, #32]
+	fmla	v0.4s, v21.4s, v17.s[2]
+	fmla	v2.4s, v19.4s, v17.s[2]
+	ldr	x7, [sp, #568]                  // 8-byte Folded Reload
+	fmla	v3.4s, v21.4s, v16.s[2]
+	fmla	v1.4s, v20.4s, v17.s[2]
+	fmla	v4.4s, v18.4s, v17.s[2]
+	fmla	v5.4s, v20.4s, v16.s[2]
+	fmla	v6.4s, v19.4s, v16.s[2]
+	fmla	v7.4s, v18.4s, v16.s[2]
+	fmla	v2.4s, v24.4s, v17.s[3]
+	fmla	v0.4s, v22.4s, v17.s[3]
+	fmla	v6.4s, v24.4s, v16.s[3]
+	fmla	v3.4s, v22.4s, v16.s[3]
+	fmla	v4.4s, v25.4s, v17.s[3]
+	fmla	v1.4s, v23.4s, v17.s[3]
+	fmla	v7.4s, v25.4s, v16.s[3]
+	fmla	v5.4s, v23.4s, v16.s[3]
+	cmp	x3, x21
+	b.ge	.LBB0_25
+	.p2align	2
+.LBB0_24:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_8 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x4, x10, x7
+	add	x5, x10, x6
+	add	x3, x3, #1
+	add	x5, x5, #4
+	add	x4, x4, #4
+	prfm	pldl1keep, [x5]
+	ldr	s16, [x10, x6]
+	prfm	pldl1keep, [x4]
+	ldr	s17, [x10, x7]
+	prfm	pldl1keep, [x11]
+	ldp	q18, q19, [x11, #-64]
+	ldp	q20, q21, [x11, #-32]
+	add	x11, x11, #64
+	add	x10, x10, #4
+	fmla	v0.4s, v21.4s, v16.s[0]
+	fmla	v1.4s, v20.4s, v16.s[0]
+	fmla	v2.4s, v19.4s, v16.s[0]
+	fmla	v4.4s, v18.4s, v16.s[0]
+	fmla	v7.4s, v18.4s, v17.s[0]
+	fmla	v6.4s, v19.4s, v17.s[0]
+	fmla	v5.4s, v20.4s, v17.s[0]
+	fmla	v3.4s, v21.4s, v17.s[0]
+	cmp	x3, x21
+	b.lt	.LBB0_24
+.LBB0_25:                               //   in Loop: Header=BB0_8 Depth=2
+	stp	q4, q2, [x1]
+	stp	q1, q0, [x1, #32]
+	stp	q7, q6, [x2]
+	stp	q5, q3, [x2, #32]
+.LBB0_26:                               //   in Loop: Header=BB0_8 Depth=2
+	ldr	x10, [sp, #744]                 // 8-byte Folded Reload
+	ldr	x11, [sp, #920]                 // 8-byte Folded Reload
+	cmp	x11, x10
+	ldr	x2, [sp, #832]                  // 8-byte Folded Reload
+	ldr	x3, [sp, #768]                  // 8-byte Folded Reload
+	ldr	x5, [sp, #872]                  // 8-byte Folded Reload
+	ldr	x4, [sp, #888]                  // 8-byte Folded Reload
+	b.ge	.LBB0_7
+// %bb.27:                              //   in Loop: Header=BB0_8 Depth=2
+	ldr	x1, [sp, #920]                  // 8-byte Folded Reload
+	ldp	q7, q16, [x22, #32]
+	mov	x10, xzr
+	ldp	q6, q5, [x22]
+	madd	x11, x1, x29, x14
+	add	x11, x11, x12
+	madd	x12, x1, x19, x16
+	add	x11, x13, x11, lsl #2
+	ldr	x13, [sp, #1000]                // 8-byte Folded Reload
+	lsl	x12, x12, #2
+	ldp	q1, q0, [x11, #32]
+	ldp	q3, q2, [x11]
+	ldr	q4, [x15, x12]
+	ldr	x12, [sp, #640]                 // 8-byte Folded Reload
+	cmp	xzr, x23
+	b.ge	.LBB0_29
+	.p2align	2
+.LBB0_28:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_8 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x1, x13, #64
+	fmla	v0.4s, v16.4s, v4.s[0]
+	fmla	v1.4s, v7.4s, v4.s[0]
+	add	x16, x13, #128
+	prfm	pldl1keep, [x1]
+	ldp	q18, q17, [x13, #-160]
+	fmla	v3.4s, v6.4s, v4.s[0]
+	ldp	q19, q20, [x13, #-192]
+	fmla	v2.4s, v5.4s, v4.s[0]
+	prfm	pldl1keep, [x16]
+	ldp	q6, q5, [x13, #-128]
+	ldp	q7, q16, [x13, #-96]
+	add	x15, x13, #192
+	prfm	pldl1keep, [x15]
+	add	x14, x13, #256
+	add	x10, x10, #4
+	fmla	v2.4s, v20.4s, v4.s[1]
+	fmla	v0.4s, v17.4s, v4.s[1]
+	fmla	v3.4s, v19.4s, v4.s[1]
+	fmla	v1.4s, v18.4s, v4.s[1]
+	ldp	q18, q17, [x13, #-32]
+	ldp	q19, q20, [x13, #-64]
+	prfm	pldl1keep, [x12]
+	fmla	v0.4s, v16.4s, v4.s[2]
+	fmla	v2.4s, v5.4s, v4.s[2]
+	fmla	v1.4s, v7.4s, v4.s[2]
+	fmla	v3.4s, v6.4s, v4.s[2]
+	fmla	v2.4s, v20.4s, v4.s[3]
+	fmla	v0.4s, v17.4s, v4.s[3]
+	fmla	v3.4s, v19.4s, v4.s[3]
+	fmla	v1.4s, v18.4s, v4.s[3]
+	ldur	q4, [x12, #-16]
+	prfm	pldl1keep, [x14]
+	add	x12, x12, #16
+	ldp	q7, q16, [x13, #32]
+	ldp	q6, q5, [x13]
+	mov	x13, x14
+	cmp	x10, x23
+	b.lt	.LBB0_28
+.LBB0_29:                               //   in Loop: Header=BB0_8 Depth=2
+	ldp	q18, q17, [x17, #32]
+	ldp	q20, q19, [x17]
+	fmla	v0.4s, v16.4s, v4.s[0]
+	fmla	v1.4s, v7.4s, v4.s[0]
+	fmla	v2.4s, v5.4s, v4.s[0]
+	fmla	v3.4s, v6.4s, v4.s[0]
+	ldp	q5, q6, [x18]
+	ldp	q7, q16, [x18, #32]
+	ldr	x12, [sp, #560]                 // 8-byte Folded Reload
+	ldr	x16, [sp, #552]                 // 8-byte Folded Reload
+	ldr	x17, [sp, #632]                 // 8-byte Folded Reload
+	ldr	x18, [sp, #880]                 // 8-byte Folded Reload
+	fmla	v2.4s, v19.4s, v4.s[1]
+	fmla	v0.4s, v17.4s, v4.s[1]
+	mov	x10, xzr
+	mov	w13, #64                        // =0x40
+	fmla	v3.4s, v20.4s, v4.s[1]
+	fmla	v1.4s, v18.4s, v4.s[1]
+	ldp	q18, q17, [x0, #32]
+	ldp	q20, q19, [x0]
+	fmla	v0.4s, v16.4s, v4.s[2]
+	fmla	v2.4s, v6.4s, v4.s[2]
+	fmla	v1.4s, v7.4s, v4.s[2]
+	fmla	v3.4s, v5.4s, v4.s[2]
+	fmla	v2.4s, v19.4s, v4.s[3]
+	fmla	v0.4s, v17.4s, v4.s[3]
+	fmla	v3.4s, v20.4s, v4.s[3]
+	fmla	v1.4s, v18.4s, v4.s[3]
+	ldr	x14, [sp, #1032]                // 8-byte Folded Reload
+	add	x14, x14, xzr
+	cmp	x14, x21
+	b.ge	.LBB0_6
+	.p2align	2
+.LBB0_30:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_8 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x15, x18, x12
+	add	x14, x16, x13
+	add	x13, x13, #64
+	prfm	pldl1keep, [x15]
+	add	x15, x16, x10, lsl #6
+	ldr	s4, [x17, x10, lsl #2]
+	prfm	pldl1keep, [x14]
+	add	x10, x10, #1
+	add	x12, x12, #4
+	ldp	q5, q6, [x15]
+	ldp	q7, q16, [x15, #32]
+	fmla	v0.4s, v16.4s, v4.s[0]
+	fmla	v2.4s, v6.4s, v4.s[0]
+	fmla	v1.4s, v7.4s, v4.s[0]
+	fmla	v3.4s, v5.4s, v4.s[0]
+	ldr	x14, [sp, #1032]                // 8-byte Folded Reload
+	add	x14, x14, x10
+	cmp	x14, x21
+	b.lt	.LBB0_30
+	b	.LBB0_6
+	.p2align	2
+.LBB0_31:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x8, [sp, #696]                  // 8-byte Folded Reload
+	ldr	x9, [sp, #688]                  // 8-byte Folded Reload
+	cmp	x2, x3
+	add	x24, x9, x8, lsl #2
+	lsl	x8, x29, #1
+	ldr	x9, [sp, #680]                  // 8-byte Folded Reload
+	str	x8, [sp, #520]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #712]                  // 8-byte Folded Reload
+	str	x24, [sp, #856]                 // 8-byte Folded Spill
+	add	x8, x9, x8, lsl #2
+	ldr	x9, [sp, #720]                  // 8-byte Folded Reload
+	str	x8, [sp, #840]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #704]                  // 8-byte Folded Reload
+	add	x8, x9, x8, lsl #2
+	str	x8, [sp, #888]                  // 8-byte Folded Spill
+	b.lt	.LBB0_35
+// %bb.32:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x4, [sp, #600]                  // 8-byte Folded Reload
+	cmp	x3, x4
+	b.lt	.LBB0_60
+.LBB0_33:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x8, [sp, #648]                  // 8-byte Folded Reload
+	cmp	x4, x8
+	b.lt	.LBB0_85
+.LBB0_34:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x8, [sp, #144]                  // 8-byte Folded Reload
+	ldr	x9, [sp, #648]                  // 8-byte Folded Reload
+	cmp	x9, x8
+	b.ge	.LBB0_3
+	b	.LBB0_110
+	.p2align	2
+.LBB0_35:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x8, [sp, #88]                   // 8-byte Folded Reload
+	add	x0, x8, #64
+	bl	malloc
+	ldr	x8, [sp, #736]                  // 8-byte Folded Reload
+	ldr	x13, [sp, #776]                 // 8-byte Folded Reload
+	mov	x11, xzr
+	mul	x9, x13, x8
+	ldr	x8, [sp, #760]                  // 8-byte Folded Reload
+	ldr	x2, [sp, #520]                  // 8-byte Folded Reload
+	add	x12, x2, x29
+	ldr	x6, [sp, #784]                  // 8-byte Folded Reload
+	ldp	x3, x4, [sp, #496]              // 16-byte Folded Reload
+	ldr	x7, [sp, #800]                  // 8-byte Folded Reload
+	mul	x10, x13, x8
+	ldr	x8, [sp, #752]                  // 8-byte Folded Reload
+	mul	x13, x13, x8
+	add	x8, x0, #63
+	add	x17, x10, x19
+	lsl	x16, x10, #2
+	str	x10, [sp, #848]                 // 8-byte Folded Spill
+	and	x25, x8, #0xffffffffffffffc0
+	ldr	x8, [sp, #832]                  // 8-byte Folded Reload
+	stp	x13, x0, [sp, #112]             // 16-byte Folded Spill
+	add	x14, x9, x8
+	add	x15, x13, x8
+	ldr	x8, [sp, #888]                  // 8-byte Folded Reload
+	add	x12, x14, x12
+	add	x18, x24, x14, lsl #2
+	add	x1, x14, x29
+	add	x2, x14, x2
+	lsl	x14, x17, #2
+	add	x12, x24, x12, lsl #2
+	add	x17, x24, x2, lsl #2
+	ldp	q1, q0, [x18]
+	ldr	x18, [sp, #792]                 // 8-byte Folded Reload
+	ldp	q7, q5, [x12]
+	add	x12, x10, x19, lsl #1
+	lsl	x12, x12, #2
+	ldr	q16, [x8, x16]
+	ldr	q17, [x8, x14]
+	add	x16, x24, x1, lsl #2
+	ldp	q6, q3, [x16]
+	ldp	q4, q2, [x17]
+	ldp	x16, x17, [sp, #464]            // 16-byte Folded Reload
+	ldp	x1, x2, [sp, #480]              // 16-byte Folded Reload
+	ldr	q18, [x8, x12]
+	ldr	x8, [sp, #840]                  // 8-byte Folded Reload
+	add	x12, x25, #64
+	add	x5, x8, x15, lsl #2
+	ldp	x14, x15, [sp, #448]            // 16-byte Folded Reload
+	ldp	q21, q20, [x5]
+	ldr	x5, [sp, #872]                  // 8-byte Folded Reload
+	.p2align	2
+.LBB0_36:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x24, x18, x27
+	fmla	v1.4s, v21.4s, v16.s[0]
+	fmla	v0.4s, v20.4s, v16.s[0]
+	cmp	x11, x23
+	prfm	pldl1keep, [x24, #16]
+	ldr	q19, [x24]
+	b.ge	.LBB0_38
+// %bb.37:                              //   in Loop: Header=BB0_36 Depth=2
+	ldr	x8, [sp, #864]                  // 8-byte Folded Reload
+	mov	x10, x25
+	fmla	v6.4s, v21.4s, v17.s[0]
+	fmla	v3.4s, v20.4s, v17.s[0]
+	fmla	v4.4s, v21.4s, v18.s[0]
+	fmla	v2.4s, v20.4s, v18.s[0]
+	stp	q21, q20, [x12, #-64]
+	fmla	v7.4s, v21.4s, v19.s[0]
+	fmla	v5.4s, v20.4s, v19.s[0]
+	add	x26, x6, x27
+	add	x28, x5, x27
+	add	x0, x26, #32
+	add	x11, x11, #4
+	add	x6, x6, #16
+	add	x5, x5, #16
+	add	x24, x1, x8
+	add	x25, x16, x8
+	add	x13, x14, x8
+	add	x30, x4, x8
+	prfm	pldl1keep, [x24]
+	ldp	q20, q21, [x25]
+	add	x24, x2, x8
+	add	x25, x15, x8
+	add	x18, x18, #16
+	fmla	v0.4s, v21.4s, v16.s[1]
+	fmla	v3.4s, v21.4s, v17.s[1]
+	fmla	v2.4s, v21.4s, v18.s[1]
+	fmla	v5.4s, v21.4s, v19.s[1]
+	fmla	v1.4s, v20.4s, v16.s[1]
+	fmla	v6.4s, v20.4s, v17.s[1]
+	fmla	v4.4s, v20.4s, v18.s[1]
+	fmla	v7.4s, v20.4s, v19.s[1]
+	stp	q20, q21, [x12, #-32]
+	prfm	pldl1keep, [x24]
+	ldp	q21, q20, [x25]
+	add	x24, x3, x8
+	add	x25, x7, x27
+	add	x7, x7, #16
+	fmla	v0.4s, v20.4s, v16.s[2]
+	fmla	v3.4s, v20.4s, v17.s[2]
+	fmla	v2.4s, v20.4s, v18.s[2]
+	fmla	v5.4s, v20.4s, v19.s[2]
+	fmla	v1.4s, v21.4s, v16.s[2]
+	fmla	v6.4s, v21.4s, v17.s[2]
+	fmla	v4.4s, v21.4s, v18.s[2]
+	fmla	v7.4s, v21.4s, v19.s[2]
+	stp	q21, q20, [x12]
+	prfm	pldl1keep, [x24]
+	ldp	q20, q21, [x13]
+	add	x13, x17, x8
+	add	x24, x25, #32
+	add	x8, x28, #32
+	fmla	v0.4s, v21.4s, v16.s[3]
+	fmla	v3.4s, v21.4s, v17.s[3]
+	fmla	v2.4s, v21.4s, v18.s[3]
+	fmla	v5.4s, v21.4s, v19.s[3]
+	fmla	v1.4s, v20.4s, v16.s[3]
+	fmla	v6.4s, v20.4s, v17.s[3]
+	fmla	v4.4s, v20.4s, v18.s[3]
+	fmla	v7.4s, v20.4s, v19.s[3]
+	stp	q20, q21, [x12, #32]
+	prfm	pldl1keep, [x30]
+	ldp	q21, q20, [x13]
+	prfm	pldl1keep, [x8]
+	ldr	q16, [x28, #16]
+	prfm	pldl1keep, [x0]
+	ldr	q17, [x26, #16]
+	prfm	pldl1keep, [x24]
+	ldr	x8, [sp, #1016]                 // 8-byte Folded Reload
+	ldr	q18, [x25, #16]
+	mov	x25, x10
+	add	x12, x12, #128
+	add	x4, x4, x8
+	add	x3, x3, x8
+	add	x2, x2, x8
+	add	x1, x1, x8
+	add	x17, x17, x8
+	add	x16, x16, x8
+	add	x15, x15, x8
+	add	x14, x14, x8
+	b	.LBB0_36
+	.p2align	2
+.LBB0_38:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x13, [sp, #728]                 // 8-byte Folded Reload
+	ldr	x11, [sp, #912]                 // 8-byte Folded Reload
+	add	x8, x25, x23, lsl #5
+	fmla	v6.4s, v21.4s, v17.s[0]
+	ldr	x10, [sp, #112]                 // 8-byte Folded Reload
+	ldr	x4, [sp, #832]                  // 8-byte Folded Reload
+	fmla	v3.4s, v20.4s, v17.s[0]
+	fmla	v4.4s, v21.4s, v18.s[0]
+	stp	q21, q20, [x8]
+	fmla	v2.4s, v20.4s, v18.s[0]
+	fmla	v5.4s, v20.4s, v19.s[0]
+	ldr	x16, [sp, #840]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #904]                 // 8-byte Folded Reload
+	fmla	v7.4s, v21.4s, v19.s[0]
+	mov	x14, xzr
+	madd	x8, x11, x13, x10
+	ldr	x15, [sp, #896]                 // 8-byte Folded Reload
+	add	x11, x25, x11, lsl #5
+	ldr	x0, [sp, #824]                  // 8-byte Folded Reload
+	ldr	x1, [sp, #192]                  // 8-byte Folded Reload
+	ldr	x2, [sp, #440]                  // 8-byte Folded Reload
+	add	x8, x8, x4
+	ldr	x24, [sp, #856]                 // 8-byte Folded Reload
+	add	x8, x16, x8, lsl #2
+	ldp	q20, q21, [x8]
+	madd	x8, x12, x13, x10
+	add	x12, x25, x12, lsl #5
+	add	x8, x8, x4
+	fmla	v0.4s, v21.4s, v16.s[1]
+	fmla	v3.4s, v21.4s, v17.s[1]
+	fmla	v2.4s, v21.4s, v18.s[1]
+	fmla	v5.4s, v21.4s, v19.s[1]
+	add	x8, x16, x8, lsl #2
+	fmla	v1.4s, v20.4s, v16.s[1]
+	stp	q20, q21, [x11]
+	fmla	v6.4s, v20.4s, v17.s[1]
+	fmla	v4.4s, v20.4s, v18.s[1]
+	fmla	v7.4s, v20.4s, v19.s[1]
+	ldp	q21, q20, [x8]
+	madd	x8, x15, x13, x10
+	add	x13, x25, x15, lsl #5
+	ldr	x15, [sp, #512]                 // 8-byte Folded Reload
+	add	x8, x8, x4
+	fmla	v0.4s, v20.4s, v16.s[2]
+	fmla	v3.4s, v20.4s, v17.s[2]
+	fmla	v2.4s, v20.4s, v18.s[2]
+	fmla	v5.4s, v20.4s, v19.s[2]
+	add	x8, x16, x8, lsl #2
+	stp	q21, q20, [x12]
+	fmla	v1.4s, v21.4s, v16.s[2]
+	fmla	v6.4s, v21.4s, v17.s[2]
+	fmla	v4.4s, v21.4s, v18.s[2]
+	fmla	v7.4s, v21.4s, v19.s[2]
+	ldr	x16, [sp, #1032]                // 8-byte Folded Reload
+	ldp	q20, q21, [x8]
+	fmla	v0.4s, v21.4s, v16.s[3]
+	fmla	v3.4s, v21.4s, v17.s[3]
+	fmla	v2.4s, v21.4s, v18.s[3]
+	fmla	v5.4s, v21.4s, v19.s[3]
+	fmla	v1.4s, v20.4s, v16.s[3]
+	fmla	v6.4s, v20.4s, v17.s[3]
+	fmla	v4.4s, v20.4s, v18.s[3]
+	fmla	v7.4s, v20.4s, v19.s[3]
+	stp	q20, q21, [x13]
+	ldr	x17, [sp, #880]                 // 8-byte Folded Reload
+	cmp	x16, x21
+	b.ge	.LBB0_40
+	.p2align	2
+.LBB0_39:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x17, x17, x15
+	add	x8, x1, x14
+	add	x18, x25, x16, lsl #5
+	add	x16, x16, #1
+	prfm	pldl1keep, [x17]
+	ldur	s16, [x17, #-4]
+	add	x17, x17, x20
+	add	x15, x15, #4
+	prfm	pldl1keep, [x17]
+	ldur	s17, [x17, #-4]
+	add	x17, x17, x20
+	prfm	pldl1keep, [x17]
+	ldur	s18, [x17, #-4]
+	add	x17, x17, x20
+	prfm	pldl1keep, [x17]
+	ldur	s19, [x17, #-4]
+	add	x17, x2, x14
+	prfm	pldl1keep, [x8]
+	add	x14, x14, x0
+	ldp	q20, q21, [x17]
+	fmla	v0.4s, v21.4s, v16.s[0]
+	fmla	v3.4s, v21.4s, v17.s[0]
+	fmla	v2.4s, v21.4s, v18.s[0]
+	fmla	v5.4s, v21.4s, v19.s[0]
+	fmla	v1.4s, v20.4s, v16.s[0]
+	fmla	v6.4s, v20.4s, v17.s[0]
+	fmla	v4.4s, v20.4s, v18.s[0]
+	fmla	v7.4s, v20.4s, v19.s[0]
+	stp	q20, q21, [x18]
+	ldr	x17, [sp, #880]                 // 8-byte Folded Reload
+	cmp	x16, x21
+	b.lt	.LBB0_39
+.LBB0_40:                               // %.preheader52
+                                        //   in Loop: Header=BB0_4 Depth=1
+	ldr	x8, [sp, #48]                   // 8-byte Folded Reload
+	ldr	x16, [sp, #808]                 // 8-byte Folded Reload
+	mov	x5, xzr
+	add	x14, x25, #128
+	ldr	x17, [sp, #816]                 // 8-byte Folded Reload
+	mov	w2, #1                          // =0x1
+	mov	w3, #2                          // =0x2
+	mov	w1, #3                          // =0x3
+	mov	w18, #4                         // =0x4
+	add	x15, x25, x8
+	b	.LBB0_42
+	.p2align	2
+.LBB0_41:                               // %.loopexit48
+                                        //   in Loop: Header=BB0_42 Depth=2
+	ldr	x8, [sp, #1008]                 // 8-byte Folded Reload
+	mov	x5, x18
+	mov	x18, x4
+	ldr	x4, [sp, #832]                  // 8-byte Folded Reload
+	add	x17, x17, x8
+	add	x16, x16, x8
+.LBB0_42:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Loop Header: Depth=2
+                                        //       Child Loop BB0_44 Depth 3
+                                        //       Child Loop BB0_46 Depth 3
+	madd	x8, x5, x29, x9
+	add	x8, x8, x4
+	madd	x0, x2, x29, x9
+	madd	x2, x3, x29, x9
+	add	x0, x0, x4
+	add	x8, x24, x8, lsl #2
+	add	x0, x24, x0, lsl #2
+	stp	q1, q0, [x8]
+	madd	x8, x1, x29, x9
+	add	x8, x8, x4
+	stp	q6, q3, [x0]
+	add	x0, x2, x4
+	add	x0, x24, x0, lsl #2
+	stp	q4, q2, [x0]
+	add	x8, x24, x8, lsl #2
+	stp	q7, q5, [x8]
+	ldr	x8, [sp, #1024]                 // 8-byte Folded Reload
+	cmp	x18, x8
+	b.ge	.LBB0_47
+// %bb.43:                              //   in Loop: Header=BB0_42 Depth=2
+	madd	x8, x18, x29, x9
+	add	x2, x18, #1
+	add	x1, x18, #3
+	add	x3, x18, #2
+	madd	x0, x2, x29, x9
+	mov	x7, x4
+	ldr	x10, [sp, #848]                 // 8-byte Folded Reload
+	mov	x5, xzr
+	madd	x6, x3, x29, x9
+	ldp	q20, q21, [x25]
+	add	x8, x8, x4
+	add	x8, x24, x8, lsl #2
+	add	x0, x0, x4
+	add	x4, x18, #4
+	add	x0, x24, x0, lsl #2
+	ldp	q1, q0, [x8]
+	madd	x8, x1, x29, x9
+	add	x8, x8, x7
+	ldp	q6, q3, [x0]
+	add	x0, x6, x7
+	add	x0, x24, x0, lsl #2
+	ldp	q4, q2, [x0]
+	ldr	x0, [sp, #888]                  // 8-byte Folded Reload
+	mov	x6, x14
+	mov	x7, x17
+	add	x8, x24, x8, lsl #2
+	ldp	q7, q5, [x8]
+	madd	x8, x18, x19, x10
+	lsl	x8, x8, #2
+	ldr	q19, [x0, x8]
+	madd	x8, x2, x19, x10
+	lsl	x8, x8, #2
+	ldr	q18, [x0, x8]
+	madd	x8, x3, x19, x10
+	lsl	x8, x8, #2
+	ldr	q17, [x0, x8]
+	madd	x8, x1, x19, x10
+	lsl	x8, x8, #2
+	ldr	q16, [x0, x8]
+	cmp	xzr, x23
+	b.ge	.LBB0_45
+	.p2align	2
+.LBB0_44:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_42 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x8, x6, #32
+	fmla	v1.4s, v20.4s, v19.s[0]
+	fmla	v0.4s, v21.4s, v19.s[0]
+	add	x5, x5, #4
+	prfm	pldl1keep, [x8]
+	ldp	q22, q23, [x6, #-96]
+	fmla	v3.4s, v21.4s, v18.s[0]
+	fmla	v6.4s, v20.4s, v18.s[0]
+	fmla	v2.4s, v21.4s, v17.s[0]
+	fmla	v4.4s, v20.4s, v17.s[0]
+	add	x8, x6, #96
+	fmla	v5.4s, v21.4s, v16.s[0]
+	fmla	v7.4s, v20.4s, v16.s[0]
+	ldp	q21, q20, [x6, #-64]
+	prfm	pldl1keep, [x8]
+	add	x8, x7, x20
+	add	x0, x8, x20
+	fmla	v0.4s, v23.4s, v19.s[1]
+	fmla	v3.4s, v23.4s, v18.s[1]
+	fmla	v2.4s, v23.4s, v17.s[1]
+	fmla	v5.4s, v23.4s, v16.s[1]
+	fmla	v1.4s, v22.4s, v19.s[1]
+	fmla	v6.4s, v22.4s, v18.s[1]
+	fmla	v4.4s, v22.4s, v17.s[1]
+	fmla	v7.4s, v22.4s, v16.s[1]
+	fmla	v0.4s, v20.4s, v19.s[2]
+	ldp	q22, q23, [x6, #-32]
+	fmla	v3.4s, v20.4s, v18.s[2]
+	fmla	v2.4s, v20.4s, v17.s[2]
+	fmla	v5.4s, v20.4s, v16.s[2]
+	fmla	v1.4s, v21.4s, v19.s[2]
+	fmla	v6.4s, v21.4s, v18.s[2]
+	fmla	v4.4s, v21.4s, v17.s[2]
+	fmla	v7.4s, v21.4s, v16.s[2]
+	ldp	q20, q21, [x6], #128
+	prfm	pldl1keep, [x7]
+	fmla	v0.4s, v23.4s, v19.s[3]
+	fmla	v3.4s, v23.4s, v18.s[3]
+	fmla	v2.4s, v23.4s, v17.s[3]
+	fmla	v5.4s, v23.4s, v16.s[3]
+	fmla	v1.4s, v22.4s, v19.s[3]
+	ldur	q19, [x7, #-16]
+	prfm	pldl1keep, [x8]
+	fmla	v6.4s, v22.4s, v18.s[3]
+	ldur	q18, [x8, #-16]
+	add	x8, x0, x20
+	prfm	pldl1keep, [x0]
+	add	x7, x7, #16
+	fmla	v4.4s, v22.4s, v17.s[3]
+	ldur	q17, [x0, #-16]
+	prfm	pldl1keep, [x8]
+	fmla	v7.4s, v22.4s, v16.s[3]
+	ldur	q16, [x8, #-16]
+	cmp	x5, x23
+	b.lt	.LBB0_44
+.LBB0_45:                               //   in Loop: Header=BB0_42 Depth=2
+	ldp	q23, q22, [x11]
+	fmla	v0.4s, v21.4s, v19.s[0]
+	fmla	v1.4s, v20.4s, v19.s[0]
+	fmla	v3.4s, v21.4s, v18.s[0]
+	fmla	v6.4s, v20.4s, v18.s[0]
+	ldr	x7, [sp, #1032]                 // 8-byte Folded Reload
+	mov	x5, x16
+	fmla	v2.4s, v21.4s, v17.s[0]
+	fmla	v4.4s, v20.4s, v17.s[0]
+	mov	x6, x15
+	fmla	v5.4s, v21.4s, v16.s[0]
+	fmla	v7.4s, v20.4s, v16.s[0]
+	ldp	q20, q21, [x12]
+	fmla	v0.4s, v22.4s, v19.s[1]
+	fmla	v3.4s, v22.4s, v18.s[1]
+	fmla	v2.4s, v22.4s, v17.s[1]
+	fmla	v5.4s, v22.4s, v16.s[1]
+	fmla	v1.4s, v23.4s, v19.s[1]
+	fmla	v6.4s, v23.4s, v18.s[1]
+	fmla	v4.4s, v23.4s, v17.s[1]
+	fmla	v7.4s, v23.4s, v16.s[1]
+	fmla	v0.4s, v21.4s, v19.s[2]
+	ldp	q23, q22, [x13]
+	fmla	v3.4s, v21.4s, v18.s[2]
+	fmla	v2.4s, v21.4s, v17.s[2]
+	fmla	v5.4s, v21.4s, v16.s[2]
+	fmla	v1.4s, v20.4s, v19.s[2]
+	fmla	v6.4s, v20.4s, v18.s[2]
+	fmla	v4.4s, v20.4s, v17.s[2]
+	fmla	v7.4s, v20.4s, v16.s[2]
+	fmla	v0.4s, v22.4s, v19.s[3]
+	fmla	v3.4s, v22.4s, v18.s[3]
+	fmla	v2.4s, v22.4s, v17.s[3]
+	fmla	v5.4s, v22.4s, v16.s[3]
+	fmla	v1.4s, v23.4s, v19.s[3]
+	fmla	v6.4s, v23.4s, v18.s[3]
+	fmla	v4.4s, v23.4s, v17.s[3]
+	fmla	v7.4s, v23.4s, v16.s[3]
+	cmp	x7, x21
+	b.ge	.LBB0_41
+	.p2align	2
+.LBB0_46:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_42 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x8, x5, x20
+	prfm	pldl1keep, [x5]
+	ldur	s16, [x5, #-4]
+	add	x7, x7, #1
+	prfm	pldl1keep, [x8]
+	ldur	s17, [x8, #-4]
+	add	x8, x8, x20
+	add	x5, x5, #4
+	prfm	pldl1keep, [x8]
+	ldur	s18, [x8, #-4]
+	add	x8, x8, x20
+	prfm	pldl1keep, [x8]
+	ldur	s19, [x8, #-4]
+	prfm	pldl1keep, [x6]
+	ldp	q20, q21, [x6, #-32]
+	add	x6, x6, #32
+	fmla	v0.4s, v21.4s, v16.s[0]
+	fmla	v3.4s, v21.4s, v17.s[0]
+	fmla	v2.4s, v21.4s, v18.s[0]
+	fmla	v1.4s, v20.4s, v16.s[0]
+	fmla	v6.4s, v20.4s, v17.s[0]
+	fmla	v4.4s, v20.4s, v18.s[0]
+	fmla	v7.4s, v20.4s, v19.s[0]
+	fmla	v5.4s, v21.4s, v19.s[0]
+	cmp	x7, x21
+	b.lt	.LBB0_46
+	b	.LBB0_41
+	.p2align	2
+.LBB0_47:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x8, [sp, #1024]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #920]                 // 8-byte Folded Reload
+	cmp	x8, x15
+	b.ge	.LBB0_53
+// %bb.48:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x18, [sp, #1024]                // 8-byte Folded Reload
+	ldr	x10, [sp, #848]                 // 8-byte Folded Reload
+	mov	x17, xzr
+	madd	x8, x18, x29, x9
+	ldr	x0, [sp, #888]                  // 8-byte Folded Reload
+	ldp	q6, q7, [x25]
+	ldr	x1, [sp, #616]                  // 8-byte Folded Reload
+	ldr	x2, [sp, #624]                  // 8-byte Folded Reload
+	add	x8, x8, x4
+	add	x15, x24, x8, lsl #2
+	add	x8, x18, #1
+	madd	x18, x18, x19, x10
+	madd	x16, x8, x29, x9
+	madd	x8, x8, x19, x10
+	lsl	x18, x18, #2
+	ldp	q1, q0, [x15]
+	add	x16, x16, x4
+	lsl	x8, x8, #2
+	ldr	q5, [x0, x18]
+	mov	x18, x14
+	add	x16, x24, x16, lsl #2
+	ldr	q4, [x0, x8]
+	ldp	q3, q2, [x16]
+	cmp	xzr, x23
+	b.ge	.LBB0_50
+	.p2align	2
+.LBB0_49:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x6, x18, #32
+	fmla	v1.4s, v6.4s, v5.s[0]
+	fmla	v0.4s, v7.4s, v5.s[0]
+	add	x5, x18, #96
+	prfm	pldl1keep, [x6]
+	ldp	q16, q17, [x18, #-96]
+	fmla	v2.4s, v7.4s, v4.s[0]
+	fmla	v3.4s, v6.4s, v4.s[0]
+	ldp	q7, q6, [x18, #-64]
+	prfm	pldl1keep, [x5]
+	add	x8, x2, x27
+	add	x3, x1, x27
+	add	x0, x8, #32
+	add	x4, x3, #32
+	add	x17, x17, #4
+	add	x2, x2, #16
+	add	x1, x1, #16
+	fmla	v0.4s, v17.4s, v5.s[1]
+	fmla	v2.4s, v17.4s, v4.s[1]
+	fmla	v1.4s, v16.4s, v5.s[1]
+	fmla	v3.4s, v16.4s, v4.s[1]
+	fmla	v0.4s, v6.4s, v5.s[2]
+	ldp	q16, q17, [x18, #-32]
+	fmla	v2.4s, v6.4s, v4.s[2]
+	fmla	v1.4s, v7.4s, v5.s[2]
+	fmla	v3.4s, v7.4s, v4.s[2]
+	ldp	q6, q7, [x18], #128
+	prfm	pldl1keep, [x4]
+	ldr	x4, [sp, #832]                  // 8-byte Folded Reload
+	fmla	v0.4s, v17.4s, v5.s[3]
+	fmla	v2.4s, v17.4s, v4.s[3]
+	fmla	v1.4s, v16.4s, v5.s[3]
+	ldr	q5, [x3, #16]
+	prfm	pldl1keep, [x0]
+	fmla	v3.4s, v16.4s, v4.s[3]
+	ldr	q4, [x8, #16]
+	cmp	x17, x23
+	b.lt	.LBB0_49
+.LBB0_50:                               //   in Loop: Header=BB0_4 Depth=1
+	ldp	q17, q16, [x11]
+	fmla	v0.4s, v7.4s, v5.s[0]
+	fmla	v1.4s, v6.4s, v5.s[0]
+	fmla	v2.4s, v7.4s, v4.s[0]
+	fmla	v3.4s, v6.4s, v4.s[0]
+	ldp	q6, q7, [x12]
+	ldr	x8, [sp, #64]                   // 8-byte Folded Reload
+	ldr	x2, [sp, #1032]                 // 8-byte Folded Reload
+	mov	x17, xzr
+	mov	x18, xzr
+	fmla	v0.4s, v16.4s, v5.s[1]
+	fmla	v2.4s, v16.4s, v4.s[1]
+	add	x1, x25, x8
+	fmla	v1.4s, v17.4s, v5.s[1]
+	fmla	v3.4s, v17.4s, v4.s[1]
+	ldp	q17, q16, [x13]
+	fmla	v0.4s, v7.4s, v5.s[2]
+	fmla	v2.4s, v7.4s, v4.s[2]
+	fmla	v1.4s, v6.4s, v5.s[2]
+	fmla	v3.4s, v6.4s, v4.s[2]
+	fmla	v0.4s, v16.4s, v5.s[3]
+	fmla	v2.4s, v16.4s, v4.s[3]
+	fmla	v1.4s, v17.4s, v5.s[3]
+	fmla	v3.4s, v17.4s, v4.s[3]
+	cmp	x2, x21
+	b.ge	.LBB0_52
+	.p2align	2
+.LBB0_51:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	ldr	x5, [sp, #544]                  // 8-byte Folded Reload
+	ldr	x6, [sp, #656]                  // 8-byte Folded Reload
+	add	x8, x1, x18, lsl #3
+	add	x2, x2, #1
+	add	x8, x8, #32
+	add	x0, x5, x18
+	add	x3, x6, x18
+	add	x0, x0, #4
+	add	x3, x3, #4
+	prfm	pldl1keep, [x3]
+	ldr	s4, [x6, x18]
+	prfm	pldl1keep, [x0]
+	ldr	s5, [x5, x18]
+	add	x0, x1, x17
+	prfm	pldl1keep, [x8]
+	add	x18, x18, #4
+	add	x17, x17, #32
+	ldp	q6, q7, [x0]
+	fmla	v0.4s, v7.4s, v4.s[0]
+	fmla	v1.4s, v6.4s, v4.s[0]
+	fmla	v2.4s, v7.4s, v5.s[0]
+	fmla	v3.4s, v6.4s, v5.s[0]
+	cmp	x2, x21
+	b.lt	.LBB0_51
+.LBB0_52:                               //   in Loop: Header=BB0_4 Depth=1
+	stp	q1, q0, [x15]
+	stp	q3, q2, [x16]
+.LBB0_53:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x8, [sp, #744]                  // 8-byte Folded Reload
+	ldr	x15, [sp, #920]                 // 8-byte Folded Reload
+	cmp	x15, x8
+	b.ge	.LBB0_59
+// %bb.54:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x16, [sp, #920]                 // 8-byte Folded Reload
+	ldr	x10, [sp, #888]                 // 8-byte Folded Reload
+	mov	x15, xzr
+	madd	x8, x16, x29, x9
+	ldp	q4, q3, [x25]
+	ldr	x17, [sp, #632]                 // 8-byte Folded Reload
+	add	x8, x8, x4
+	add	x9, x24, x8, lsl #2
+	ldr	x8, [sp, #848]                  // 8-byte Folded Reload
+	ldp	q1, q0, [x9]
+	madd	x8, x16, x19, x8
+	lsl	x8, x8, #2
+	ldr	q2, [x10, x8]
+	ldr	x10, [sp, #640]                 // 8-byte Folded Reload
+	cmp	xzr, x23
+	b.ge	.LBB0_56
+	.p2align	2
+.LBB0_55:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x16, x14, #32
+	fmla	v1.4s, v4.4s, v2.s[0]
+	fmla	v0.4s, v3.4s, v2.s[0]
+	add	x8, x14, #96
+	prfm	pldl1keep, [x16]
+	ldp	q5, q6, [x14, #-96]
+	add	x15, x15, #4
+	ldp	q4, q3, [x14, #-64]
+	prfm	pldl1keep, [x8]
+	fmla	v0.4s, v6.4s, v2.s[1]
+	fmla	v1.4s, v5.4s, v2.s[1]
+	ldp	q5, q6, [x14, #-32]
+	prfm	pldl1keep, [x10]
+	fmla	v0.4s, v3.4s, v2.s[2]
+	fmla	v1.4s, v4.4s, v2.s[2]
+	fmla	v0.4s, v6.4s, v2.s[3]
+	fmla	v1.4s, v5.4s, v2.s[3]
+	ldur	q2, [x10, #-16]
+	ldp	q4, q3, [x14], #128
+	add	x10, x10, #16
+	cmp	x15, x23
+	b.lt	.LBB0_55
+.LBB0_56:                               //   in Loop: Header=BB0_4 Depth=1
+	ldp	q6, q5, [x11]
+	fmla	v0.4s, v3.4s, v2.s[0]
+	fmla	v1.4s, v4.4s, v2.s[0]
+	ldp	q3, q4, [x12]
+	ldr	x8, [sp, #64]                   // 8-byte Folded Reload
+	ldr	x11, [sp, #1032]                // 8-byte Folded Reload
+	mov	x10, xzr
+	mov	x14, xzr
+	fmla	v0.4s, v5.4s, v2.s[1]
+	add	x8, x25, x8
+	fmla	v1.4s, v6.4s, v2.s[1]
+	ldp	q6, q5, [x13]
+	fmla	v0.4s, v4.4s, v2.s[2]
+	fmla	v1.4s, v3.4s, v2.s[2]
+	fmla	v0.4s, v5.4s, v2.s[3]
+	fmla	v1.4s, v6.4s, v2.s[3]
+	cmp	x11, x21
+	b.ge	.LBB0_58
+	.p2align	2
+.LBB0_57:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x12, x8, x14, lsl #3
+	add	x13, x17, x14
+	add	x11, x11, #1
+	add	x13, x13, #4
+	add	x12, x12, #32
+	prfm	pldl1keep, [x13]
+	ldr	s2, [x17, x14]
+	add	x13, x8, x10
+	add	x14, x14, #4
+	add	x10, x10, #32
+	prfm	pldl1keep, [x12]
+	ldp	q3, q4, [x13]
+	fmla	v0.4s, v4.4s, v2.s[0]
+	fmla	v1.4s, v3.4s, v2.s[0]
+	cmp	x11, x21
+	b.lt	.LBB0_57
+.LBB0_58:                               //   in Loop: Header=BB0_4 Depth=1
+	stp	q1, q0, [x9]
+.LBB0_59:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x0, [sp, #120]                  // 8-byte Folded Reload
+	bl	free
+	ldr	x3, [sp, #768]                  // 8-byte Folded Reload
+	ldr	x4, [sp, #600]                  // 8-byte Folded Reload
+	cmp	x3, x4
+	b.ge	.LBB0_33
+.LBB0_60:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x8, [sp, #80]                   // 8-byte Folded Reload
+	add	x0, x8, #64
+	bl	malloc
+	ldr	x8, [sp, #736]                  // 8-byte Folded Reload
+	ldr	x11, [sp, #776]                 // 8-byte Folded Reload
+	mov	x12, xzr
+	mul	x9, x11, x8
+	ldr	x8, [sp, #752]                  // 8-byte Folded Reload
+	ldr	x10, [sp, #768]                 // 8-byte Folded Reload
+	ldr	x18, [sp, #520]                 // 8-byte Folded Reload
+	add	x13, x18, x29
+	ldp	x1, x2, [sp, #408]              // 16-byte Folded Reload
+	ldp	x3, x4, [sp, #424]              // 16-byte Folded Reload
+	ldr	x5, [sp, #872]                  // 8-byte Folded Reload
+	ldr	x6, [sp, #784]                  // 8-byte Folded Reload
+	ldr	x7, [sp, #800]                  // 8-byte Folded Reload
+	mul	x15, x11, x8
+	add	x14, x9, x10
+	add	x8, x0, #63
+	lsl	x16, x14, #2
+	add	x17, x14, x29
+	add	x18, x14, x18
+	add	x13, x14, x13
+	and	x8, x8, #0xffffffffffffffc0
+	lsl	x13, x13, #2
+	ldr	q0, [x24, x16]
+	lsl	x16, x18, #2
+	ldr	x18, [sp, #792]                 // 8-byte Folded Reload
+	ldr	q3, [x24, x13]
+	ldr	q1, [x24, x16]
+	stp	x15, x0, [sp, #112]             // 16-byte Folded Spill
+	add	x15, x15, x10
+	ldr	x10, [sp, #760]                 // 8-byte Folded Reload
+	lsl	x14, x15, #2
+	lsl	x15, x17, #2
+	ldp	x16, x17, [sp, #392]            // 16-byte Folded Reload
+	mul	x11, x11, x10
+	ldr	x10, [sp, #840]                 // 8-byte Folded Reload
+	ldr	q2, [x24, x15]
+	lsl	x13, x11, #2
+	ldr	q7, [x10, x14]
+	ldr	x10, [sp, #888]                 // 8-byte Folded Reload
+	str	x11, [sp, #848]                 // 8-byte Folded Spill
+	ldp	x14, x15, [sp, #376]            // 16-byte Folded Reload
+	ldr	q4, [x10, x13]
+	add	x13, x11, x19
+	lsl	x13, x13, #2
+	ldr	q5, [x10, x13]
+	add	x13, x11, x19, lsl #1
+	lsl	x13, x13, #2
+	ldr	q6, [x10, x13]
+	orr	x13, x8, #0x20
+	.p2align	2
+.LBB0_61:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x24, x18, x27
+	fmla	v0.4s, v7.4s, v4.s[0]
+	fmla	v2.4s, v7.4s, v5.s[0]
+	cmp	x12, x23
+	prfm	pldl1keep, [x24, #16]
+	ldr	q16, [x24]
+	b.ge	.LBB0_63
+// %bb.62:                              //   in Loop: Header=BB0_61 Depth=2
+	ldr	x10, [sp, #864]                 // 8-byte Folded Reload
+	stur	q7, [x13, #-32]
+	fmla	v1.4s, v7.4s, v6.s[0]
+	fmla	v3.4s, v7.4s, v16.s[0]
+	add	x25, x6, x27
+	add	x26, x5, x27
+	add	x11, x25, #32
+	add	x0, x26, #32
+	add	x12, x12, #4
+	add	x6, x6, #16
+	add	x5, x5, #16
+	add	x18, x18, #16
+	add	x24, x1, x10
+	add	x28, x4, x10
+	prfm	pldl1keep, [x24]
+	add	x24, x2, x10
+	ldr	q17, [x16, x10]
+	stur	q17, [x13, #-16]
+	prfm	pldl1keep, [x24]
+	ldr	q18, [x15, x10]
+	add	x24, x3, x10
+	fmla	v0.4s, v17.4s, v4.s[1]
+	fmla	v2.4s, v17.4s, v5.s[1]
+	fmla	v1.4s, v17.4s, v6.s[1]
+	fmla	v3.4s, v17.4s, v16.s[1]
+	str	q18, [x13]
+	prfm	pldl1keep, [x24]
+	ldr	q19, [x14, x10]
+	fmla	v0.4s, v18.4s, v4.s[2]
+	fmla	v2.4s, v18.4s, v5.s[2]
+	fmla	v1.4s, v18.4s, v6.s[2]
+	add	x24, x7, x27
+	fmla	v3.4s, v18.4s, v16.s[2]
+	add	x7, x7, #16
+	add	x30, x24, #32
+	str	q19, [x13, #16]
+	prfm	pldl1keep, [x28]
+	ldr	q7, [x17, x10]
+	fmla	v0.4s, v19.4s, v4.s[3]
+	fmla	v2.4s, v19.4s, v5.s[3]
+	fmla	v1.4s, v19.4s, v6.s[3]
+	prfm	pldl1keep, [x0]
+	ldr	q4, [x26, #16]
+	prfm	pldl1keep, [x11]
+	ldr	q5, [x25, #16]
+	prfm	pldl1keep, [x30]
+	ldr	x10, [sp, #1016]                // 8-byte Folded Reload
+	ldr	q6, [x24, #16]
+	fmla	v3.4s, v19.4s, v16.s[3]
+	add	x13, x13, #64
+	add	x4, x4, x10
+	add	x3, x3, x10
+	add	x2, x2, x10
+	add	x1, x1, x10
+	add	x17, x17, x10
+	add	x16, x16, x10
+	add	x15, x15, x10
+	add	x14, x14, x10
+	b	.LBB0_61
+	.p2align	2
+.LBB0_63:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x13, [sp, #728]                 // 8-byte Folded Reload
+	ldr	x10, [sp, #912]                 // 8-byte Folded Reload
+	fmla	v1.4s, v7.4s, v6.s[0]
+	fmla	v3.4s, v7.4s, v16.s[0]
+	ldr	x15, [sp, #112]                 // 8-byte Folded Reload
+	ldr	x5, [sp, #768]                  // 8-byte Folded Reload
+	mov	x12, xzr
+	madd	x11, x10, x13, x15
+	ldr	x14, [sp, #840]                 // 8-byte Folded Reload
+	str	q7, [x8, x23, lsl #4]
+	ldr	x6, [sp, #576]                  // 8-byte Folded Reload
+	ldr	x7, [sp, #568]                  // 8-byte Folded Reload
+	ldr	x17, [sp, #880]                 // 8-byte Folded Reload
+	ldr	x24, [sp, #856]                 // 8-byte Folded Reload
+	add	x11, x11, x5
+	lsl	x11, x11, #2
+	ldr	q7, [x14, x11]
+	fmla	v0.4s, v7.4s, v4.s[1]
+	str	q7, [x8, x10, lsl #4]
+	ldr	x10, [sp, #904]                 // 8-byte Folded Reload
+	fmla	v2.4s, v7.4s, v5.s[1]
+	fmla	v1.4s, v7.4s, v6.s[1]
+	fmla	v3.4s, v7.4s, v16.s[1]
+	madd	x11, x10, x13, x15
+	add	x11, x11, x5
+	lsl	x11, x11, #2
+	ldr	q17, [x14, x11]
+	fmla	v0.4s, v17.4s, v4.s[2]
+	str	q17, [x8, x10, lsl #4]
+	ldr	x10, [sp, #896]                 // 8-byte Folded Reload
+	fmla	v2.4s, v17.4s, v5.s[2]
+	fmla	v1.4s, v17.4s, v6.s[2]
+	fmla	v3.4s, v17.4s, v16.s[2]
+	madd	x11, x10, x13, x15
+	ldr	x13, [sp, #1032]                // 8-byte Folded Reload
+	add	x11, x11, x5
+	lsl	x11, x11, #2
+	ldr	q7, [x14, x11]
+	ldr	x11, [sp, #512]                 // 8-byte Folded Reload
+	fmla	v0.4s, v7.4s, v4.s[3]
+	fmla	v2.4s, v7.4s, v5.s[3]
+	fmla	v1.4s, v7.4s, v6.s[3]
+	fmla	v3.4s, v7.4s, v16.s[3]
+	str	q7, [x8, x10, lsl #4]
+	ldp	x16, x10, [sp, #360]            // 16-byte Folded Reload
+	cmp	x13, x21
+	b.ge	.LBB0_65
+	.p2align	2
+.LBB0_64:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x15, x17, x11
+	add	x14, x10, x12
+	add	x11, x11, #4
+	prfm	pldl1keep, [x15]
+	ldur	s4, [x15, #-4]
+	add	x15, x15, x20
+	prfm	pldl1keep, [x15]
+	ldur	s5, [x15, #-4]
+	add	x15, x15, x20
+	prfm	pldl1keep, [x15]
+	ldur	s6, [x15, #-4]
+	add	x15, x15, x20
+	prfm	pldl1keep, [x15]
+	ldur	s7, [x15, #-4]
+	prfm	pldl1keep, [x14]
+	ldr	x14, [sp, #824]                 // 8-byte Folded Reload
+	ldr	q16, [x16, x12]
+	add	x12, x12, x14
+	fmla	v0.4s, v16.4s, v4.s[0]
+	str	q16, [x8, x13, lsl #4]
+	add	x13, x13, #1
+	fmla	v2.4s, v16.4s, v5.s[0]
+	fmla	v1.4s, v16.4s, v6.s[0]
+	fmla	v3.4s, v16.4s, v7.s[0]
+	cmp	x13, x21
+	b.lt	.LBB0_64
+.LBB0_65:                               // %.preheader51
+                                        //   in Loop: Header=BB0_4 Depth=1
+	ldr	x10, [sp, #40]                  // 8-byte Folded Reload
+	ldr	x13, [sp, #808]                 // 8-byte Folded Reload
+	mov	x2, xzr
+	add	x11, x8, #48
+	ldr	x14, [sp, #816]                 // 8-byte Folded Reload
+	mov	w16, #1                         // =0x1
+	mov	w17, #2                         // =0x2
+	mov	w18, #3                         // =0x3
+	mov	w15, #4                         // =0x4
+	add	x12, x8, x10
+	b	.LBB0_67
+	.p2align	2
+.LBB0_66:                               // %.loopexit47
+                                        //   in Loop: Header=BB0_67 Depth=2
+	ldr	x10, [sp, #1008]                // 8-byte Folded Reload
+	mov	x2, x15
+	mov	x15, x1
+	add	x14, x14, x10
+	add	x13, x13, x10
+.LBB0_67:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Loop Header: Depth=2
+                                        //       Child Loop BB0_69 Depth 3
+                                        //       Child Loop BB0_71 Depth 3
+	madd	x0, x2, x29, x9
+	add	x0, x0, x5
+	madd	x16, x16, x29, x9
+	madd	x17, x17, x29, x9
+	madd	x18, x18, x29, x9
+	add	x16, x16, x5
+	add	x17, x17, x5
+	lsl	x0, x0, #2
+	lsl	x16, x16, #2
+	lsl	x17, x17, #2
+	str	q0, [x24, x0]
+	str	q2, [x24, x16]
+	add	x16, x18, x5
+	lsl	x16, x16, #2
+	str	q1, [x24, x17]
+	str	q3, [x24, x16]
+	ldr	x16, [sp, #1024]                // 8-byte Folded Reload
+	cmp	x15, x16
+	b.ge	.LBB0_72
+// %bb.68:                              //   in Loop: Header=BB0_67 Depth=2
+	madd	x0, x15, x29, x9
+	add	x17, x15, #2
+	add	x18, x15, #3
+	ldr	x10, [sp, #888]                 // 8-byte Folded Reload
+	madd	x3, x17, x29, x9
+	add	x16, x15, #1
+	ldr	q16, [x8]
+	mov	x2, xzr
+	madd	x1, x16, x29, x9
+	mov	x4, x14
+	add	x0, x0, x5
+	lsl	x0, x0, #2
+	add	x3, x3, x5
+	add	x1, x1, x5
+	ldr	q0, [x24, x0]
+	madd	x0, x18, x29, x9
+	lsl	x3, x3, #2
+	lsl	x1, x1, #2
+	add	x0, x0, x5
+	ldr	q1, [x24, x3]
+	ldr	x3, [sp, #848]                  // 8-byte Folded Reload
+	lsl	x0, x0, #2
+	ldr	q2, [x24, x1]
+	add	x1, x15, #4
+	ldr	q3, [x24, x0]
+	madd	x0, x15, x19, x3
+	lsl	x0, x0, #2
+	ldr	q7, [x10, x0]
+	madd	x0, x16, x19, x3
+	lsl	x0, x0, #2
+	ldr	q6, [x10, x0]
+	madd	x0, x17, x19, x3
+	lsl	x0, x0, #2
+	ldr	q5, [x10, x0]
+	madd	x0, x18, x19, x3
+	mov	x3, x11
+	lsl	x0, x0, #2
+	ldr	q4, [x10, x0]
+	cmp	xzr, x23
+	b.ge	.LBB0_70
+	.p2align	2
+.LBB0_69:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_67 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x0, x3, #32
+	fmla	v0.4s, v16.4s, v7.s[0]
+	fmla	v2.4s, v16.4s, v6.s[0]
+	add	x2, x2, #4
+	fmla	v1.4s, v16.4s, v5.s[0]
+	fmla	v3.4s, v16.4s, v4.s[0]
+	prfm	pldl1keep, [x0]
+	add	x0, x4, x20
+	ldp	q16, q17, [x3, #-32]
+	fmla	v0.4s, v16.4s, v7.s[1]
+	fmla	v2.4s, v16.4s, v6.s[1]
+	fmla	v1.4s, v16.4s, v5.s[1]
+	fmla	v3.4s, v16.4s, v4.s[1]
+	fmla	v0.4s, v17.4s, v7.s[2]
+	fmla	v2.4s, v17.4s, v6.s[2]
+	fmla	v1.4s, v17.4s, v5.s[2]
+	fmla	v3.4s, v17.4s, v4.s[2]
+	ldp	q17, q16, [x3], #64
+	prfm	pldl1keep, [x4]
+	fmla	v0.4s, v17.4s, v7.s[3]
+	ldur	q7, [x4, #-16]
+	prfm	pldl1keep, [x0]
+	fmla	v2.4s, v17.4s, v6.s[3]
+	ldur	q6, [x0, #-16]
+	add	x0, x0, x20
+	fmla	v1.4s, v17.4s, v5.s[3]
+	fmla	v3.4s, v17.4s, v4.s[3]
+	add	x4, x4, #16
+	prfm	pldl1keep, [x0]
+	ldur	q5, [x0, #-16]
+	add	x0, x0, x20
+	prfm	pldl1keep, [x0]
+	ldur	q4, [x0, #-16]
+	cmp	x2, x23
+	b.lt	.LBB0_69
+.LBB0_70:                               //   in Loop: Header=BB0_67 Depth=2
+	ldr	x10, [sp, #912]                 // 8-byte Folded Reload
+	fmla	v0.4s, v16.4s, v7.s[0]
+	fmla	v2.4s, v16.4s, v6.s[0]
+	ldr	x4, [sp, #1032]                 // 8-byte Folded Reload
+	fmla	v1.4s, v16.4s, v5.s[0]
+	fmla	v3.4s, v16.4s, v4.s[0]
+	mov	x2, x13
+	mov	x3, x12
+	ldr	q17, [x8, x10, lsl #4]
+	ldr	x10, [sp, #904]                 // 8-byte Folded Reload
+	fmla	v0.4s, v17.4s, v7.s[1]
+	ldr	q16, [x8, x10, lsl #4]
+	ldr	x10, [sp, #896]                 // 8-byte Folded Reload
+	fmla	v2.4s, v17.4s, v6.s[1]
+	fmla	v1.4s, v17.4s, v5.s[1]
+	fmla	v3.4s, v17.4s, v4.s[1]
+	ldr	q18, [x8, x10, lsl #4]
+	fmla	v0.4s, v16.4s, v7.s[2]
+	fmla	v2.4s, v16.4s, v6.s[2]
+	fmla	v1.4s, v16.4s, v5.s[2]
+	fmla	v3.4s, v16.4s, v4.s[2]
+	fmla	v0.4s, v18.4s, v7.s[3]
+	fmla	v2.4s, v18.4s, v6.s[3]
+	fmla	v1.4s, v18.4s, v5.s[3]
+	fmla	v3.4s, v18.4s, v4.s[3]
+	cmp	x4, x21
+	b.ge	.LBB0_66
+	.p2align	2
+.LBB0_71:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_67 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x0, x2, x20
+	prfm	pldl1keep, [x2]
+	ldur	s4, [x2, #-4]
+	add	x4, x4, #1
+	prfm	pldl1keep, [x0]
+	ldur	s5, [x0, #-4]
+	add	x0, x0, x20
+	add	x2, x2, #4
+	prfm	pldl1keep, [x0]
+	ldur	s6, [x0, #-4]
+	add	x0, x0, x20
+	prfm	pldl1keep, [x0]
+	ldur	s7, [x0, #-4]
+	prfm	pldl1keep, [x3]
+	ldur	q16, [x3, #-16]
+	add	x3, x3, #16
+	fmla	v0.4s, v16.4s, v4.s[0]
+	fmla	v2.4s, v16.4s, v5.s[0]
+	fmla	v1.4s, v16.4s, v6.s[0]
+	fmla	v3.4s, v16.4s, v7.s[0]
+	cmp	x4, x21
+	b.lt	.LBB0_71
+	b	.LBB0_66
+	.p2align	2
+.LBB0_72:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x13, [sp, #1024]                // 8-byte Folded Reload
+	ldr	x14, [sp, #920]                 // 8-byte Folded Reload
+	cmp	x13, x14
+	b.ge	.LBB0_78
+// %bb.73:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x17, [sp, #1024]                // 8-byte Folded Reload
+	ldr	x18, [sp, #848]                 // 8-byte Folded Reload
+	mov	x15, xzr
+	add	x16, x17, #1
+	madd	x13, x17, x29, x9
+	madd	x17, x17, x19, x18
+	ldr	x10, [sp, #888]                 // 8-byte Folded Reload
+	ldr	q4, [x8]
+	madd	x14, x16, x29, x9
+	madd	x16, x16, x19, x18
+	ldr	x18, [sp, #624]                 // 8-byte Folded Reload
+	add	x13, x13, x5
+	lsl	x17, x17, #2
+	add	x14, x14, x5
+	add	x13, x24, x13, lsl #2
+	lsl	x16, x16, #2
+	ldr	q3, [x10, x17]
+	ldr	x17, [sp, #616]                 // 8-byte Folded Reload
+	add	x14, x24, x14, lsl #2
+	ldr	q2, [x10, x16]
+	mov	x16, x11
+	ldr	q0, [x13]
+	ldr	q1, [x14]
+	cmp	xzr, x23
+	b.ge	.LBB0_75
+	.p2align	2
+.LBB0_74:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x4, x16, #32
+	fmla	v0.4s, v4.4s, v3.s[0]
+	fmla	v1.4s, v4.4s, v2.s[0]
+	add	x0, x18, x27
+	prfm	pldl1keep, [x4]
+	ldp	q4, q5, [x16, #-32]
+	add	x2, x17, x27
+	add	x1, x0, #32
+	add	x3, x2, #32
+	add	x15, x15, #4
+	add	x18, x18, #16
+	add	x17, x17, #16
+	fmla	v0.4s, v4.4s, v3.s[1]
+	fmla	v1.4s, v4.4s, v2.s[1]
+	fmla	v0.4s, v5.4s, v3.s[2]
+	fmla	v1.4s, v5.4s, v2.s[2]
+	ldp	q5, q4, [x16], #64
+	prfm	pldl1keep, [x3]
+	fmla	v0.4s, v5.4s, v3.s[3]
+	ldr	q3, [x2, #16]
+	prfm	pldl1keep, [x1]
+	fmla	v1.4s, v5.4s, v2.s[3]
+	ldr	q2, [x0, #16]
+	cmp	x15, x23
+	b.lt	.LBB0_74
+.LBB0_75:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x10, [sp, #912]                 // 8-byte Folded Reload
+	fmla	v0.4s, v4.4s, v3.s[0]
+	fmla	v1.4s, v4.4s, v2.s[0]
+	ldr	x15, [sp, #880]                 // 8-byte Folded Reload
+	ldr	x16, [sp, #1032]                // 8-byte Folded Reload
+	ldr	q5, [x8, x10, lsl #4]
+	ldr	x10, [sp, #904]                 // 8-byte Folded Reload
+	ldr	q4, [x8, x10, lsl #4]
+	ldr	x10, [sp, #896]                 // 8-byte Folded Reload
+	fmla	v0.4s, v5.4s, v3.s[1]
+	fmla	v1.4s, v5.4s, v2.s[1]
+	ldr	q5, [x8, x10, lsl #4]
+	fmla	v0.4s, v4.4s, v3.s[2]
+	fmla	v1.4s, v4.4s, v2.s[2]
+	fmla	v0.4s, v5.4s, v3.s[3]
+	fmla	v1.4s, v5.4s, v2.s[3]
+	cmp	x16, x21
+	b.ge	.LBB0_77
+	.p2align	2
+.LBB0_76:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x17, x15, x7
+	add	x18, x15, x6
+	add	x16, x16, #1
+	add	x17, x17, #4
+	add	x18, x18, #4
+	prfm	pldl1keep, [x18]
+	ldr	s2, [x15, x6]
+	prfm	pldl1keep, [x17]
+	ldr	s3, [x15, x7]
+	prfm	pldl1keep, [x12]
+	ldur	q4, [x12, #-16]
+	add	x12, x12, #16
+	add	x15, x15, #4
+	fmla	v0.4s, v4.4s, v2.s[0]
+	fmla	v1.4s, v4.4s, v3.s[0]
+	cmp	x16, x21
+	b.lt	.LBB0_76
+.LBB0_77:                               //   in Loop: Header=BB0_4 Depth=1
+	str	q0, [x13]
+	str	q1, [x14]
+.LBB0_78:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x12, [sp, #744]                 // 8-byte Folded Reload
+	ldr	x13, [sp, #920]                 // 8-byte Folded Reload
+	cmp	x13, x12
+	b.ge	.LBB0_84
+// %bb.79:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x13, [sp, #920]                 // 8-byte Folded Reload
+	ldr	x10, [sp, #848]                 // 8-byte Folded Reload
+	mov	x12, xzr
+	madd	x9, x13, x29, x9
+	madd	x10, x13, x19, x10
+	ldr	x13, [sp, #888]                 // 8-byte Folded Reload
+	ldr	q2, [x8]
+	ldr	x14, [sp, #632]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #880]                 // 8-byte Folded Reload
+	add	x9, x9, x5
+	lsl	x10, x10, #2
+	add	x9, x24, x9, lsl #2
+	ldr	q1, [x13, x10]
+	ldr	x10, [sp, #640]                 // 8-byte Folded Reload
+	ldr	q0, [x9]
+	cmp	xzr, x23
+	b.ge	.LBB0_81
+	.p2align	2
+.LBB0_80:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x13, x11, #32
+	fmla	v0.4s, v2.4s, v1.s[0]
+	add	x12, x12, #4
+	prfm	pldl1keep, [x13]
+	ldp	q2, q3, [x11, #-32]
+	fmla	v0.4s, v2.4s, v1.s[1]
+	fmla	v0.4s, v3.4s, v1.s[2]
+	ldp	q3, q2, [x11], #64
+	prfm	pldl1keep, [x10]
+	fmla	v0.4s, v3.4s, v1.s[3]
+	ldur	q1, [x10, #-16]
+	add	x10, x10, #16
+	cmp	x12, x23
+	b.lt	.LBB0_80
+.LBB0_81:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x11, [sp, #912]                 // 8-byte Folded Reload
+	fmla	v0.4s, v2.4s, v1.s[0]
+	mov	x10, xzr
+	mov	w12, #16                        // =0x10
+	ldr	q3, [x8, x11, lsl #4]
+	ldr	x11, [sp, #904]                 // 8-byte Folded Reload
+	fmla	v0.4s, v3.4s, v1.s[1]
+	ldr	q2, [x8, x11, lsl #4]
+	ldr	x11, [sp, #896]                 // 8-byte Folded Reload
+	fmla	v0.4s, v2.4s, v1.s[2]
+	ldr	q3, [x8, x11, lsl #4]
+	ldr	x11, [sp, #24]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	ldr	x11, [sp, #560]                 // 8-byte Folded Reload
+	fmla	v0.4s, v3.4s, v1.s[3]
+	ldr	x13, [sp, #1032]                // 8-byte Folded Reload
+	add	x13, x13, xzr
+	cmp	x13, x21
+	b.ge	.LBB0_83
+	.p2align	2
+.LBB0_82:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x13, x15, x11
+	add	x11, x11, #4
+	prfm	pldl1keep, [x13]
+	add	x13, x8, x12
+	ldr	s1, [x14, x10, lsl #2]
+	add	x12, x12, #16
+	prfm	pldl1keep, [x13]
+	ldr	q2, [x8, x10, lsl #4]
+	add	x10, x10, #1
+	fmla	v0.4s, v2.4s, v1.s[0]
+	ldr	x13, [sp, #1032]                // 8-byte Folded Reload
+	add	x13, x13, x10
+	cmp	x13, x21
+	b.lt	.LBB0_82
+.LBB0_83:                               //   in Loop: Header=BB0_4 Depth=1
+	str	q0, [x9]
+.LBB0_84:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x0, [sp, #120]                  // 8-byte Folded Reload
+	bl	free
+	ldr	x4, [sp, #600]                  // 8-byte Folded Reload
+	ldr	x8, [sp, #648]                  // 8-byte Folded Reload
+	cmp	x4, x8
+	b.ge	.LBB0_34
+.LBB0_85:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x8, [sp, #72]                   // 8-byte Folded Reload
+	mov	x24, x4
+	add	x0, x8, #64
+	bl	malloc
+	ldr	x8, [sp, #736]                  // 8-byte Folded Reload
+	ldr	x11, [sp, #776]                 // 8-byte Folded Reload
+	mov	x12, xzr
+	mul	x9, x11, x8
+	ldr	x8, [sp, #752]                  // 8-byte Folded Reload
+	ldr	x18, [sp, #520]                 // 8-byte Folded Reload
+	str	x0, [sp, #848]                  // 8-byte Folded Spill
+	add	x13, x18, x29
+	ldp	x1, x2, [sp, #328]              // 16-byte Folded Reload
+	ldp	x3, x4, [sp, #344]              // 16-byte Folded Reload
+	ldr	x5, [sp, #872]                  // 8-byte Folded Reload
+	ldr	x6, [sp, #784]                  // 8-byte Folded Reload
+	ldr	x7, [sp, #800]                  // 8-byte Folded Reload
+	mul	x10, x11, x8
+	add	x8, x0, #63
+	add	x14, x9, x24
+	lsl	x16, x14, #2
+	add	x17, x14, x29
+	add	x18, x14, x18
+	add	x13, x14, x13
+	and	x8, x8, #0xffffffffffffffc0
+	lsl	x13, x13, #2
+	str	x10, [sp, #112]                 // 8-byte Folded Spill
+	add	x15, x10, x24
+	ldr	x10, [sp, #760]                 // 8-byte Folded Reload
+	lsl	x14, x15, #2
+	lsl	x15, x17, #2
+	mul	x0, x11, x10
+	ldr	x10, [sp, #856]                 // 8-byte Folded Reload
+	ldr	x11, [sp, #840]                 // 8-byte Folded Reload
+	ldr	d0, [x10, x16]
+	lsl	x16, x18, #2
+	ldr	x18, [sp, #792]                 // 8-byte Folded Reload
+	ldr	d2, [x10, x15]
+	ldr	d3, [x10, x13]
+	lsl	x13, x0, #2
+	ldr	d7, [x11, x14]
+	str	x0, [sp, #120]                  // 8-byte Folded Spill
+	ldp	x14, x15, [sp, #296]            // 16-byte Folded Reload
+	ldr	d1, [x10, x16]
+	ldr	x10, [sp, #888]                 // 8-byte Folded Reload
+	ldp	x16, x17, [sp, #312]            // 16-byte Folded Reload
+	ldr	q4, [x10, x13]
+	add	x13, x0, x19
+	lsl	x13, x13, #2
+	ldr	q5, [x10, x13]
+	add	x13, x0, x19, lsl #1
+	lsl	x13, x13, #2
+	ldr	q6, [x10, x13]
+	orr	x13, x8, #0x10
+	.p2align	2
+.LBB0_86:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x24, x18, x27
+	fmla	v0.2s, v7.2s, v4.s[0]
+	fmla	v2.2s, v7.2s, v5.s[0]
+	cmp	x12, x23
+	prfm	pldl1keep, [x24, #16]
+	ldr	q16, [x24]
+	b.ge	.LBB0_88
+// %bb.87:                              //   in Loop: Header=BB0_86 Depth=2
+	ldr	x10, [sp, #864]                 // 8-byte Folded Reload
+	stur	d7, [x13, #-16]
+	fmla	v1.2s, v7.2s, v6.s[0]
+	fmla	v3.2s, v7.2s, v16.s[0]
+	add	x25, x6, x27
+	add	x26, x5, x27
+	add	x11, x25, #32
+	add	x0, x26, #32
+	add	x12, x12, #4
+	add	x6, x6, #16
+	add	x5, x5, #16
+	add	x18, x18, #16
+	add	x24, x1, x10
+	add	x28, x4, x10
+	prfm	pldl1keep, [x24]
+	add	x24, x2, x10
+	ldr	d17, [x16, x10]
+	stur	d17, [x13, #-8]
+	prfm	pldl1keep, [x24]
+	ldr	d18, [x15, x10]
+	add	x24, x3, x10
+	fmla	v0.2s, v17.2s, v4.s[1]
+	fmla	v2.2s, v17.2s, v5.s[1]
+	fmla	v1.2s, v17.2s, v6.s[1]
+	fmla	v3.2s, v17.2s, v16.s[1]
+	str	d18, [x13]
+	prfm	pldl1keep, [x24]
+	ldr	d19, [x14, x10]
+	fmla	v0.2s, v18.2s, v4.s[2]
+	fmla	v2.2s, v18.2s, v5.s[2]
+	fmla	v1.2s, v18.2s, v6.s[2]
+	add	x24, x7, x27
+	fmla	v3.2s, v18.2s, v16.s[2]
+	add	x7, x7, #16
+	add	x30, x24, #32
+	str	d19, [x13, #8]
+	prfm	pldl1keep, [x28]
+	ldr	d7, [x17, x10]
+	fmla	v0.2s, v19.2s, v4.s[3]
+	fmla	v2.2s, v19.2s, v5.s[3]
+	fmla	v1.2s, v19.2s, v6.s[3]
+	prfm	pldl1keep, [x0]
+	ldr	q4, [x26, #16]
+	prfm	pldl1keep, [x11]
+	ldr	q5, [x25, #16]
+	prfm	pldl1keep, [x30]
+	ldr	x10, [sp, #1016]                // 8-byte Folded Reload
+	ldr	q6, [x24, #16]
+	fmla	v3.2s, v19.2s, v16.s[3]
+	add	x13, x13, #32
+	add	x4, x4, x10
+	add	x3, x3, x10
+	add	x2, x2, x10
+	add	x1, x1, x10
+	add	x17, x17, x10
+	add	x16, x16, x10
+	add	x15, x15, x10
+	add	x14, x14, x10
+	b	.LBB0_86
+	.p2align	2
+.LBB0_88:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x13, [sp, #728]                 // 8-byte Folded Reload
+	ldr	x10, [sp, #912]                 // 8-byte Folded Reload
+	fmla	v1.2s, v7.2s, v6.s[0]
+	fmla	v3.2s, v7.2s, v16.s[0]
+	ldp	x15, x6, [sp, #112]             // 16-byte Folded Reload
+	ldr	x4, [sp, #600]                  // 8-byte Folded Reload
+	mov	x12, xzr
+	madd	x11, x10, x13, x15
+	ldr	x14, [sp, #840]                 // 8-byte Folded Reload
+	str	d7, [x8, x23, lsl #3]
+	ldr	x16, [sp, #824]                 // 8-byte Folded Reload
+	ldr	x18, [sp, #880]                 // 8-byte Folded Reload
+	ldr	x5, [sp, #856]                  // 8-byte Folded Reload
+	add	x11, x11, x4
+	lsl	x11, x11, #2
+	ldr	d7, [x14, x11]
+	fmla	v0.2s, v7.2s, v4.s[1]
+	str	d7, [x8, x10, lsl #3]
+	ldr	x10, [sp, #904]                 // 8-byte Folded Reload
+	fmla	v2.2s, v7.2s, v5.s[1]
+	fmla	v1.2s, v7.2s, v6.s[1]
+	fmla	v3.2s, v7.2s, v16.s[1]
+	madd	x11, x10, x13, x15
+	add	x11, x11, x4
+	lsl	x11, x11, #2
+	ldr	d17, [x14, x11]
+	fmla	v0.2s, v17.2s, v4.s[2]
+	str	d17, [x8, x10, lsl #3]
+	ldr	x10, [sp, #896]                 // 8-byte Folded Reload
+	fmla	v2.2s, v17.2s, v5.s[2]
+	fmla	v1.2s, v17.2s, v6.s[2]
+	fmla	v3.2s, v17.2s, v16.s[2]
+	madd	x11, x10, x13, x15
+	ldr	x13, [sp, #1032]                // 8-byte Folded Reload
+	add	x11, x11, x4
+	lsl	x11, x11, #2
+	ldr	d7, [x14, x11]
+	ldr	x11, [sp, #512]                 // 8-byte Folded Reload
+	fmla	v0.2s, v7.2s, v4.s[3]
+	fmla	v2.2s, v7.2s, v5.s[3]
+	fmla	v1.2s, v7.2s, v6.s[3]
+	fmla	v3.2s, v7.2s, v16.s[3]
+	str	d7, [x8, x10, lsl #3]
+	ldp	x17, x10, [sp, #280]            // 16-byte Folded Reload
+	cmp	x13, x21
+	b.ge	.LBB0_90
+	.p2align	2
+.LBB0_89:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x15, x18, x11
+	add	x14, x10, x12
+	add	x11, x11, #4
+	prfm	pldl1keep, [x15]
+	ldur	s4, [x15, #-4]
+	add	x15, x15, x20
+	prfm	pldl1keep, [x15]
+	ldur	s5, [x15, #-4]
+	add	x15, x15, x20
+	prfm	pldl1keep, [x15]
+	ldur	s6, [x15, #-4]
+	add	x15, x15, x20
+	prfm	pldl1keep, [x15]
+	ldur	s7, [x15, #-4]
+	prfm	pldl1keep, [x14]
+	ldr	d16, [x17, x12]
+	add	x12, x12, x16
+	fmla	v0.2s, v16.2s, v4.s[0]
+	str	d16, [x8, x13, lsl #3]
+	add	x13, x13, #1
+	fmla	v2.2s, v16.2s, v5.s[0]
+	fmla	v1.2s, v16.2s, v6.s[0]
+	fmla	v3.2s, v16.2s, v7.s[0]
+	cmp	x13, x21
+	b.lt	.LBB0_89
+.LBB0_90:                               // %.preheader50
+                                        //   in Loop: Header=BB0_4 Depth=1
+	ldr	x10, [sp, #32]                  // 8-byte Folded Reload
+	ldr	x13, [sp, #808]                 // 8-byte Folded Reload
+	mov	x2, xzr
+	add	x11, x8, #24
+	ldr	x14, [sp, #816]                 // 8-byte Folded Reload
+	mov	w16, #1                         // =0x1
+	mov	w17, #2                         // =0x2
+	mov	w18, #3                         // =0x3
+	mov	w15, #4                         // =0x4
+	add	x12, x8, x10
+	b	.LBB0_92
+	.p2align	2
+.LBB0_91:                               // %.loopexit46
+                                        //   in Loop: Header=BB0_92 Depth=2
+	ldr	x10, [sp, #1008]                // 8-byte Folded Reload
+	ldr	x4, [sp, #600]                  // 8-byte Folded Reload
+	mov	x2, x15
+	mov	x15, x1
+	add	x14, x14, x10
+	add	x13, x13, x10
+.LBB0_92:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Loop Header: Depth=2
+                                        //       Child Loop BB0_94 Depth 3
+                                        //       Child Loop BB0_96 Depth 3
+	madd	x0, x2, x29, x9
+	add	x0, x0, x4
+	madd	x16, x16, x29, x9
+	madd	x17, x17, x29, x9
+	madd	x18, x18, x29, x9
+	add	x16, x16, x4
+	add	x17, x17, x4
+	lsl	x0, x0, #2
+	lsl	x16, x16, #2
+	lsl	x17, x17, #2
+	str	d0, [x5, x0]
+	str	d2, [x5, x16]
+	add	x16, x18, x4
+	lsl	x16, x16, #2
+	str	d1, [x5, x17]
+	str	d3, [x5, x16]
+	ldr	x16, [sp, #1024]                // 8-byte Folded Reload
+	cmp	x15, x16
+	b.ge	.LBB0_97
+// %bb.93:                              //   in Loop: Header=BB0_92 Depth=2
+	madd	x0, x15, x29, x9
+	add	x18, x15, #3
+	ldr	x10, [sp, #888]                 // 8-byte Folded Reload
+	add	x16, x15, #1
+	add	x17, x15, #2
+	madd	x1, x16, x29, x9
+	ldr	d16, [x8]
+	mov	x2, xzr
+	madd	x3, x17, x29, x9
+	add	x0, x0, x4
+	lsl	x0, x0, #2
+	add	x1, x1, x4
+	add	x3, x3, x4
+	ldr	d0, [x5, x0]
+	madd	x0, x18, x29, x9
+	lsl	x1, x1, #2
+	lsl	x3, x3, #2
+	ldr	d2, [x5, x1]
+	ldr	d1, [x5, x3]
+	add	x1, x15, #4
+	mov	x3, x11
+	add	x0, x0, x4
+	mov	x4, x14
+	lsl	x0, x0, #2
+	ldr	d3, [x5, x0]
+	madd	x0, x15, x19, x6
+	lsl	x0, x0, #2
+	ldr	q7, [x10, x0]
+	madd	x0, x16, x19, x6
+	lsl	x0, x0, #2
+	ldr	q6, [x10, x0]
+	madd	x0, x17, x19, x6
+	lsl	x0, x0, #2
+	ldr	q5, [x10, x0]
+	madd	x0, x18, x19, x6
+	lsl	x0, x0, #2
+	ldr	q4, [x10, x0]
+	cmp	xzr, x23
+	b.ge	.LBB0_95
+	.p2align	2
+.LBB0_94:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_92 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x0, x3, #16
+	fmla	v0.2s, v16.2s, v7.s[0]
+	fmla	v2.2s, v16.2s, v6.s[0]
+	add	x2, x2, #4
+	fmla	v1.2s, v16.2s, v5.s[0]
+	fmla	v3.2s, v16.2s, v4.s[0]
+	prfm	pldl1keep, [x0]
+	add	x0, x4, x20
+	ldp	d16, d17, [x3, #-16]
+	fmla	v0.2s, v16.2s, v7.s[1]
+	fmla	v2.2s, v16.2s, v6.s[1]
+	fmla	v1.2s, v16.2s, v5.s[1]
+	fmla	v3.2s, v16.2s, v4.s[1]
+	fmla	v0.2s, v17.2s, v7.s[2]
+	fmla	v2.2s, v17.2s, v6.s[2]
+	fmla	v1.2s, v17.2s, v5.s[2]
+	fmla	v3.2s, v17.2s, v4.s[2]
+	ldp	d17, d16, [x3], #32
+	prfm	pldl1keep, [x4]
+	fmla	v0.2s, v17.2s, v7.s[3]
+	ldur	q7, [x4, #-16]
+	prfm	pldl1keep, [x0]
+	fmla	v2.2s, v17.2s, v6.s[3]
+	ldur	q6, [x0, #-16]
+	add	x0, x0, x20
+	fmla	v1.2s, v17.2s, v5.s[3]
+	fmla	v3.2s, v17.2s, v4.s[3]
+	add	x4, x4, #16
+	prfm	pldl1keep, [x0]
+	ldur	q5, [x0, #-16]
+	add	x0, x0, x20
+	prfm	pldl1keep, [x0]
+	ldur	q4, [x0, #-16]
+	cmp	x2, x23
+	b.lt	.LBB0_94
+.LBB0_95:                               //   in Loop: Header=BB0_92 Depth=2
+	ldr	x10, [sp, #912]                 // 8-byte Folded Reload
+	fmla	v0.2s, v16.2s, v7.s[0]
+	fmla	v2.2s, v16.2s, v6.s[0]
+	ldr	x4, [sp, #1032]                 // 8-byte Folded Reload
+	fmla	v1.2s, v16.2s, v5.s[0]
+	fmla	v3.2s, v16.2s, v4.s[0]
+	mov	x2, x13
+	mov	x3, x12
+	ldr	d17, [x8, x10, lsl #3]
+	ldr	x10, [sp, #904]                 // 8-byte Folded Reload
+	fmla	v0.2s, v17.2s, v7.s[1]
+	ldr	d16, [x8, x10, lsl #3]
+	ldr	x10, [sp, #896]                 // 8-byte Folded Reload
+	fmla	v2.2s, v17.2s, v6.s[1]
+	fmla	v1.2s, v17.2s, v5.s[1]
+	fmla	v3.2s, v17.2s, v4.s[1]
+	ldr	d18, [x8, x10, lsl #3]
+	fmla	v0.2s, v16.2s, v7.s[2]
+	fmla	v2.2s, v16.2s, v6.s[2]
+	fmla	v1.2s, v16.2s, v5.s[2]
+	fmla	v3.2s, v16.2s, v4.s[2]
+	fmla	v0.2s, v18.2s, v7.s[3]
+	fmla	v2.2s, v18.2s, v6.s[3]
+	fmla	v1.2s, v18.2s, v5.s[3]
+	fmla	v3.2s, v18.2s, v4.s[3]
+	cmp	x4, x21
+	b.ge	.LBB0_91
+	.p2align	2
+.LBB0_96:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_92 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x0, x2, x20
+	prfm	pldl1keep, [x2]
+	ldur	s4, [x2, #-4]
+	add	x4, x4, #1
+	prfm	pldl1keep, [x0]
+	ldur	s5, [x0, #-4]
+	add	x0, x0, x20
+	add	x2, x2, #4
+	prfm	pldl1keep, [x0]
+	ldur	s6, [x0, #-4]
+	add	x0, x0, x20
+	prfm	pldl1keep, [x0]
+	ldur	s7, [x0, #-4]
+	prfm	pldl1keep, [x3]
+	ldur	d16, [x3, #-8]
+	add	x3, x3, #8
+	fmla	v0.2s, v16.2s, v4.s[0]
+	fmla	v2.2s, v16.2s, v5.s[0]
+	fmla	v1.2s, v16.2s, v6.s[0]
+	fmla	v3.2s, v16.2s, v7.s[0]
+	cmp	x4, x21
+	b.lt	.LBB0_96
+	b	.LBB0_91
+	.p2align	2
+.LBB0_97:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x12, [sp, #1024]                // 8-byte Folded Reload
+	ldr	x13, [sp, #920]                 // 8-byte Folded Reload
+	cmp	x12, x13
+	b.ge	.LBB0_103
+// %bb.98:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x16, [sp, #1024]                // 8-byte Folded Reload
+	ldr	x10, [sp, #888]                 // 8-byte Folded Reload
+	mov	x14, xzr
+	add	x15, x16, #1
+	madd	x12, x16, x29, x9
+	madd	x16, x16, x19, x6
+	ldr	d4, [x8]
+	ldr	x17, [sp, #624]                 // 8-byte Folded Reload
+	madd	x13, x15, x29, x9
+	madd	x15, x15, x19, x6
+	add	x12, x12, x4
+	lsl	x16, x16, #2
+	add	x13, x13, x4
+	add	x12, x5, x12, lsl #2
+	lsl	x15, x15, #2
+	add	x13, x5, x13, lsl #2
+	ldr	q3, [x10, x16]
+	ldr	q2, [x10, x15]
+	ldr	x16, [sp, #616]                 // 8-byte Folded Reload
+	mov	x15, x11
+	ldr	d0, [x12]
+	ldr	d1, [x13]
+	cmp	xzr, x23
+	b.ge	.LBB0_100
+	.p2align	2
+.LBB0_99:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x3, x15, #16
+	fmla	v0.2s, v4.2s, v3.s[0]
+	fmla	v1.2s, v4.2s, v2.s[0]
+	add	x18, x17, x27
+	prfm	pldl1keep, [x3]
+	ldp	d4, d5, [x15, #-16]
+	add	x1, x16, x27
+	add	x0, x18, #32
+	add	x2, x1, #32
+	add	x14, x14, #4
+	add	x17, x17, #16
+	add	x16, x16, #16
+	fmla	v0.2s, v4.2s, v3.s[1]
+	fmla	v1.2s, v4.2s, v2.s[1]
+	fmla	v0.2s, v5.2s, v3.s[2]
+	fmla	v1.2s, v5.2s, v2.s[2]
+	ldp	d5, d4, [x15], #32
+	prfm	pldl1keep, [x2]
+	fmla	v0.2s, v5.2s, v3.s[3]
+	ldr	q3, [x1, #16]
+	prfm	pldl1keep, [x0]
+	fmla	v1.2s, v5.2s, v2.s[3]
+	ldr	q2, [x18, #16]
+	cmp	x14, x23
+	b.lt	.LBB0_99
+.LBB0_100:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x10, [sp, #912]                 // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v3.s[0]
+	fmla	v1.2s, v4.2s, v2.s[0]
+	ldr	x1, [sp, #544]                  // 8-byte Folded Reload
+	mov	x14, xzr
+	mov	x15, xzr
+	ldr	d5, [x8, x10, lsl #3]
+	ldr	x10, [sp, #904]                 // 8-byte Folded Reload
+	ldr	d4, [x8, x10, lsl #3]
+	ldr	x10, [sp, #896]                 // 8-byte Folded Reload
+	fmla	v0.2s, v5.2s, v3.s[1]
+	fmla	v1.2s, v5.2s, v2.s[1]
+	ldr	d5, [x8, x10, lsl #3]
+	ldr	x10, [sp, #56]                  // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v3.s[2]
+	fmla	v1.2s, v4.2s, v2.s[2]
+	add	x16, x8, x10
+	fmla	v0.2s, v5.2s, v3.s[3]
+	fmla	v1.2s, v5.2s, v2.s[3]
+	ldr	x10, [sp, #1032]                // 8-byte Folded Reload
+	add	x17, x10, xzr
+	cmp	x17, x21
+	b.ge	.LBB0_102
+	.p2align	2
+.LBB0_101:                              //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	ldr	x10, [sp, #656]                 // 8-byte Folded Reload
+	add	x17, x16, x15, lsl #3
+	add	x18, x1, x14
+	add	x18, x18, #4
+	add	x17, x17, #8
+	add	x0, x10, x14
+	add	x14, x14, #4
+	add	x0, x0, #4
+	prfm	pldl1keep, [x0]
+	ldr	s2, [x10, x15, lsl #2]
+	prfm	pldl1keep, [x18]
+	ldr	s3, [x1, x15, lsl #2]
+	prfm	pldl1keep, [x17]
+	ldr	d4, [x16, x15, lsl #3]
+	add	x15, x15, #1
+	fmla	v0.2s, v4.2s, v2.s[0]
+	fmla	v1.2s, v4.2s, v3.s[0]
+	ldr	x10, [sp, #1032]                // 8-byte Folded Reload
+	add	x17, x10, x15
+	cmp	x17, x21
+	b.lt	.LBB0_101
+.LBB0_102:                              //   in Loop: Header=BB0_4 Depth=1
+	str	d0, [x12]
+	str	d1, [x13]
+.LBB0_103:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x12, [sp, #744]                 // 8-byte Folded Reload
+	ldr	x13, [sp, #920]                 // 8-byte Folded Reload
+	cmp	x13, x12
+	b.ge	.LBB0_109
+// %bb.104:                             //   in Loop: Header=BB0_4 Depth=1
+	ldr	x13, [sp, #920]                 // 8-byte Folded Reload
+	ldr	d2, [x8]
+	mov	x12, xzr
+	madd	x9, x13, x29, x9
+	madd	x10, x13, x19, x6
+	ldr	x13, [sp, #888]                 // 8-byte Folded Reload
+	ldr	x14, [sp, #632]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #880]                 // 8-byte Folded Reload
+	add	x9, x9, x4
+	lsl	x10, x10, #2
+	add	x9, x5, x9, lsl #2
+	ldr	d0, [x9]
+	ldr	q1, [x13, x10]
+	ldr	x10, [sp, #640]                 // 8-byte Folded Reload
+	cmp	xzr, x23
+	b.ge	.LBB0_106
+	.p2align	2
+.LBB0_105:                              //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x13, x11, #16
+	fmla	v0.2s, v2.2s, v1.s[0]
+	add	x12, x12, #4
+	prfm	pldl1keep, [x13]
+	ldp	d2, d3, [x11, #-16]
+	fmla	v0.2s, v2.2s, v1.s[1]
+	fmla	v0.2s, v3.2s, v1.s[2]
+	ldp	d3, d2, [x11], #32
+	prfm	pldl1keep, [x10]
+	fmla	v0.2s, v3.2s, v1.s[3]
+	ldur	q1, [x10, #-16]
+	add	x10, x10, #16
+	cmp	x12, x23
+	b.lt	.LBB0_105
+.LBB0_106:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x11, [sp, #912]                 // 8-byte Folded Reload
+	fmla	v0.2s, v2.2s, v1.s[0]
+	mov	x10, xzr
+	ldr	d3, [x8, x11, lsl #3]
+	ldr	x11, [sp, #904]                 // 8-byte Folded Reload
+	fmla	v0.2s, v3.2s, v1.s[1]
+	ldr	d4, [x8, x11, lsl #3]
+	ldr	x11, [sp, #896]                 // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v1.s[2]
+	ldr	d2, [x8, x11, lsl #3]
+	ldr	x11, [sp, #56]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	ldr	x11, [sp, #560]                 // 8-byte Folded Reload
+	fmla	v0.2s, v2.2s, v1.s[3]
+	ldr	x12, [sp, #1032]                // 8-byte Folded Reload
+	add	x12, x12, xzr
+	cmp	x12, x21
+	b.ge	.LBB0_108
+	.p2align	2
+.LBB0_107:                              //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x12, x8, x10, lsl #3
+	add	x13, x15, x11
+	add	x11, x11, #4
+	prfm	pldl1keep, [x13]
+	ldr	s1, [x14, x10, lsl #2]
+	add	x12, x12, #8
+	prfm	pldl1keep, [x12]
+	ldr	d2, [x8, x10, lsl #3]
+	add	x10, x10, #1
+	fmla	v0.2s, v2.2s, v1.s[0]
+	ldr	x12, [sp, #1032]                // 8-byte Folded Reload
+	add	x12, x12, x10
+	cmp	x12, x21
+	b.lt	.LBB0_107
+.LBB0_108:                              //   in Loop: Header=BB0_4 Depth=1
+	str	d0, [x9]
+.LBB0_109:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x0, [sp, #848]                  // 8-byte Folded Reload
+	bl	free
+	ldr	x8, [sp, #144]                  // 8-byte Folded Reload
+	ldr	x9, [sp, #648]                  // 8-byte Folded Reload
+	cmp	x9, x8
+	b.ge	.LBB0_3
+.LBB0_110:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x8, [sp, #96]                   // 8-byte Folded Reload
+	add	x0, x8, #64
+	bl	malloc
+	ldr	x8, [sp, #736]                  // 8-byte Folded Reload
+	ldr	x17, [sp, #776]                 // 8-byte Folded Reload
+	add	x10, x0, #63
+	mov	x12, xzr
+	ldr	x15, [sp, #520]                 // 8-byte Folded Reload
+	ldr	x16, [sp, #648]                 // 8-byte Folded Reload
+	mov	x13, xzr
+	mul	x9, x17, x8
+	ldr	x24, [sp, #856]                 // 8-byte Folded Reload
+	add	x8, x15, x29
+	ldp	x1, x2, [sp, #248]              // 16-byte Folded Reload
+	ldp	x3, x4, [sp, #264]              // 16-byte Folded Reload
+	str	x0, [sp, #120]                  // 8-byte Folded Spill
+	add	x11, x9, x16
+	add	x8, x11, x8
+	add	x14, x11, x29
+	add	x15, x11, x15
+	ldr	s2, [x24, x11, lsl #2]
+	ldr	x11, [sp, #840]                 // 8-byte Folded Reload
+	ldr	s0, [x24, x8, lsl #2]
+	and	x8, x10, #0xffffffffffffffc0
+	ldr	x10, [sp, #752]                 // 8-byte Folded Reload
+	ldr	s3, [x24, x14, lsl #2]
+	ldr	s1, [x24, x15, lsl #2]
+	mul	x10, x17, x10
+	str	x10, [sp, #520]                 // 8-byte Folded Spill
+	add	x10, x10, x16
+	ldp	x15, x16, [sp, #216]            // 16-byte Folded Reload
+	ldr	s7, [x11, x10, lsl #2]
+	ldr	x10, [sp, #760]                 // 8-byte Folded Reload
+	mul	x11, x17, x10
+	ldr	x10, [sp, #888]                 // 8-byte Folded Reload
+	lsl	x14, x11, #2
+	ldp	x17, x18, [sp, #232]            // 16-byte Folded Reload
+	str	x11, [sp, #776]                 // 8-byte Folded Spill
+	ldr	q4, [x10, x14]
+	add	x14, x11, x19
+	lsl	x14, x14, #2
+	ldr	q5, [x10, x14]
+	add	x14, x11, x19, lsl #1
+	lsl	x14, x14, #2
+	ldr	q6, [x10, x14]
+	orr	x10, x8, #0xc
+	str	x10, [sp, #848]                 // 8-byte Folded Spill
+	.p2align	2
+.LBB0_111:                              //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	ldr	x10, [sp, #608]                 // 8-byte Folded Reload
+	ext	v20.16b, v4.16b, v4.16b, #8
+	cmp	x13, x23
+	ext	v19.16b, v5.16b, v5.16b, #8
+	add	x5, x10, x12
+	prfm	pldl1keep, [x5, #16]
+	ldr	q16, [x5]
+	ext	v18.16b, v6.16b, v6.16b, #8
+	ext	v17.16b, v16.16b, v16.16b, #8
+	b.ge	.LBB0_113
+// %bb.112:                             //   in Loop: Header=BB0_111 Depth=2
+	ldr	x10, [sp, #592]                 // 8-byte Folded Reload
+	ldr	x14, [sp, #864]                 // 8-byte Folded Reload
+	fmla	v2.2s, v7.2s, v4.2s
+	fmla	v3.2s, v7.2s, v5.2s
+	fmla	v1.2s, v7.2s, v6.2s
+	fmla	v0.2s, v7.2s, v16.2s
+	add	x13, x13, #4
+	add	x5, x10, x12
+	ldr	x10, [sp, #584]                 // 8-byte Folded Reload
+	add	x0, x1, x14
+	add	x11, x2, x14
+	add	x24, x3, x14
+	add	x30, x4, x14
+	add	x6, x5, #32
+	add	x7, x10, x12
+	ldr	x10, [sp, #880]                 // 8-byte Folded Reload
+	add	x25, x7, #32
+	add	x26, x10, x12
+	ldr	x10, [sp, #848]                 // 8-byte Folded Reload
+	add	x28, x26, #32
+	add	x10, x10, x12
+	add	x12, x12, #16
+	stur	s7, [x10, #-12]
+	prfm	pldl1keep, [x0]
+	ldr	s7, [x17, x14]
+	fmla	v2.2s, v7.2s, v4.s[1]
+	fmla	v3.2s, v7.2s, v5.s[1]
+	fmla	v1.2s, v7.2s, v6.s[1]
+	fmla	v0.2s, v7.2s, v16.s[1]
+	stur	s7, [x10, #-8]
+	prfm	pldl1keep, [x11]
+	ldr	s7, [x16, x14]
+	fmla	v1.2s, v7.2s, v18.2s
+	stur	s7, [x10, #-4]
+	prfm	pldl1keep, [x24]
+	ldr	s18, [x15, x14]
+	fmla	v2.2s, v7.2s, v20.2s
+	fmla	v3.2s, v7.2s, v19.2s
+	ldr	x24, [sp, #856]                 // 8-byte Folded Reload
+	fmla	v0.2s, v7.2s, v17.2s
+	str	s18, [x10]
+	prfm	pldl1keep, [x30]
+	ldr	s7, [x18, x14]
+	fmla	v2.2s, v18.2s, v4.s[3]
+	fmla	v3.2s, v18.2s, v5.s[3]
+	fmla	v1.2s, v18.2s, v6.s[3]
+	prfm	pldl1keep, [x28]
+	ldr	q4, [x26, #16]
+	prfm	pldl1keep, [x25]
+	ldr	q5, [x7, #16]
+	prfm	pldl1keep, [x6]
+	ldr	x10, [sp, #1016]                // 8-byte Folded Reload
+	ldr	q6, [x5, #16]
+	fmla	v0.2s, v18.2s, v16.s[3]
+	add	x4, x4, x10
+	add	x3, x3, x10
+	add	x2, x2, x10
+	add	x1, x1, x10
+	add	x18, x18, x10
+	add	x17, x17, x10
+	add	x16, x16, x10
+	add	x15, x15, x10
+	b	.LBB0_111
+	.p2align	2
+.LBB0_113:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x11, [sp, #728]                 // 8-byte Folded Reload
+	ldr	x13, [sp, #912]                 // 8-byte Folded Reload
+	fmla	v2.2s, v7.2s, v4.2s
+	fmla	v3.2s, v7.2s, v5.2s
+	ldr	x15, [sp, #520]                 // 8-byte Folded Reload
+	ldr	x4, [sp, #648]                  // 8-byte Folded Reload
+	fmla	v1.2s, v7.2s, v6.2s
+	fmla	v0.2s, v7.2s, v16.2s
+	ldr	x14, [sp, #840]                 // 8-byte Folded Reload
+	str	s7, [x8, x23, lsl #2]
+	mov	x12, xzr
+	ldr	x16, [sp, #880]                 // 8-byte Folded Reload
+	ldp	x18, x17, [sp, #200]            // 16-byte Folded Reload
+	madd	x10, x13, x11, x15
+	ldr	x5, [sp, #776]                  // 8-byte Folded Reload
+	add	x10, x10, x4
+	ldr	s7, [x14, x10, lsl #2]
+	str	s7, [x8, x13, lsl #2]
+	ldr	x13, [sp, #904]                 // 8-byte Folded Reload
+	fmla	v2.2s, v7.2s, v4.s[1]
+	fmla	v3.2s, v7.2s, v5.s[1]
+	fmla	v1.2s, v7.2s, v6.s[1]
+	fmla	v0.2s, v7.2s, v16.s[1]
+	madd	x10, x13, x11, x15
+	add	x10, x10, x4
+	ldr	s7, [x14, x10, lsl #2]
+	fmla	v2.2s, v7.2s, v20.2s
+	str	s7, [x8, x13, lsl #2]
+	ldr	x13, [sp, #896]                 // 8-byte Folded Reload
+	fmla	v3.2s, v7.2s, v19.2s
+	fmla	v1.2s, v7.2s, v18.2s
+	fmla	v0.2s, v7.2s, v17.2s
+	madd	x10, x13, x11, x15
+	ldr	x11, [sp, #512]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #824]                 // 8-byte Folded Reload
+	add	x10, x10, x4
+	ldr	s7, [x14, x10, lsl #2]
+	fmla	v2.2s, v7.2s, v4.s[3]
+	fmla	v3.2s, v7.2s, v5.s[3]
+	fmla	v1.2s, v7.2s, v6.s[3]
+	fmla	v0.2s, v7.2s, v16.s[3]
+	str	s7, [x8, x13, lsl #2]
+	ldr	x13, [sp, #1032]                // 8-byte Folded Reload
+	cmp	x13, x21
+	b.ge	.LBB0_115
+	.p2align	2
+.LBB0_114:                              //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x14, x16, x11
+	add	x10, x17, x12
+	add	x11, x11, #4
+	prfm	pldl1keep, [x14]
+	ldur	s4, [x14, #-4]
+	add	x14, x14, x20
+	prfm	pldl1keep, [x14]
+	ldur	s5, [x14, #-4]
+	add	x14, x14, x20
+	prfm	pldl1keep, [x14]
+	ldur	s6, [x14, #-4]
+	add	x14, x14, x20
+	prfm	pldl1keep, [x14]
+	ldur	s7, [x14, #-4]
+	prfm	pldl1keep, [x10]
+	ldr	s16, [x18, x12]
+	add	x12, x12, x15
+	fmla	v2.2s, v16.2s, v4.2s
+	str	s16, [x8, x13, lsl #2]
+	add	x13, x13, #1
+	fmla	v3.2s, v16.2s, v5.2s
+	fmla	v1.2s, v16.2s, v6.2s
+	fmla	v0.2s, v16.2s, v7.2s
+	cmp	x13, x21
+	b.lt	.LBB0_114
+.LBB0_115:                              // %.preheader49
+                                        //   in Loop: Header=BB0_4 Depth=1
+	ldr	x10, [sp, #512]                 // 8-byte Folded Reload
+	ldr	x13, [sp, #808]                 // 8-byte Folded Reload
+	mov	x2, xzr
+	add	x11, x8, #12
+	ldr	x14, [sp, #816]                 // 8-byte Folded Reload
+	mov	w17, #1                         // =0x1
+	mov	w18, #2                         // =0x2
+	mov	w16, #3                         // =0x3
+	mov	w15, #4                         // =0x4
+	add	x12, x8, x10
+	b	.LBB0_117
+	.p2align	2
+.LBB0_116:                              // %.loopexit45
+                                        //   in Loop: Header=BB0_117 Depth=2
+	ldr	x10, [sp, #1008]                // 8-byte Folded Reload
+	ldr	x4, [sp, #648]                  // 8-byte Folded Reload
+	mov	x2, x15
+	mov	x15, x1
+	add	x14, x14, x10
+	add	x13, x13, x10
+.LBB0_117:                              //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Loop Header: Depth=2
+                                        //       Child Loop BB0_119 Depth 3
+                                        //       Child Loop BB0_121 Depth 3
+	madd	x10, x2, x29, x9
+	add	x10, x10, x4
+	madd	x17, x17, x29, x9
+	madd	x18, x18, x29, x9
+	add	x17, x17, x4
+	str	s2, [x24, x10, lsl #2]
+	madd	x10, x16, x29, x9
+	add	x16, x18, x4
+	str	s3, [x24, x17, lsl #2]
+	add	x10, x10, x4
+	str	s1, [x24, x16, lsl #2]
+	str	s0, [x24, x10, lsl #2]
+	ldr	x10, [sp, #1024]                // 8-byte Folded Reload
+	cmp	x15, x10
+	b.ge	.LBB0_122
+// %bb.118:                             //   in Loop: Header=BB0_117 Depth=2
+	add	x17, x15, #1
+	madd	x10, x15, x29, x9
+	add	x18, x15, #2
+	add	x16, x15, #3
+	madd	x0, x17, x29, x9
+	ldr	s16, [x8]
+	mov	x2, xzr
+	add	x10, x10, x4
+	madd	x1, x18, x29, x9
+	madd	x3, x16, x29, x9
+	add	x0, x0, x4
+	add	x1, x1, x4
+	ldr	s2, [x24, x10, lsl #2]
+	madd	x10, x15, x19, x5
+	add	x3, x3, x4
+	mov	x4, x14
+	ldr	s3, [x24, x0, lsl #2]
+	ldr	x0, [sp, #888]                  // 8-byte Folded Reload
+	ldr	s0, [x24, x3, lsl #2]
+	ldr	s1, [x24, x1, lsl #2]
+	add	x1, x15, #4
+	mov	x3, x11
+	lsl	x10, x10, #2
+	ldr	q7, [x0, x10]
+	madd	x10, x17, x19, x5
+	lsl	x10, x10, #2
+	ldr	q6, [x0, x10]
+	madd	x10, x18, x19, x5
+	lsl	x10, x10, #2
+	ldr	q5, [x0, x10]
+	madd	x10, x16, x19, x5
+	lsl	x10, x10, #2
+	ldr	q4, [x0, x10]
+	ext	v20.16b, v7.16b, v7.16b, #8
+	cmp	xzr, x23
+	ext	v19.16b, v6.16b, v6.16b, #8
+	ext	v18.16b, v5.16b, v5.16b, #8
+	ext	v17.16b, v4.16b, v4.16b, #8
+	b.ge	.LBB0_120
+	.p2align	2
+.LBB0_119:                              //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_117 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x10, x3, #8
+	fmla	v2.2s, v16.2s, v7.2s
+	fmla	v3.2s, v16.2s, v6.2s
+	add	x2, x2, #4
+	fmla	v1.2s, v16.2s, v5.2s
+	fmla	v0.2s, v16.2s, v4.2s
+	prfm	pldl1keep, [x10]
+	add	x10, x4, x20
+	ldp	s16, s21, [x3, #-8]
+	fmla	v0.2s, v16.2s, v4.s[1]
+	fmla	v2.2s, v16.2s, v7.s[1]
+	fmla	v3.2s, v16.2s, v6.s[1]
+	fmla	v1.2s, v16.2s, v5.s[1]
+	fmla	v0.2s, v21.2s, v17.2s
+	fmla	v2.2s, v21.2s, v20.2s
+	ldp	s17, s16, [x3], #16
+	fmla	v3.2s, v21.2s, v19.2s
+	fmla	v1.2s, v21.2s, v18.2s
+	prfm	pldl1keep, [x4]
+	fmla	v2.2s, v17.2s, v7.s[3]
+	ldur	q7, [x4, #-16]
+	prfm	pldl1keep, [x10]
+	fmla	v3.2s, v17.2s, v6.s[3]
+	ldur	q6, [x10, #-16]
+	add	x10, x10, x20
+	fmla	v1.2s, v17.2s, v5.s[3]
+	fmla	v0.2s, v17.2s, v4.s[3]
+	add	x4, x4, #16
+	prfm	pldl1keep, [x10]
+	ldur	q5, [x10, #-16]
+	add	x10, x10, x20
+	prfm	pldl1keep, [x10]
+	ldur	q4, [x10, #-16]
+	ext	v20.16b, v7.16b, v7.16b, #8
+	cmp	x2, x23
+	ext	v19.16b, v6.16b, v6.16b, #8
+	ext	v18.16b, v5.16b, v5.16b, #8
+	ext	v17.16b, v4.16b, v4.16b, #8
+	b.lt	.LBB0_119
+.LBB0_120:                              //   in Loop: Header=BB0_117 Depth=2
+	ldr	x10, [sp, #912]                 // 8-byte Folded Reload
+	fmla	v2.2s, v16.2s, v7.2s
+	fmla	v3.2s, v16.2s, v6.2s
+	ldr	x4, [sp, #1032]                 // 8-byte Folded Reload
+	fmla	v1.2s, v16.2s, v5.2s
+	fmla	v0.2s, v16.2s, v4.2s
+	mov	x2, x13
+	mov	x3, x12
+	ldr	s21, [x8, x10, lsl #2]
+	ldr	x10, [sp, #904]                 // 8-byte Folded Reload
+	fmla	v2.2s, v21.2s, v7.s[1]
+	ldr	s16, [x8, x10, lsl #2]
+	ldr	x10, [sp, #896]                 // 8-byte Folded Reload
+	fmla	v3.2s, v21.2s, v6.s[1]
+	fmla	v1.2s, v21.2s, v5.s[1]
+	fmla	v0.2s, v21.2s, v4.s[1]
+	ldr	s22, [x8, x10, lsl #2]
+	fmla	v2.2s, v16.2s, v20.2s
+	fmla	v3.2s, v16.2s, v19.2s
+	fmla	v1.2s, v16.2s, v18.2s
+	fmla	v0.2s, v16.2s, v17.2s
+	fmla	v2.2s, v22.2s, v7.s[3]
+	fmla	v3.2s, v22.2s, v6.s[3]
+	fmla	v1.2s, v22.2s, v5.s[3]
+	fmla	v0.2s, v22.2s, v4.s[3]
+	cmp	x4, x21
+	b.ge	.LBB0_116
+	.p2align	2
+.LBB0_121:                              //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_117 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x10, x2, x20
+	prfm	pldl1keep, [x2]
+	ldur	s4, [x2, #-4]
+	add	x4, x4, #1
+	prfm	pldl1keep, [x10]
+	ldur	s5, [x10, #-4]
+	add	x10, x10, x20
+	add	x2, x2, #4
+	prfm	pldl1keep, [x10]
+	ldur	s6, [x10, #-4]
+	add	x10, x10, x20
+	prfm	pldl1keep, [x10]
+	ldur	s7, [x10, #-4]
+	prfm	pldl1keep, [x3]
+	ldur	s16, [x3, #-4]
+	add	x3, x3, #4
+	fmla	v2.2s, v16.2s, v4.2s
+	fmla	v3.2s, v16.2s, v5.2s
+	fmla	v1.2s, v16.2s, v6.2s
+	fmla	v0.2s, v16.2s, v7.2s
+	cmp	x4, x21
+	b.lt	.LBB0_121
+	b	.LBB0_116
+	.p2align	2
+.LBB0_122:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x10, [sp, #1024]                // 8-byte Folded Reload
+	ldr	x12, [sp, #920]                 // 8-byte Folded Reload
+	cmp	x10, x12
+	ldr	x2, [sp, #544]                  // 8-byte Folded Reload
+	b.ge	.LBB0_128
+// %bb.123:                             //   in Loop: Header=BB0_4 Depth=1
+	ldr	x13, [sp, #1024]                // 8-byte Folded Reload
+	ldr	x16, [sp, #888]                 // 8-byte Folded Reload
+	mov	x14, xzr
+	mov	x15, xzr
+	ldr	s4, [x8]
+	madd	x12, x13, x19, x5
+	add	x10, x13, #1
+	lsl	x12, x12, #2
+	ldr	q3, [x16, x12]
+	madd	x12, x10, x19, x5
+	madd	x10, x10, x29, x9
+	lsl	x12, x12, #2
+	ldr	q2, [x16, x12]
+	madd	x12, x13, x29, x9
+	add	x13, x10, x4
+	ldr	s0, [x24, x13, lsl #2]
+	add	x12, x12, x4
+	ldr	s1, [x24, x12, lsl #2]
+	ext	v6.16b, v3.16b, v3.16b, #8
+	cmp	xzr, x23
+	ext	v5.16b, v2.16b, v2.16b, #8
+	b.ge	.LBB0_125
+	.p2align	2
+.LBB0_124:                              //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x0, x8, x14
+	ldr	x10, [sp, #536]                 // 8-byte Folded Reload
+	ldr	x17, [sp, #528]                 // 8-byte Folded Reload
+	fmla	v1.2s, v4.2s, v3.2s
+	add	x1, x0, #20
+	fmla	v0.2s, v4.2s, v2.2s
+	add	x15, x15, #4
+	prfm	pldl1keep, [x1]
+	ldp	s4, s7, [x0, #4]
+	add	x10, x10, x14
+	add	x17, x17, x14
+	add	x14, x14, #16
+	add	x16, x10, #32
+	add	x18, x17, #32
+	fmla	v0.2s, v4.2s, v2.s[1]
+	fmla	v1.2s, v4.2s, v3.s[1]
+	fmla	v0.2s, v7.2s, v5.2s
+	ldp	s5, s4, [x0, #12]
+	fmla	v1.2s, v7.2s, v6.2s
+	prfm	pldl1keep, [x18]
+	fmla	v1.2s, v5.2s, v3.s[3]
+	ldr	q3, [x17, #16]
+	prfm	pldl1keep, [x16]
+	fmla	v0.2s, v5.2s, v2.s[3]
+	ldr	q2, [x10, #16]
+	ext	v6.16b, v3.16b, v3.16b, #8
+	cmp	x15, x23
+	ext	v5.16b, v2.16b, v2.16b, #8
+	b.lt	.LBB0_124
+.LBB0_125:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x10, [sp, #912]                 // 8-byte Folded Reload
+	fmla	v1.2s, v4.2s, v3.2s
+	fmla	v0.2s, v4.2s, v2.2s
+	ldr	x16, [sp, #1032]                // 8-byte Folded Reload
+	mov	x14, xzr
+	ldr	s7, [x8, x10, lsl #2]
+	ldr	x10, [sp, #904]                 // 8-byte Folded Reload
+	ldr	s4, [x8, x10, lsl #2]
+	ldr	x10, [sp, #896]                 // 8-byte Folded Reload
+	fmla	v1.2s, v7.2s, v3.s[1]
+	fmla	v0.2s, v7.2s, v2.s[1]
+	ldr	s7, [x8, x10, lsl #2]
+	ldr	x10, [sp, #104]                 // 8-byte Folded Reload
+	fmla	v1.2s, v4.2s, v6.2s
+	fmla	v0.2s, v4.2s, v5.2s
+	add	x15, x8, x10
+	fmla	v1.2s, v7.2s, v3.s[3]
+	fmla	v0.2s, v7.2s, v2.s[3]
+	cmp	x16, x21
+	b.ge	.LBB0_127
+	.p2align	2
+.LBB0_126:                              //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	ldr	x0, [sp, #656]                  // 8-byte Folded Reload
+	add	x10, x15, x14
+	add	x17, x2, x14
+	add	x16, x16, #1
+	add	x10, x10, #4
+	add	x17, x17, #4
+	add	x18, x0, x14
+	add	x18, x18, #4
+	prfm	pldl1keep, [x18]
+	prfm	pldl1keep, [x17]
+	ldr	s2, [x0, x14]
+	prfm	pldl1keep, [x10]
+	ldr	s3, [x15, x14]
+	fmla	v1.2s, v3.2s, v2.2s
+	ldr	s2, [x2, x14]
+	add	x14, x14, #4
+	fmla	v0.2s, v3.2s, v2.2s
+	cmp	x16, x21
+	b.lt	.LBB0_126
+.LBB0_127:                              //   in Loop: Header=BB0_4 Depth=1
+	str	s1, [x24, x12, lsl #2]
+	str	s0, [x24, x13, lsl #2]
+.LBB0_128:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x10, [sp, #744]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #920]                 // 8-byte Folded Reload
+	cmp	x12, x10
+	b.ge	.LBB0_2
+// %bb.129:                             //   in Loop: Header=BB0_4 Depth=1
+	ldr	x10, [sp, #920]                 // 8-byte Folded Reload
+	ldr	x13, [sp, #888]                 // 8-byte Folded Reload
+	mov	x12, xzr
+	madd	x9, x10, x29, x9
+	madd	x10, x10, x19, x5
+	ldr	s2, [x8]
+	ldr	x14, [sp, #632]                 // 8-byte Folded Reload
+	add	x9, x9, x4
+	lsl	x10, x10, #2
+	ldr	s0, [x24, x9, lsl #2]
+	ldr	q1, [x13, x10]
+	ldr	x10, [sp, #640]                 // 8-byte Folded Reload
+	ext	v3.16b, v1.16b, v1.16b, #8
+	cmp	xzr, x23
+	b.ge	.LBB0_131
+	.p2align	2
+.LBB0_130:                              //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x13, x11, #8
+	fmla	v0.2s, v2.2s, v1.2s
+	add	x12, x12, #4
+	prfm	pldl1keep, [x13]
+	ldp	s2, s4, [x11, #-8]
+	fmla	v0.2s, v2.2s, v1.s[1]
+	fmla	v0.2s, v4.2s, v3.2s
+	ldp	s3, s2, [x11], #16
+	prfm	pldl1keep, [x10]
+	fmla	v0.2s, v3.2s, v1.s[3]
+	ldur	q1, [x10, #-16]
+	add	x10, x10, #16
+	ext	v3.16b, v1.16b, v1.16b, #8
+	cmp	x12, x23
+	b.lt	.LBB0_130
+.LBB0_131:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x11, [sp, #912]                 // 8-byte Folded Reload
+	fmla	v0.2s, v2.2s, v1.2s
+	mov	x10, xzr
+	ldr	s4, [x8, x11, lsl #2]
+	ldr	x11, [sp, #904]                 // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v1.s[1]
+	ldr	s5, [x8, x11, lsl #2]
+	ldr	x11, [sp, #896]                 // 8-byte Folded Reload
+	fmla	v0.2s, v5.2s, v3.2s
+	ldr	s2, [x8, x11, lsl #2]
+	ldr	x11, [sp, #104]                 // 8-byte Folded Reload
+	add	x8, x8, x11
+	ldr	x11, [sp, #1032]                // 8-byte Folded Reload
+	fmla	v0.2s, v2.2s, v1.s[3]
+	cmp	x11, x21
+	b.ge	.LBB0_1
+	.p2align	2
+.LBB0_132:                              //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x12, x8, x10
+	add	x13, x14, x10
+	add	x11, x11, #1
+	add	x12, x12, #4
+	add	x13, x13, #4
+	prfm	pldl1keep, [x13]
+	prfm	pldl1keep, [x12]
+	ldr	s1, [x8, x10]
+	ldr	s2, [x14, x10]
+	add	x10, x10, #4
+	fmla	v0.2s, v1.2s, v2.2s
+	cmp	x11, x21
+	b.lt	.LBB0_132
+	b	.LBB0_1
+.LBB0_133:
+	ldr	x0, [sp, #16]                   // 8-byte Folded Reload
+	bl	free
+	add	sp, sp, #1040
+	ldp	d9, d8, [sp, #48]               // 16-byte Folded Reload
+	ldp	d11, d10, [sp, #32]             // 16-byte Folded Reload
+	ldp	d13, d12, [sp, #16]             // 16-byte Folded Reload
+	ldp	x20, x19, [sp, #144]            // 16-byte Folded Reload
+	ldp	x22, x21, [sp, #128]            // 16-byte Folded Reload
+	ldp	x24, x23, [sp, #112]            // 16-byte Folded Reload
+	ldp	x26, x25, [sp, #96]             // 16-byte Folded Reload
+	ldp	x28, x27, [sp, #80]             // 16-byte Folded Reload
+	ldp	x29, x30, [sp, #64]             // 16-byte Folded Reload
+	ldp	d15, d14, [sp], #160            // 16-byte Folded Reload
+	ret
+.Lfunc_end0:
+	.size	sbatch_matmul_3d_nn_mlir, .Lfunc_end0-sbatch_matmul_3d_nn_mlir
+	.cfi_endproc
+                                        // -- End function
+	.section	".note.GNU-stack","",@progbits
diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nt_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nt_mlir.s
new file mode 100644
index 00000000000000..a70650bb6207e2
--- /dev/null
+++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nt_mlir.s
@@ -0,0 +1,2987 @@
+	.text
+	.file	"LLVMDialectModule"
+	.globl	sbatch_matmul_3d_nt_mlir                    // -- Begin function sbatch_matmul_3d_nt_mlir
+	.p2align	4
+	.type	sbatch_matmul_3d_nt_mlir,@function
+sbatch_matmul_3d_nt_mlir:                           // @sbatch_matmul_3d_nt_mlir
+	.cfi_startproc
+// %bb.0:
+	stp	d15, d14, [sp, #-160]!          // 16-byte Folded Spill
+	stp	d13, d12, [sp, #16]             // 16-byte Folded Spill
+	stp	x29, x30, [sp, #64]             // 16-byte Folded Spill
+	stp	x28, x27, [sp, #80]             // 16-byte Folded Spill
+	stp	x26, x25, [sp, #96]             // 16-byte Folded Spill
+	stp	x24, x23, [sp, #112]            // 16-byte Folded Spill
+	stp	x22, x21, [sp, #128]            // 16-byte Folded Spill
+	stp	x20, x19, [sp, #144]            // 16-byte Folded Spill
+	stp	d11, d10, [sp, #32]             // 16-byte Folded Spill
+	stp	d9, d8, [sp, #48]               // 16-byte Folded Spill
+	sub	sp, sp, #512
+	.cfi_def_cfa_offset 672
+	.cfi_offset w19, -8
+	.cfi_offset w20, -16
+	.cfi_offset w21, -24
+	.cfi_offset w22, -32
+	.cfi_offset w23, -40
+	.cfi_offset w24, -48
+	.cfi_offset w25, -56
+	.cfi_offset w26, -64
+	.cfi_offset w27, -72
+	.cfi_offset w28, -80
+	.cfi_offset w30, -88
+	.cfi_offset w29, -96
+	.cfi_offset b8, -104
+	.cfi_offset b9, -112
+	.cfi_offset b10, -120
+	.cfi_offset b11, -128
+	.cfi_offset b12, -136
+	.cfi_offset b13, -144
+	.cfi_offset b14, -152
+	.cfi_offset b15, -160
+	cmp	x4, #0
+	ldr	x13, [sp, #712]
+	ldr	x14, [sp, #768]
+	mov	x19, x7
+	cinv	x8, x4, lt
+	ldr	x12, [sp, #760]
+	ldr	x28, [sp, #808]
+	mov	x21, x5
+	add	x9, x8, x8, lsr #63
+	add	x10, x8, #3
+	ldr	x23, [sp, #728]
+	ldr	x27, [sp, #736]
+	str	x6, [sp, #448]                  // 8-byte Folded Spill
+	stp	x13, x3, [sp, #136]             // 16-byte Folded Spill
+	mov	x26, x2
+	mov	x25, x1
+	asr	x9, x9, #1
+	stp	x12, x14, [sp, #328]            // 16-byte Folded Spill
+	cinv	x22, x9, lt
+	cmp	x8, #0
+	csel	x8, x10, x8, lt
+	cmp	x4, #0
+	ldr	x10, [sp, #800]
+	asr	x8, x8, #2
+	cinv	x24, x8, lt
+	cmp	x13, #0
+	cinv	x8, x13, lt
+	add	x9, x8, x8, lsr #63
+	stp	x10, x4, [sp, #360]             // 16-byte Folded Spill
+	add	x10, x8, #15
+	add	x11, x8, #7
+	add	x12, x8, #3
+	asr	x9, x9, #1
+	cinv	x14, x9, lt
+	cmp	x8, #0
+	csel	x9, x10, x8, lt
+	csel	x10, x11, x8, lt
+	ldr	x11, [sp, #696]
+	csel	x8, x12, x8, lt
+	cmp	x13, #0
+	asr	x9, x9, #4
+	asr	x10, x10, #3
+	asr	x8, x8, #2
+	cinv	x9, x9, lt
+	cinv	x10, x10, lt
+	cinv	x29, x8, lt
+	lsl	x8, x14, #1
+	stp	x9, x10, [sp, #456]             // 16-byte Folded Spill
+	lsl	x9, x9, #4
+	str	x8, [sp, #168]                  // 8-byte Folded Spill
+	lsl	x8, x5, #6
+	lsl	x20, x29, #2
+	stp	x11, x14, [sp, #488]            // 16-byte Folded Spill
+	ldr	x11, [sp, #688]
+	str	x9, [sp, #416]                  // 8-byte Folded Spill
+	lsl	x9, x10, #3
+	add	x0, x8, #64
+	stp	x8, x11, [sp, #472]             // 16-byte Folded Spill
+	str	x9, [sp, #280]                  // 8-byte Folded Spill
+	bl	malloc
+	lsl	x8, x24, #2
+	mov	x12, x22
+	lsl	x10, x23, #2
+	mul	x11, x24, x19
+	str	x8, [sp, #504]                  // 8-byte Folded Spill
+	lsl	x8, x22, #1
+	and	x9, x21, #0x3
+	lsl	x22, x19, #2
+	str	x8, [sp, #440]                  // 8-byte Folded Spill
+	negs	x8, x21
+	str	x10, [sp, #128]                 // 8-byte Folded Spill
+	lsl	x10, x27, #6
+	mul	x13, x12, x19
+	str	x0, [sp, #16]                   // 8-byte Folded Spill
+	add	x12, x0, #63
+	and	x8, x8, #0x3
+	ldp	x0, x18, [sp, #480]             // 16-byte Folded Reload
+	str	x10, [sp, #320]                 // 8-byte Folded Spill
+	mov	w10, #1                         // =0x1
+	add	x14, x21, x22
+	csneg	x8, x9, x8, mi
+	lsl	x2, x26, #2
+	bfi	x10, x24, #2, #62
+	sub	x14, x14, x8
+	and	x23, x12, #0xffffffffffffffc0
+	mul	x12, x19, x10
+	lsl	x24, x19, #4
+	add	x14, x2, x14, lsl #2
+	lsl	x9, x11, #4
+	add	x11, x21, x11, lsl #2
+	add	x18, x0, x18, lsl #2
+	add	x0, x24, x2
+	add	x0, x0, x25
+	add	x14, x14, x25
+	sub	x11, x11, x8
+	add	x0, x0, #32
+	add	x14, x14, #4
+	lsl	x17, x12, #2
+	add	x12, x21, x12
+	lsl	x11, x11, #2
+	stp	x14, x0, [sp, #384]             // 16-byte Folded Spill
+	ldr	x0, [sp, #472]                  // 8-byte Folded Reload
+	add	x16, x21, x13, lsl #1
+	str	x11, [sp, #264]                 // 8-byte Folded Spill
+	sub	x11, x12, x8
+	ldr	x10, [sp, #456]                 // 8-byte Folded Reload
+	lsl	x11, x11, #2
+	sub	x14, x23, x8, lsl #6
+	str	x11, [sp, #256]                 // 8-byte Folded Spill
+	sub	x11, x16, x8
+	mov	w15, #1                         // =0x1
+	lsl	x16, x11, #2
+	add	x13, x2, x13, lsl #3
+	bfi	x15, x29, #2, #62
+	add	x16, x16, #4
+	add	x0, x14, x0
+	add	x14, x25, x17
+	lsl	x4, x21, #2
+	mul	x10, x10, x27
+	str	x14, [sp, #296]                 // 8-byte Folded Spill
+	add	x14, x25, x9
+	mul	x12, x29, x27
+	mul	x15, x27, x15
+	str	x14, [sp, #288]                 // 8-byte Folded Spill
+	add	x14, x13, x25
+	str	x16, [sp, #272]                 // 8-byte Folded Spill
+	add	x13, x13, x4
+	lsl	x16, x8, #2
+	add	x14, x14, #32
+	sub	x13, x13, x16
+	ldr	x11, [sp, #464]                 // 8-byte Folded Reload
+	str	x14, [sp, #304]                 // 8-byte Folded Spill
+	ldr	x14, [sp, #496]                 // 8-byte Folded Reload
+	add	x13, x25, x13
+	add	x3, x18, #4
+	str	x13, [sp, #376]                 // 8-byte Folded Spill
+	add	x13, x17, x2
+	add	x17, x18, x15, lsl #2
+	add	x18, x18, x12, lsl #4
+	add	x12, x13, x4
+	add	x10, x3, x10, lsl #6
+	mul	x11, x11, x27
+	str	x10, [sp, #232]                 // 8-byte Folded Spill
+	sub	x12, x12, x16
+	add	x9, x9, x2
+	mov	x15, x0
+	lsl	x0, x21, #3
+	mul	x14, x14, x27
+	add	x10, x25, x12
+	add	x12, x25, x13
+	add	x13, x25, x9
+	str	x10, [sp, #240]                 // 8-byte Folded Spill
+	add	x10, x9, x4
+	lsl	x9, x21, #5
+	str	x26, [sp, #352]                 // 8-byte Folded Spill
+	sub	x10, x10, x16
+	lsl	x26, x27, #2
+	sub	x27, x21, x8
+	str	x0, [sp, #80]                   // 8-byte Folded Spill
+	add	x10, x25, x10
+	sub	x0, x0, x8, lsl #3
+	str	x2, [sp, #456]                  // 8-byte Folded Spill
+	add	x2, x25, x2
+	str	x10, [sp, #312]                 // 8-byte Folded Spill
+	add	x10, x3, x11, lsl #5
+	sub	x11, x4, x16
+	lsl	x16, x21, #4
+	add	x14, x3, x14, lsl #3
+	stp	x16, x9, [sp, #88]              // 16-byte Folded Spill
+	sub	x9, x9, x8, lsl #5
+	sub	x16, x16, x8, lsl #4
+	ldr	x8, [sp, #448]                  // 8-byte Folded Reload
+	str	x14, [sp, #224]                 // 8-byte Folded Spill
+	sub	x14, x27, #3
+	mov	x1, x20
+	str	x14, [sp, #496]                 // 8-byte Folded Spill
+	sub	x14, x27, #2
+	stp	x0, x9, [sp, #64]               // 16-byte Folded Spill
+	add	x9, x9, #32
+	str	x14, [sp, #488]                 // 8-byte Folded Spill
+	sub	x14, x27, #1
+	str	x9, [sp, #56]                   // 8-byte Folded Spill
+	add	x9, x16, #16
+	str	x14, [sp, #480]                 // 8-byte Folded Spill
+	lsl	x14, x8, #2
+	str	x9, [sp, #48]                   // 8-byte Folded Spill
+	add	x9, x0, #8
+	stp	x4, x14, [sp, #112]             // 16-byte Folded Spill
+	add	x14, x2, #4
+	mov	x20, xzr
+	sub	x29, x27, #4
+	str	x14, [sp, #400]                 // 8-byte Folded Spill
+	add	x14, x23, #256
+	str	x9, [sp, #40]                   // 8-byte Folded Spill
+	add	x9, x11, #4
+	str	x14, [sp, #472]                 // 8-byte Folded Spill
+	add	x14, x15, #64
+	str	x25, [sp, #344]                 // 8-byte Folded Spill
+	str	x14, [sp, #464]                 // 8-byte Folded Spill
+	str	x11, [sp, #104]                 // 8-byte Folded Spill
+	stp	x16, x9, [sp, #24]              // 16-byte Folded Spill
+	str	x1, [sp, #176]                  // 8-byte Folded Spill
+	str	x15, [sp, #248]                 // 8-byte Folded Spill
+	b	.LBB0_4
+	.p2align	2
+.LBB0_1:                                //   in Loop: Header=BB0_4 Depth=1
+	str	s0, [x15, x9, lsl #2]
+.LBB0_2:                                //   in Loop: Header=BB0_4 Depth=1
+	bl	free
+	ldr	x8, [sp, #448]                  // 8-byte Folded Reload
+.LBB0_3:                                // %.backedge28
+                                        //   in Loop: Header=BB0_4 Depth=1
+	ldp	x14, x9, [sp, #120]             // 16-byte Folded Reload
+	ldp	x10, x2, [sp, #400]             // 16-byte Folded Reload
+	ldp	x20, x3, [sp, #152]             // 16-byte Folded Reload
+	ldr	x17, [sp, #216]                 // 8-byte Folded Reload
+	add	x10, x10, x14
+	ldp	x13, x12, [sp, #184]            // 16-byte Folded Reload
+	add	x3, x3, x9
+	add	x2, x2, x14
+	add	x17, x17, x9
+	add	x12, x12, x14
+	add	x13, x13, x14
+	str	x10, [sp, #400]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #392]                 // 8-byte Folded Reload
+	add	x10, x10, x14
+	str	x10, [sp, #392]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #384]                 // 8-byte Folded Reload
+	add	x10, x10, x14
+	str	x10, [sp, #384]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #296]                 // 8-byte Folded Reload
+	add	x10, x10, x14
+	str	x10, [sp, #296]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #288]                 // 8-byte Folded Reload
+	add	x10, x10, x14
+	str	x10, [sp, #288]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #304]                 // 8-byte Folded Reload
+	add	x10, x10, x14
+	str	x10, [sp, #304]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #376]                 // 8-byte Folded Reload
+	add	x10, x10, x14
+	str	x10, [sp, #376]                 // 8-byte Folded Spill
+	ldp	x11, x10, [sp, #224]            // 16-byte Folded Reload
+	add	x10, x10, x9
+	add	x11, x11, x9
+	stp	x11, x10, [sp, #224]            // 16-byte Folded Spill
+	ldr	x10, [sp, #240]                 // 8-byte Folded Reload
+	add	x10, x10, x14
+	str	x10, [sp, #240]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #312]                 // 8-byte Folded Reload
+	add	x10, x10, x14
+	str	x10, [sp, #312]                 // 8-byte Folded Spill
+	ldp	x10, x18, [sp, #200]            // 16-byte Folded Reload
+	add	x10, x10, x9
+	add	x18, x18, x9
+.LBB0_4:                                // =>This Loop Header: Depth=1
+                                        //     Child Loop BB0_8 Depth 2
+                                        //       Child Loop BB0_10 Depth 3
+                                        //       Child Loop BB0_13 Depth 3
+                                        //         Child Loop BB0_15 Depth 4
+                                        //         Child Loop BB0_17 Depth 4
+                                        //       Child Loop BB0_20 Depth 3
+                                        //       Child Loop BB0_22 Depth 3
+                                        //       Child Loop BB0_26 Depth 3
+                                        //       Child Loop BB0_28 Depth 3
+                                        //     Child Loop BB0_34 Depth 2
+                                        //     Child Loop BB0_37 Depth 2
+                                        //       Child Loop BB0_39 Depth 3
+                                        //       Child Loop BB0_41 Depth 3
+                                        //     Child Loop BB0_44 Depth 2
+                                        //     Child Loop BB0_46 Depth 2
+                                        //     Child Loop BB0_50 Depth 2
+                                        //     Child Loop BB0_52 Depth 2
+                                        //     Child Loop BB0_56 Depth 2
+                                        //     Child Loop BB0_59 Depth 2
+                                        //       Child Loop BB0_61 Depth 3
+                                        //       Child Loop BB0_63 Depth 3
+                                        //     Child Loop BB0_66 Depth 2
+                                        //     Child Loop BB0_68 Depth 2
+                                        //     Child Loop BB0_72 Depth 2
+                                        //     Child Loop BB0_74 Depth 2
+                                        //     Child Loop BB0_78 Depth 2
+                                        //     Child Loop BB0_81 Depth 2
+                                        //       Child Loop BB0_83 Depth 3
+                                        //       Child Loop BB0_85 Depth 3
+                                        //     Child Loop BB0_88 Depth 2
+                                        //     Child Loop BB0_90 Depth 2
+                                        //     Child Loop BB0_94 Depth 2
+                                        //     Child Loop BB0_96 Depth 2
+                                        //     Child Loop BB0_100 Depth 2
+                                        //     Child Loop BB0_103 Depth 2
+                                        //       Child Loop BB0_105 Depth 3
+                                        //       Child Loop BB0_107 Depth 3
+                                        //     Child Loop BB0_110 Depth 2
+                                        //     Child Loop BB0_112 Depth 2
+                                        //     Child Loop BB0_116 Depth 2
+                                        //     Child Loop BB0_118 Depth 2
+	ldr	x9, [sp, #144]                  // 8-byte Folded Reload
+	cmp	x20, x9
+	b.ge	.LBB0_119
+// %bb.5:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x0, [sp, #416]                  // 8-byte Folded Reload
+	ldr	x30, [sp, #280]                 // 8-byte Folded Reload
+	add	x9, x20, #1
+	stp	x10, x18, [sp, #200]            // 16-byte Folded Spill
+	mov	x10, xzr
+	str	x2, [sp, #408]                  // 8-byte Folded Spill
+	stp	x13, x12, [sp, #184]            // 16-byte Folded Spill
+	str	x17, [sp, #216]                 // 8-byte Folded Spill
+	stp	x9, x3, [sp, #152]              // 16-byte Folded Spill
+	b	.LBB0_8
+	.p2align	2
+.LBB0_6:                                //   in Loop: Header=BB0_8 Depth=2
+	ldr	x8, [sp, #448]                  // 8-byte Folded Reload
+	stp	q3, q2, [x10]
+	stp	q1, q0, [x10, #32]
+.LBB0_7:                                // %.backedge
+                                        //   in Loop: Header=BB0_8 Depth=2
+	ldr	x9, [sp, #320]                  // 8-byte Folded Reload
+	ldp	x10, x3, [sp, #424]             // 16-byte Folded Reload
+	add	x3, x3, x9
+.LBB0_8:                                //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Loop Header: Depth=2
+                                        //       Child Loop BB0_10 Depth 3
+                                        //       Child Loop BB0_13 Depth 3
+                                        //         Child Loop BB0_15 Depth 4
+                                        //         Child Loop BB0_17 Depth 4
+                                        //       Child Loop BB0_20 Depth 3
+                                        //       Child Loop BB0_22 Depth 3
+                                        //       Child Loop BB0_26 Depth 3
+                                        //       Child Loop BB0_28 Depth 3
+	ldp	x11, x9, [sp, #344]             // 16-byte Folded Reload
+	ldr	x16, [sp, #400]                 // 8-byte Folded Reload
+	cmp	x10, x0
+	add	x25, x11, x9, lsl #2
+	b.ge	.LBB0_29
+// %bb.9:                               //   in Loop: Header=BB0_8 Depth=2
+	ldr	x9, [sp, #360]                  // 8-byte Folded Reload
+	mov	x13, xzr
+	mul	x12, x20, x9
+	add	x14, x12, x10
+	ldp	x11, x9, [sp, #328]             // 16-byte Folded Reload
+	add	x11, x11, x9, lsl #2
+	add	x15, x14, x28
+	add	x15, x11, x15, lsl #2
+	add	x9, x11, x14, lsl #2
+	ldp	q3, q1, [x15, #32]
+	ldp	q5, q4, [x15]
+	lsl	x15, x28, #1
+	ldp	q16, q6, [x9, #32]
+	ldp	q2, q0, [x9]
+	add	x9, x14, x15
+	add	x15, x15, x28
+	add	x14, x14, x15
+	add	x9, x11, x9, lsl #2
+	mov	x15, x3
+	add	x14, x11, x14, lsl #2
+	ldp	q17, q7, [x9, #32]
+	ldp	q20, q18, [x9]
+	add	x9, x10, #16
+	str	x9, [sp, #424]                  // 8-byte Folded Spill
+	ldp	q21, q19, [x14, #32]
+	ldp	q23, q22, [x14]
+	mov	x14, x16
+	cmp	xzr, x21
+	b.ge	.LBB0_11
+	.p2align	2
+.LBB0_10:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_8 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x16, x14, x22
+	prfm	pldl1keep, [x14]
+	ldur	s27, [x14, #-4]
+	add	x14, x14, #4
+	add	x17, x16, x22
+	prfm	pldl1keep, [x16]
+	ldur	s28, [x16, #-4]
+	add	x16, x15, x26
+	add	x18, x17, x22
+	prfm	pldl1keep, [x17]
+	ldur	s26, [x17, #-4]
+	sub	x17, x16, #4
+	prfm	pldl1keep, [x18]
+	ldur	s25, [x18, #-4]
+	add	x18, x16, x26
+	prfm	pldl1keep, [x15]
+	ldur	s24, [x15, #-4]
+	add	x15, x15, #4
+	prfm	pldl1keep, [x16]
+	sub	x16, x18, #4
+	prfm	pldl1keep, [x18]
+	ld1	{ v24.s }[1], [x17]
+	add	x17, x18, x26
+	prfm	pldl1keep, [x17]
+	ld1	{ v24.s }[2], [x16]
+	add	x16, x17, x26
+	sub	x17, x17, #4
+	prfm	pldl1keep, [x16]
+	ldur	s29, [x16, #-4]
+	add	x16, x16, x26
+	sub	x18, x16, #4
+	add	x0, x16, x26
+	ld1	{ v24.s }[3], [x17]
+	prfm	pldl1keep, [x16]
+	prfm	pldl1keep, [x0]
+	ld1	{ v29.s }[1], [x18]
+	sub	x16, x0, #4
+	add	x17, x0, x26
+	prfm	pldl1keep, [x17]
+	fmla	v2.4s, v24.4s, v27.s[0]
+	ld1	{ v29.s }[2], [x16]
+	add	x16, x17, x26
+	sub	x17, x17, #4
+	fmla	v5.4s, v24.4s, v28.s[0]
+	fmla	v20.4s, v24.4s, v26.s[0]
+	fmla	v23.4s, v24.4s, v25.s[0]
+	prfm	pldl1keep, [x16]
+	ldur	s30, [x16, #-4]
+	add	x16, x16, x26
+	sub	x18, x16, #4
+	add	x0, x16, x26
+	ld1	{ v29.s }[3], [x17]
+	prfm	pldl1keep, [x16]
+	prfm	pldl1keep, [x0]
+	ld1	{ v30.s }[1], [x18]
+	sub	x16, x0, #4
+	add	x17, x0, x26
+	prfm	pldl1keep, [x17]
+	ld1	{ v30.s }[2], [x16]
+	add	x16, x17, x26
+	sub	x17, x17, #4
+	fmla	v0.4s, v29.4s, v27.s[0]
+	fmla	v4.4s, v29.4s, v28.s[0]
+	fmla	v18.4s, v29.4s, v26.s[0]
+	fmla	v22.4s, v29.4s, v25.s[0]
+	prfm	pldl1keep, [x16]
+	ldur	s31, [x16, #-4]
+	add	x16, x16, x26
+	sub	x18, x16, #4
+	add	x0, x16, x26
+	ld1	{ v30.s }[3], [x17]
+	prfm	pldl1keep, [x16]
+	prfm	pldl1keep, [x0]
+	ld1	{ v31.s }[1], [x18]
+	sub	x16, x0, #4
+	add	x17, x0, x26
+	prfm	pldl1keep, [x17]
+	fmla	v16.4s, v30.4s, v27.s[0]
+	ld1	{ v31.s }[2], [x16]
+	sub	x16, x17, #4
+	fmla	v3.4s, v30.4s, v28.s[0]
+	fmla	v17.4s, v30.4s, v26.s[0]
+	fmla	v21.4s, v30.4s, v25.s[0]
+	ld1	{ v31.s }[3], [x16]
+	add	x16, x23, x13, lsl #6
+	add	x13, x13, #1
+	stp	q24, q29, [x16]
+	fmla	v6.4s, v31.4s, v27.s[0]
+	fmla	v1.4s, v31.4s, v28.s[0]
+	fmla	v7.4s, v31.4s, v26.s[0]
+	fmla	v19.4s, v31.4s, v25.s[0]
+	stp	q30, q31, [x16, #32]
+	cmp	x13, x21
+	b.lt	.LBB0_10
+.LBB0_11:                               // %.preheader
+                                        //   in Loop: Header=BB0_8 Depth=2
+	ldp	x13, x14, [sp, #384]            // 16-byte Folded Reload
+	str	x3, [sp, #432]                  // 8-byte Folded Spill
+	mov	x1, xzr
+	mov	w17, #1                         // =0x1
+	mov	w18, #2                         // =0x2
+	mov	w16, #3                         // =0x3
+	mov	w15, #4                         // =0x4
+	b	.LBB0_13
+	.p2align	2
+.LBB0_12:                               // %.loopexit
+                                        //   in Loop: Header=BB0_13 Depth=3
+	add	x14, x14, x24
+	add	x13, x13, x24
+	mov	x1, x15
+	mov	x15, x0
+.LBB0_13:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_8 Depth=2
+                                        // =>    This Loop Header: Depth=3
+                                        //         Child Loop BB0_15 Depth 4
+                                        //         Child Loop BB0_17 Depth 4
+	madd	x0, x1, x28, x12
+	ldr	x9, [sp, #504]                  // 8-byte Folded Reload
+	add	x0, x0, x10
+	madd	x17, x17, x28, x12
+	madd	x18, x18, x28, x12
+	madd	x16, x16, x28, x12
+	add	x17, x17, x10
+	add	x18, x18, x10
+	add	x16, x16, x10
+	cmp	x15, x9
+	add	x0, x11, x0, lsl #2
+	add	x17, x11, x17, lsl #2
+	stp	q2, q0, [x0]
+	add	x18, x11, x18, lsl #2
+	add	x16, x11, x16, lsl #2
+	stp	q16, q6, [x0, #32]
+	stp	q5, q4, [x17]
+	stp	q3, q1, [x17, #32]
+	stp	q20, q18, [x18]
+	stp	q17, q7, [x18, #32]
+	stp	q23, q22, [x16]
+	stp	q21, q19, [x16, #32]
+	b.ge	.LBB0_18
+// %bb.14:                              //   in Loop: Header=BB0_13 Depth=3
+	add	x17, x15, #1
+	add	x16, x15, #3
+	mul	x2, x20, x8
+	add	x18, x15, #2
+	madd	x3, x17, x28, x12
+	ldp	q28, q29, [x23, #32]
+	mov	x1, xzr
+	madd	x0, x15, x28, x12
+	ldp	q30, q31, [x23]
+	add	x0, x0, x10
+	add	x3, x3, x10
+	add	x0, x11, x0, lsl #2
+	add	x3, x11, x3, lsl #2
+	ldp	q16, q6, [x0, #32]
+	ldp	q2, q0, [x0]
+	madd	x0, x18, x28, x12
+	ldp	q3, q1, [x3, #32]
+	add	x0, x0, x10
+	ldp	q5, q4, [x3]
+	madd	x3, x16, x28, x12
+	add	x3, x3, x10
+	add	x0, x11, x0, lsl #2
+	add	x3, x11, x3, lsl #2
+	ldp	q17, q7, [x0, #32]
+	ldp	q20, q18, [x0]
+	add	x0, x15, #4
+	ldp	q21, q19, [x3, #32]
+	ldp	q23, q22, [x3]
+	madd	x3, x15, x19, x2
+	lsl	x3, x3, #2
+	ldr	q27, [x25, x3]
+	madd	x3, x17, x19, x2
+	lsl	x3, x3, #2
+	ldr	q26, [x25, x3]
+	madd	x3, x18, x19, x2
+	madd	x2, x16, x19, x2
+	lsl	x3, x3, #2
+	lsl	x2, x2, #2
+	ldr	q25, [x25, x3]
+	ldr	q24, [x25, x2]
+	ldr	x3, [sp, #472]                  // 8-byte Folded Reload
+	mov	x2, x14
+	fmla	v6.4s, v29.4s, v27.s[0]
+	cmp	xzr, x29
+	b.ge	.LBB0_16
+	.p2align	2
+.LBB0_15:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_8 Depth=2
+                                        //       Parent Loop BB0_13 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x7, x3, #64
+	fmla	v16.4s, v28.4s, v27.s[0]
+	fmla	v2.4s, v30.4s, v27.s[0]
+	add	x6, x3, #128
+	prfm	pldl1keep, [x7]
+	ldp	q9, q8, [x3, #-160]
+	fmla	v0.4s, v31.4s, v27.s[0]
+	ldp	q12, q15, [x3, #-192]
+	fmla	v1.4s, v29.4s, v26.s[0]
+	fmla	v3.4s, v28.4s, v26.s[0]
+	fmla	v4.4s, v31.4s, v26.s[0]
+	fmla	v5.4s, v30.4s, v26.s[0]
+	fmla	v7.4s, v29.4s, v25.s[0]
+	prfm	pldl1keep, [x6]
+	fmla	v17.4s, v28.4s, v25.s[0]
+	fmla	v18.4s, v31.4s, v25.s[0]
+	ldp	q11, q10, [x3, #-128]
+	fmla	v20.4s, v30.4s, v25.s[0]
+	fmla	v19.4s, v29.4s, v24.s[0]
+	ldp	q13, q14, [x3, #-96]
+	fmla	v21.4s, v28.4s, v24.s[0]
+	fmla	v22.4s, v31.4s, v24.s[0]
+	add	x5, x3, #192
+	prfm	pldl1keep, [x5]
+	fmla	v23.4s, v30.4s, v24.s[0]
+	fmla	v0.4s, v15.4s, v27.s[1]
+	add	x4, x3, #256
+	add	x1, x1, #4
+	fmla	v2.4s, v12.4s, v27.s[1]
+	fmla	v16.4s, v9.4s, v27.s[1]
+	fmla	v6.4s, v8.4s, v27.s[1]
+	fmla	v5.4s, v12.4s, v26.s[1]
+	fmla	v4.4s, v15.4s, v26.s[1]
+	fmla	v3.4s, v9.4s, v26.s[1]
+	fmla	v1.4s, v8.4s, v26.s[1]
+	fmla	v20.4s, v12.4s, v25.s[1]
+	fmla	v18.4s, v15.4s, v25.s[1]
+	fmla	v17.4s, v9.4s, v25.s[1]
+	fmla	v7.4s, v8.4s, v25.s[1]
+	fmla	v23.4s, v12.4s, v24.s[1]
+	fmla	v22.4s, v15.4s, v24.s[1]
+	ldp	q15, q12, [x3, #-64]
+	fmla	v21.4s, v9.4s, v24.s[1]
+	fmla	v19.4s, v8.4s, v24.s[1]
+	ldp	q9, q8, [x3, #-32]
+	prfm	pldl1keep, [x4]
+	ldp	q28, q29, [x3, #32]
+	ldp	q30, q31, [x3]
+	add	x3, x2, x22
+	prfm	pldl1keep, [x2]
+	fmla	v6.4s, v14.4s, v27.s[2]
+	fmla	v16.4s, v13.4s, v27.s[2]
+	fmla	v2.4s, v11.4s, v27.s[2]
+	fmla	v0.4s, v10.4s, v27.s[2]
+	fmla	v1.4s, v14.4s, v26.s[2]
+	fmla	v3.4s, v13.4s, v26.s[2]
+	fmla	v4.4s, v10.4s, v26.s[2]
+	fmla	v5.4s, v11.4s, v26.s[2]
+	fmla	v7.4s, v14.4s, v25.s[2]
+	fmla	v17.4s, v13.4s, v25.s[2]
+	fmla	v18.4s, v10.4s, v25.s[2]
+	fmla	v20.4s, v11.4s, v25.s[2]
+	fmla	v19.4s, v14.4s, v24.s[2]
+	fmla	v21.4s, v13.4s, v24.s[2]
+	fmla	v22.4s, v10.4s, v24.s[2]
+	fmla	v23.4s, v11.4s, v24.s[2]
+	fmla	v0.4s, v12.4s, v27.s[3]
+	fmla	v2.4s, v15.4s, v27.s[3]
+	fmla	v16.4s, v9.4s, v27.s[3]
+	fmla	v6.4s, v8.4s, v27.s[3]
+	ldur	q27, [x2, #-16]
+	prfm	pldl1keep, [x3]
+	add	x2, x2, #16
+	fmla	v5.4s, v15.4s, v26.s[3]
+	fmla	v4.4s, v12.4s, v26.s[3]
+	fmla	v3.4s, v9.4s, v26.s[3]
+	fmla	v1.4s, v8.4s, v26.s[3]
+	ldur	q26, [x3, #-16]
+	add	x3, x3, x22
+	add	x5, x3, x22
+	prfm	pldl1keep, [x3]
+	fmla	v20.4s, v15.4s, v25.s[3]
+	fmla	v18.4s, v12.4s, v25.s[3]
+	fmla	v17.4s, v9.4s, v25.s[3]
+	fmla	v7.4s, v8.4s, v25.s[3]
+	ldur	q25, [x3, #-16]
+	prfm	pldl1keep, [x5]
+	mov	x3, x4
+	fmla	v23.4s, v15.4s, v24.s[3]
+	fmla	v22.4s, v12.4s, v24.s[3]
+	fmla	v21.4s, v9.4s, v24.s[3]
+	fmla	v19.4s, v8.4s, v24.s[3]
+	ldur	q24, [x5, #-16]
+	fmla	v6.4s, v29.4s, v27.s[0]
+	cmp	x1, x29
+	b.lt	.LBB0_15
+.LBB0_16:                               //   in Loop: Header=BB0_13 Depth=3
+	ldr	x9, [sp, #496]                  // 8-byte Folded Reload
+	fmla	v16.4s, v28.4s, v27.s[0]
+	fmla	v2.4s, v30.4s, v27.s[0]
+	mov	x2, x13
+	fmla	v0.4s, v31.4s, v27.s[0]
+	fmla	v1.4s, v29.4s, v26.s[0]
+	mov	x3, x27
+	add	x1, x23, x9, lsl #6
+	ldr	x9, [sp, #488]                  // 8-byte Folded Reload
+	fmla	v3.4s, v28.4s, v26.s[0]
+	fmla	v4.4s, v31.4s, v26.s[0]
+	fmla	v5.4s, v30.4s, v26.s[0]
+	fmla	v7.4s, v29.4s, v25.s[0]
+	fmla	v17.4s, v28.4s, v25.s[0]
+	fmla	v18.4s, v31.4s, v25.s[0]
+	fmla	v20.4s, v30.4s, v25.s[0]
+	ldp	q10, q9, [x1, #32]
+	ldp	q11, q12, [x1]
+	fmla	v19.4s, v29.4s, v24.s[0]
+	fmla	v21.4s, v28.4s, v24.s[0]
+	fmla	v22.4s, v31.4s, v24.s[0]
+	add	x1, x23, x9, lsl #6
+	fmla	v23.4s, v30.4s, v24.s[0]
+	ldr	x9, [sp, #480]                  // 8-byte Folded Reload
+	ldp	q29, q30, [x1]
+	ldp	q8, q13, [x1, #32]
+	fmla	v0.4s, v12.4s, v27.s[1]
+	fmla	v6.4s, v9.4s, v27.s[1]
+	fmla	v4.4s, v12.4s, v26.s[1]
+	fmla	v1.4s, v9.4s, v26.s[1]
+	fmla	v18.4s, v12.4s, v25.s[1]
+	fmla	v7.4s, v9.4s, v25.s[1]
+	fmla	v22.4s, v12.4s, v24.s[1]
+	add	x1, x23, x9, lsl #6
+	fmla	v2.4s, v11.4s, v27.s[1]
+	fmla	v16.4s, v10.4s, v27.s[1]
+	fmla	v5.4s, v11.4s, v26.s[1]
+	fmla	v3.4s, v10.4s, v26.s[1]
+	ldp	q31, q28, [x1, #32]
+	fmla	v20.4s, v11.4s, v25.s[1]
+	fmla	v17.4s, v10.4s, v25.s[1]
+	fmla	v23.4s, v11.4s, v24.s[1]
+	fmla	v21.4s, v10.4s, v24.s[1]
+	fmla	v19.4s, v9.4s, v24.s[1]
+	ldp	q9, q10, [x1]
+	ldr	x1, [sp, #464]                  // 8-byte Folded Reload
+	fmla	v6.4s, v13.4s, v27.s[2]
+	fmla	v0.4s, v30.4s, v27.s[2]
+	fmla	v1.4s, v13.4s, v26.s[2]
+	fmla	v4.4s, v30.4s, v26.s[2]
+	fmla	v7.4s, v13.4s, v25.s[2]
+	fmla	v18.4s, v30.4s, v25.s[2]
+	fmla	v19.4s, v13.4s, v24.s[2]
+	fmla	v22.4s, v30.4s, v24.s[2]
+	fmla	v16.4s, v8.4s, v27.s[2]
+	fmla	v2.4s, v29.4s, v27.s[2]
+	fmla	v3.4s, v8.4s, v26.s[2]
+	fmla	v5.4s, v29.4s, v26.s[2]
+	fmla	v17.4s, v8.4s, v25.s[2]
+	fmla	v20.4s, v29.4s, v25.s[2]
+	fmla	v21.4s, v8.4s, v24.s[2]
+	fmla	v23.4s, v29.4s, v24.s[2]
+	fmla	v0.4s, v10.4s, v27.s[3]
+	fmla	v6.4s, v28.4s, v27.s[3]
+	fmla	v4.4s, v10.4s, v26.s[3]
+	fmla	v1.4s, v28.4s, v26.s[3]
+	fmla	v18.4s, v10.4s, v25.s[3]
+	fmla	v7.4s, v28.4s, v25.s[3]
+	fmla	v22.4s, v10.4s, v24.s[3]
+	fmla	v19.4s, v28.4s, v24.s[3]
+	fmla	v2.4s, v9.4s, v27.s[3]
+	fmla	v16.4s, v31.4s, v27.s[3]
+	fmla	v5.4s, v9.4s, v26.s[3]
+	fmla	v3.4s, v31.4s, v26.s[3]
+	fmla	v20.4s, v9.4s, v25.s[3]
+	fmla	v17.4s, v31.4s, v25.s[3]
+	fmla	v23.4s, v9.4s, v24.s[3]
+	fmla	v21.4s, v31.4s, v24.s[3]
+	cmp	x27, x21
+	b.ge	.LBB0_12
+	.p2align	2
+.LBB0_17:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_8 Depth=2
+                                        //       Parent Loop BB0_13 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	prfm	pldl1keep, [x1]
+	ldp	q24, q25, [x1, #-64]
+	add	x4, x2, x22
+	ldp	q26, q27, [x1, #-32]
+	prfm	pldl1keep, [x2]
+	add	x3, x3, #1
+	ldur	s28, [x2, #-4]
+	prfm	pldl1keep, [x4]
+	add	x2, x2, #4
+	add	x1, x1, #64
+	ldur	s29, [x4, #-4]
+	add	x4, x4, x22
+	prfm	pldl1keep, [x4]
+	fmla	v6.4s, v27.4s, v28.s[0]
+	ldur	s30, [x4, #-4]
+	add	x4, x4, x22
+	prfm	pldl1keep, [x4]
+	fmla	v16.4s, v26.4s, v28.s[0]
+	fmla	v0.4s, v25.4s, v28.s[0]
+	fmla	v2.4s, v24.4s, v28.s[0]
+	ldur	s28, [x4, #-4]
+	fmla	v4.4s, v25.4s, v29.s[0]
+	fmla	v5.4s, v24.4s, v29.s[0]
+	fmla	v3.4s, v26.4s, v29.s[0]
+	fmla	v1.4s, v27.4s, v29.s[0]
+	fmla	v20.4s, v24.4s, v30.s[0]
+	fmla	v18.4s, v25.4s, v30.s[0]
+	fmla	v17.4s, v26.4s, v30.s[0]
+	fmla	v7.4s, v27.4s, v30.s[0]
+	fmla	v23.4s, v24.4s, v28.s[0]
+	fmla	v22.4s, v25.4s, v28.s[0]
+	fmla	v21.4s, v26.4s, v28.s[0]
+	fmla	v19.4s, v27.4s, v28.s[0]
+	cmp	x3, x21
+	b.lt	.LBB0_17
+	b	.LBB0_12
+	.p2align	2
+.LBB0_18:                               //   in Loop: Header=BB0_8 Depth=2
+	ldr	x9, [sp, #504]                  // 8-byte Folded Reload
+	ldr	x13, [sp, #440]                 // 8-byte Folded Reload
+	cmp	x9, x13
+	ldr	x9, [sp, #496]                  // 8-byte Folded Reload
+	add	x15, x23, x9, lsl #6
+	ldr	x9, [sp, #488]                  // 8-byte Folded Reload
+	add	x14, x23, x9, lsl #6
+	ldr	x9, [sp, #480]                  // 8-byte Folded Reload
+	add	x13, x23, x9, lsl #6
+	b.ge	.LBB0_24
+// %bb.19:                              //   in Loop: Header=BB0_8 Depth=2
+	ldr	x9, [sp, #504]                  // 8-byte Folded Reload
+	add	x17, x12, x10
+	ldp	q18, q19, [x23, #32]
+	ldp	q20, q21, [x23]
+	mov	x18, xzr
+	add	x0, x9, #1
+	mul	x16, x9, x28
+	mul	x2, x9, x19
+	madd	x1, x0, x28, x12
+	add	x16, x17, x16
+	add	x17, x11, x16, lsl #2
+	add	x16, x1, x10
+	mul	x1, x20, x8
+	madd	x0, x0, x19, x1
+	add	x16, x11, x16, lsl #2
+	ldp	q2, q0, [x17, #32]
+	add	x2, x1, x2
+	ldp	q6, q4, [x17]
+	ldp	q3, q1, [x16, #32]
+	ldp	q7, q5, [x16]
+	lsl	x2, x2, #2
+	lsl	x0, x0, #2
+	ldr	q17, [x25, x2]
+	ldr	q16, [x25, x0]
+	ldp	x0, x1, [sp, #288]              // 16-byte Folded Reload
+	ldr	x2, [sp, #472]                  // 8-byte Folded Reload
+	cmp	xzr, x29
+	b.ge	.LBB0_21
+	.p2align	2
+.LBB0_20:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_8 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldr	x8, [sp, #456]                  // 8-byte Folded Reload
+	fmla	v0.4s, v19.4s, v17.s[0]
+	fmla	v2.4s, v18.4s, v17.s[0]
+	add	x9, x2, #128
+	fmla	v6.4s, v20.4s, v17.s[0]
+	fmla	v4.4s, v21.4s, v17.s[0]
+	add	x30, x2, #192
+	add	x3, x2, #256
+	fmla	v1.4s, v19.4s, v16.s[0]
+	fmla	v3.4s, v18.4s, v16.s[0]
+	add	x18, x18, #4
+	add	x4, x1, x8
+	add	x6, x0, x8
+	add	x8, x2, #64
+	fmla	v5.4s, v21.4s, v16.s[0]
+	fmla	v7.4s, v20.4s, v16.s[0]
+	add	x1, x1, #16
+	add	x0, x0, #16
+	prfm	pldl1keep, [x8]
+	add	x5, x4, #32
+	ldp	q23, q22, [x2, #-160]
+	ldp	q24, q25, [x2, #-192]
+	prfm	pldl1keep, [x9]
+	ldp	q19, q18, [x2, #-128]
+	add	x7, x6, #32
+	ldp	q20, q21, [x2, #-96]
+	prfm	pldl1keep, [x30]
+	fmla	v4.4s, v25.4s, v17.s[1]
+	fmla	v0.4s, v22.4s, v17.s[1]
+	fmla	v5.4s, v25.4s, v16.s[1]
+	fmla	v1.4s, v22.4s, v16.s[1]
+	fmla	v6.4s, v24.4s, v17.s[1]
+	fmla	v2.4s, v23.4s, v17.s[1]
+	fmla	v7.4s, v24.4s, v16.s[1]
+	fmla	v3.4s, v23.4s, v16.s[1]
+	fmla	v0.4s, v21.4s, v17.s[2]
+	ldp	q23, q22, [x2, #-32]
+	ldp	q24, q25, [x2, #-64]
+	fmla	v4.4s, v18.4s, v17.s[2]
+	fmla	v1.4s, v21.4s, v16.s[2]
+	fmla	v5.4s, v18.4s, v16.s[2]
+	prfm	pldl1keep, [x7]
+	fmla	v2.4s, v20.4s, v17.s[2]
+	fmla	v6.4s, v19.4s, v17.s[2]
+	fmla	v3.4s, v20.4s, v16.s[2]
+	fmla	v7.4s, v19.4s, v16.s[2]
+	fmla	v4.4s, v25.4s, v17.s[3]
+	fmla	v0.4s, v22.4s, v17.s[3]
+	fmla	v5.4s, v25.4s, v16.s[3]
+	fmla	v1.4s, v22.4s, v16.s[3]
+	fmla	v6.4s, v24.4s, v17.s[3]
+	fmla	v2.4s, v23.4s, v17.s[3]
+	ldr	q17, [x6, #16]
+	prfm	pldl1keep, [x5]
+	fmla	v7.4s, v24.4s, v16.s[3]
+	fmla	v3.4s, v23.4s, v16.s[3]
+	ldr	q16, [x4, #16]
+	prfm	pldl1keep, [x3]
+	ldp	q18, q19, [x2, #32]
+	ldp	q20, q21, [x2]
+	mov	x2, x3
+	cmp	x18, x29
+	b.lt	.LBB0_20
+.LBB0_21:                               //   in Loop: Header=BB0_8 Depth=2
+	ldp	q23, q22, [x15, #32]
+	ldp	q24, q25, [x15]
+	fmla	v0.4s, v19.4s, v17.s[0]
+	fmla	v2.4s, v18.4s, v17.s[0]
+	fmla	v6.4s, v20.4s, v17.s[0]
+	fmla	v4.4s, v21.4s, v17.s[0]
+	fmla	v1.4s, v19.4s, v16.s[0]
+	fmla	v3.4s, v18.4s, v16.s[0]
+	ldp	q19, q18, [x14]
+	fmla	v5.4s, v21.4s, v16.s[0]
+	fmla	v7.4s, v20.4s, v16.s[0]
+	ldp	q20, q21, [x14, #32]
+	fmla	v4.4s, v25.4s, v17.s[1]
+	fmla	v0.4s, v22.4s, v17.s[1]
+	ldr	x18, [sp, #464]                 // 8-byte Folded Reload
+	ldr	x0, [sp, #408]                  // 8-byte Folded Reload
+	fmla	v6.4s, v24.4s, v17.s[1]
+	fmla	v2.4s, v23.4s, v17.s[1]
+	ldr	x30, [sp, #280]                 // 8-byte Folded Reload
+	mov	x1, x27
+	fmla	v7.4s, v24.4s, v16.s[1]
+	fmla	v5.4s, v25.4s, v16.s[1]
+	ldp	q24, q25, [x13]
+	fmla	v3.4s, v23.4s, v16.s[1]
+	fmla	v1.4s, v22.4s, v16.s[1]
+	ldp	q23, q22, [x13, #32]
+	fmla	v0.4s, v21.4s, v17.s[2]
+	fmla	v4.4s, v18.4s, v17.s[2]
+	ldp	x3, x2, [sp, #256]              // 16-byte Folded Reload
+	fmla	v2.4s, v20.4s, v17.s[2]
+	fmla	v6.4s, v19.4s, v17.s[2]
+	fmla	v1.4s, v21.4s, v16.s[2]
+	fmla	v3.4s, v20.4s, v16.s[2]
+	fmla	v5.4s, v18.4s, v16.s[2]
+	fmla	v7.4s, v19.4s, v16.s[2]
+	fmla	v4.4s, v25.4s, v17.s[3]
+	fmla	v0.4s, v22.4s, v17.s[3]
+	fmla	v5.4s, v25.4s, v16.s[3]
+	fmla	v1.4s, v22.4s, v16.s[3]
+	fmla	v6.4s, v24.4s, v17.s[3]
+	fmla	v2.4s, v23.4s, v17.s[3]
+	fmla	v7.4s, v24.4s, v16.s[3]
+	fmla	v3.4s, v23.4s, v16.s[3]
+	cmp	x27, x21
+	b.ge	.LBB0_23
+	.p2align	2
+.LBB0_22:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_8 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x8, x0, x3
+	add	x9, x0, x2
+	prfm	pldl1keep, [x18]
+	add	x1, x1, #1
+	add	x8, x8, #4
+	add	x9, x9, #4
+	ldp	q16, q17, [x18, #-64]
+	ldp	q18, q19, [x18, #-32]
+	prfm	pldl1keep, [x9]
+	add	x18, x18, #64
+	ldr	s20, [x0, x2]
+	prfm	pldl1keep, [x8]
+	fmla	v0.4s, v19.4s, v20.s[0]
+	ldr	s21, [x0, x3]
+	fmla	v2.4s, v18.4s, v20.s[0]
+	fmla	v4.4s, v17.4s, v20.s[0]
+	fmla	v6.4s, v16.4s, v20.s[0]
+	fmla	v5.4s, v17.4s, v21.s[0]
+	fmla	v7.4s, v16.4s, v21.s[0]
+	fmla	v3.4s, v18.4s, v21.s[0]
+	fmla	v1.4s, v19.4s, v21.s[0]
+	add	x0, x0, #4
+	cmp	x1, x21
+	b.lt	.LBB0_22
+.LBB0_23:                               //   in Loop: Header=BB0_8 Depth=2
+	ldr	x8, [sp, #448]                  // 8-byte Folded Reload
+	stp	q6, q4, [x17]
+	stp	q2, q0, [x17, #32]
+	stp	q7, q5, [x16]
+	stp	q3, q1, [x16, #32]
+.LBB0_24:                               //   in Loop: Header=BB0_8 Depth=2
+	ldp	x9, x1, [sp, #368]              // 16-byte Folded Reload
+	ldr	x16, [sp, #440]                 // 8-byte Folded Reload
+	cmp	x16, x9
+	ldr	x0, [sp, #416]                  // 8-byte Folded Reload
+	b.ge	.LBB0_7
+// %bb.25:                              //   in Loop: Header=BB0_8 Depth=2
+	mov	x17, x8
+	add	x8, x12, x10
+	ldr	x12, [sp, #440]                 // 8-byte Folded Reload
+	ldr	x2, [sp, #408]                  // 8-byte Folded Reload
+	ldp	q7, q16, [x23, #32]
+	ldp	q6, q5, [x23]
+	mov	x16, xzr
+	mul	x9, x12, x28
+	add	x8, x8, x9
+	add	x10, x11, x8, lsl #2
+	mul	x8, x12, x19
+	ldr	x11, [sp, #304]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #472]                 // 8-byte Folded Reload
+	madd	x8, x20, x17, x8
+	ldp	q1, q0, [x10, #32]
+	ldp	q3, q2, [x10]
+	lsl	x8, x8, #2
+	ldr	q4, [x25, x8]
+	cmp	xzr, x29
+	b.ge	.LBB0_27
+	.p2align	2
+.LBB0_26:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_8 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x18, x12, #64
+	fmla	v0.4s, v16.4s, v4.s[0]
+	fmla	v1.4s, v7.4s, v4.s[0]
+	add	x9, x12, #128
+	prfm	pldl1keep, [x18]
+	ldp	q18, q17, [x12, #-160]
+	fmla	v3.4s, v6.4s, v4.s[0]
+	ldp	q19, q20, [x12, #-192]
+	fmla	v2.4s, v5.4s, v4.s[0]
+	prfm	pldl1keep, [x9]
+	ldp	q6, q5, [x12, #-128]
+	ldp	q7, q16, [x12, #-96]
+	add	x8, x12, #192
+	prfm	pldl1keep, [x8]
+	add	x17, x12, #256
+	add	x16, x16, #4
+	fmla	v2.4s, v20.4s, v4.s[1]
+	fmla	v0.4s, v17.4s, v4.s[1]
+	fmla	v3.4s, v19.4s, v4.s[1]
+	fmla	v1.4s, v18.4s, v4.s[1]
+	ldp	q18, q17, [x12, #-32]
+	ldp	q19, q20, [x12, #-64]
+	prfm	pldl1keep, [x11]
+	fmla	v0.4s, v16.4s, v4.s[2]
+	fmla	v2.4s, v5.4s, v4.s[2]
+	fmla	v1.4s, v7.4s, v4.s[2]
+	fmla	v3.4s, v6.4s, v4.s[2]
+	fmla	v2.4s, v20.4s, v4.s[3]
+	fmla	v0.4s, v17.4s, v4.s[3]
+	fmla	v3.4s, v19.4s, v4.s[3]
+	fmla	v1.4s, v18.4s, v4.s[3]
+	ldur	q4, [x11, #-16]
+	prfm	pldl1keep, [x17]
+	add	x11, x11, #16
+	ldp	q7, q16, [x12, #32]
+	ldp	q6, q5, [x12]
+	mov	x12, x17
+	cmp	x16, x29
+	b.lt	.LBB0_26
+.LBB0_27:                               //   in Loop: Header=BB0_8 Depth=2
+	ldp	q18, q17, [x15, #32]
+	ldp	q19, q20, [x15]
+	fmla	v0.4s, v16.4s, v4.s[0]
+	fmla	v1.4s, v7.4s, v4.s[0]
+	fmla	v3.4s, v6.4s, v4.s[0]
+	fmla	v2.4s, v5.4s, v4.s[0]
+	ldp	q6, q5, [x14]
+	ldp	q7, q16, [x14, #32]
+	ldr	x15, [sp, #248]                 // 8-byte Folded Reload
+	mov	x11, xzr
+	mov	w12, #64                        // =0x40
+	fmla	v2.4s, v20.4s, v4.s[1]
+	fmla	v0.4s, v17.4s, v4.s[1]
+	fmla	v3.4s, v19.4s, v4.s[1]
+	fmla	v1.4s, v18.4s, v4.s[1]
+	fmla	v0.4s, v16.4s, v4.s[2]
+	ldp	q18, q17, [x13, #32]
+	ldp	q19, q20, [x13]
+	fmla	v2.4s, v5.4s, v4.s[2]
+	ldr	x13, [sp, #272]                 // 8-byte Folded Reload
+	fmla	v1.4s, v7.4s, v4.s[2]
+	fmla	v3.4s, v6.4s, v4.s[2]
+	fmla	v2.4s, v20.4s, v4.s[3]
+	fmla	v0.4s, v17.4s, v4.s[3]
+	fmla	v3.4s, v19.4s, v4.s[3]
+	fmla	v1.4s, v18.4s, v4.s[3]
+	add	x8, x27, xzr
+	cmp	x8, x21
+	b.ge	.LBB0_6
+	.p2align	2
+.LBB0_28:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_8 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x14, x15, x11, lsl #6
+	add	x8, x2, x13
+	add	x9, x15, x12
+	add	x13, x13, #4
+	prfm	pldl1keep, [x9]
+	add	x12, x12, #64
+	ldp	q4, q5, [x14]
+	ldp	q6, q7, [x14, #32]
+	prfm	pldl1keep, [x8]
+	ldr	s16, [x1, x11, lsl #2]
+	add	x11, x11, #1
+	fmla	v0.4s, v7.4s, v16.s[0]
+	fmla	v1.4s, v6.4s, v16.s[0]
+	fmla	v2.4s, v5.4s, v16.s[0]
+	fmla	v3.4s, v4.4s, v16.s[0]
+	add	x8, x27, x11
+	cmp	x8, x21
+	b.lt	.LBB0_28
+	b	.LBB0_6
+	.p2align	2
+.LBB0_29:                               //   in Loop: Header=BB0_4 Depth=1
+	ldp	x10, x9, [sp, #328]             // 16-byte Folded Reload
+	cmp	x0, x30
+	add	x11, x10, x9, lsl #2
+	lsl	x9, x28, #1
+	stp	x9, x11, [sp, #424]             // 16-byte Folded Spill
+	b.lt	.LBB0_33
+// %bb.30:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x1, [sp, #176]                  // 8-byte Folded Reload
+	cmp	x30, x1
+	b.lt	.LBB0_55
+.LBB0_31:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x10, [sp, #168]                 // 8-byte Folded Reload
+	cmp	x1, x10
+	b.lt	.LBB0_77
+.LBB0_32:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x9, [sp, #136]                  // 8-byte Folded Reload
+	cmp	x10, x9
+	b.ge	.LBB0_3
+	b	.LBB0_99
+	.p2align	2
+.LBB0_33:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x8, [sp, #96]                   // 8-byte Folded Reload
+	add	x0, x8, #64
+	bl	malloc
+	ldr	x8, [sp, #360]                  // 8-byte Folded Reload
+	ldr	x6, [sp, #432]                  // 8-byte Folded Reload
+	mov	x10, xzr
+	mov	x11, xzr
+	ldr	x15, [sp, #232]                 // 8-byte Folded Reload
+	ldr	x16, [sp, #400]                 // 8-byte Folded Reload
+	mul	x9, x20, x8
+	ldp	x8, x13, [sp, #416]             // 16-byte Folded Reload
+	add	x8, x9, x8
+	add	x12, x6, x8, lsl #2
+	ldp	q3, q2, [x12]
+	add	x12, x8, x28
+	add	x12, x6, x12, lsl #2
+	ldp	q1, q0, [x12]
+	add	x12, x8, x13
+	add	x12, x6, x12, lsl #2
+	ldp	q5, q4, [x12]
+	add	x12, x13, x28
+	add	x8, x8, x12
+	add	x8, x6, x8, lsl #2
+	ldp	q7, q6, [x8]
+	add	x8, x0, #63
+	and	x8, x8, #0xffffffffffffffc0
+	cmp	xzr, x21
+	b.ge	.LBB0_35
+	.p2align	2
+.LBB0_34:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x13, x16, x10
+	add	x12, x15, x10
+	add	x10, x10, #4
+	prfm	pldl1keep, [x13]
+	ldur	s16, [x13, #-4]
+	add	x13, x13, x22
+	prfm	pldl1keep, [x13]
+	ldur	s17, [x13, #-4]
+	add	x13, x13, x22
+	prfm	pldl1keep, [x13]
+	ldur	s18, [x13, #-4]
+	add	x13, x13, x22
+	prfm	pldl1keep, [x13]
+	ldur	s20, [x13, #-4]
+	prfm	pldl1keep, [x12]
+	ldur	s19, [x12, #-4]
+	add	x12, x12, x26
+	sub	x13, x12, #4
+	prfm	pldl1keep, [x12]
+	add	x12, x12, x26
+	prfm	pldl1keep, [x12]
+	sub	x14, x12, #4
+	add	x12, x12, x26
+	ld1	{ v19.s }[1], [x13]
+	prfm	pldl1keep, [x12]
+	sub	x13, x12, #4
+	add	x12, x12, x26
+	ld1	{ v19.s }[2], [x14]
+	prfm	pldl1keep, [x12]
+	ldur	s21, [x12, #-4]
+	add	x12, x12, x26
+	ld1	{ v19.s }[3], [x13]
+	prfm	pldl1keep, [x12]
+	sub	x13, x12, #4
+	add	x12, x12, x26
+	prfm	pldl1keep, [x12]
+	ld1	{ v21.s }[1], [x13]
+	sub	x14, x12, #4
+	add	x12, x12, x26
+	prfm	pldl1keep, [x12]
+	sub	x12, x12, #4
+	fmla	v3.4s, v19.4s, v16.s[0]
+	fmla	v1.4s, v19.4s, v17.s[0]
+	fmla	v5.4s, v19.4s, v18.s[0]
+	fmla	v7.4s, v19.4s, v20.s[0]
+	ld1	{ v21.s }[2], [x14]
+	ld1	{ v21.s }[3], [x12]
+	add	x12, x8, x11, lsl #5
+	add	x11, x11, #1
+	fmla	v2.4s, v21.4s, v16.s[0]
+	fmla	v0.4s, v21.4s, v17.s[0]
+	fmla	v4.4s, v21.4s, v18.s[0]
+	fmla	v6.4s, v21.4s, v20.s[0]
+	stp	q19, q21, [x12]
+	cmp	x11, x21
+	b.lt	.LBB0_34
+.LBB0_35:                               // %.preheader27
+                                        //   in Loop: Header=BB0_4 Depth=1
+	ldr	x12, [sp, #56]                  // 8-byte Folded Reload
+	ldp	x15, x16, [sp, #384]            // 16-byte Folded Reload
+	mov	x11, xzr
+	add	x10, x8, #128
+	mov	w18, #1                         // =0x1
+	mov	w2, #2                          // =0x2
+	mov	w1, #3                          // =0x3
+	mov	w17, #4                         // =0x4
+	add	x14, x8, x12
+	b	.LBB0_37
+	.p2align	2
+.LBB0_36:                               // %.loopexit23
+                                        //   in Loop: Header=BB0_37 Depth=2
+	add	x16, x16, x24
+	add	x15, x15, x24
+	mov	x11, x17
+	mov	x17, x3
+.LBB0_37:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Loop Header: Depth=2
+                                        //       Child Loop BB0_39 Depth 3
+                                        //       Child Loop BB0_41 Depth 3
+	madd	x11, x11, x28, x9
+	ldr	x7, [sp, #416]                  // 8-byte Folded Reload
+	add	x11, x11, x7
+	madd	x12, x18, x28, x9
+	madd	x13, x2, x28, x9
+	add	x12, x12, x7
+	add	x13, x13, x7
+	add	x11, x6, x11, lsl #2
+	add	x12, x6, x12, lsl #2
+	stp	q3, q2, [x11]
+	madd	x11, x1, x28, x9
+	stp	q1, q0, [x12]
+	add	x12, x6, x13, lsl #2
+	add	x11, x11, x7
+	stp	q5, q4, [x12]
+	add	x11, x6, x11, lsl #2
+	stp	q7, q6, [x11]
+	ldr	x11, [sp, #504]                 // 8-byte Folded Reload
+	cmp	x17, x11
+	ldr	x11, [sp, #496]                 // 8-byte Folded Reload
+	add	x13, x8, x11, lsl #5
+	ldr	x11, [sp, #488]                 // 8-byte Folded Reload
+	add	x12, x8, x11, lsl #5
+	ldr	x11, [sp, #480]                 // 8-byte Folded Reload
+	add	x11, x8, x11, lsl #5
+	b.ge	.LBB0_42
+// %bb.38:                              //   in Loop: Header=BB0_37 Depth=2
+	madd	x5, x17, x28, x9
+	add	x18, x17, #1
+	mov	x30, x6
+	add	x2, x17, #2
+	madd	x6, x18, x28, x9
+	add	x1, x17, #3
+	ldp	q20, q21, [x8]
+	mov	x4, xzr
+	add	x3, x17, #4
+	add	x5, x5, x7
+	add	x5, x30, x5, lsl #2
+	add	x6, x6, x7
+	add	x6, x30, x6, lsl #2
+	ldp	q3, q2, [x5]
+	madd	x5, x2, x28, x9
+	ldp	q1, q0, [x6]
+	madd	x6, x1, x28, x9
+	add	x5, x5, x7
+	add	x5, x30, x5, lsl #2
+	add	x6, x6, x7
+	add	x6, x30, x6, lsl #2
+	ldp	q5, q4, [x5]
+	ldr	x5, [sp, #448]                  // 8-byte Folded Reload
+	mul	x5, x20, x5
+	ldp	q7, q6, [x6]
+	madd	x6, x17, x19, x5
+	lsl	x6, x6, #2
+	ldr	q19, [x25, x6]
+	madd	x6, x18, x19, x5
+	lsl	x6, x6, #2
+	ldr	q18, [x25, x6]
+	madd	x6, x2, x19, x5
+	madd	x5, x1, x19, x5
+	lsl	x6, x6, #2
+	lsl	x5, x5, #2
+	ldr	q17, [x25, x6]
+	ldr	q16, [x25, x5]
+	mov	x5, x10
+	mov	x6, x16
+	cmp	xzr, x29
+	b.ge	.LBB0_40
+	.p2align	2
+.LBB0_39:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_37 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x7, x5, #32
+	fmla	v3.4s, v20.4s, v19.s[0]
+	fmla	v2.4s, v21.4s, v19.s[0]
+	add	x4, x4, #4
+	prfm	pldl1keep, [x7]
+	ldp	q22, q23, [x5, #-96]
+	fmla	v0.4s, v21.4s, v18.s[0]
+	fmla	v1.4s, v20.4s, v18.s[0]
+	fmla	v4.4s, v21.4s, v17.s[0]
+	fmla	v5.4s, v20.4s, v17.s[0]
+	add	x7, x5, #96
+	fmla	v6.4s, v21.4s, v16.s[0]
+	fmla	v7.4s, v20.4s, v16.s[0]
+	ldp	q21, q20, [x5, #-64]
+	prfm	pldl1keep, [x7]
+	add	x7, x6, x22
+	add	x30, x7, x22
+	fmla	v2.4s, v23.4s, v19.s[1]
+	fmla	v0.4s, v23.4s, v18.s[1]
+	fmla	v4.4s, v23.4s, v17.s[1]
+	fmla	v6.4s, v23.4s, v16.s[1]
+	fmla	v3.4s, v22.4s, v19.s[1]
+	fmla	v1.4s, v22.4s, v18.s[1]
+	fmla	v5.4s, v22.4s, v17.s[1]
+	fmla	v7.4s, v22.4s, v16.s[1]
+	fmla	v2.4s, v20.4s, v19.s[2]
+	ldp	q22, q23, [x5, #-32]
+	fmla	v0.4s, v20.4s, v18.s[2]
+	fmla	v4.4s, v20.4s, v17.s[2]
+	fmla	v6.4s, v20.4s, v16.s[2]
+	fmla	v3.4s, v21.4s, v19.s[2]
+	fmla	v1.4s, v21.4s, v18.s[2]
+	fmla	v5.4s, v21.4s, v17.s[2]
+	fmla	v7.4s, v21.4s, v16.s[2]
+	ldp	q20, q21, [x5], #128
+	prfm	pldl1keep, [x6]
+	fmla	v2.4s, v23.4s, v19.s[3]
+	fmla	v0.4s, v23.4s, v18.s[3]
+	fmla	v4.4s, v23.4s, v17.s[3]
+	fmla	v6.4s, v23.4s, v16.s[3]
+	fmla	v3.4s, v22.4s, v19.s[3]
+	ldur	q19, [x6, #-16]
+	prfm	pldl1keep, [x7]
+	fmla	v1.4s, v22.4s, v18.s[3]
+	ldur	q18, [x7, #-16]
+	add	x7, x30, x22
+	prfm	pldl1keep, [x30]
+	add	x6, x6, #16
+	fmla	v5.4s, v22.4s, v17.s[3]
+	ldur	q17, [x30, #-16]
+	prfm	pldl1keep, [x7]
+	fmla	v7.4s, v22.4s, v16.s[3]
+	ldur	q16, [x7, #-16]
+	cmp	x4, x29
+	b.lt	.LBB0_39
+.LBB0_40:                               //   in Loop: Header=BB0_37 Depth=2
+	ldp	q22, q23, [x13]
+	fmla	v3.4s, v20.4s, v19.s[0]
+	fmla	v2.4s, v21.4s, v19.s[0]
+	fmla	v0.4s, v21.4s, v18.s[0]
+	fmla	v1.4s, v20.4s, v18.s[0]
+	ldr	x6, [sp, #432]                  // 8-byte Folded Reload
+	mov	x13, x27
+	fmla	v4.4s, v21.4s, v17.s[0]
+	fmla	v5.4s, v20.4s, v17.s[0]
+	fmla	v6.4s, v21.4s, v16.s[0]
+	fmla	v7.4s, v20.4s, v16.s[0]
+	ldp	q21, q20, [x12]
+	mov	x12, x15
+	fmla	v2.4s, v23.4s, v19.s[1]
+	fmla	v0.4s, v23.4s, v18.s[1]
+	fmla	v4.4s, v23.4s, v17.s[1]
+	fmla	v6.4s, v23.4s, v16.s[1]
+	fmla	v3.4s, v22.4s, v19.s[1]
+	fmla	v1.4s, v22.4s, v18.s[1]
+	fmla	v5.4s, v22.4s, v17.s[1]
+	fmla	v7.4s, v22.4s, v16.s[1]
+	fmla	v2.4s, v20.4s, v19.s[2]
+	ldp	q22, q23, [x11]
+	fmla	v0.4s, v20.4s, v18.s[2]
+	mov	x11, x14
+	fmla	v4.4s, v20.4s, v17.s[2]
+	fmla	v6.4s, v20.4s, v16.s[2]
+	fmla	v3.4s, v21.4s, v19.s[2]
+	fmla	v1.4s, v21.4s, v18.s[2]
+	fmla	v5.4s, v21.4s, v17.s[2]
+	fmla	v7.4s, v21.4s, v16.s[2]
+	fmla	v2.4s, v23.4s, v19.s[3]
+	fmla	v0.4s, v23.4s, v18.s[3]
+	fmla	v4.4s, v23.4s, v17.s[3]
+	fmla	v6.4s, v23.4s, v16.s[3]
+	fmla	v3.4s, v22.4s, v19.s[3]
+	fmla	v1.4s, v22.4s, v18.s[3]
+	fmla	v5.4s, v22.4s, v17.s[3]
+	fmla	v7.4s, v22.4s, v16.s[3]
+	cmp	x27, x21
+	b.ge	.LBB0_36
+	.p2align	2
+.LBB0_41:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_37 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x4, x12, x22
+	prfm	pldl1keep, [x11]
+	ldp	q16, q17, [x11, #-32]
+	prfm	pldl1keep, [x12]
+	ldur	s18, [x12, #-4]
+	add	x13, x13, #1
+	add	x12, x12, #4
+	prfm	pldl1keep, [x4]
+	ldur	s19, [x4, #-4]
+	add	x4, x4, x22
+	add	x11, x11, #32
+	prfm	pldl1keep, [x4]
+	ldur	s20, [x4, #-4]
+	add	x4, x4, x22
+	fmla	v2.4s, v17.4s, v18.s[0]
+	prfm	pldl1keep, [x4]
+	ldur	s21, [x4, #-4]
+	fmla	v3.4s, v16.4s, v18.s[0]
+	fmla	v0.4s, v17.4s, v19.s[0]
+	fmla	v1.4s, v16.4s, v19.s[0]
+	fmla	v4.4s, v17.4s, v20.s[0]
+	fmla	v5.4s, v16.4s, v20.s[0]
+	fmla	v6.4s, v17.4s, v21.s[0]
+	fmla	v7.4s, v16.4s, v21.s[0]
+	cmp	x13, x21
+	b.lt	.LBB0_41
+	b	.LBB0_36
+	.p2align	2
+.LBB0_42:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x14, [sp, #504]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #440]                 // 8-byte Folded Reload
+	cmp	x14, x15
+	b.ge	.LBB0_48
+// %bb.43:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x1, [sp, #504]                  // 8-byte Folded Reload
+	ldr	x18, [sp, #416]                 // 8-byte Folded Reload
+	mov	x16, xzr
+	mul	x14, x1, x28
+	add	x17, x1, #1
+	mul	x1, x1, x19
+	ldp	q6, q7, [x8]
+	madd	x15, x17, x28, x9
+	add	x14, x9, x14
+	add	x14, x14, x18
+	add	x15, x15, x18
+	ldr	x18, [sp, #448]                 // 8-byte Folded Reload
+	add	x14, x6, x14, lsl #2
+	add	x15, x6, x15, lsl #2
+	mul	x18, x20, x18
+	ldp	q1, q0, [x14]
+	ldp	q3, q2, [x15]
+	madd	x17, x17, x19, x18
+	add	x1, x18, x1
+	lsl	x1, x1, #2
+	lsl	x17, x17, #2
+	ldr	q5, [x25, x1]
+	ldr	q4, [x25, x17]
+	ldp	x18, x1, [sp, #288]             // 16-byte Folded Reload
+	mov	x17, x10
+	cmp	xzr, x29
+	b.ge	.LBB0_45
+	.p2align	2
+.LBB0_44:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x7, x17, #32
+	ldr	x4, [sp, #456]                  // 8-byte Folded Reload
+	fmla	v1.4s, v6.4s, v5.s[0]
+	fmla	v0.4s, v7.4s, v5.s[0]
+	prfm	pldl1keep, [x7]
+	ldp	q16, q17, [x17, #-96]
+	fmla	v2.4s, v7.4s, v4.s[0]
+	fmla	v3.4s, v6.4s, v4.s[0]
+	ldp	q7, q6, [x17, #-64]
+	add	x6, x17, #96
+	prfm	pldl1keep, [x6]
+	add	x16, x16, #4
+	add	x2, x1, x4
+	add	x4, x18, x4
+	add	x1, x1, #16
+	add	x18, x18, #16
+	fmla	v0.4s, v17.4s, v5.s[1]
+	fmla	v2.4s, v17.4s, v4.s[1]
+	add	x3, x2, #32
+	add	x5, x4, #32
+	fmla	v1.4s, v16.4s, v5.s[1]
+	fmla	v3.4s, v16.4s, v4.s[1]
+	ldp	q16, q17, [x17, #-32]
+	fmla	v0.4s, v6.4s, v5.s[2]
+	fmla	v2.4s, v6.4s, v4.s[2]
+	fmla	v1.4s, v7.4s, v5.s[2]
+	fmla	v3.4s, v7.4s, v4.s[2]
+	fmla	v0.4s, v17.4s, v5.s[3]
+	fmla	v2.4s, v17.4s, v4.s[3]
+	ldp	q6, q7, [x17], #128
+	prfm	pldl1keep, [x5]
+	fmla	v1.4s, v16.4s, v5.s[3]
+	ldr	q5, [x4, #16]
+	prfm	pldl1keep, [x3]
+	fmla	v3.4s, v16.4s, v4.s[3]
+	ldr	q4, [x2, #16]
+	cmp	x16, x29
+	b.lt	.LBB0_44
+.LBB0_45:                               //   in Loop: Header=BB0_4 Depth=1
+	ldp	q16, q17, [x13]
+	fmla	v1.4s, v6.4s, v5.s[0]
+	fmla	v0.4s, v7.4s, v5.s[0]
+	fmla	v2.4s, v7.4s, v4.s[0]
+	fmla	v3.4s, v6.4s, v4.s[0]
+	ldp	q7, q6, [x12]
+	ldr	x18, [sp, #72]                  // 8-byte Folded Reload
+	mov	x16, xzr
+	mov	x17, xzr
+	mov	x1, x27
+	fmla	v0.4s, v17.4s, v5.s[1]
+	fmla	v2.4s, v17.4s, v4.s[1]
+	add	x18, x8, x18
+	fmla	v1.4s, v16.4s, v5.s[1]
+	fmla	v3.4s, v16.4s, v4.s[1]
+	ldp	q16, q17, [x11]
+	fmla	v0.4s, v6.4s, v5.s[2]
+	fmla	v2.4s, v6.4s, v4.s[2]
+	fmla	v1.4s, v7.4s, v5.s[2]
+	fmla	v3.4s, v7.4s, v4.s[2]
+	fmla	v0.4s, v17.4s, v5.s[3]
+	fmla	v2.4s, v17.4s, v4.s[3]
+	fmla	v1.4s, v16.4s, v5.s[3]
+	fmla	v3.4s, v16.4s, v4.s[3]
+	cmp	x27, x21
+	b.ge	.LBB0_47
+	.p2align	2
+.LBB0_46:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	ldr	x6, [sp, #240]                  // 8-byte Folded Reload
+	ldr	x7, [sp, #312]                  // 8-byte Folded Reload
+	add	x4, x18, x17, lsl #3
+	add	x5, x18, x16
+	add	x1, x1, #1
+	add	x16, x16, #32
+	add	x4, x4, #32
+	prfm	pldl1keep, [x4]
+	ldp	q4, q5, [x5]
+	add	x2, x6, x17
+	add	x3, x7, x17
+	add	x2, x2, #4
+	add	x3, x3, #4
+	prfm	pldl1keep, [x3]
+	ldr	s6, [x7, x17]
+	prfm	pldl1keep, [x2]
+	fmla	v0.4s, v5.4s, v6.s[0]
+	ldr	s7, [x6, x17]
+	fmla	v1.4s, v4.4s, v6.s[0]
+	fmla	v2.4s, v5.4s, v7.s[0]
+	fmla	v3.4s, v4.4s, v7.s[0]
+	add	x17, x17, #4
+	cmp	x1, x21
+	b.lt	.LBB0_46
+.LBB0_47:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x6, [sp, #432]                  // 8-byte Folded Reload
+	stp	q1, q0, [x14]
+	stp	q3, q2, [x15]
+.LBB0_48:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x14, [sp, #368]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #440]                 // 8-byte Folded Reload
+	cmp	x15, x14
+	b.ge	.LBB0_54
+// %bb.49:                              //   in Loop: Header=BB0_4 Depth=1
+	ldp	x17, x16, [sp, #440]            // 16-byte Folded Reload
+	ldp	q4, q3, [x8]
+	ldr	x18, [sp, #376]                 // 8-byte Folded Reload
+	mov	x14, xzr
+	mul	x15, x17, x28
+	add	x9, x9, x15
+	ldr	x15, [sp, #416]                 // 8-byte Folded Reload
+	add	x9, x9, x15
+	mul	x15, x17, x19
+	madd	x15, x20, x16, x15
+	add	x9, x6, x9, lsl #2
+	ldp	q1, q0, [x9]
+	lsl	x15, x15, #2
+	ldr	q2, [x25, x15]
+	ldr	x15, [sp, #304]                 // 8-byte Folded Reload
+	cmp	xzr, x29
+	b.ge	.LBB0_51
+	.p2align	2
+.LBB0_50:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x17, x10, #32
+	fmla	v1.4s, v4.4s, v2.s[0]
+	fmla	v0.4s, v3.4s, v2.s[0]
+	add	x16, x10, #96
+	prfm	pldl1keep, [x17]
+	ldp	q5, q6, [x10, #-96]
+	add	x14, x14, #4
+	ldp	q4, q3, [x10, #-64]
+	prfm	pldl1keep, [x16]
+	fmla	v0.4s, v6.4s, v2.s[1]
+	fmla	v1.4s, v5.4s, v2.s[1]
+	ldp	q5, q6, [x10, #-32]
+	prfm	pldl1keep, [x15]
+	fmla	v0.4s, v3.4s, v2.s[2]
+	fmla	v1.4s, v4.4s, v2.s[2]
+	fmla	v0.4s, v6.4s, v2.s[3]
+	fmla	v1.4s, v5.4s, v2.s[3]
+	ldur	q2, [x15, #-16]
+	ldp	q4, q3, [x10], #128
+	add	x15, x15, #16
+	cmp	x14, x29
+	b.lt	.LBB0_50
+.LBB0_51:                               //   in Loop: Header=BB0_4 Depth=1
+	ldp	q5, q6, [x13]
+	fmla	v1.4s, v4.4s, v2.s[0]
+	fmla	v0.4s, v3.4s, v2.s[0]
+	ldp	q4, q3, [x12]
+	mov	x10, xzr
+	mov	x14, xzr
+	fmla	v0.4s, v6.4s, v2.s[1]
+	fmla	v1.4s, v5.4s, v2.s[1]
+	ldp	q5, q6, [x11]
+	ldr	x11, [sp, #72]                  // 8-byte Folded Reload
+	fmla	v0.4s, v3.4s, v2.s[2]
+	fmla	v1.4s, v4.4s, v2.s[2]
+	add	x8, x8, x11
+	mov	x11, x27
+	fmla	v0.4s, v6.4s, v2.s[3]
+	fmla	v1.4s, v5.4s, v2.s[3]
+	cmp	x27, x21
+	b.ge	.LBB0_53
+	.p2align	2
+.LBB0_52:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x13, x8, x14, lsl #3
+	add	x12, x18, x14
+	add	x15, x8, x10
+	add	x11, x11, #1
+	add	x12, x12, #4
+	add	x10, x10, #32
+	add	x13, x13, #32
+	prfm	pldl1keep, [x13]
+	ldp	q2, q3, [x15]
+	prfm	pldl1keep, [x12]
+	ldr	s4, [x18, x14]
+	add	x14, x14, #4
+	fmla	v0.4s, v3.4s, v4.s[0]
+	fmla	v1.4s, v2.4s, v4.s[0]
+	cmp	x11, x21
+	b.lt	.LBB0_52
+.LBB0_53:                               //   in Loop: Header=BB0_4 Depth=1
+	stp	q1, q0, [x9]
+.LBB0_54:                               //   in Loop: Header=BB0_4 Depth=1
+	bl	free
+	ldr	x8, [sp, #448]                  // 8-byte Folded Reload
+	ldr	x30, [sp, #280]                 // 8-byte Folded Reload
+	ldr	x1, [sp, #176]                  // 8-byte Folded Reload
+	cmp	x30, x1
+	b.ge	.LBB0_31
+.LBB0_55:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x8, [sp, #88]                   // 8-byte Folded Reload
+	add	x0, x8, #64
+	bl	malloc
+	ldr	x8, [sp, #360]                  // 8-byte Folded Reload
+	ldr	x5, [sp, #280]                  // 8-byte Folded Reload
+	mov	x10, xzr
+	mov	x11, xzr
+	ldp	x13, x6, [sp, #424]             // 16-byte Folded Reload
+	ldr	x15, [sp, #200]                 // 8-byte Folded Reload
+	mul	x9, x20, x8
+	ldr	x16, [sp, #400]                 // 8-byte Folded Reload
+	add	x8, x9, x5
+	lsl	x12, x8, #2
+	ldr	q0, [x6, x12]
+	add	x12, x8, x28
+	lsl	x12, x12, #2
+	ldr	q1, [x6, x12]
+	add	x12, x8, x13
+	lsl	x12, x12, #2
+	ldr	q2, [x6, x12]
+	add	x12, x13, x28
+	add	x8, x8, x12
+	lsl	x8, x8, #2
+	ldr	q3, [x6, x8]
+	add	x8, x0, #63
+	and	x8, x8, #0xffffffffffffffc0
+	cmp	xzr, x21
+	b.ge	.LBB0_57
+	.p2align	2
+.LBB0_56:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x13, x16, x10
+	add	x12, x15, x10
+	add	x10, x10, #4
+	prfm	pldl1keep, [x13]
+	ldur	s4, [x13, #-4]
+	add	x13, x13, x22
+	prfm	pldl1keep, [x13]
+	ldur	s5, [x13, #-4]
+	add	x13, x13, x22
+	prfm	pldl1keep, [x13]
+	ldur	s6, [x13, #-4]
+	add	x13, x13, x22
+	prfm	pldl1keep, [x13]
+	ldur	s7, [x13, #-4]
+	prfm	pldl1keep, [x12]
+	ldur	s16, [x12, #-4]
+	add	x12, x12, x26
+	sub	x13, x12, #4
+	prfm	pldl1keep, [x12]
+	add	x12, x12, x26
+	prfm	pldl1keep, [x12]
+	sub	x14, x12, #4
+	add	x12, x12, x26
+	ld1	{ v16.s }[1], [x13]
+	prfm	pldl1keep, [x12]
+	sub	x12, x12, #4
+	ld1	{ v16.s }[2], [x14]
+	ld1	{ v16.s }[3], [x12]
+	str	q16, [x8, x11, lsl #4]
+	add	x11, x11, #1
+	fmla	v0.4s, v16.4s, v4.s[0]
+	fmla	v1.4s, v16.4s, v5.s[0]
+	fmla	v2.4s, v16.4s, v6.s[0]
+	fmla	v3.4s, v16.4s, v7.s[0]
+	cmp	x11, x21
+	b.lt	.LBB0_56
+.LBB0_57:                               // %.preheader26
+                                        //   in Loop: Header=BB0_4 Depth=1
+	ldr	x11, [sp, #48]                  // 8-byte Folded Reload
+	ldp	x12, x13, [sp, #384]            // 16-byte Folded Reload
+	mov	x1, xzr
+	add	x10, x8, #48
+	mov	w15, #1                         // =0x1
+	mov	w16, #2                         // =0x2
+	mov	w17, #3                         // =0x3
+	mov	w14, #4                         // =0x4
+	add	x11, x8, x11
+	b	.LBB0_59
+	.p2align	2
+.LBB0_58:                               // %.loopexit22
+                                        //   in Loop: Header=BB0_59 Depth=2
+	add	x13, x13, x24
+	add	x12, x12, x24
+	mov	x1, x14
+	mov	x14, x18
+.LBB0_59:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Loop Header: Depth=2
+                                        //       Child Loop BB0_61 Depth 3
+                                        //       Child Loop BB0_63 Depth 3
+	madd	x18, x1, x28, x9
+	add	x18, x18, x5
+	madd	x15, x15, x28, x9
+	madd	x16, x16, x28, x9
+	madd	x17, x17, x28, x9
+	add	x15, x15, x5
+	add	x16, x16, x5
+	lsl	x18, x18, #2
+	lsl	x15, x15, #2
+	lsl	x16, x16, #2
+	str	q0, [x6, x18]
+	str	q1, [x6, x15]
+	add	x15, x17, x5
+	lsl	x15, x15, #2
+	str	q2, [x6, x16]
+	str	q3, [x6, x15]
+	ldr	x15, [sp, #504]                 // 8-byte Folded Reload
+	cmp	x14, x15
+	b.ge	.LBB0_64
+// %bb.60:                              //   in Loop: Header=BB0_59 Depth=2
+	madd	x2, x14, x28, x9
+	add	x15, x14, #1
+	add	x16, x14, #2
+	add	x17, x14, #3
+	madd	x3, x15, x28, x9
+	ldr	q16, [x8]
+	mov	x1, xzr
+	add	x18, x14, #4
+	add	x2, x2, x5
+	lsl	x2, x2, #2
+	add	x3, x3, x5
+	lsl	x3, x3, #2
+	ldr	q0, [x6, x2]
+	madd	x2, x16, x28, x9
+	add	x2, x2, x5
+	ldr	q1, [x6, x3]
+	madd	x3, x17, x28, x9
+	lsl	x2, x2, #2
+	ldr	q2, [x6, x2]
+	add	x2, x3, x5
+	lsl	x2, x2, #2
+	ldr	q3, [x6, x2]
+	ldr	x2, [sp, #448]                  // 8-byte Folded Reload
+	mul	x2, x20, x2
+	madd	x3, x14, x19, x2
+	lsl	x3, x3, #2
+	ldr	q7, [x25, x3]
+	madd	x3, x15, x19, x2
+	lsl	x3, x3, #2
+	ldr	q6, [x25, x3]
+	madd	x3, x16, x19, x2
+	madd	x2, x17, x19, x2
+	lsl	x3, x3, #2
+	lsl	x2, x2, #2
+	ldr	q5, [x25, x3]
+	ldr	q4, [x25, x2]
+	mov	x2, x10
+	mov	x3, x13
+	cmp	xzr, x29
+	b.ge	.LBB0_62
+	.p2align	2
+.LBB0_61:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_59 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x4, x2, #32
+	fmla	v0.4s, v16.4s, v7.s[0]
+	fmla	v1.4s, v16.4s, v6.s[0]
+	add	x1, x1, #4
+	fmla	v2.4s, v16.4s, v5.s[0]
+	fmla	v3.4s, v16.4s, v4.s[0]
+	prfm	pldl1keep, [x4]
+	add	x4, x3, x22
+	ldp	q16, q17, [x2, #-32]
+	fmla	v0.4s, v16.4s, v7.s[1]
+	fmla	v1.4s, v16.4s, v6.s[1]
+	fmla	v2.4s, v16.4s, v5.s[1]
+	fmla	v3.4s, v16.4s, v4.s[1]
+	fmla	v0.4s, v17.4s, v7.s[2]
+	fmla	v1.4s, v17.4s, v6.s[2]
+	fmla	v2.4s, v17.4s, v5.s[2]
+	fmla	v3.4s, v17.4s, v4.s[2]
+	ldp	q17, q16, [x2], #64
+	prfm	pldl1keep, [x3]
+	fmla	v0.4s, v17.4s, v7.s[3]
+	ldur	q7, [x3, #-16]
+	prfm	pldl1keep, [x4]
+	fmla	v1.4s, v17.4s, v6.s[3]
+	ldur	q6, [x4, #-16]
+	add	x4, x4, x22
+	fmla	v2.4s, v17.4s, v5.s[3]
+	fmla	v3.4s, v17.4s, v4.s[3]
+	add	x3, x3, #16
+	prfm	pldl1keep, [x4]
+	ldur	q5, [x4, #-16]
+	add	x4, x4, x22
+	prfm	pldl1keep, [x4]
+	ldur	q4, [x4, #-16]
+	cmp	x1, x29
+	b.lt	.LBB0_61
+.LBB0_62:                               //   in Loop: Header=BB0_59 Depth=2
+	ldp	x1, x2, [sp, #488]              // 16-byte Folded Reload
+	fmla	v0.4s, v16.4s, v7.s[0]
+	fmla	v1.4s, v16.4s, v6.s[0]
+	fmla	v2.4s, v16.4s, v5.s[0]
+	fmla	v3.4s, v16.4s, v4.s[0]
+	mov	x3, x27
+	ldr	q17, [x8, x2, lsl #4]
+	ldr	q16, [x8, x1, lsl #4]
+	ldr	x1, [sp, #480]                  // 8-byte Folded Reload
+	mov	x2, x12
+	ldr	q18, [x8, x1, lsl #4]
+	mov	x1, x11
+	fmla	v0.4s, v17.4s, v7.s[1]
+	fmla	v1.4s, v17.4s, v6.s[1]
+	fmla	v2.4s, v17.4s, v5.s[1]
+	fmla	v3.4s, v17.4s, v4.s[1]
+	fmla	v0.4s, v16.4s, v7.s[2]
+	fmla	v1.4s, v16.4s, v6.s[2]
+	fmla	v2.4s, v16.4s, v5.s[2]
+	fmla	v3.4s, v16.4s, v4.s[2]
+	fmla	v0.4s, v18.4s, v7.s[3]
+	fmla	v1.4s, v18.4s, v6.s[3]
+	fmla	v2.4s, v18.4s, v5.s[3]
+	fmla	v3.4s, v18.4s, v4.s[3]
+	cmp	x27, x21
+	b.ge	.LBB0_58
+	.p2align	2
+.LBB0_63:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_59 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x4, x2, x22
+	prfm	pldl1keep, [x1]
+	ldur	q4, [x1, #-16]
+	add	x3, x3, #1
+	prfm	pldl1keep, [x2]
+	ldur	s5, [x2, #-4]
+	add	x2, x2, #4
+	add	x1, x1, #16
+	prfm	pldl1keep, [x4]
+	ldur	s6, [x4, #-4]
+	add	x4, x4, x22
+	fmla	v0.4s, v4.4s, v5.s[0]
+	prfm	pldl1keep, [x4]
+	ldur	s7, [x4, #-4]
+	add	x4, x4, x22
+	prfm	pldl1keep, [x4]
+	ldur	s16, [x4, #-4]
+	fmla	v1.4s, v4.4s, v6.s[0]
+	fmla	v2.4s, v4.4s, v7.s[0]
+	fmla	v3.4s, v4.4s, v16.s[0]
+	cmp	x3, x21
+	b.lt	.LBB0_63
+	b	.LBB0_58
+	.p2align	2
+.LBB0_64:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x12, [sp, #504]                 // 8-byte Folded Reload
+	ldr	x13, [sp, #440]                 // 8-byte Folded Reload
+	cmp	x12, x13
+	b.ge	.LBB0_70
+// %bb.65:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x17, [sp, #504]                 // 8-byte Folded Reload
+	ldr	x16, [sp, #448]                 // 8-byte Folded Reload
+	mov	x14, xzr
+	mul	x12, x17, x28
+	add	x15, x17, #1
+	mul	x16, x20, x16
+	mul	x17, x17, x19
+	ldr	q4, [x8]
+	madd	x13, x15, x28, x9
+	madd	x15, x15, x19, x16
+	add	x12, x9, x12
+	add	x17, x16, x17
+	add	x12, x12, x5
+	add	x13, x13, x5
+	lsl	x17, x17, #2
+	lsl	x15, x15, #2
+	add	x12, x6, x12, lsl #2
+	add	x13, x6, x13, lsl #2
+	ldr	q3, [x25, x17]
+	ldr	q2, [x25, x15]
+	ldp	x16, x17, [sp, #288]            // 16-byte Folded Reload
+	mov	x15, x10
+	ldr	q0, [x12]
+	ldr	q1, [x13]
+	cmp	xzr, x29
+	b.ge	.LBB0_67
+	.p2align	2
+.LBB0_66:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x4, x15, #32
+	ldr	x2, [sp, #456]                  // 8-byte Folded Reload
+	fmla	v0.4s, v4.4s, v3.s[0]
+	fmla	v1.4s, v4.4s, v2.s[0]
+	prfm	pldl1keep, [x4]
+	ldp	q4, q5, [x15, #-32]
+	add	x14, x14, #4
+	add	x18, x17, x2
+	add	x2, x16, x2
+	add	x17, x17, #16
+	add	x16, x16, #16
+	add	x1, x18, #32
+	add	x3, x2, #32
+	fmla	v0.4s, v4.4s, v3.s[1]
+	fmla	v1.4s, v4.4s, v2.s[1]
+	fmla	v0.4s, v5.4s, v3.s[2]
+	fmla	v1.4s, v5.4s, v2.s[2]
+	ldp	q5, q4, [x15], #64
+	prfm	pldl1keep, [x3]
+	fmla	v0.4s, v5.4s, v3.s[3]
+	ldr	q3, [x2, #16]
+	prfm	pldl1keep, [x1]
+	fmla	v1.4s, v5.4s, v2.s[3]
+	ldr	q2, [x18, #16]
+	cmp	x14, x29
+	b.lt	.LBB0_66
+.LBB0_67:                               //   in Loop: Header=BB0_4 Depth=1
+	ldp	x14, x15, [sp, #488]            // 16-byte Folded Reload
+	fmla	v0.4s, v4.4s, v3.s[0]
+	fmla	v1.4s, v4.4s, v2.s[0]
+	ldp	x1, x18, [sp, #256]             // 16-byte Folded Reload
+	ldr	q5, [x8, x15, lsl #4]
+	ldr	q4, [x8, x14, lsl #4]
+	ldr	x14, [sp, #480]                 // 8-byte Folded Reload
+	mov	x15, x27
+	fmla	v0.4s, v5.4s, v3.s[1]
+	fmla	v1.4s, v5.4s, v2.s[1]
+	ldr	q5, [x8, x14, lsl #4]
+	ldr	x14, [sp, #408]                 // 8-byte Folded Reload
+	fmla	v0.4s, v4.4s, v3.s[2]
+	fmla	v1.4s, v4.4s, v2.s[2]
+	fmla	v0.4s, v5.4s, v3.s[3]
+	fmla	v1.4s, v5.4s, v2.s[3]
+	cmp	x27, x21
+	b.ge	.LBB0_69
+	.p2align	2
+.LBB0_68:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x16, x14, x1
+	add	x17, x14, x18
+	prfm	pldl1keep, [x11]
+	ldur	q2, [x11, #-16]
+	add	x16, x16, #4
+	add	x17, x17, #4
+	add	x15, x15, #1
+	add	x11, x11, #16
+	prfm	pldl1keep, [x17]
+	ldr	s3, [x14, x18]
+	prfm	pldl1keep, [x16]
+	ldr	s4, [x14, x1]
+	add	x14, x14, #4
+	fmla	v0.4s, v2.4s, v3.s[0]
+	fmla	v1.4s, v2.4s, v4.s[0]
+	cmp	x15, x21
+	b.lt	.LBB0_68
+.LBB0_69:                               //   in Loop: Header=BB0_4 Depth=1
+	str	q0, [x12]
+	str	q1, [x13]
+.LBB0_70:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x11, [sp, #368]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #440]                 // 8-byte Folded Reload
+	cmp	x12, x11
+	b.ge	.LBB0_76
+// %bb.71:                              //   in Loop: Header=BB0_4 Depth=1
+	ldp	x14, x13, [sp, #440]            // 16-byte Folded Reload
+	ldr	q2, [x8]
+	mov	x11, xzr
+	mul	x12, x14, x28
+	add	x9, x9, x12
+	mul	x12, x14, x19
+	ldr	x14, [sp, #376]                 // 8-byte Folded Reload
+	madd	x12, x20, x13, x12
+	add	x9, x9, x5
+	add	x9, x6, x9, lsl #2
+	ldr	q0, [x9]
+	lsl	x12, x12, #2
+	ldr	q1, [x25, x12]
+	ldr	x12, [sp, #304]                 // 8-byte Folded Reload
+	cmp	xzr, x29
+	b.ge	.LBB0_73
+	.p2align	2
+.LBB0_72:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x13, x10, #32
+	fmla	v0.4s, v2.4s, v1.s[0]
+	add	x11, x11, #4
+	prfm	pldl1keep, [x13]
+	ldp	q2, q3, [x10, #-32]
+	fmla	v0.4s, v2.4s, v1.s[1]
+	fmla	v0.4s, v3.4s, v1.s[2]
+	ldp	q3, q2, [x10], #64
+	prfm	pldl1keep, [x12]
+	fmla	v0.4s, v3.4s, v1.s[3]
+	ldur	q1, [x12, #-16]
+	add	x12, x12, #16
+	cmp	x11, x29
+	b.lt	.LBB0_72
+.LBB0_73:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x11, [sp, #496]                 // 8-byte Folded Reload
+	fmla	v0.4s, v2.4s, v1.s[0]
+	ldr	x12, [sp, #272]                 // 8-byte Folded Reload
+	mov	x10, xzr
+	ldr	q3, [x8, x11, lsl #4]
+	ldr	x11, [sp, #488]                 // 8-byte Folded Reload
+	fmla	v0.4s, v3.4s, v1.s[1]
+	ldr	q2, [x8, x11, lsl #4]
+	ldr	x11, [sp, #480]                 // 8-byte Folded Reload
+	fmla	v0.4s, v2.4s, v1.s[2]
+	ldr	q3, [x8, x11, lsl #4]
+	ldr	x11, [sp, #24]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	mov	w11, #16                        // =0x10
+	fmla	v0.4s, v3.4s, v1.s[3]
+	add	x13, x27, xzr
+	cmp	x13, x21
+	b.ge	.LBB0_75
+	.p2align	2
+.LBB0_74:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x13, x8, x11
+	add	x11, x11, #16
+	prfm	pldl1keep, [x13]
+	ldr	x13, [sp, #408]                 // 8-byte Folded Reload
+	ldr	q1, [x8, x10, lsl #4]
+	add	x13, x13, x12
+	add	x12, x12, #4
+	prfm	pldl1keep, [x13]
+	ldr	s2, [x14, x10, lsl #2]
+	add	x10, x10, #1
+	fmla	v0.4s, v1.4s, v2.s[0]
+	add	x13, x27, x10
+	cmp	x13, x21
+	b.lt	.LBB0_74
+.LBB0_75:                               //   in Loop: Header=BB0_4 Depth=1
+	str	q0, [x9]
+.LBB0_76:                               //   in Loop: Header=BB0_4 Depth=1
+	bl	free
+	ldr	x8, [sp, #448]                  // 8-byte Folded Reload
+	ldp	x10, x1, [sp, #168]             // 16-byte Folded Reload
+	cmp	x1, x10
+	b.ge	.LBB0_32
+.LBB0_77:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x8, [sp, #80]                   // 8-byte Folded Reload
+	add	x0, x8, #64
+	bl	malloc
+	ldr	x8, [sp, #360]                  // 8-byte Folded Reload
+	ldr	x5, [sp, #176]                  // 8-byte Folded Reload
+	mov	x10, xzr
+	mov	x11, xzr
+	ldp	x13, x6, [sp, #424]             // 16-byte Folded Reload
+	ldp	x17, x16, [sp, #208]            // 16-byte Folded Reload
+	ldr	x18, [sp, #400]                 // 8-byte Folded Reload
+	mul	x9, x20, x8
+	add	x8, x9, x5
+	lsl	x12, x8, #2
+	ldr	d0, [x6, x12]
+	add	x12, x8, x28
+	lsl	x12, x12, #2
+	ldr	d1, [x6, x12]
+	add	x12, x8, x13
+	lsl	x12, x12, #2
+	ldr	d2, [x6, x12]
+	add	x12, x13, x28
+	add	x8, x8, x12
+	lsl	x8, x8, #2
+	ldr	d3, [x6, x8]
+	add	x8, x0, #63
+	and	x8, x8, #0xffffffffffffffc0
+	cmp	xzr, x21
+	b.ge	.LBB0_79
+	.p2align	2
+.LBB0_78:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x15, x18, x10
+	add	x12, x16, x10
+	add	x14, x17, x10
+	prfm	pldl1keep, [x15]
+	ldur	s4, [x15, #-4]
+	add	x15, x15, x22
+	add	x13, x12, #4
+	add	x14, x14, #4
+	prfm	pldl1keep, [x15]
+	ldur	s5, [x15, #-4]
+	add	x15, x15, x22
+	prfm	pldl1keep, [x15]
+	ldur	s6, [x15, #-4]
+	add	x15, x15, x22
+	prfm	pldl1keep, [x15]
+	ldur	s7, [x15, #-4]
+	prfm	pldl1keep, [x14]
+	prfm	pldl1keep, [x13]
+	ldr	s16, [x17, x10]
+	add	x10, x10, #4
+	ld1	{ v16.s }[1], [x12]
+	fmla	v0.2s, v16.2s, v4.s[0]
+	str	d16, [x8, x11, lsl #3]
+	add	x11, x11, #1
+	fmla	v1.2s, v16.2s, v5.s[0]
+	fmla	v2.2s, v16.2s, v6.s[0]
+	fmla	v3.2s, v16.2s, v7.s[0]
+	cmp	x11, x21
+	b.lt	.LBB0_78
+.LBB0_79:                               // %.preheader25
+                                        //   in Loop: Header=BB0_4 Depth=1
+	ldr	x11, [sp, #40]                  // 8-byte Folded Reload
+	ldp	x12, x13, [sp, #384]            // 16-byte Folded Reload
+	mov	x1, xzr
+	add	x10, x8, #24
+	mov	w15, #1                         // =0x1
+	mov	w16, #2                         // =0x2
+	mov	w17, #3                         // =0x3
+	mov	w14, #4                         // =0x4
+	add	x11, x8, x11
+	b	.LBB0_81
+	.p2align	2
+.LBB0_80:                               // %.loopexit21
+                                        //   in Loop: Header=BB0_81 Depth=2
+	add	x13, x13, x24
+	add	x12, x12, x24
+	mov	x1, x14
+	mov	x14, x18
+.LBB0_81:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Loop Header: Depth=2
+                                        //       Child Loop BB0_83 Depth 3
+                                        //       Child Loop BB0_85 Depth 3
+	madd	x18, x1, x28, x9
+	add	x18, x18, x5
+	madd	x15, x15, x28, x9
+	madd	x16, x16, x28, x9
+	madd	x17, x17, x28, x9
+	add	x15, x15, x5
+	add	x16, x16, x5
+	lsl	x18, x18, #2
+	lsl	x15, x15, #2
+	lsl	x16, x16, #2
+	str	d0, [x6, x18]
+	str	d1, [x6, x15]
+	add	x15, x17, x5
+	lsl	x15, x15, #2
+	str	d2, [x6, x16]
+	str	d3, [x6, x15]
+	ldr	x15, [sp, #504]                 // 8-byte Folded Reload
+	cmp	x14, x15
+	b.ge	.LBB0_86
+// %bb.82:                              //   in Loop: Header=BB0_81 Depth=2
+	madd	x2, x14, x28, x9
+	add	x15, x14, #1
+	add	x16, x14, #2
+	add	x17, x14, #3
+	madd	x3, x15, x28, x9
+	ldr	d16, [x8]
+	mov	x1, xzr
+	add	x18, x14, #4
+	add	x2, x2, x5
+	lsl	x2, x2, #2
+	add	x3, x3, x5
+	lsl	x3, x3, #2
+	ldr	d0, [x6, x2]
+	madd	x2, x16, x28, x9
+	add	x2, x2, x5
+	ldr	d1, [x6, x3]
+	madd	x3, x17, x28, x9
+	lsl	x2, x2, #2
+	ldr	d2, [x6, x2]
+	add	x2, x3, x5
+	lsl	x2, x2, #2
+	ldr	d3, [x6, x2]
+	ldr	x2, [sp, #448]                  // 8-byte Folded Reload
+	mul	x2, x20, x2
+	madd	x3, x14, x19, x2
+	lsl	x3, x3, #2
+	ldr	q7, [x25, x3]
+	madd	x3, x15, x19, x2
+	lsl	x3, x3, #2
+	ldr	q6, [x25, x3]
+	madd	x3, x16, x19, x2
+	madd	x2, x17, x19, x2
+	lsl	x3, x3, #2
+	lsl	x2, x2, #2
+	ldr	q5, [x25, x3]
+	ldr	q4, [x25, x2]
+	mov	x2, x10
+	mov	x3, x13
+	cmp	xzr, x29
+	b.ge	.LBB0_84
+	.p2align	2
+.LBB0_83:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_81 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x4, x2, #16
+	fmla	v0.2s, v16.2s, v7.s[0]
+	fmla	v1.2s, v16.2s, v6.s[0]
+	add	x1, x1, #4
+	fmla	v2.2s, v16.2s, v5.s[0]
+	fmla	v3.2s, v16.2s, v4.s[0]
+	prfm	pldl1keep, [x4]
+	add	x4, x3, x22
+	ldp	d16, d17, [x2, #-16]
+	fmla	v0.2s, v16.2s, v7.s[1]
+	fmla	v1.2s, v16.2s, v6.s[1]
+	fmla	v2.2s, v16.2s, v5.s[1]
+	fmla	v3.2s, v16.2s, v4.s[1]
+	fmla	v0.2s, v17.2s, v7.s[2]
+	fmla	v1.2s, v17.2s, v6.s[2]
+	fmla	v2.2s, v17.2s, v5.s[2]
+	fmla	v3.2s, v17.2s, v4.s[2]
+	ldp	d17, d16, [x2], #32
+	prfm	pldl1keep, [x3]
+	fmla	v0.2s, v17.2s, v7.s[3]
+	ldur	q7, [x3, #-16]
+	prfm	pldl1keep, [x4]
+	fmla	v1.2s, v17.2s, v6.s[3]
+	ldur	q6, [x4, #-16]
+	add	x4, x4, x22
+	fmla	v2.2s, v17.2s, v5.s[3]
+	fmla	v3.2s, v17.2s, v4.s[3]
+	add	x3, x3, #16
+	prfm	pldl1keep, [x4]
+	ldur	q5, [x4, #-16]
+	add	x4, x4, x22
+	prfm	pldl1keep, [x4]
+	ldur	q4, [x4, #-16]
+	cmp	x1, x29
+	b.lt	.LBB0_83
+.LBB0_84:                               //   in Loop: Header=BB0_81 Depth=2
+	ldp	x1, x2, [sp, #488]              // 16-byte Folded Reload
+	fmla	v0.2s, v16.2s, v7.s[0]
+	fmla	v1.2s, v16.2s, v6.s[0]
+	fmla	v2.2s, v16.2s, v5.s[0]
+	fmla	v3.2s, v16.2s, v4.s[0]
+	mov	x3, x27
+	ldr	d17, [x8, x2, lsl #3]
+	ldr	d16, [x8, x1, lsl #3]
+	ldr	x1, [sp, #480]                  // 8-byte Folded Reload
+	mov	x2, x12
+	ldr	d18, [x8, x1, lsl #3]
+	mov	x1, x11
+	fmla	v0.2s, v17.2s, v7.s[1]
+	fmla	v1.2s, v17.2s, v6.s[1]
+	fmla	v2.2s, v17.2s, v5.s[1]
+	fmla	v3.2s, v17.2s, v4.s[1]
+	fmla	v0.2s, v16.2s, v7.s[2]
+	fmla	v1.2s, v16.2s, v6.s[2]
+	fmla	v2.2s, v16.2s, v5.s[2]
+	fmla	v3.2s, v16.2s, v4.s[2]
+	fmla	v0.2s, v18.2s, v7.s[3]
+	fmla	v1.2s, v18.2s, v6.s[3]
+	fmla	v2.2s, v18.2s, v5.s[3]
+	fmla	v3.2s, v18.2s, v4.s[3]
+	cmp	x27, x21
+	b.ge	.LBB0_80
+	.p2align	2
+.LBB0_85:                               //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_81 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x4, x2, x22
+	prfm	pldl1keep, [x1]
+	ldur	d4, [x1, #-8]
+	add	x3, x3, #1
+	prfm	pldl1keep, [x2]
+	ldur	s5, [x2, #-4]
+	add	x2, x2, #4
+	add	x1, x1, #8
+	prfm	pldl1keep, [x4]
+	ldur	s6, [x4, #-4]
+	add	x4, x4, x22
+	fmla	v0.2s, v4.2s, v5.s[0]
+	prfm	pldl1keep, [x4]
+	ldur	s7, [x4, #-4]
+	add	x4, x4, x22
+	prfm	pldl1keep, [x4]
+	ldur	s16, [x4, #-4]
+	fmla	v1.2s, v4.2s, v6.s[0]
+	fmla	v2.2s, v4.2s, v7.s[0]
+	fmla	v3.2s, v4.2s, v16.s[0]
+	cmp	x3, x21
+	b.lt	.LBB0_85
+	b	.LBB0_80
+	.p2align	2
+.LBB0_86:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x11, [sp, #504]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #440]                 // 8-byte Folded Reload
+	cmp	x11, x12
+	b.ge	.LBB0_92
+// %bb.87:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x16, [sp, #504]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #448]                 // 8-byte Folded Reload
+	mov	x13, xzr
+	mul	x11, x16, x28
+	add	x14, x16, #1
+	mul	x15, x20, x15
+	mul	x16, x16, x19
+	ldr	d4, [x8]
+	madd	x12, x14, x28, x9
+	madd	x14, x14, x19, x15
+	add	x11, x9, x11
+	add	x16, x15, x16
+	add	x11, x11, x5
+	add	x12, x12, x5
+	lsl	x16, x16, #2
+	lsl	x14, x14, #2
+	add	x11, x6, x11, lsl #2
+	add	x12, x6, x12, lsl #2
+	ldr	q3, [x25, x16]
+	ldr	q2, [x25, x14]
+	ldp	x15, x16, [sp, #288]            // 16-byte Folded Reload
+	mov	x14, x10
+	ldr	d0, [x11]
+	ldr	d1, [x12]
+	cmp	xzr, x29
+	b.ge	.LBB0_89
+	.p2align	2
+.LBB0_88:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x3, x14, #16
+	ldr	x1, [sp, #456]                  // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v3.s[0]
+	fmla	v1.2s, v4.2s, v2.s[0]
+	prfm	pldl1keep, [x3]
+	ldp	d4, d5, [x14, #-16]
+	add	x13, x13, #4
+	add	x17, x16, x1
+	add	x1, x15, x1
+	add	x16, x16, #16
+	add	x15, x15, #16
+	add	x18, x17, #32
+	add	x2, x1, #32
+	fmla	v0.2s, v4.2s, v3.s[1]
+	fmla	v1.2s, v4.2s, v2.s[1]
+	fmla	v0.2s, v5.2s, v3.s[2]
+	fmla	v1.2s, v5.2s, v2.s[2]
+	ldp	d5, d4, [x14], #32
+	prfm	pldl1keep, [x2]
+	fmla	v0.2s, v5.2s, v3.s[3]
+	ldr	q3, [x1, #16]
+	prfm	pldl1keep, [x18]
+	fmla	v1.2s, v5.2s, v2.s[3]
+	ldr	q2, [x17, #16]
+	cmp	x13, x29
+	b.lt	.LBB0_88
+.LBB0_89:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x15, [sp, #496]                 // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v3.s[0]
+	fmla	v1.2s, v4.2s, v2.s[0]
+	ldr	x1, [sp, #240]                  // 8-byte Folded Reload
+	mov	x13, xzr
+	mov	x14, xzr
+	ldr	d5, [x8, x15, lsl #3]
+	ldr	x15, [sp, #488]                 // 8-byte Folded Reload
+	ldr	d4, [x8, x15, lsl #3]
+	ldr	x15, [sp, #480]                 // 8-byte Folded Reload
+	fmla	v0.2s, v5.2s, v3.s[1]
+	fmla	v1.2s, v5.2s, v2.s[1]
+	ldr	d5, [x8, x15, lsl #3]
+	ldr	x15, [sp, #64]                  // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v3.s[2]
+	fmla	v1.2s, v4.2s, v2.s[2]
+	add	x15, x8, x15
+	fmla	v0.2s, v5.2s, v3.s[3]
+	fmla	v1.2s, v5.2s, v2.s[3]
+	add	x16, x27, xzr
+	cmp	x16, x21
+	b.ge	.LBB0_91
+	.p2align	2
+.LBB0_90:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	ldr	x2, [sp, #312]                  // 8-byte Folded Reload
+	add	x18, x15, x14, lsl #3
+	add	x16, x1, x13
+	add	x16, x16, #4
+	add	x18, x18, #8
+	prfm	pldl1keep, [x18]
+	ldr	d2, [x15, x14, lsl #3]
+	add	x17, x2, x13
+	add	x13, x13, #4
+	add	x17, x17, #4
+	prfm	pldl1keep, [x17]
+	ldr	s3, [x2, x14, lsl #2]
+	prfm	pldl1keep, [x16]
+	fmla	v0.2s, v2.2s, v3.s[0]
+	ldr	s4, [x1, x14, lsl #2]
+	fmla	v1.2s, v2.2s, v4.s[0]
+	add	x14, x14, #1
+	add	x16, x27, x14
+	cmp	x16, x21
+	b.lt	.LBB0_90
+.LBB0_91:                               //   in Loop: Header=BB0_4 Depth=1
+	str	d0, [x11]
+	str	d1, [x12]
+.LBB0_92:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x11, [sp, #368]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #440]                 // 8-byte Folded Reload
+	cmp	x12, x11
+	b.ge	.LBB0_98
+// %bb.93:                              //   in Loop: Header=BB0_4 Depth=1
+	ldp	x14, x13, [sp, #440]            // 16-byte Folded Reload
+	ldr	d2, [x8]
+	mov	x11, xzr
+	mul	x12, x14, x28
+	add	x9, x9, x12
+	mul	x12, x14, x19
+	ldr	x14, [sp, #376]                 // 8-byte Folded Reload
+	madd	x12, x20, x13, x12
+	add	x9, x9, x5
+	add	x9, x6, x9, lsl #2
+	ldr	d0, [x9]
+	lsl	x12, x12, #2
+	ldr	q1, [x25, x12]
+	ldr	x12, [sp, #304]                 // 8-byte Folded Reload
+	cmp	xzr, x29
+	b.ge	.LBB0_95
+	.p2align	2
+.LBB0_94:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x13, x10, #16
+	fmla	v0.2s, v2.2s, v1.s[0]
+	add	x11, x11, #4
+	prfm	pldl1keep, [x13]
+	ldp	d2, d3, [x10, #-16]
+	fmla	v0.2s, v2.2s, v1.s[1]
+	fmla	v0.2s, v3.2s, v1.s[2]
+	ldp	d3, d2, [x10], #32
+	prfm	pldl1keep, [x12]
+	fmla	v0.2s, v3.2s, v1.s[3]
+	ldur	q1, [x12, #-16]
+	add	x12, x12, #16
+	cmp	x11, x29
+	b.lt	.LBB0_94
+.LBB0_95:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x11, [sp, #496]                 // 8-byte Folded Reload
+	fmla	v0.2s, v2.2s, v1.s[0]
+	mov	x10, xzr
+	ldr	d3, [x8, x11, lsl #3]
+	ldr	x11, [sp, #488]                 // 8-byte Folded Reload
+	fmla	v0.2s, v3.2s, v1.s[1]
+	ldr	d4, [x8, x11, lsl #3]
+	ldr	x11, [sp, #480]                 // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v1.s[2]
+	ldr	d2, [x8, x11, lsl #3]
+	ldr	x11, [sp, #64]                  // 8-byte Folded Reload
+	add	x8, x8, x11
+	ldr	x11, [sp, #272]                 // 8-byte Folded Reload
+	fmla	v0.2s, v2.2s, v1.s[3]
+	add	x12, x27, xzr
+	cmp	x12, x21
+	b.ge	.LBB0_97
+	.p2align	2
+.LBB0_96:                               //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x12, x8, x10, lsl #3
+	add	x12, x12, #8
+	prfm	pldl1keep, [x12]
+	ldr	x12, [sp, #408]                 // 8-byte Folded Reload
+	ldr	d1, [x8, x10, lsl #3]
+	add	x12, x12, x11
+	add	x11, x11, #4
+	prfm	pldl1keep, [x12]
+	ldr	s2, [x14, x10, lsl #2]
+	add	x10, x10, #1
+	fmla	v0.2s, v1.2s, v2.s[0]
+	add	x12, x27, x10
+	cmp	x12, x21
+	b.lt	.LBB0_96
+.LBB0_97:                               //   in Loop: Header=BB0_4 Depth=1
+	str	d0, [x9]
+.LBB0_98:                               //   in Loop: Header=BB0_4 Depth=1
+	bl	free
+	ldr	x8, [sp, #448]                  // 8-byte Folded Reload
+	ldr	x10, [sp, #168]                 // 8-byte Folded Reload
+	ldr	x9, [sp, #136]                  // 8-byte Folded Reload
+	cmp	x10, x9
+	b.ge	.LBB0_3
+.LBB0_99:                               //   in Loop: Header=BB0_4 Depth=1
+	ldr	x8, [sp, #112]                  // 8-byte Folded Reload
+	add	x0, x8, #64
+	bl	malloc
+	ldr	x8, [sp, #360]                  // 8-byte Folded Reload
+	ldr	x5, [sp, #168]                  // 8-byte Folded Reload
+	mov	x10, xzr
+	mov	x11, xzr
+	ldp	x13, x14, [sp, #424]            // 16-byte Folded Reload
+	mul	x9, x20, x8
+	add	x12, x9, x5
+	add	x8, x12, x13
+	add	x13, x13, x28
+	ldr	s2, [x14, x12, lsl #2]
+	add	x13, x12, x13
+	ldr	s1, [x14, x8, lsl #2]
+	add	x8, x0, #63
+	ldr	s0, [x14, x13, lsl #2]
+	add	x13, x12, x28
+	ldr	x12, [sp, #224]                 // 8-byte Folded Reload
+	and	x8, x8, #0xffffffffffffffc0
+	ldr	s3, [x14, x13, lsl #2]
+	ldr	x14, [sp, #400]                 // 8-byte Folded Reload
+	cmp	xzr, x21
+	b.ge	.LBB0_101
+	.p2align	2
+.LBB0_100:                              //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x13, x14, x10
+	add	x11, x11, #1
+	prfm	pldl1keep, [x13]
+	ldur	s4, [x13, #-4]
+	add	x13, x13, x22
+	prfm	pldl1keep, [x13]
+	ldur	s5, [x13, #-4]
+	add	x13, x13, x22
+	prfm	pldl1keep, [x13]
+	ldur	s6, [x13, #-4]
+	add	x13, x13, x22
+	prfm	pldl1keep, [x13]
+	ldur	s7, [x13, #-4]
+	prfm	pldl1keep, [x12]
+	ldur	s16, [x12, #-4]
+	add	x12, x12, #4
+	fmla	v2.2s, v16.2s, v4.2s
+	fmla	v3.2s, v16.2s, v5.2s
+	fmla	v1.2s, v16.2s, v6.2s
+	fmla	v0.2s, v16.2s, v7.2s
+	str	s16, [x8, x10]
+	add	x10, x10, #4
+	cmp	x11, x21
+	b.lt	.LBB0_100
+.LBB0_101:                              // %.preheader24
+                                        //   in Loop: Header=BB0_4 Depth=1
+	ldr	x11, [sp, #32]                  // 8-byte Folded Reload
+	ldp	x12, x13, [sp, #384]            // 16-byte Folded Reload
+	mov	x1, xzr
+	ldp	x7, x6, [sp, #184]              // 16-byte Folded Reload
+	add	x10, x8, #12
+	mov	w16, #1                         // =0x1
+	mov	w17, #2                         // =0x2
+	mov	w15, #3                         // =0x3
+	mov	w14, #4                         // =0x4
+	add	x11, x8, x11
+	b	.LBB0_103
+	.p2align	2
+.LBB0_102:                              // %.loopexit20
+                                        //   in Loop: Header=BB0_103 Depth=2
+	add	x13, x13, x24
+	add	x12, x12, x24
+	mov	x1, x14
+	mov	x14, x18
+.LBB0_103:                              //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Loop Header: Depth=2
+                                        //       Child Loop BB0_105 Depth 3
+                                        //       Child Loop BB0_107 Depth 3
+	madd	x18, x1, x28, x9
+	ldr	x30, [sp, #432]                 // 8-byte Folded Reload
+	add	x18, x18, x5
+	madd	x16, x16, x28, x9
+	madd	x17, x17, x28, x9
+	madd	x15, x15, x28, x9
+	add	x16, x16, x5
+	add	x15, x15, x5
+	str	s2, [x30, x18, lsl #2]
+	str	s3, [x30, x16, lsl #2]
+	add	x16, x17, x5
+	str	s1, [x30, x16, lsl #2]
+	str	s0, [x30, x15, lsl #2]
+	ldr	x15, [sp, #504]                 // 8-byte Folded Reload
+	cmp	x14, x15
+	b.ge	.LBB0_108
+// %bb.104:                             //   in Loop: Header=BB0_103 Depth=2
+	madd	x2, x14, x28, x9
+	add	x15, x14, #3
+	add	x16, x14, #1
+	add	x17, x14, #2
+	madd	x3, x16, x28, x9
+	ldr	s16, [x8]
+	mov	x1, xzr
+	add	x18, x14, #4
+	madd	x4, x17, x28, x9
+	add	x2, x2, x5
+	ldr	s2, [x30, x2, lsl #2]
+	madd	x2, x15, x28, x9
+	add	x3, x3, x5
+	add	x4, x4, x5
+	add	x2, x2, x5
+	ldr	s3, [x30, x3, lsl #2]
+	ldr	s1, [x30, x4, lsl #2]
+	ldr	s0, [x30, x2, lsl #2]
+	ldr	x2, [sp, #448]                  // 8-byte Folded Reload
+	mul	x2, x20, x2
+	madd	x3, x14, x19, x2
+	lsl	x3, x3, #2
+	ldr	q7, [x25, x3]
+	madd	x3, x16, x19, x2
+	lsl	x3, x3, #2
+	ldr	q6, [x25, x3]
+	madd	x3, x17, x19, x2
+	madd	x2, x15, x19, x2
+	lsl	x3, x3, #2
+	lsl	x2, x2, #2
+	ldr	q5, [x25, x3]
+	ldr	q4, [x25, x2]
+	mov	x2, x10
+	mov	x3, x13
+	ext	v20.16b, v7.16b, v7.16b, #8
+	cmp	xzr, x29
+	ext	v19.16b, v6.16b, v6.16b, #8
+	ext	v18.16b, v5.16b, v5.16b, #8
+	ext	v17.16b, v4.16b, v4.16b, #8
+	b.ge	.LBB0_106
+	.p2align	2
+.LBB0_105:                              //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_103 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x4, x2, #8
+	fmla	v2.2s, v16.2s, v7.2s
+	fmla	v3.2s, v16.2s, v6.2s
+	add	x1, x1, #4
+	fmla	v1.2s, v16.2s, v5.2s
+	fmla	v0.2s, v16.2s, v4.2s
+	prfm	pldl1keep, [x4]
+	add	x4, x3, x22
+	ldp	s16, s21, [x2, #-8]
+	fmla	v0.2s, v16.2s, v4.s[1]
+	fmla	v2.2s, v16.2s, v7.s[1]
+	fmla	v3.2s, v16.2s, v6.s[1]
+	fmla	v1.2s, v16.2s, v5.s[1]
+	fmla	v0.2s, v21.2s, v17.2s
+	fmla	v2.2s, v21.2s, v20.2s
+	ldp	s17, s16, [x2], #16
+	fmla	v3.2s, v21.2s, v19.2s
+	fmla	v1.2s, v21.2s, v18.2s
+	prfm	pldl1keep, [x3]
+	fmla	v2.2s, v17.2s, v7.s[3]
+	ldur	q7, [x3, #-16]
+	prfm	pldl1keep, [x4]
+	fmla	v3.2s, v17.2s, v6.s[3]
+	ldur	q6, [x4, #-16]
+	add	x4, x4, x22
+	fmla	v1.2s, v17.2s, v5.s[3]
+	fmla	v0.2s, v17.2s, v4.s[3]
+	add	x3, x3, #16
+	prfm	pldl1keep, [x4]
+	ldur	q5, [x4, #-16]
+	add	x4, x4, x22
+	prfm	pldl1keep, [x4]
+	ldur	q4, [x4, #-16]
+	ext	v20.16b, v7.16b, v7.16b, #8
+	cmp	x1, x29
+	ext	v19.16b, v6.16b, v6.16b, #8
+	ext	v18.16b, v5.16b, v5.16b, #8
+	ext	v17.16b, v4.16b, v4.16b, #8
+	b.lt	.LBB0_105
+.LBB0_106:                              //   in Loop: Header=BB0_103 Depth=2
+	ldp	x1, x2, [sp, #488]              // 16-byte Folded Reload
+	fmla	v2.2s, v16.2s, v7.2s
+	fmla	v3.2s, v16.2s, v6.2s
+	fmla	v1.2s, v16.2s, v5.2s
+	fmla	v0.2s, v16.2s, v4.2s
+	mov	x3, x27
+	ldr	s21, [x8, x2, lsl #2]
+	ldr	s16, [x8, x1, lsl #2]
+	ldr	x1, [sp, #480]                  // 8-byte Folded Reload
+	mov	x2, x12
+	ldr	s22, [x8, x1, lsl #2]
+	mov	x1, x11
+	fmla	v2.2s, v21.2s, v7.s[1]
+	fmla	v3.2s, v21.2s, v6.s[1]
+	fmla	v1.2s, v21.2s, v5.s[1]
+	fmla	v0.2s, v21.2s, v4.s[1]
+	fmla	v2.2s, v16.2s, v20.2s
+	fmla	v3.2s, v16.2s, v19.2s
+	fmla	v1.2s, v16.2s, v18.2s
+	fmla	v0.2s, v16.2s, v17.2s
+	fmla	v2.2s, v22.2s, v7.s[3]
+	fmla	v3.2s, v22.2s, v6.s[3]
+	fmla	v1.2s, v22.2s, v5.s[3]
+	fmla	v0.2s, v22.2s, v4.s[3]
+	cmp	x27, x21
+	b.ge	.LBB0_102
+	.p2align	2
+.LBB0_107:                              //   Parent Loop BB0_4 Depth=1
+                                        //     Parent Loop BB0_103 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x4, x2, x22
+	prfm	pldl1keep, [x1]
+	ldur	s4, [x1, #-4]
+	add	x3, x3, #1
+	prfm	pldl1keep, [x2]
+	ldur	s5, [x2, #-4]
+	add	x2, x2, #4
+	add	x1, x1, #4
+	prfm	pldl1keep, [x4]
+	ldur	s6, [x4, #-4]
+	add	x4, x4, x22
+	fmla	v2.2s, v4.2s, v5.2s
+	prfm	pldl1keep, [x4]
+	ldur	s7, [x4, #-4]
+	add	x4, x4, x22
+	prfm	pldl1keep, [x4]
+	ldur	s16, [x4, #-4]
+	fmla	v3.2s, v4.2s, v6.2s
+	fmla	v1.2s, v4.2s, v7.2s
+	fmla	v0.2s, v4.2s, v16.2s
+	cmp	x3, x21
+	b.lt	.LBB0_107
+	b	.LBB0_102
+	.p2align	2
+.LBB0_108:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x11, [sp, #504]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #440]                 // 8-byte Folded Reload
+	cmp	x11, x12
+	b.ge	.LBB0_114
+// %bb.109:                             //   in Loop: Header=BB0_4 Depth=1
+	ldr	x16, [sp, #504]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #448]                 // 8-byte Folded Reload
+	mov	x13, xzr
+	mov	x14, xzr
+	ldr	s4, [x8]
+	mul	x12, x20, x12
+	mul	x15, x16, x19
+	mul	x11, x16, x28
+	add	x15, x12, x15
+	add	x11, x9, x11
+	lsl	x15, x15, #2
+	add	x11, x11, x5
+	ldr	q2, [x25, x15]
+	add	x15, x16, #1
+	madd	x16, x15, x19, x12
+	madd	x12, x15, x28, x9
+	ldr	x15, [sp, #432]                 // 8-byte Folded Reload
+	add	x12, x12, x5
+	ldr	s1, [x15, x11, lsl #2]
+	ldr	s0, [x15, x12, lsl #2]
+	lsl	x15, x16, #2
+	ldr	q3, [x25, x15]
+	ext	v6.16b, v2.16b, v2.16b, #8
+	cmp	xzr, x29
+	ext	v5.16b, v3.16b, v3.16b, #8
+	b.ge	.LBB0_111
+	.p2align	2
+.LBB0_110:                              //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x1, x8, x13
+	fmla	v1.2s, v4.2s, v2.2s
+	fmla	v0.2s, v4.2s, v3.2s
+	add	x15, x6, x13
+	add	x2, x1, #20
+	add	x17, x7, x13
+	add	x16, x15, #32
+	add	x18, x17, #32
+	prfm	pldl1keep, [x2]
+	ldp	s4, s7, [x1, #4]
+	add	x14, x14, #4
+	add	x13, x13, #16
+	fmla	v0.2s, v4.2s, v3.s[1]
+	fmla	v1.2s, v4.2s, v2.s[1]
+	fmla	v0.2s, v7.2s, v5.2s
+	ldp	s5, s4, [x1, #12]
+	fmla	v1.2s, v7.2s, v6.2s
+	prfm	pldl1keep, [x18]
+	fmla	v1.2s, v5.2s, v2.s[3]
+	ldr	q2, [x17, #16]
+	prfm	pldl1keep, [x16]
+	fmla	v0.2s, v5.2s, v3.s[3]
+	ldr	q3, [x15, #16]
+	ext	v6.16b, v2.16b, v2.16b, #8
+	cmp	x14, x29
+	ext	v5.16b, v3.16b, v3.16b, #8
+	b.lt	.LBB0_110
+.LBB0_111:                              //   in Loop: Header=BB0_4 Depth=1
+	ldp	x14, x15, [sp, #488]            // 16-byte Folded Reload
+	fmla	v1.2s, v4.2s, v2.2s
+	fmla	v0.2s, v4.2s, v3.2s
+	ldr	x1, [sp, #240]                  // 8-byte Folded Reload
+	ldr	x3, [sp, #432]                  // 8-byte Folded Reload
+	mov	x13, xzr
+	ldr	s7, [x8, x15, lsl #2]
+	ldr	s4, [x8, x14, lsl #2]
+	ldr	x14, [sp, #480]                 // 8-byte Folded Reload
+	mov	x15, x27
+	fmla	v1.2s, v7.2s, v2.s[1]
+	fmla	v0.2s, v7.2s, v3.s[1]
+	ldr	s7, [x8, x14, lsl #2]
+	ldr	x14, [sp, #104]                 // 8-byte Folded Reload
+	fmla	v1.2s, v4.2s, v6.2s
+	fmla	v0.2s, v4.2s, v5.2s
+	add	x14, x8, x14
+	fmla	v1.2s, v7.2s, v2.s[3]
+	fmla	v0.2s, v7.2s, v3.s[3]
+	cmp	x27, x21
+	b.ge	.LBB0_113
+	.p2align	2
+.LBB0_112:                              //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	ldr	x2, [sp, #312]                  // 8-byte Folded Reload
+	add	x16, x1, x13
+	add	x18, x14, x13
+	add	x15, x15, #1
+	add	x16, x16, #4
+	add	x18, x18, #4
+	prfm	pldl1keep, [x18]
+	add	x17, x2, x13
+	ldr	s2, [x14, x13]
+	add	x17, x17, #4
+	prfm	pldl1keep, [x17]
+	prfm	pldl1keep, [x16]
+	ldr	s3, [x2, x13]
+	fmla	v1.2s, v2.2s, v3.2s
+	ldr	s3, [x1, x13]
+	add	x13, x13, #4
+	fmla	v0.2s, v2.2s, v3.2s
+	cmp	x15, x21
+	b.lt	.LBB0_112
+.LBB0_113:                              //   in Loop: Header=BB0_4 Depth=1
+	str	s1, [x3, x11, lsl #2]
+	str	s0, [x3, x12, lsl #2]
+.LBB0_114:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x11, [sp, #368]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #440]                 // 8-byte Folded Reload
+	cmp	x12, x11
+	b.ge	.LBB0_2
+// %bb.115:                             //   in Loop: Header=BB0_4 Depth=1
+	ldp	x15, x13, [sp, #432]            // 16-byte Folded Reload
+	ldr	s2, [x8]
+	mov	x11, xzr
+	ldr	x14, [sp, #376]                 // 8-byte Folded Reload
+	mul	x12, x13, x28
+	add	x9, x9, x12
+	mul	x12, x13, x19
+	ldr	x13, [sp, #448]                 // 8-byte Folded Reload
+	add	x9, x9, x5
+	ldr	s0, [x15, x9, lsl #2]
+	madd	x12, x20, x13, x12
+	lsl	x12, x12, #2
+	ldr	q1, [x25, x12]
+	ldr	x12, [sp, #304]                 // 8-byte Folded Reload
+	ext	v3.16b, v1.16b, v1.16b, #8
+	cmp	xzr, x29
+	b.ge	.LBB0_117
+	.p2align	2
+.LBB0_116:                              //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x13, x10, #8
+	fmla	v0.2s, v2.2s, v1.2s
+	add	x11, x11, #4
+	prfm	pldl1keep, [x13]
+	ldp	s2, s4, [x10, #-8]
+	fmla	v0.2s, v2.2s, v1.s[1]
+	fmla	v0.2s, v4.2s, v3.2s
+	ldp	s3, s2, [x10], #16
+	prfm	pldl1keep, [x12]
+	fmla	v0.2s, v3.2s, v1.s[3]
+	ldur	q1, [x12, #-16]
+	add	x12, x12, #16
+	ext	v3.16b, v1.16b, v1.16b, #8
+	cmp	x11, x29
+	b.lt	.LBB0_116
+.LBB0_117:                              //   in Loop: Header=BB0_4 Depth=1
+	ldr	x11, [sp, #496]                 // 8-byte Folded Reload
+	fmla	v0.2s, v2.2s, v1.2s
+	mov	x10, xzr
+	ldr	s4, [x8, x11, lsl #2]
+	ldr	x11, [sp, #488]                 // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v1.s[1]
+	ldr	s5, [x8, x11, lsl #2]
+	ldr	x11, [sp, #480]                 // 8-byte Folded Reload
+	fmla	v0.2s, v5.2s, v3.2s
+	ldr	s2, [x8, x11, lsl #2]
+	ldr	x11, [sp, #104]                 // 8-byte Folded Reload
+	add	x8, x8, x11
+	mov	x11, x27
+	fmla	v0.2s, v2.2s, v1.s[3]
+	cmp	x27, x21
+	b.ge	.LBB0_1
+	.p2align	2
+.LBB0_118:                              //   Parent Loop BB0_4 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x12, x14, x10
+	add	x13, x8, x10
+	add	x11, x11, #1
+	add	x12, x12, #4
+	add	x13, x13, #4
+	prfm	pldl1keep, [x13]
+	ldr	s1, [x8, x10]
+	prfm	pldl1keep, [x12]
+	ldr	s2, [x14, x10]
+	add	x10, x10, #4
+	fmla	v0.2s, v1.2s, v2.2s
+	cmp	x11, x21
+	b.lt	.LBB0_118
+	b	.LBB0_1
+.LBB0_119:
+	ldr	x0, [sp, #16]                   // 8-byte Folded Reload
+	bl	free
+	add	sp, sp, #512
+	ldp	d9, d8, [sp, #48]               // 16-byte Folded Reload
+	ldp	d11, d10, [sp, #32]             // 16-byte Folded Reload
+	ldp	d13, d12, [sp, #16]             // 16-byte Folded Reload
+	ldp	x20, x19, [sp, #144]            // 16-byte Folded Reload
+	ldp	x22, x21, [sp, #128]            // 16-byte Folded Reload
+	ldp	x24, x23, [sp, #112]            // 16-byte Folded Reload
+	ldp	x26, x25, [sp, #96]             // 16-byte Folded Reload
+	ldp	x28, x27, [sp, #80]             // 16-byte Folded Reload
+	ldp	x29, x30, [sp, #64]             // 16-byte Folded Reload
+	ldp	d15, d14, [sp], #160            // 16-byte Folded Reload
+	ret
+.Lfunc_end0:
+	.size	sbatch_matmul_3d_nt_mlir, .Lfunc_end0-sbatch_matmul_3d_nt_mlir
+	.cfi_endproc
+                                        // -- End function
+	.section	".note.GNU-stack","",@progbits
diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nn_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nn_mlir.s
new file mode 100644
index 00000000000000..96e02991c200d9
--- /dev/null
+++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nn_mlir.s
@@ -0,0 +1,4171 @@
+	.text
+	.file	"LLVMDialectModule"
+	.globl	sbatch_matmul_4d_nn_mlir                    // -- Begin function sbatch_matmul_4d_nn_mlir
+	.p2align	4
+	.type	sbatch_matmul_4d_nn_mlir,@function
+sbatch_matmul_4d_nn_mlir:                           // @sbatch_matmul_4d_nn_mlir
+	.cfi_startproc
+// %bb.0:
+	stp	d15, d14, [sp, #-160]!          // 16-byte Folded Spill
+	stp	d13, d12, [sp, #16]             // 16-byte Folded Spill
+	stp	x29, x30, [sp, #64]             // 16-byte Folded Spill
+	stp	x28, x27, [sp, #80]             // 16-byte Folded Spill
+	stp	x26, x25, [sp, #96]             // 16-byte Folded Spill
+	stp	x24, x23, [sp, #112]            // 16-byte Folded Spill
+	stp	x22, x21, [sp, #128]            // 16-byte Folded Spill
+	stp	x20, x19, [sp, #144]            // 16-byte Folded Spill
+	stp	d11, d10, [sp, #32]             // 16-byte Folded Spill
+	stp	d9, d8, [sp, #48]               // 16-byte Folded Spill
+	sub	sp, sp, #1312
+	.cfi_def_cfa_offset 1472
+	.cfi_offset w19, -8
+	.cfi_offset w20, -16
+	.cfi_offset w21, -24
+	.cfi_offset w22, -32
+	.cfi_offset w23, -40
+	.cfi_offset w24, -48
+	.cfi_offset w25, -56
+	.cfi_offset w26, -64
+	.cfi_offset w27, -72
+	.cfi_offset w28, -80
+	.cfi_offset w30, -88
+	.cfi_offset w29, -96
+	.cfi_offset b8, -104
+	.cfi_offset b9, -112
+	.cfi_offset b10, -120
+	.cfi_offset b11, -128
+	.cfi_offset b12, -136
+	.cfi_offset b13, -144
+	.cfi_offset b14, -152
+	.cfi_offset b15, -160
+	cmp	x5, #0
+	ldr	x13, [sp, #1544]
+	ldr	x29, [sp, #1656]
+	mov	x20, x6
+	cinv	x8, x5, lt
+	ldr	x23, [sp, #1568]
+	ldr	x27, [sp, #1512]
+	mov	x21, x1
+	add	x9, x8, x8, lsr #63
+	add	x10, x8, #3
+	ldr	x24, [sp, #1504]
+	ldr	x28, [sp, #1480]
+	str	x7, [sp, #1056]                 // 8-byte Folded Spill
+	str	x4, [sp, #520]                  // 8-byte Folded Spill
+	asr	x9, x9, #1
+	str	x3, [sp, #40]                   // 8-byte Folded Spill
+	str	x2, [sp, #960]                  // 8-byte Folded Spill
+	cinv	x9, x9, lt
+	cmp	x8, #0
+	str	x5, [sp, #1024]                 // 8-byte Folded Spill
+	str	x13, [sp, #504]                 // 8-byte Folded Spill
+	csel	x8, x10, x8, lt
+	str	x9, [sp, #1280]                 // 8-byte Folded Spill
+	ldr	x9, [sp, #1552]
+	cmp	x5, #0
+	ldr	x10, [sp, #1600]
+	asr	x8, x8, #2
+	cinv	x22, x8, lt
+	cmp	x13, #0
+	cinv	x8, x13, lt
+	str	x9, [sp, #1048]                 // 8-byte Folded Spill
+	ldr	x9, [sp, #1648]
+	str	x10, [sp, #944]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #1592]
+	add	x11, x8, #7
+	add	x12, x8, #3
+	str	x9, [sp, #1016]                 // 8-byte Folded Spill
+	ldr	x9, [sp, #1640]
+	str	x10, [sp, #936]                 // 8-byte Folded Spill
+	add	x10, x8, #15
+	str	x9, [sp, #1008]                 // 8-byte Folded Spill
+	add	x9, x8, x8, lsr #63
+	asr	x9, x9, #1
+	cinv	x14, x9, lt
+	ldr	x9, [sp, #1560]
+	cmp	x8, #0
+	str	x14, [sp, #1272]                // 8-byte Folded Spill
+	str	x9, [sp, #1040]                 // 8-byte Folded Spill
+	csel	x9, x10, x8, lt
+	csel	x10, x11, x8, lt
+	csel	x8, x12, x8, lt
+	cmp	x13, #0
+	asr	x9, x9, #4
+	asr	x8, x8, #2
+	asr	x10, x10, #3
+	cinv	x19, x9, lt
+	cinv	x9, x8, lt
+	cinv	x25, x10, lt
+	lsl	x8, x19, #4
+	str	x9, [sp, #1264]                 // 8-byte Folded Spill
+	lsl	x26, x25, #3
+	str	x8, [sp, #1104]                 // 8-byte Folded Spill
+	ldr	x8, [sp, #1472]
+	str	x8, [sp, #1032]                 // 8-byte Folded Spill
+	lsl	x8, x9, #2
+	str	x8, [sp, #592]                  // 8-byte Folded Spill
+	lsl	x8, x14, #1
+	str	x8, [sp, #584]                  // 8-byte Folded Spill
+	lsl	x8, x6, #6
+	add	x0, x8, #64
+	str	x8, [sp, #1288]                 // 8-byte Folded Spill
+	bl	malloc
+	ldr	x16, [sp, #1280]                // 8-byte Folded Reload
+	lsl	x8, x22, #2
+	mov	x30, x26
+	str	x27, [sp, #928]                 // 8-byte Folded Spill
+	str	x8, [sp, #1296]                 // 8-byte Folded Spill
+	add	x8, x0, #63
+	lsl	x12, x27, #2
+	lsl	x27, x28, #2
+	and	x26, x8, #0xffffffffffffffc0
+	lsl	x6, x20, #2
+	str	x0, [sp, #16]                   // 8-byte Folded Spill
+	mov	w14, #20                        // =0x14
+	madd	x14, x23, x14, x12
+	mov	w11, #12                        // =0xc
+	str	x6, [sp, #480]                  // 8-byte Folded Spill
+	mov	w17, #28                        // =0x1c
+	lsl	x9, x16, #1
+	mul	x3, x16, x28
+	madd	x17, x23, x17, x12
+	mov	w15, #24                        // =0x18
+	str	x9, [sp, #1128]                 // 8-byte Folded Spill
+	negs	x9, x20
+	madd	x15, x23, x15, x12
+	str	x23, [sp, #1000]                // 8-byte Folded Spill
+	and	x8, x9, #0x3
+	and	x9, x20, #0x3
+	str	x24, [sp, #920]                 // 8-byte Folded Spill
+	lsl	x25, x25, #5
+	csneg	x7, x9, x8, mi
+	ldr	x8, [sp, #960]                  // 8-byte Folded Reload
+	add	x9, x20, x27
+	lsl	x19, x19, #6
+	lsl	x18, x7, #2
+	sub	x4, x26, x7, lsl #6
+	stp	xzr, xzr, [sp, #264]            // 16-byte Folded Spill
+	mov	x13, xzr
+	str	x21, [sp, #952]                 // 8-byte Folded Spill
+	stp	x25, x19, [sp, #464]            // 16-byte Folded Spill
+	lsl	x10, x8, #2
+	sub	x8, x6, x18
+	str	x30, [sp, #1080]                // 8-byte Folded Spill
+	str	x8, [sp, #512]                  // 8-byte Folded Spill
+	sub	x8, x9, x7
+	mov	w9, #1                          // =0x1
+	add	x2, x10, x28, lsl #3
+	bfi	x9, x22, #2, #62
+	str	x8, [sp, #1304]                 // 8-byte Folded Spill
+	mul	x8, x22, x28
+	mul	x1, x28, x9
+	add	x9, x20, x3, lsl #1
+	sub	x9, x9, x7
+	str	x9, [sp, #1280]                 // 8-byte Folded Spill
+	ldr	x9, [sp, #1264]                 // 8-byte Folded Reload
+	add	x22, x10, x8, lsl #4
+	add	x5, x22, x6
+	lsl	x0, x9, #4
+	ldr	x9, [sp, #1272]                 // 8-byte Folded Reload
+	str	x0, [sp, #456]                  // 8-byte Folded Spill
+	lsl	x16, x9, #3
+	add	x9, x21, x2
+	add	x2, x6, x10
+	str	x9, [sp, #1224]                 // 8-byte Folded Spill
+	sub	x9, x2, x18
+	add	x2, x24, x14
+	madd	x14, x28, x11, x10
+	madd	x11, x23, x11, x12
+	str	x9, [sp, #1272]                 // 8-byte Folded Spill
+	ldr	x9, [sp, #1288]                 // 8-byte Folded Reload
+	add	x9, x4, x9
+	str	x9, [sp, #848]                  // 8-byte Folded Spill
+	add	x9, x10, x1, lsl #2
+	add	x1, x10, x3, lsl #3
+	add	x3, x24, x15
+	stp	x2, x3, [sp, #232]              // 16-byte Folded Spill
+	add	x4, x9, x6
+	add	x6, x1, x6
+	str	x9, [sp, #1256]                 // 8-byte Folded Spill
+	sub	x9, x20, x7
+	sub	x8, x4, x18
+	add	x4, x24, x17
+	str	x8, [sp, #1264]                 // 8-byte Folded Spill
+	sub	x8, x5, x18
+	str	x8, [sp, #1248]                 // 8-byte Folded Spill
+	sub	x8, x6, x18
+	add	x6, x12, x23, lsl #5
+	str	x8, [sp, #1216]                 // 8-byte Folded Spill
+	add	x8, x21, x14
+	lsl	x14, x23, #4
+	str	x14, [sp, #1240]                // 8-byte Folded Spill
+	add	x14, x14, x12
+	str	x8, [sp, #1288]                 // 8-byte Folded Spill
+	ldr	x8, [sp, #512]                  // 8-byte Folded Reload
+	add	x6, x24, x6
+	add	x18, x24, x14
+	lsl	x14, x23, #2
+	stp	x4, x6, [sp, #248]              // 16-byte Folded Spill
+	str	x14, [sp, #992]                 // 8-byte Folded Spill
+	add	x14, x14, x12
+	add	x17, x24, x14
+	add	x14, x12, x23, lsl #3
+	add	x8, x8, #4
+	stp	x17, x18, [sp, #216]            // 16-byte Folded Spill
+	add	x15, x24, x14
+	add	x14, x24, x11
+	mul	x11, x23, x9
+	str	x8, [sp, #448]                  // 8-byte Folded Spill
+	stp	x14, x15, [sp, #200]            // 16-byte Folded Spill
+	add	x11, x12, x11, lsl #2
+	madd	x12, x23, x8, x12
+	ldr	x8, [sp, #1272]                 // 8-byte Folded Reload
+	add	x23, x24, x11
+	add	x5, x24, x12
+	add	x12, x8, x21
+	ldr	x8, [sp, #1304]                 // 8-byte Folded Reload
+	add	x24, x21, x10
+	str	x9, [sp, #1304]                 // 8-byte Folded Spill
+	add	x12, x12, #4
+	str	x5, [sp, #192]                  // 8-byte Folded Spill
+	add	x5, x5, x16
+	stp	x23, x12, [sp, #176]            // 16-byte Folded Spill
+	add	x11, x10, x8, lsl #2
+	ldr	x8, [sp, #1280]                 // 8-byte Folded Reload
+	add	x12, x10, x8, lsl #2
+	add	x10, x10, x21
+	lsl	x8, x28, #4
+	add	x10, x8, x10
+	str	x8, [sp, #1232]                 // 8-byte Folded Spill
+	add	x8, x10, #32
+	add	x10, x11, x21
+	ldr	x11, [sp, #848]                 // 8-byte Folded Reload
+	str	x8, [sp, #168]                  // 8-byte Folded Spill
+	add	x8, x10, #4
+	ldr	x10, [sp, #1264]                // 8-byte Folded Reload
+	str	x8, [sp, #160]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #1256]                 // 8-byte Folded Reload
+	add	x10, x21, x10
+	str	x10, [sp, #1168]                // 8-byte Folded Spill
+	ldr	x10, [sp, #1248]                // 8-byte Folded Reload
+	add	x8, x21, x8
+	str	x8, [sp, #1184]                 // 8-byte Folded Spill
+	add	x8, x21, x22
+	sub	x22, x9, #4
+	str	x8, [sp, #1176]                 // 8-byte Folded Spill
+	add	x10, x21, x10
+	str	x10, [sp, #1160]                // 8-byte Folded Spill
+	add	x10, x1, x21
+	lsl	x1, x20, #3
+	add	x10, x10, #32
+	str	x1, [sp, #424]                  // 8-byte Folded Spill
+	sub	x1, x1, x7, lsl #3
+	str	x10, [sp, #152]                 // 8-byte Folded Spill
+	add	x10, x12, x21
+	lsl	x12, x20, #4
+	add	x10, x10, #4
+	str	x10, [sp, #144]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #1216]                // 8-byte Folded Reload
+	str	x24, [sp, #1216]                // 8-byte Folded Spill
+	add	x10, x21, x10
+	str	x10, [sp, #136]                 // 8-byte Folded Spill
+	lsl	x10, x20, #5
+	stp	x12, x10, [sp, #432]            // 16-byte Folded Spill
+	sub	x10, x10, x7, lsl #5
+	sub	x12, x12, x7, lsl #4
+	add	x7, x6, x16
+	stp	x7, x1, [sp, #400]              // 16-byte Folded Spill
+	add	x7, x4, x16
+	str	x7, [sp, #392]                  // 8-byte Folded Spill
+	add	x7, x3, x16
+	str	x10, [sp, #416]                 // 8-byte Folded Spill
+	str	x12, [sp, #280]                 // 8-byte Folded Spill
+	str	x7, [sp, #384]                  // 8-byte Folded Spill
+	add	x7, x2, x16
+	str	x7, [sp, #840]                  // 8-byte Folded Spill
+	add	x7, x18, x16
+	str	x7, [sp, #832]                  // 8-byte Folded Spill
+	add	x7, x17, x16
+	str	x7, [sp, #824]                  // 8-byte Folded Spill
+	add	x7, x15, x16
+	str	x7, [sp, #816]                  // 8-byte Folded Spill
+	add	x7, x14, x16
+	add	x16, x23, x16
+	stp	x16, x5, [sp, #120]             // 16-byte Folded Spill
+	sub	x16, x9, #3
+	str	x7, [sp, #808]                  // 8-byte Folded Spill
+	str	x16, [sp, #984]                 // 8-byte Folded Spill
+	sub	x16, x9, #2
+	sub	x9, x9, #1
+	str	x16, [sp, #976]                 // 8-byte Folded Spill
+	add	x16, x26, #128
+	str	x9, [sp, #968]                  // 8-byte Folded Spill
+	ldr	x9, [sp, #1048]                 // 8-byte Folded Reload
+	str	x16, [sp, #912]                 // 8-byte Folded Spill
+	add	x16, x26, #256
+	str	x16, [sp, #1200]                // 8-byte Folded Spill
+	add	x16, x11, #64
+	add	x11, x10, #32
+	add	x10, x6, x25
+	stp	x10, x11, [sp, #328]            // 16-byte Folded Spill
+	add	x10, x4, x25
+	add	x11, x12, #16
+	str	x16, [sp, #1192]                // 8-byte Folded Spill
+	add	x16, x6, x19
+	ldr	x12, [sp, #1288]                // 8-byte Folded Reload
+	str	x10, [sp, #320]                 // 8-byte Folded Spill
+	add	x10, x3, x25
+	lsl	x9, x9, #2
+	str	x16, [sp, #376]                 // 8-byte Folded Spill
+	add	x16, x4, x19
+	str	x10, [sp, #776]                 // 8-byte Folded Spill
+	add	x10, x2, x25
+	str	x9, [sp, #32]                   // 8-byte Folded Spill
+	ldr	x9, [sp, #1040]                 // 8-byte Folded Reload
+	str	x16, [sp, #368]                 // 8-byte Folded Spill
+	add	x16, x3, x19
+	str	x10, [sp, #768]                 // 8-byte Folded Spill
+	add	x10, x18, x25
+	str	x16, [sp, #360]                 // 8-byte Folded Spill
+	add	x16, x2, x19
+	str	x10, [sp, #760]                 // 8-byte Folded Spill
+	add	x10, x17, x25
+	str	x16, [sp, #352]                 // 8-byte Folded Spill
+	add	x16, x18, x19
+	stp	x24, x12, [sp, #104]            // 16-byte Folded Spill
+	str	x10, [sp, #752]                 // 8-byte Folded Spill
+	add	x10, x15, x25
+	str	x16, [sp, #344]                 // 8-byte Folded Spill
+	add	x16, x17, x19
+	str	x10, [sp, #744]                 // 8-byte Folded Spill
+	add	x10, x14, x25
+	lsl	x9, x9, #2
+	str	x16, [sp, #800]                 // 8-byte Folded Spill
+	add	x16, x15, x19
+	str	x10, [sp, #736]                 // 8-byte Folded Spill
+	add	x10, x6, x0
+	str	x9, [sp, #496]                  // 8-byte Folded Spill
+	ldr	x9, [sp, #1056]                 // 8-byte Folded Reload
+	str	x16, [sp, #792]                 // 8-byte Folded Spill
+	add	x16, x14, x19
+	stp	x10, x11, [sp, #304]            // 16-byte Folded Spill
+	add	x10, x4, x0
+	ldr	x11, [sp, #1224]                // 8-byte Folded Reload
+	str	x16, [sp, #784]                 // 8-byte Folded Spill
+	str	x10, [sp, #296]                 // 8-byte Folded Spill
+	add	x10, x3, x0
+	str	x10, [sp, #728]                 // 8-byte Folded Spill
+	add	x10, x2, x0
+	str	x10, [sp, #720]                 // 8-byte Folded Spill
+	add	x10, x18, x0
+	lsl	x9, x9, #2
+	str	x10, [sp, #712]                 // 8-byte Folded Spill
+	add	x10, x17, x0
+	str	x9, [sp, #24]                   // 8-byte Folded Spill
+	ldr	x9, [sp, #1032]                 // 8-byte Folded Reload
+	str	x10, [sp, #704]                 // 8-byte Folded Spill
+	add	x10, x15, x0
+	str	x10, [sp, #696]                 // 8-byte Folded Spill
+	add	x10, x14, x0
+	str	x10, [sp, #688]                 // 8-byte Folded Spill
+	add	x10, x1, #8
+	str	x10, [sp, #288]                 // 8-byte Folded Spill
+	mov	x10, x8
+	ldr	x8, [sp, #1184]                 // 8-byte Folded Reload
+	lsl	x9, x9, #2
+	str	x9, [sp, #488]                  // 8-byte Folded Spill
+	add	x9, x24, x27
+	str	x9, [sp, #1208]                 // 8-byte Folded Spill
+	str	x9, [sp, #96]                   // 8-byte Folded Spill
+	mov	x9, x8
+	ldr	x8, [sp, #1160]                 // 8-byte Folded Reload
+	stp	x8, x11, [sp, #80]              // 16-byte Folded Spill
+	ldr	x8, [sp, #1168]                 // 8-byte Folded Reload
+	str	x8, [sp, #72]                   // 8-byte Folded Spill
+	b	.LBB0_2
+	.p2align	2
+.LBB0_1:                                // %.loopexit68
+                                        //   in Loop: Header=BB0_2 Depth=1
+	ldr	x8, [sp, #264]                  // 8-byte Folded Reload
+	ldp	x10, x9, [sp, #24]              // 16-byte Folded Reload
+	add	x8, x8, x10
+	ldr	x13, [sp, #48]                  // 8-byte Folded Reload
+	str	x8, [sp, #264]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #256]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #256]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #248]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #248]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #240]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #240]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #232]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #232]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #224]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #224]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #216]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #216]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #208]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #208]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #200]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #200]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #192]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #192]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #184]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #184]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #176]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #176]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #168]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #168]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #160]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #160]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #152]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #152]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #144]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #144]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #136]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #136]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #272]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #272]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #72]                   // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #72]                   // 8-byte Folded Spill
+	ldr	x8, [sp, #80]                   // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #80]                   // 8-byte Folded Spill
+	ldr	x8, [sp, #88]                   // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #88]                   // 8-byte Folded Spill
+	ldr	x8, [sp, #96]                   // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #96]                   // 8-byte Folded Spill
+	ldr	x8, [sp, #104]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #104]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #112]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #112]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #128]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #128]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #120]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #120]                  // 8-byte Folded Spill
+	ldp	x9, x8, [sp, #56]               // 16-byte Folded Reload
+	add	x9, x9, x10
+	add	x10, x8, x10
+.LBB0_2:                                // =>This Loop Header: Depth=1
+                                        //     Child Loop BB0_7 Depth 2
+                                        //       Child Loop BB0_11 Depth 3
+                                        //         Child Loop BB0_13 Depth 4
+                                        //         Child Loop BB0_16 Depth 4
+                                        //         Child Loop BB0_19 Depth 4
+                                        //           Child Loop BB0_21 Depth 5
+                                        //           Child Loop BB0_23 Depth 5
+                                        //         Child Loop BB0_26 Depth 4
+                                        //         Child Loop BB0_28 Depth 4
+                                        //         Child Loop BB0_32 Depth 4
+                                        //         Child Loop BB0_34 Depth 4
+                                        //       Child Loop BB0_40 Depth 3
+                                        //       Child Loop BB0_43 Depth 3
+                                        //       Child Loop BB0_46 Depth 3
+                                        //         Child Loop BB0_48 Depth 4
+                                        //         Child Loop BB0_50 Depth 4
+                                        //       Child Loop BB0_53 Depth 3
+                                        //       Child Loop BB0_55 Depth 3
+                                        //       Child Loop BB0_59 Depth 3
+                                        //       Child Loop BB0_61 Depth 3
+                                        //       Child Loop BB0_65 Depth 3
+                                        //       Child Loop BB0_68 Depth 3
+                                        //       Child Loop BB0_71 Depth 3
+                                        //         Child Loop BB0_73 Depth 4
+                                        //         Child Loop BB0_75 Depth 4
+                                        //       Child Loop BB0_78 Depth 3
+                                        //       Child Loop BB0_80 Depth 3
+                                        //       Child Loop BB0_84 Depth 3
+                                        //       Child Loop BB0_86 Depth 3
+                                        //       Child Loop BB0_90 Depth 3
+                                        //       Child Loop BB0_93 Depth 3
+                                        //       Child Loop BB0_96 Depth 3
+                                        //         Child Loop BB0_98 Depth 4
+                                        //         Child Loop BB0_100 Depth 4
+                                        //       Child Loop BB0_103 Depth 3
+                                        //       Child Loop BB0_105 Depth 3
+                                        //       Child Loop BB0_109 Depth 3
+                                        //       Child Loop BB0_111 Depth 3
+                                        //       Child Loop BB0_115 Depth 3
+                                        //       Child Loop BB0_118 Depth 3
+                                        //       Child Loop BB0_121 Depth 3
+                                        //         Child Loop BB0_123 Depth 4
+                                        //         Child Loop BB0_125 Depth 4
+                                        //       Child Loop BB0_128 Depth 3
+                                        //       Child Loop BB0_130 Depth 3
+                                        //       Child Loop BB0_134 Depth 3
+                                        //       Child Loop BB0_136 Depth 3
+	ldr	x8, [sp, #40]                   // 8-byte Folded Reload
+	cmp	x13, x8
+	b.ge	.LBB0_137
+// %bb.3:                               //   in Loop: Header=BB0_2 Depth=1
+	add	x8, x13, #1
+	str	x10, [sp, #64]                  // 8-byte Folded Spill
+	ldp	x15, x16, [sp, #216]            // 16-byte Folded Reload
+	stp	x8, x9, [sp, #48]               // 16-byte Folded Spill
+	mov	x19, xzr
+	str	x10, [sp, #616]                 // 8-byte Folded Spill
+	str	x9, [sp, #608]                  // 8-byte Folded Spill
+	ldp	x10, x8, [sp, #120]             // 16-byte Folded Reload
+	str	x13, [sp, #1064]                // 8-byte Folded Spill
+	str	x8, [sp, #600]                  // 8-byte Folded Spill
+	ldp	x8, x9, [sp, #104]              // 16-byte Folded Reload
+	ldp	x13, x14, [sp, #200]            // 16-byte Folded Reload
+	str	x9, [sp, #888]                  // 8-byte Folded Spill
+	str	x8, [sp, #872]                  // 8-byte Folded Spill
+	ldp	x8, x9, [sp, #88]               // 16-byte Folded Reload
+	str	x9, [sp, #864]                  // 8-byte Folded Spill
+	str	x8, [sp, #856]                  // 8-byte Folded Spill
+	ldp	x8, x9, [sp, #72]               // 16-byte Folded Reload
+	str	x8, [sp, #680]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #272]                  // 8-byte Folded Reload
+	str	x9, [sp, #648]                  // 8-byte Folded Spill
+	str	x8, [sp, #656]                  // 8-byte Folded Spill
+	ldp	x12, x8, [sp, #136]             // 16-byte Folded Reload
+	str	x8, [sp, #880]                  // 8-byte Folded Spill
+	ldp	x9, x8, [sp, #152]              // 16-byte Folded Reload
+	str	x9, [sp, #904]                  // 8-byte Folded Spill
+	str	x8, [sp, #1096]                 // 8-byte Folded Spill
+	ldp	x9, x8, [sp, #168]              // 16-byte Folded Reload
+	str	x9, [sp, #1088]                 // 8-byte Folded Spill
+	str	x8, [sp, #672]                  // 8-byte Folded Spill
+	ldp	x9, x8, [sp, #184]              // 16-byte Folded Reload
+	str	x8, [sp, #664]                  // 8-byte Folded Spill
+	str	x9, [sp, #1152]                 // 8-byte Folded Spill
+	ldp	x17, x8, [sp, #232]             // 16-byte Folded Reload
+	str	x8, [sp, #640]                  // 8-byte Folded Spill
+	ldp	x9, x8, [sp, #248]              // 16-byte Folded Reload
+	str	x8, [sp, #624]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #264]                  // 8-byte Folded Reload
+	str	x9, [sp, #632]                  // 8-byte Folded Spill
+	str	x8, [sp, #1120]                 // 8-byte Folded Spill
+	b	.LBB0_7
+	.p2align	2
+.LBB0_4:                                //   in Loop: Header=BB0_7 Depth=2
+	str	s0, [x15, x9, lsl #2]
+.LBB0_5:                                //   in Loop: Header=BB0_7 Depth=2
+	bl	free
+	ldr	x30, [sp, #1080]                // 8-byte Folded Reload
+.LBB0_6:                                // %.backedge69
+                                        //   in Loop: Header=BB0_7 Depth=2
+	ldr	x9, [sp, #488]                  // 8-byte Folded Reload
+	ldr	x8, [sp, #1120]                 // 8-byte Folded Reload
+	add	x8, x8, x9
+	ldr	x10, [sp, #624]                 // 8-byte Folded Reload
+	ldr	x17, [sp, #536]                 // 8-byte Folded Reload
+	ldr	x16, [sp, #544]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #552]                 // 8-byte Folded Reload
+	ldr	x14, [sp, #560]                 // 8-byte Folded Reload
+	ldr	x13, [sp, #568]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #896]                 // 8-byte Folded Reload
+	ldr	x19, [sp, #528]                 // 8-byte Folded Reload
+	add	x12, x12, x9
+	str	x8, [sp, #1120]                 // 8-byte Folded Spill
+	ldr	x8, [sp, #496]                  // 8-byte Folded Reload
+	add	x10, x10, x8
+	add	x17, x17, x8
+	add	x16, x16, x8
+	add	x15, x15, x8
+	add	x14, x14, x8
+	add	x13, x13, x8
+	str	x10, [sp, #624]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #632]                 // 8-byte Folded Reload
+	add	x10, x10, x8
+	str	x10, [sp, #632]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #640]                 // 8-byte Folded Reload
+	add	x10, x10, x8
+	str	x10, [sp, #640]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #664]                 // 8-byte Folded Reload
+	add	x10, x10, x8
+	str	x10, [sp, #664]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #1152]                // 8-byte Folded Reload
+	add	x10, x10, x9
+	str	x10, [sp, #1152]                // 8-byte Folded Spill
+	ldr	x10, [sp, #672]                 // 8-byte Folded Reload
+	add	x10, x10, x8
+	str	x10, [sp, #672]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #1088]                // 8-byte Folded Reload
+	add	x10, x10, x9
+	str	x10, [sp, #1088]                // 8-byte Folded Spill
+	ldr	x10, [sp, #1096]                // 8-byte Folded Reload
+	add	x10, x10, x9
+	str	x10, [sp, #1096]                // 8-byte Folded Spill
+	ldr	x10, [sp, #904]                 // 8-byte Folded Reload
+	add	x10, x10, x9
+	str	x10, [sp, #904]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #880]                 // 8-byte Folded Reload
+	add	x10, x10, x9
+	str	x10, [sp, #880]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #656]                 // 8-byte Folded Reload
+	add	x10, x10, x8
+	str	x10, [sp, #656]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #680]                 // 8-byte Folded Reload
+	add	x10, x10, x9
+	str	x10, [sp, #680]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #648]                 // 8-byte Folded Reload
+	add	x10, x10, x9
+	str	x10, [sp, #648]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #856]                 // 8-byte Folded Reload
+	add	x10, x10, x9
+	str	x10, [sp, #856]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #864]                 // 8-byte Folded Reload
+	add	x10, x10, x9
+	str	x10, [sp, #864]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #872]                 // 8-byte Folded Reload
+	add	x10, x10, x9
+	str	x10, [sp, #872]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #888]                 // 8-byte Folded Reload
+	add	x10, x10, x9
+	str	x10, [sp, #888]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #600]                 // 8-byte Folded Reload
+	add	x10, x10, x8
+	str	x10, [sp, #600]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #576]                 // 8-byte Folded Reload
+	add	x10, x10, x8
+	ldr	x8, [sp, #608]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #608]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #616]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #616]                  // 8-byte Folded Spill
+.LBB0_7:                                //   Parent Loop BB0_2 Depth=1
+                                        // =>  This Loop Header: Depth=2
+                                        //       Child Loop BB0_11 Depth 3
+                                        //         Child Loop BB0_13 Depth 4
+                                        //         Child Loop BB0_16 Depth 4
+                                        //         Child Loop BB0_19 Depth 4
+                                        //           Child Loop BB0_21 Depth 5
+                                        //           Child Loop BB0_23 Depth 5
+                                        //         Child Loop BB0_26 Depth 4
+                                        //         Child Loop BB0_28 Depth 4
+                                        //         Child Loop BB0_32 Depth 4
+                                        //         Child Loop BB0_34 Depth 4
+                                        //       Child Loop BB0_40 Depth 3
+                                        //       Child Loop BB0_43 Depth 3
+                                        //       Child Loop BB0_46 Depth 3
+                                        //         Child Loop BB0_48 Depth 4
+                                        //         Child Loop BB0_50 Depth 4
+                                        //       Child Loop BB0_53 Depth 3
+                                        //       Child Loop BB0_55 Depth 3
+                                        //       Child Loop BB0_59 Depth 3
+                                        //       Child Loop BB0_61 Depth 3
+                                        //       Child Loop BB0_65 Depth 3
+                                        //       Child Loop BB0_68 Depth 3
+                                        //       Child Loop BB0_71 Depth 3
+                                        //         Child Loop BB0_73 Depth 4
+                                        //         Child Loop BB0_75 Depth 4
+                                        //       Child Loop BB0_78 Depth 3
+                                        //       Child Loop BB0_80 Depth 3
+                                        //       Child Loop BB0_84 Depth 3
+                                        //       Child Loop BB0_86 Depth 3
+                                        //       Child Loop BB0_90 Depth 3
+                                        //       Child Loop BB0_93 Depth 3
+                                        //       Child Loop BB0_96 Depth 3
+                                        //         Child Loop BB0_98 Depth 4
+                                        //         Child Loop BB0_100 Depth 4
+                                        //       Child Loop BB0_103 Depth 3
+                                        //       Child Loop BB0_105 Depth 3
+                                        //       Child Loop BB0_109 Depth 3
+                                        //       Child Loop BB0_111 Depth 3
+                                        //       Child Loop BB0_115 Depth 3
+                                        //       Child Loop BB0_118 Depth 3
+                                        //       Child Loop BB0_121 Depth 3
+                                        //         Child Loop BB0_123 Depth 4
+                                        //         Child Loop BB0_125 Depth 4
+                                        //       Child Loop BB0_128 Depth 3
+                                        //       Child Loop BB0_130 Depth 3
+                                        //       Child Loop BB0_134 Depth 3
+                                        //       Child Loop BB0_136 Depth 3
+	ldr	x8, [sp, #520]                  // 8-byte Folded Reload
+	cmp	x19, x8
+	b.ge	.LBB0_1
+// %bb.8:                               //   in Loop: Header=BB0_7 Depth=2
+	add	x8, x19, #1
+	str	x15, [sp, #552]                 // 8-byte Folded Spill
+	mov	x0, xzr
+	str	x15, [sp, #1264]                // 8-byte Folded Spill
+	ldr	x15, [sp, #640]                 // 8-byte Folded Reload
+	str	x8, [sp, #528]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #672]                  // 8-byte Folded Reload
+	str	x16, [sp, #544]                 // 8-byte Folded Spill
+	str	x16, [sp, #1256]                // 8-byte Folded Spill
+	ldr	x16, [sp, #632]                 // 8-byte Folded Reload
+	str	x17, [sp, #536]                 // 8-byte Folded Spill
+	str	x17, [sp, #1248]                // 8-byte Folded Spill
+	ldr	x17, [sp, #624]                 // 8-byte Folded Reload
+	str	x12, [sp, #896]                 // 8-byte Folded Spill
+	str	x10, [sp, #576]                 // 8-byte Folded Spill
+	str	x13, [sp, #568]                 // 8-byte Folded Spill
+	str	x13, [sp, #1280]                // 8-byte Folded Spill
+	str	x14, [sp, #560]                 // 8-byte Folded Spill
+	str	x14, [sp, #1272]                // 8-byte Folded Spill
+	str	x19, [sp, #1072]                // 8-byte Folded Spill
+	str	x8, [sp, #1144]                 // 8-byte Folded Spill
+	ldr	x8, [sp, #664]                  // 8-byte Folded Reload
+	str	x8, [sp, #1136]                 // 8-byte Folded Spill
+	b	.LBB0_11
+	.p2align	2
+.LBB0_9:                                //   in Loop: Header=BB0_11 Depth=3
+	stp	q3, q2, [x8]
+	stp	q1, q0, [x8, #32]
+.LBB0_10:                               // %.backedge
+                                        //   in Loop: Header=BB0_11 Depth=3
+	ldr	x8, [sp, #1248]                 // 8-byte Folded Reload
+	ldr	x0, [sp, #1112]                 // 8-byte Folded Reload
+	add	x17, x17, #64
+	add	x16, x16, #64
+	add	x15, x15, #64
+	add	x8, x8, #64
+	str	x8, [sp, #1248]                 // 8-byte Folded Spill
+	ldr	x8, [sp, #1256]                 // 8-byte Folded Reload
+	add	x8, x8, #64
+	str	x8, [sp, #1256]                 // 8-byte Folded Spill
+	ldr	x8, [sp, #1264]                 // 8-byte Folded Reload
+	add	x8, x8, #64
+	str	x8, [sp, #1264]                 // 8-byte Folded Spill
+	ldr	x8, [sp, #1272]                 // 8-byte Folded Reload
+	add	x8, x8, #64
+	str	x8, [sp, #1272]                 // 8-byte Folded Spill
+	ldr	x8, [sp, #1280]                 // 8-byte Folded Reload
+	add	x8, x8, #64
+	str	x8, [sp, #1280]                 // 8-byte Folded Spill
+	ldr	x8, [sp, #1136]                 // 8-byte Folded Reload
+	add	x8, x8, #64
+	str	x8, [sp, #1136]                 // 8-byte Folded Spill
+	ldr	x8, [sp, #1144]                 // 8-byte Folded Reload
+	add	x8, x8, #64
+	str	x8, [sp, #1144]                 // 8-byte Folded Spill
+.LBB0_11:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Loop Header: Depth=3
+                                        //         Child Loop BB0_13 Depth 4
+                                        //         Child Loop BB0_16 Depth 4
+                                        //         Child Loop BB0_19 Depth 4
+                                        //           Child Loop BB0_21 Depth 5
+                                        //           Child Loop BB0_23 Depth 5
+                                        //         Child Loop BB0_26 Depth 4
+                                        //         Child Loop BB0_28 Depth 4
+                                        //         Child Loop BB0_32 Depth 4
+                                        //         Child Loop BB0_34 Depth 4
+	ldr	x8, [sp, #1104]                 // 8-byte Folded Reload
+	cmp	x0, x8
+	b.ge	.LBB0_35
+// %bb.12:                              //   in Loop: Header=BB0_11 Depth=3
+	ldr	x8, [sp, #944]                  // 8-byte Folded Reload
+	ldr	x9, [sp, #936]                  // 8-byte Folded Reload
+	mov	x5, xzr
+	mov	x7, xzr
+	ldr	x10, [sp, #928]                 // 8-byte Folded Reload
+	ldr	x11, [sp, #920]                 // 8-byte Folded Reload
+	add	x1, x9, x8, lsl #2
+	add	x8, x0, #16
+	add	x14, x11, x10, lsl #2
+	ldr	x11, [sp, #1008]                // 8-byte Folded Reload
+	ldr	x12, [sp, #1064]                // 8-byte Folded Reload
+	ldr	x10, [sp, #1040]                // 8-byte Folded Reload
+	lsl	x9, x29, #1
+	str	x8, [sp, #1112]                 // 8-byte Folded Spill
+	ldr	x8, [sp, #1016]                 // 8-byte Folded Reload
+	mul	x10, x19, x10
+	mul	x8, x19, x8
+	madd	x2, x12, x11, x8
+	ldr	x11, [sp, #1048]                // 8-byte Folded Reload
+	add	x8, x9, x29
+	madd	x6, x12, x11, x10
+	add	x10, x2, x0
+	add	x11, x1, x10, lsl #2
+	add	x8, x10, x8
+	add	x9, x10, x9
+	add	x10, x10, x29
+	add	x8, x1, x8, lsl #2
+	add	x9, x1, x9, lsl #2
+	add	x10, x1, x10, lsl #2
+	ldp	q4, q3, [x11, #32]
+	ldp	q1, q0, [x11]
+	add	x11, x6, x0
+	ldp	q18, q16, [x8, #32]
+	ldp	q23, q21, [x8]
+	ldp	q19, q17, [x9, #32]
+	ldp	q22, q20, [x9]
+	ldr	x9, [sp, #1056]                 // 8-byte Folded Reload
+	ldp	q5, q2, [x10, #32]
+	add	x8, x14, x11, lsl #2
+	ldp	q7, q6, [x10]
+	ldr	x10, [sp, #952]                 // 8-byte Folded Reload
+	ldp	q29, q28, [x8, #32]
+	ldp	q31, q30, [x8]
+	ldr	x8, [sp, #1032]                 // 8-byte Folded Reload
+	mul	x8, x19, x8
+	ldr	x19, [sp, #1120]                // 8-byte Folded Reload
+	madd	x4, x12, x9, x8
+	ldr	x9, [sp, #960]                  // 8-byte Folded Reload
+	add	x3, x10, x9, lsl #2
+	lsl	x8, x4, #2
+	ldr	q26, [x3, x8]
+	add	x8, x4, x28
+	lsl	x8, x8, #2
+	ldr	q25, [x3, x8]
+	add	x8, x4, x28, lsl #1
+	lsl	x8, x8, #2
+	ldr	q24, [x3, x8]
+	ldr	x8, [sp, #912]                  // 8-byte Folded Reload
+	.p2align	2
+.LBB0_13:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_11 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	ldr	x9, [sp, #1288]                 // 8-byte Folded Reload
+	fmla	v1.4s, v31.4s, v26.s[0]
+	fmla	v0.4s, v30.4s, v26.s[0]
+	cmp	x7, x22
+	add	x9, x9, x19
+	prfm	pldl1keep, [x9, #16]
+	ldr	q27, [x9]
+	b.ge	.LBB0_15
+// %bb.14:                              //   in Loop: Header=BB0_13 Depth=4
+	ldr	x11, [sp, #1216]                // 8-byte Folded Reload
+	ldr	x12, [sp, #1264]                // 8-byte Folded Reload
+	fmla	v4.4s, v29.4s, v26.s[0]
+	fmla	v3.4s, v28.4s, v26.s[0]
+	ldr	x9, [sp, #1224]                 // 8-byte Folded Reload
+	ldr	x10, [sp, #1208]                // 8-byte Folded Reload
+	fmla	v7.4s, v31.4s, v25.s[0]
+	fmla	v6.4s, v30.4s, v25.s[0]
+	fmla	v5.4s, v29.4s, v25.s[0]
+	fmla	v2.4s, v28.4s, v25.s[0]
+	stp	q31, q30, [x8, #-128]
+	fmla	v17.4s, v28.4s, v24.s[0]
+	fmla	v19.4s, v29.4s, v24.s[0]
+	stp	q29, q28, [x8, #-96]
+	add	x21, x11, x19
+	ldr	x11, [sp, #1248]                // 8-byte Folded Reload
+	add	x12, x12, x5
+	fmla	v22.4s, v31.4s, v24.s[0]
+	fmla	v20.4s, v30.4s, v24.s[0]
+	fmla	v16.4s, v28.4s, v27.s[0]
+	fmla	v18.4s, v29.4s, v27.s[0]
+	add	x30, x16, x5
+	fmla	v21.4s, v30.4s, v27.s[0]
+	fmla	v23.4s, v31.4s, v27.s[0]
+	add	x9, x9, x19
+	add	x10, x10, x19
+	add	x23, x9, #32
+	add	x24, x10, #32
+	add	x25, x21, #32
+	add	x7, x7, #4
+	add	x11, x11, x5
+	add	x19, x19, #16
+	prfm	pldl1keep, [x11]
+	ldr	x13, [sp, #1272]                // 8-byte Folded Reload
+	ldp	q28, q29, [x12, #32]
+	ldp	q30, q31, [x12]
+	add	x12, x15, x5
+	add	x11, x17, x5
+	add	x18, x13, x5
+	fmla	v3.4s, v29.4s, v26.s[1]
+	fmla	v0.4s, v31.4s, v26.s[1]
+	fmla	v2.4s, v29.4s, v25.s[1]
+	fmla	v6.4s, v31.4s, v25.s[1]
+	fmla	v20.4s, v31.4s, v24.s[1]
+	fmla	v17.4s, v29.4s, v24.s[1]
+	fmla	v21.4s, v31.4s, v27.s[1]
+	fmla	v16.4s, v29.4s, v27.s[1]
+	fmla	v4.4s, v28.4s, v26.s[1]
+	fmla	v1.4s, v30.4s, v26.s[1]
+	fmla	v5.4s, v28.4s, v25.s[1]
+	fmla	v7.4s, v30.4s, v25.s[1]
+	fmla	v22.4s, v30.4s, v24.s[1]
+	stp	q28, q29, [x8, #-32]
+	fmla	v19.4s, v28.4s, v24.s[1]
+	fmla	v23.4s, v30.4s, v27.s[1]
+	fmla	v18.4s, v28.4s, v27.s[1]
+	stp	q30, q31, [x8, #-64]
+	prfm	pldl1keep, [x12]
+	ldp	q29, q28, [x18, #32]
+	ldp	q31, q30, [x18]
+	ldr	x12, [sp, #1280]                // 8-byte Folded Reload
+	add	x12, x12, x5
+	fmla	v0.4s, v30.4s, v26.s[2]
+	fmla	v3.4s, v28.4s, v26.s[2]
+	fmla	v2.4s, v28.4s, v25.s[2]
+	fmla	v17.4s, v28.4s, v24.s[2]
+	fmla	v6.4s, v30.4s, v25.s[2]
+	fmla	v20.4s, v30.4s, v24.s[2]
+	stp	q31, q30, [x8]
+	stp	q29, q28, [x8, #32]
+	prfm	pldl1keep, [x30]
+	ldp	q8, q9, [x12]
+	fmla	v1.4s, v31.4s, v26.s[2]
+	ldp	q10, q11, [x12, #32]
+	ldr	x12, [sp, #1256]                // 8-byte Folded Reload
+	fmla	v4.4s, v29.4s, v26.s[2]
+	fmla	v5.4s, v29.4s, v25.s[2]
+	fmla	v19.4s, v29.4s, v24.s[2]
+	fmla	v7.4s, v31.4s, v25.s[2]
+	fmla	v22.4s, v31.4s, v24.s[2]
+	add	x12, x12, x5
+	fmla	v16.4s, v28.4s, v27.s[2]
+	fmla	v18.4s, v29.4s, v27.s[2]
+	fmla	v21.4s, v30.4s, v27.s[2]
+	fmla	v23.4s, v31.4s, v27.s[2]
+	fmla	v3.4s, v11.4s, v26.s[3]
+	fmla	v4.4s, v10.4s, v26.s[3]
+	fmla	v0.4s, v9.4s, v26.s[3]
+	stp	q8, q9, [x8, #64]
+	stp	q10, q11, [x8, #96]
+	prfm	pldl1keep, [x11]
+	fmla	v1.4s, v8.4s, v26.s[3]
+	fmla	v2.4s, v11.4s, v25.s[3]
+	fmla	v5.4s, v10.4s, v25.s[3]
+	fmla	v6.4s, v9.4s, v25.s[3]
+	fmla	v7.4s, v8.4s, v25.s[3]
+	ldp	q29, q28, [x12, #32]
+	fmla	v20.4s, v9.4s, v24.s[3]
+	fmla	v22.4s, v8.4s, v24.s[3]
+	fmla	v19.4s, v10.4s, v24.s[3]
+	fmla	v17.4s, v11.4s, v24.s[3]
+	ldp	q31, q30, [x12]
+	prfm	pldl1keep, [x25]
+	ldr	q26, [x21, #16]
+	prfm	pldl1keep, [x24]
+	ldr	q25, [x10, #16]
+	prfm	pldl1keep, [x23]
+	ldr	q24, [x9, #16]
+	ldr	x9, [sp, #1240]                 // 8-byte Folded Reload
+	fmla	v23.4s, v8.4s, v27.s[3]
+	fmla	v21.4s, v9.4s, v27.s[3]
+	fmla	v18.4s, v10.4s, v27.s[3]
+	fmla	v16.4s, v11.4s, v27.s[3]
+	add	x5, x5, x9
+	add	x8, x8, #256
+	b	.LBB0_13
+	.p2align	2
+.LBB0_15:                               //   in Loop: Header=BB0_11 Depth=3
+	ldr	x11, [sp, #1000]                // 8-byte Folded Reload
+	ldr	x12, [sp, #984]                 // 8-byte Folded Reload
+	add	x9, x26, x22, lsl #6
+	fmla	v4.4s, v29.4s, v26.s[0]
+	ldr	x13, [sp, #976]                 // 8-byte Folded Reload
+	fmla	v3.4s, v28.4s, v26.s[0]
+	fmla	v2.4s, v28.4s, v25.s[0]
+	fmla	v7.4s, v31.4s, v25.s[0]
+	stp	q31, q30, [x9]
+	stp	q29, q28, [x9, #32]
+	fmla	v6.4s, v30.4s, v25.s[0]
+	fmla	v5.4s, v29.4s, v25.s[0]
+	fmla	v17.4s, v28.4s, v24.s[0]
+	fmla	v19.4s, v29.4s, v24.s[0]
+	madd	x8, x12, x11, x6
+	madd	x10, x13, x11, x6
+	fmla	v20.4s, v30.4s, v24.s[0]
+	fmla	v22.4s, v31.4s, v24.s[0]
+	fmla	v16.4s, v28.4s, v27.s[0]
+	fmla	v18.4s, v29.4s, v27.s[0]
+	add	x5, x26, x12, lsl #6
+	ldr	x12, [sp, #968]                 // 8-byte Folded Reload
+	fmla	v21.4s, v30.4s, v27.s[0]
+	fmla	v23.4s, v31.4s, v27.s[0]
+	ldr	x18, [sp, #1136]                // 8-byte Folded Reload
+	mov	x19, xzr
+	add	x8, x8, x0
+	add	x9, x10, x0
+	add	x7, x26, x12, lsl #6
+	add	x8, x14, x8, lsl #2
+	add	x9, x14, x9, lsl #2
+	ldp	q28, q29, [x8]
+	fmla	v0.4s, v29.4s, v26.s[1]
+	ldp	q30, q31, [x8, #32]
+	madd	x8, x12, x11, x6
+	fmla	v3.4s, v31.4s, v26.s[1]
+	fmla	v6.4s, v29.4s, v25.s[1]
+	fmla	v2.4s, v31.4s, v25.s[1]
+	fmla	v20.4s, v29.4s, v24.s[1]
+	fmla	v17.4s, v31.4s, v24.s[1]
+	fmla	v21.4s, v29.4s, v27.s[1]
+	fmla	v16.4s, v31.4s, v27.s[1]
+	add	x6, x26, x13, lsl #6
+	add	x8, x8, x0
+	ldr	x13, [sp, #992]                 // 8-byte Folded Reload
+	stp	q28, q29, [x5]
+	fmla	v4.4s, v30.4s, v26.s[1]
+	stp	q30, q31, [x5, #32]
+	fmla	v1.4s, v28.4s, v26.s[1]
+	fmla	v5.4s, v30.4s, v25.s[1]
+	fmla	v7.4s, v28.4s, v25.s[1]
+	fmla	v22.4s, v28.4s, v24.s[1]
+	fmla	v19.4s, v30.4s, v24.s[1]
+	fmla	v23.4s, v28.4s, v27.s[1]
+	fmla	v18.4s, v30.4s, v27.s[1]
+	add	x8, x14, x8, lsl #2
+	ldr	x14, [sp, #1144]                // 8-byte Folded Reload
+	ldp	q29, q28, [x9, #32]
+	ldp	q31, q30, [x9]
+	ldr	x9, [sp, #1304]                 // 8-byte Folded Reload
+	fmla	v0.4s, v30.4s, v26.s[2]
+	fmla	v3.4s, v28.4s, v26.s[2]
+	fmla	v2.4s, v28.4s, v25.s[2]
+	fmla	v6.4s, v30.4s, v25.s[2]
+	fmla	v17.4s, v28.4s, v24.s[2]
+	fmla	v20.4s, v30.4s, v24.s[2]
+	fmla	v16.4s, v28.4s, v27.s[2]
+	fmla	v21.4s, v30.4s, v27.s[2]
+	stp	q31, q30, [x6]
+	fmla	v1.4s, v31.4s, v26.s[2]
+	stp	q29, q28, [x6, #32]
+	fmla	v4.4s, v29.4s, v26.s[2]
+	fmla	v7.4s, v31.4s, v25.s[2]
+	fmla	v5.4s, v29.4s, v25.s[2]
+	fmla	v19.4s, v29.4s, v24.s[2]
+	fmla	v22.4s, v31.4s, v24.s[2]
+	fmla	v18.4s, v29.4s, v27.s[2]
+	fmla	v23.4s, v31.4s, v27.s[2]
+	ldp	q28, q29, [x8]
+	fmla	v0.4s, v29.4s, v26.s[3]
+	ldp	q30, q31, [x8, #32]
+	fmla	v3.4s, v31.4s, v26.s[3]
+	fmla	v6.4s, v29.4s, v25.s[3]
+	fmla	v2.4s, v31.4s, v25.s[3]
+	fmla	v20.4s, v29.4s, v24.s[3]
+	fmla	v17.4s, v31.4s, v24.s[3]
+	fmla	v21.4s, v29.4s, v27.s[3]
+	ldr	x8, [sp, #1152]                 // 8-byte Folded Reload
+	fmla	v16.4s, v31.4s, v27.s[3]
+	fmla	v4.4s, v30.4s, v26.s[3]
+	fmla	v1.4s, v28.4s, v26.s[3]
+	fmla	v5.4s, v30.4s, v25.s[3]
+	fmla	v7.4s, v28.4s, v25.s[3]
+	stp	q28, q29, [x7]
+	stp	q30, q31, [x7, #32]
+	fmla	v22.4s, v28.4s, v24.s[3]
+	fmla	v19.4s, v30.4s, v24.s[3]
+	fmla	v23.4s, v28.4s, v27.s[3]
+	fmla	v18.4s, v30.4s, v27.s[3]
+	cmp	x9, x20
+	b.ge	.LBB0_17
+	.p2align	2
+.LBB0_16:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_11 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x10, x8, x27
+	add	x11, x18, x19
+	prfm	pldl1keep, [x8]
+	ldur	s24, [x8, #-4]
+	add	x12, x10, x27
+	prfm	pldl1keep, [x10]
+	ldur	s25, [x10, #-4]
+	add	x10, x14, x19
+	prfm	pldl1keep, [x12]
+	ldur	s26, [x12, #-4]
+	add	x12, x12, x27
+	add	x19, x19, x13
+	prfm	pldl1keep, [x12]
+	ldur	s27, [x12, #-4]
+	add	x8, x8, #4
+	prfm	pldl1keep, [x11]
+	ldp	q28, q29, [x10, #32]
+	fmla	v3.4s, v29.4s, v24.s[0]
+	fmla	v2.4s, v29.4s, v25.s[0]
+	ldp	q30, q31, [x10]
+	add	x10, x26, x9, lsl #6
+	fmla	v0.4s, v31.4s, v24.s[0]
+	fmla	v6.4s, v31.4s, v25.s[0]
+	add	x9, x9, #1
+	fmla	v20.4s, v31.4s, v26.s[0]
+	fmla	v17.4s, v29.4s, v26.s[0]
+	fmla	v4.4s, v28.4s, v24.s[0]
+	fmla	v1.4s, v30.4s, v24.s[0]
+	fmla	v5.4s, v28.4s, v25.s[0]
+	fmla	v7.4s, v30.4s, v25.s[0]
+	fmla	v22.4s, v30.4s, v26.s[0]
+	fmla	v19.4s, v28.4s, v26.s[0]
+	fmla	v23.4s, v30.4s, v27.s[0]
+	fmla	v21.4s, v31.4s, v27.s[0]
+	fmla	v18.4s, v28.4s, v27.s[0]
+	stp	q30, q31, [x10]
+	fmla	v16.4s, v29.4s, v27.s[0]
+	stp	q28, q29, [x10, #32]
+	cmp	x9, x20
+	b.lt	.LBB0_16
+.LBB0_17:                               // %.preheader
+                                        //   in Loop: Header=BB0_11 Depth=3
+	ldr	x30, [sp, #1096]                // 8-byte Folded Reload
+	ldr	x9, [sp, #1088]                 // 8-byte Folded Reload
+	mov	x8, xzr
+	mov	w19, #1                         // =0x1
+	mov	w25, #2                         // =0x2
+	mov	w23, #3                         // =0x3
+	mov	w24, #4                         // =0x4
+	b	.LBB0_19
+	.p2align	2
+.LBB0_18:                               // %.loopexit
+                                        //   in Loop: Header=BB0_19 Depth=4
+	ldr	x8, [sp, #1232]                 // 8-byte Folded Reload
+	add	x9, x9, x8
+	add	x30, x30, x8
+	mov	x8, x24
+	mov	x24, x21
+.LBB0_19:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_11 Depth=3
+                                        // =>      This Loop Header: Depth=4
+                                        //           Child Loop BB0_21 Depth 5
+                                        //           Child Loop BB0_23 Depth 5
+	madd	x8, x8, x29, x2
+	add	x8, x8, x0
+	madd	x10, x19, x29, x2
+	madd	x11, x25, x29, x2
+	add	x10, x10, x0
+	add	x11, x11, x0
+	add	x8, x1, x8, lsl #2
+	stp	q1, q0, [x8]
+	stp	q4, q3, [x8, #32]
+	add	x8, x1, x10, lsl #2
+	add	x10, x1, x11, lsl #2
+	stp	q7, q6, [x8]
+	stp	q5, q2, [x8, #32]
+	madd	x8, x23, x29, x2
+	add	x8, x8, x0
+	stp	q22, q20, [x10]
+	stp	q19, q17, [x10, #32]
+	ldr	x10, [sp, #1296]                // 8-byte Folded Reload
+	cmp	x24, x10
+	add	x8, x1, x8, lsl #2
+	stp	q23, q21, [x8]
+	stp	q18, q16, [x8, #32]
+	b.ge	.LBB0_24
+// %bb.20:                              //   in Loop: Header=BB0_19 Depth=4
+	madd	x10, x24, x29, x2
+	add	x23, x24, #3
+	add	x19, x24, #1
+	add	x25, x24, #2
+	madd	x11, x19, x29, x2
+	ldp	q28, q29, [x26, #32]
+	mov	x8, xzr
+	madd	x12, x25, x29, x2
+	ldp	q30, q31, [x26]
+	add	x21, x24, #4
+	mov	x18, x9
+	add	x10, x10, x0
+	add	x10, x1, x10, lsl #2
+	add	x11, x11, x0
+	add	x11, x1, x11, lsl #2
+	ldp	q4, q3, [x10, #32]
+	ldp	q1, q0, [x10]
+	madd	x10, x23, x29, x2
+	add	x10, x10, x0
+	ldp	q5, q2, [x11, #32]
+	ldp	q7, q6, [x11]
+	add	x11, x12, x0
+	add	x11, x1, x11, lsl #2
+	ldp	q19, q17, [x11, #32]
+	ldp	q22, q20, [x11]
+	add	x10, x1, x10, lsl #2
+	ldp	q18, q16, [x10, #32]
+	ldp	q23, q21, [x10]
+	madd	x10, x24, x28, x4
+	lsl	x10, x10, #2
+	ldr	q27, [x3, x10]
+	madd	x10, x19, x28, x4
+	lsl	x10, x10, #2
+	ldr	q26, [x3, x10]
+	madd	x10, x25, x28, x4
+	lsl	x10, x10, #2
+	ldr	q25, [x3, x10]
+	madd	x10, x23, x28, x4
+	lsl	x10, x10, #2
+	ldr	q24, [x3, x10]
+	ldr	x10, [sp, #1200]                // 8-byte Folded Reload
+	fmla	v3.4s, v29.4s, v27.s[0]
+	cmp	xzr, x22
+	b.ge	.LBB0_22
+	.p2align	2
+.LBB0_21:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_11 Depth=3
+                                        //         Parent Loop BB0_19 Depth=4
+                                        // =>        This Inner Loop Header: Depth=5
+	add	x14, x10, #64
+	fmla	v4.4s, v28.4s, v27.s[0]
+	fmla	v1.4s, v30.4s, v27.s[0]
+	add	x13, x10, #128
+	prfm	pldl1keep, [x14]
+	ldp	q9, q8, [x10, #-160]
+	fmla	v0.4s, v31.4s, v27.s[0]
+	ldp	q12, q15, [x10, #-192]
+	fmla	v2.4s, v29.4s, v26.s[0]
+	fmla	v5.4s, v28.4s, v26.s[0]
+	fmla	v6.4s, v31.4s, v26.s[0]
+	fmla	v7.4s, v30.4s, v26.s[0]
+	fmla	v17.4s, v29.4s, v25.s[0]
+	prfm	pldl1keep, [x13]
+	fmla	v19.4s, v28.4s, v25.s[0]
+	fmla	v20.4s, v31.4s, v25.s[0]
+	ldp	q11, q10, [x10, #-128]
+	fmla	v22.4s, v30.4s, v25.s[0]
+	fmla	v16.4s, v29.4s, v24.s[0]
+	ldp	q13, q14, [x10, #-96]
+	fmla	v18.4s, v28.4s, v24.s[0]
+	fmla	v21.4s, v31.4s, v24.s[0]
+	add	x12, x10, #192
+	prfm	pldl1keep, [x12]
+	fmla	v23.4s, v30.4s, v24.s[0]
+	fmla	v0.4s, v15.4s, v27.s[1]
+	add	x11, x10, #256
+	add	x8, x8, #4
+	fmla	v1.4s, v12.4s, v27.s[1]
+	fmla	v4.4s, v9.4s, v27.s[1]
+	fmla	v3.4s, v8.4s, v27.s[1]
+	fmla	v7.4s, v12.4s, v26.s[1]
+	fmla	v6.4s, v15.4s, v26.s[1]
+	fmla	v5.4s, v9.4s, v26.s[1]
+	fmla	v2.4s, v8.4s, v26.s[1]
+	fmla	v22.4s, v12.4s, v25.s[1]
+	fmla	v20.4s, v15.4s, v25.s[1]
+	fmla	v19.4s, v9.4s, v25.s[1]
+	fmla	v17.4s, v8.4s, v25.s[1]
+	fmla	v23.4s, v12.4s, v24.s[1]
+	fmla	v21.4s, v15.4s, v24.s[1]
+	ldp	q15, q12, [x10, #-64]
+	fmla	v18.4s, v9.4s, v24.s[1]
+	fmla	v16.4s, v8.4s, v24.s[1]
+	ldp	q9, q8, [x10, #-32]
+	prfm	pldl1keep, [x11]
+	ldp	q28, q29, [x10, #32]
+	ldp	q30, q31, [x10]
+	add	x10, x18, x27
+	prfm	pldl1keep, [x18]
+	fmla	v3.4s, v14.4s, v27.s[2]
+	fmla	v4.4s, v13.4s, v27.s[2]
+	fmla	v1.4s, v11.4s, v27.s[2]
+	fmla	v0.4s, v10.4s, v27.s[2]
+	fmla	v2.4s, v14.4s, v26.s[2]
+	fmla	v5.4s, v13.4s, v26.s[2]
+	fmla	v6.4s, v10.4s, v26.s[2]
+	fmla	v7.4s, v11.4s, v26.s[2]
+	fmla	v17.4s, v14.4s, v25.s[2]
+	fmla	v19.4s, v13.4s, v25.s[2]
+	fmla	v20.4s, v10.4s, v25.s[2]
+	fmla	v22.4s, v11.4s, v25.s[2]
+	fmla	v16.4s, v14.4s, v24.s[2]
+	fmla	v18.4s, v13.4s, v24.s[2]
+	fmla	v21.4s, v10.4s, v24.s[2]
+	fmla	v23.4s, v11.4s, v24.s[2]
+	fmla	v0.4s, v12.4s, v27.s[3]
+	fmla	v1.4s, v15.4s, v27.s[3]
+	fmla	v4.4s, v9.4s, v27.s[3]
+	fmla	v3.4s, v8.4s, v27.s[3]
+	ldur	q27, [x18, #-16]
+	prfm	pldl1keep, [x10]
+	add	x18, x18, #16
+	fmla	v7.4s, v15.4s, v26.s[3]
+	fmla	v6.4s, v12.4s, v26.s[3]
+	fmla	v5.4s, v9.4s, v26.s[3]
+	fmla	v2.4s, v8.4s, v26.s[3]
+	ldur	q26, [x10, #-16]
+	add	x10, x10, x27
+	add	x12, x10, x27
+	prfm	pldl1keep, [x10]
+	fmla	v22.4s, v15.4s, v25.s[3]
+	fmla	v20.4s, v12.4s, v25.s[3]
+	fmla	v19.4s, v9.4s, v25.s[3]
+	fmla	v17.4s, v8.4s, v25.s[3]
+	ldur	q25, [x10, #-16]
+	prfm	pldl1keep, [x12]
+	mov	x10, x11
+	fmla	v23.4s, v15.4s, v24.s[3]
+	fmla	v21.4s, v12.4s, v24.s[3]
+	fmla	v18.4s, v9.4s, v24.s[3]
+	fmla	v16.4s, v8.4s, v24.s[3]
+	ldur	q24, [x12, #-16]
+	fmla	v3.4s, v29.4s, v27.s[0]
+	cmp	x8, x22
+	b.lt	.LBB0_21
+.LBB0_22:                               //   in Loop: Header=BB0_19 Depth=4
+	ldp	q10, q8, [x5, #32]
+	ldp	q12, q11, [x5]
+	fmla	v4.4s, v28.4s, v27.s[0]
+	fmla	v0.4s, v31.4s, v27.s[0]
+	fmla	v1.4s, v30.4s, v27.s[0]
+	fmla	v2.4s, v29.4s, v26.s[0]
+	fmla	v5.4s, v28.4s, v26.s[0]
+	fmla	v6.4s, v31.4s, v26.s[0]
+	ldp	q9, q13, [x6, #32]
+	fmla	v7.4s, v30.4s, v26.s[0]
+	fmla	v17.4s, v29.4s, v25.s[0]
+	ldr	x10, [sp, #1192]                // 8-byte Folded Reload
+	ldr	x11, [sp, #1304]                // 8-byte Folded Reload
+	fmla	v19.4s, v28.4s, v25.s[0]
+	fmla	v20.4s, v31.4s, v25.s[0]
+	mov	x8, x30
+	fmla	v22.4s, v30.4s, v25.s[0]
+	fmla	v16.4s, v29.4s, v24.s[0]
+	fmla	v18.4s, v28.4s, v24.s[0]
+	fmla	v21.4s, v31.4s, v24.s[0]
+	fmla	v23.4s, v30.4s, v24.s[0]
+	ldp	q29, q30, [x6]
+	ldp	q31, q28, [x7, #32]
+	fmla	v1.4s, v12.4s, v27.s[1]
+	fmla	v0.4s, v11.4s, v27.s[1]
+	fmla	v4.4s, v10.4s, v27.s[1]
+	fmla	v3.4s, v8.4s, v27.s[1]
+	fmla	v7.4s, v12.4s, v26.s[1]
+	fmla	v6.4s, v11.4s, v26.s[1]
+	fmla	v5.4s, v10.4s, v26.s[1]
+	fmla	v2.4s, v8.4s, v26.s[1]
+	fmla	v22.4s, v12.4s, v25.s[1]
+	fmla	v20.4s, v11.4s, v25.s[1]
+	fmla	v19.4s, v10.4s, v25.s[1]
+	fmla	v17.4s, v8.4s, v25.s[1]
+	fmla	v23.4s, v12.4s, v24.s[1]
+	fmla	v21.4s, v11.4s, v24.s[1]
+	fmla	v18.4s, v10.4s, v24.s[1]
+	fmla	v16.4s, v8.4s, v24.s[1]
+	ldp	q10, q8, [x7]
+	fmla	v3.4s, v13.4s, v27.s[2]
+	fmla	v4.4s, v9.4s, v27.s[2]
+	fmla	v0.4s, v30.4s, v27.s[2]
+	fmla	v1.4s, v29.4s, v27.s[2]
+	fmla	v2.4s, v13.4s, v26.s[2]
+	fmla	v5.4s, v9.4s, v26.s[2]
+	fmla	v6.4s, v30.4s, v26.s[2]
+	fmla	v7.4s, v29.4s, v26.s[2]
+	fmla	v17.4s, v13.4s, v25.s[2]
+	fmla	v19.4s, v9.4s, v25.s[2]
+	fmla	v20.4s, v30.4s, v25.s[2]
+	fmla	v22.4s, v29.4s, v25.s[2]
+	fmla	v16.4s, v13.4s, v24.s[2]
+	fmla	v18.4s, v9.4s, v24.s[2]
+	fmla	v21.4s, v30.4s, v24.s[2]
+	fmla	v23.4s, v29.4s, v24.s[2]
+	fmla	v1.4s, v10.4s, v27.s[3]
+	fmla	v0.4s, v8.4s, v27.s[3]
+	fmla	v4.4s, v31.4s, v27.s[3]
+	fmla	v3.4s, v28.4s, v27.s[3]
+	fmla	v7.4s, v10.4s, v26.s[3]
+	fmla	v6.4s, v8.4s, v26.s[3]
+	fmla	v5.4s, v31.4s, v26.s[3]
+	fmla	v2.4s, v28.4s, v26.s[3]
+	fmla	v22.4s, v10.4s, v25.s[3]
+	fmla	v20.4s, v8.4s, v25.s[3]
+	fmla	v19.4s, v31.4s, v25.s[3]
+	fmla	v17.4s, v28.4s, v25.s[3]
+	fmla	v23.4s, v10.4s, v24.s[3]
+	fmla	v21.4s, v8.4s, v24.s[3]
+	fmla	v18.4s, v31.4s, v24.s[3]
+	fmla	v16.4s, v28.4s, v24.s[3]
+	cmp	x11, x20
+	b.ge	.LBB0_18
+	.p2align	2
+.LBB0_23:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_11 Depth=3
+                                        //         Parent Loop BB0_19 Depth=4
+                                        // =>        This Inner Loop Header: Depth=5
+	add	x12, x8, x27
+	prfm	pldl1keep, [x8]
+	ldur	s24, [x8, #-4]
+	add	x11, x11, #1
+	prfm	pldl1keep, [x12]
+	ldur	s25, [x12, #-4]
+	add	x12, x12, x27
+	add	x8, x8, #4
+	prfm	pldl1keep, [x12]
+	ldur	s26, [x12, #-4]
+	add	x12, x12, x27
+	prfm	pldl1keep, [x12]
+	ldur	s27, [x12, #-4]
+	prfm	pldl1keep, [x10]
+	ldp	q28, q29, [x10, #-32]
+	fmla	v3.4s, v29.4s, v24.s[0]
+	ldp	q30, q31, [x10, #-64]
+	fmla	v0.4s, v31.4s, v24.s[0]
+	fmla	v6.4s, v31.4s, v25.s[0]
+	fmla	v2.4s, v29.4s, v25.s[0]
+	fmla	v20.4s, v31.4s, v26.s[0]
+	fmla	v17.4s, v29.4s, v26.s[0]
+	add	x10, x10, #64
+	fmla	v4.4s, v28.4s, v24.s[0]
+	fmla	v1.4s, v30.4s, v24.s[0]
+	fmla	v5.4s, v28.4s, v25.s[0]
+	fmla	v7.4s, v30.4s, v25.s[0]
+	fmla	v22.4s, v30.4s, v26.s[0]
+	fmla	v19.4s, v28.4s, v26.s[0]
+	fmla	v23.4s, v30.4s, v27.s[0]
+	fmla	v21.4s, v31.4s, v27.s[0]
+	fmla	v18.4s, v28.4s, v27.s[0]
+	fmla	v16.4s, v29.4s, v27.s[0]
+	cmp	x11, x20
+	b.lt	.LBB0_23
+	b	.LBB0_18
+	.p2align	2
+.LBB0_24:                               //   in Loop: Header=BB0_11 Depth=3
+	ldr	x8, [sp, #1296]                 // 8-byte Folded Reload
+	ldr	x9, [sp, #1128]                 // 8-byte Folded Reload
+	cmp	x8, x9
+	ldr	x30, [sp, #1080]                // 8-byte Folded Reload
+	b.ge	.LBB0_30
+// %bb.25:                              //   in Loop: Header=BB0_11 Depth=3
+	ldr	x12, [sp, #1296]                // 8-byte Folded Reload
+	ldp	q20, q21, [x26, #32]
+	mov	x9, xzr
+	ldp	q18, q19, [x26]
+	add	x10, x12, #1
+	madd	x8, x12, x29, x2
+	madd	x11, x10, x29, x2
+	madd	x10, x10, x28, x4
+	add	x8, x8, x0
+	add	x11, x11, x0
+	add	x8, x1, x8, lsl #2
+	lsl	x10, x10, #2
+	add	x18, x1, x11, lsl #2
+	madd	x11, x12, x28, x4
+	ldr	q16, [x3, x10]
+	ldr	x10, [sp, #1120]                // 8-byte Folded Reload
+	lsl	x11, x11, #2
+	ldp	q1, q0, [x8, #32]
+	ldp	q4, q2, [x8]
+	ldp	q5, q3, [x18, #32]
+	ldp	q7, q6, [x18]
+	ldr	q17, [x3, x11]
+	ldr	x11, [sp, #1200]                // 8-byte Folded Reload
+	cmp	xzr, x22
+	b.ge	.LBB0_27
+	.p2align	2
+.LBB0_26:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_11 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	ldr	x12, [sp, #1184]                // 8-byte Folded Reload
+	add	x25, x11, #64
+	fmla	v0.4s, v21.4s, v17.s[0]
+	fmla	v1.4s, v20.4s, v17.s[0]
+	fmla	v4.4s, v18.4s, v17.s[0]
+	fmla	v2.4s, v19.4s, v17.s[0]
+	add	x14, x11, #128
+	add	x13, x11, #192
+	fmla	v3.4s, v21.4s, v16.s[0]
+	fmla	v5.4s, v20.4s, v16.s[0]
+	add	x19, x11, #256
+	add	x9, x9, #4
+	fmla	v6.4s, v19.4s, v16.s[0]
+	fmla	v7.4s, v18.4s, v16.s[0]
+	add	x21, x12, x10
+	ldr	x12, [sp, #1176]                // 8-byte Folded Reload
+	prfm	pldl1keep, [x25]
+	ldp	q23, q22, [x11, #-160]
+	add	x23, x21, #32
+	ldp	q24, q25, [x11, #-192]
+	prfm	pldl1keep, [x14]
+	fmla	v2.4s, v25.4s, v17.s[1]
+	ldp	q19, q18, [x11, #-128]
+	ldp	q20, q21, [x11, #-96]
+	fmla	v0.4s, v22.4s, v17.s[1]
+	fmla	v6.4s, v25.4s, v16.s[1]
+	fmla	v3.4s, v22.4s, v16.s[1]
+	prfm	pldl1keep, [x13]
+	add	x24, x12, x10
+	add	x10, x10, #16
+	fmla	v4.4s, v24.4s, v17.s[1]
+	fmla	v1.4s, v23.4s, v17.s[1]
+	fmla	v7.4s, v24.4s, v16.s[1]
+	fmla	v5.4s, v23.4s, v16.s[1]
+	ldp	q23, q22, [x11, #-32]
+	ldp	q24, q25, [x11, #-64]
+	add	x12, x24, #32
+	prfm	pldl1keep, [x12]
+	fmla	v0.4s, v21.4s, v17.s[2]
+	fmla	v2.4s, v18.4s, v17.s[2]
+	fmla	v3.4s, v21.4s, v16.s[2]
+	fmla	v6.4s, v18.4s, v16.s[2]
+	fmla	v1.4s, v20.4s, v17.s[2]
+	fmla	v4.4s, v19.4s, v17.s[2]
+	fmla	v5.4s, v20.4s, v16.s[2]
+	fmla	v7.4s, v19.4s, v16.s[2]
+	fmla	v2.4s, v25.4s, v17.s[3]
+	fmla	v0.4s, v22.4s, v17.s[3]
+	fmla	v6.4s, v25.4s, v16.s[3]
+	fmla	v3.4s, v22.4s, v16.s[3]
+	fmla	v4.4s, v24.4s, v17.s[3]
+	fmla	v1.4s, v23.4s, v17.s[3]
+	ldr	q17, [x24, #16]
+	prfm	pldl1keep, [x23]
+	fmla	v7.4s, v24.4s, v16.s[3]
+	fmla	v5.4s, v23.4s, v16.s[3]
+	ldr	q16, [x21, #16]
+	prfm	pldl1keep, [x19]
+	ldp	q20, q21, [x11, #32]
+	ldp	q18, q19, [x11]
+	mov	x11, x19
+	cmp	x9, x22
+	b.lt	.LBB0_26
+.LBB0_27:                               //   in Loop: Header=BB0_11 Depth=3
+	ldp	q23, q22, [x5, #32]
+	ldp	q25, q24, [x5]
+	fmla	v0.4s, v21.4s, v17.s[0]
+	fmla	v1.4s, v20.4s, v17.s[0]
+	fmla	v2.4s, v19.4s, v17.s[0]
+	fmla	v4.4s, v18.4s, v17.s[0]
+	fmla	v3.4s, v21.4s, v16.s[0]
+	fmla	v5.4s, v20.4s, v16.s[0]
+	ldp	q20, q21, [x6, #32]
+	fmla	v6.4s, v19.4s, v16.s[0]
+	fmla	v7.4s, v18.4s, v16.s[0]
+	ldp	q18, q19, [x6]
+	fmla	v2.4s, v24.4s, v17.s[1]
+	fmla	v0.4s, v22.4s, v17.s[1]
+	ldr	x9, [sp, #1120]                 // 8-byte Folded Reload
+	ldr	x10, [sp, #1192]                // 8-byte Folded Reload
+	fmla	v4.4s, v25.4s, v17.s[1]
+	fmla	v1.4s, v23.4s, v17.s[1]
+	ldr	x11, [sp, #1304]                // 8-byte Folded Reload
+	fmla	v7.4s, v25.4s, v16.s[1]
+	fmla	v6.4s, v24.4s, v16.s[1]
+	ldp	q25, q24, [x7]
+	fmla	v5.4s, v23.4s, v16.s[1]
+	fmla	v3.4s, v22.4s, v16.s[1]
+	ldp	q23, q22, [x7, #32]
+	fmla	v0.4s, v21.4s, v17.s[2]
+	fmla	v2.4s, v19.4s, v17.s[2]
+	fmla	v3.4s, v21.4s, v16.s[2]
+	fmla	v6.4s, v19.4s, v16.s[2]
+	fmla	v1.4s, v20.4s, v17.s[2]
+	fmla	v4.4s, v18.4s, v17.s[2]
+	fmla	v5.4s, v20.4s, v16.s[2]
+	fmla	v7.4s, v18.4s, v16.s[2]
+	fmla	v2.4s, v24.4s, v17.s[3]
+	fmla	v0.4s, v22.4s, v17.s[3]
+	fmla	v6.4s, v24.4s, v16.s[3]
+	fmla	v3.4s, v22.4s, v16.s[3]
+	fmla	v4.4s, v25.4s, v17.s[3]
+	fmla	v1.4s, v23.4s, v17.s[3]
+	fmla	v7.4s, v25.4s, v16.s[3]
+	fmla	v5.4s, v23.4s, v16.s[3]
+	cmp	x11, x20
+	b.ge	.LBB0_29
+	.p2align	2
+.LBB0_28:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_11 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	ldr	x14, [sp, #1168]                // 8-byte Folded Reload
+	ldr	x19, [sp, #1160]                // 8-byte Folded Reload
+	add	x11, x11, #1
+	add	x12, x14, x9
+	add	x13, x19, x9
+	add	x13, x13, #4
+	add	x12, x12, #4
+	prfm	pldl1keep, [x13]
+	ldr	s16, [x19, x9]
+	prfm	pldl1keep, [x12]
+	ldr	s17, [x14, x9]
+	prfm	pldl1keep, [x10]
+	add	x9, x9, #4
+	ldp	q18, q19, [x10, #-64]
+	ldp	q20, q21, [x10, #-32]
+	add	x10, x10, #64
+	fmla	v0.4s, v21.4s, v16.s[0]
+	fmla	v1.4s, v20.4s, v16.s[0]
+	fmla	v2.4s, v19.4s, v16.s[0]
+	fmla	v4.4s, v18.4s, v16.s[0]
+	fmla	v7.4s, v18.4s, v17.s[0]
+	fmla	v6.4s, v19.4s, v17.s[0]
+	fmla	v5.4s, v20.4s, v17.s[0]
+	fmla	v3.4s, v21.4s, v17.s[0]
+	cmp	x11, x20
+	b.lt	.LBB0_28
+.LBB0_29:                               //   in Loop: Header=BB0_11 Depth=3
+	stp	q4, q2, [x8]
+	stp	q1, q0, [x8, #32]
+	stp	q7, q6, [x18]
+	stp	q5, q3, [x18, #32]
+.LBB0_30:                               //   in Loop: Header=BB0_11 Depth=3
+	ldr	x8, [sp, #1024]                 // 8-byte Folded Reload
+	ldr	x9, [sp, #1128]                 // 8-byte Folded Reload
+	cmp	x9, x8
+	ldr	x19, [sp, #1072]                // 8-byte Folded Reload
+	b.ge	.LBB0_10
+// %bb.31:                              //   in Loop: Header=BB0_11 Depth=3
+	ldr	x10, [sp, #1128]                // 8-byte Folded Reload
+	ldp	q7, q16, [x26, #32]
+	mov	x9, xzr
+	ldp	q6, q5, [x26]
+	ldr	x11, [sp, #1200]                // 8-byte Folded Reload
+	madd	x8, x10, x29, x2
+	madd	x10, x10, x28, x4
+	add	x8, x8, x0
+	lsl	x10, x10, #2
+	ldr	x0, [sp, #848]                  // 8-byte Folded Reload
+	add	x8, x1, x8, lsl #2
+	ldr	q4, [x3, x10]
+	ldr	x10, [sp, #904]                 // 8-byte Folded Reload
+	ldr	x1, [sp, #896]                  // 8-byte Folded Reload
+	ldp	q1, q0, [x8, #32]
+	ldp	q3, q2, [x8]
+	cmp	xzr, x22
+	b.ge	.LBB0_33
+	.p2align	2
+.LBB0_32:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_11 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x18, x11, #64
+	fmla	v0.4s, v16.4s, v4.s[0]
+	fmla	v1.4s, v7.4s, v4.s[0]
+	add	x14, x11, #128
+	prfm	pldl1keep, [x18]
+	ldp	q18, q17, [x11, #-160]
+	fmla	v3.4s, v6.4s, v4.s[0]
+	ldp	q19, q20, [x11, #-192]
+	fmla	v2.4s, v5.4s, v4.s[0]
+	prfm	pldl1keep, [x14]
+	ldp	q6, q5, [x11, #-128]
+	ldp	q7, q16, [x11, #-96]
+	add	x13, x11, #192
+	prfm	pldl1keep, [x13]
+	add	x12, x11, #256
+	add	x9, x9, #4
+	fmla	v2.4s, v20.4s, v4.s[1]
+	fmla	v0.4s, v17.4s, v4.s[1]
+	fmla	v3.4s, v19.4s, v4.s[1]
+	fmla	v1.4s, v18.4s, v4.s[1]
+	ldp	q18, q17, [x11, #-32]
+	ldp	q19, q20, [x11, #-64]
+	prfm	pldl1keep, [x10]
+	fmla	v0.4s, v16.4s, v4.s[2]
+	fmla	v2.4s, v5.4s, v4.s[2]
+	fmla	v1.4s, v7.4s, v4.s[2]
+	fmla	v3.4s, v6.4s, v4.s[2]
+	fmla	v2.4s, v20.4s, v4.s[3]
+	fmla	v0.4s, v17.4s, v4.s[3]
+	fmla	v3.4s, v19.4s, v4.s[3]
+	fmla	v1.4s, v18.4s, v4.s[3]
+	ldur	q4, [x10, #-16]
+	prfm	pldl1keep, [x12]
+	add	x10, x10, #16
+	ldp	q7, q16, [x11, #32]
+	ldp	q6, q5, [x11]
+	mov	x11, x12
+	cmp	x9, x22
+	b.lt	.LBB0_32
+.LBB0_33:                               //   in Loop: Header=BB0_11 Depth=3
+	ldp	q18, q17, [x5, #32]
+	ldp	q20, q19, [x5]
+	fmla	v0.4s, v16.4s, v4.s[0]
+	fmla	v1.4s, v7.4s, v4.s[0]
+	fmla	v2.4s, v5.4s, v4.s[0]
+	fmla	v3.4s, v6.4s, v4.s[0]
+	ldp	q5, q6, [x6]
+	ldp	q7, q16, [x6, #32]
+	ldr	x10, [sp, #880]                 // 8-byte Folded Reload
+	mov	x9, xzr
+	mov	w11, #64                        // =0x40
+	fmla	v2.4s, v19.4s, v4.s[1]
+	fmla	v0.4s, v17.4s, v4.s[1]
+	fmla	v3.4s, v20.4s, v4.s[1]
+	fmla	v1.4s, v18.4s, v4.s[1]
+	fmla	v0.4s, v16.4s, v4.s[2]
+	ldp	q18, q17, [x7, #32]
+	ldp	q20, q19, [x7]
+	fmla	v2.4s, v6.4s, v4.s[2]
+	fmla	v1.4s, v7.4s, v4.s[2]
+	fmla	v3.4s, v5.4s, v4.s[2]
+	fmla	v2.4s, v19.4s, v4.s[3]
+	fmla	v0.4s, v17.4s, v4.s[3]
+	fmla	v3.4s, v20.4s, v4.s[3]
+	fmla	v1.4s, v18.4s, v4.s[3]
+	ldr	x12, [sp, #1304]                // 8-byte Folded Reload
+	add	x12, x12, xzr
+	cmp	x12, x20
+	b.ge	.LBB0_9
+	.p2align	2
+.LBB0_34:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_11 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x13, x0, x9, lsl #6
+	add	x12, x0, x11
+	prfm	pldl1keep, [x10]
+	add	x11, x11, #64
+	ldr	s4, [x1, x9, lsl #2]
+	prfm	pldl1keep, [x12]
+	add	x9, x9, #1
+	ldp	q5, q6, [x13]
+	ldp	q7, q16, [x13, #32]
+	add	x10, x10, #4
+	fmla	v0.4s, v16.4s, v4.s[0]
+	fmla	v2.4s, v6.4s, v4.s[0]
+	fmla	v1.4s, v7.4s, v4.s[0]
+	fmla	v3.4s, v5.4s, v4.s[0]
+	ldr	x12, [sp, #1304]                // 8-byte Folded Reload
+	add	x12, x12, x9
+	cmp	x12, x20
+	b.lt	.LBB0_34
+	b	.LBB0_9
+	.p2align	2
+.LBB0_35:                               //   in Loop: Header=BB0_7 Depth=2
+	cmp	x8, x30
+	ldr	x8, [sp, #944]                  // 8-byte Folded Reload
+	ldr	x9, [sp, #936]                  // 8-byte Folded Reload
+	add	x8, x9, x8, lsl #2
+	ldr	x9, [sp, #920]                  // 8-byte Folded Reload
+	str	x8, [sp, #1272]                 // 8-byte Folded Spill
+	lsl	x8, x29, #1
+	str	x8, [sp, #1256]                 // 8-byte Folded Spill
+	ldr	x8, [sp, #928]                  // 8-byte Folded Reload
+	add	x8, x9, x8, lsl #2
+	ldr	x9, [sp, #952]                  // 8-byte Folded Reload
+	str	x8, [sp, #1264]                 // 8-byte Folded Spill
+	ldr	x8, [sp, #960]                  // 8-byte Folded Reload
+	add	x8, x9, x8, lsl #2
+	str	x8, [sp, #1280]                 // 8-byte Folded Spill
+	b.lt	.LBB0_39
+// %bb.36:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x8, [sp, #592]                  // 8-byte Folded Reload
+	cmp	x30, x8
+	b.lt	.LBB0_64
+.LBB0_37:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x8, [sp, #592]                  // 8-byte Folded Reload
+	ldr	x9, [sp, #584]                  // 8-byte Folded Reload
+	cmp	x8, x9
+	b.lt	.LBB0_89
+.LBB0_38:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x8, [sp, #504]                  // 8-byte Folded Reload
+	ldr	x9, [sp, #584]                  // 8-byte Folded Reload
+	cmp	x9, x8
+	b.ge	.LBB0_6
+	b	.LBB0_114
+	.p2align	2
+.LBB0_39:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x8, [sp, #440]                  // 8-byte Folded Reload
+	add	x0, x8, #64
+	bl	malloc
+	ldr	x9, [sp, #1040]                 // 8-byte Folded Reload
+	ldr	x8, [sp, #1016]                 // 8-byte Folded Reload
+	add	x14, x0, #63
+	mov	x11, xzr
+	ldr	x17, [sp, #1064]                // 8-byte Folded Reload
+	ldr	x1, [sp, #1256]                 // 8-byte Folded Reload
+	mul	x12, x19, x9
+	ldr	x9, [sp, #1008]                 // 8-byte Folded Reload
+	mul	x8, x19, x8
+	ldr	x16, [sp, #1104]                // 8-byte Folded Reload
+	ldr	x18, [sp, #1272]                // 8-byte Folded Reload
+	add	x10, x1, x29
+	ldr	x6, [sp, #992]                  // 8-byte Folded Reload
+	ldr	x7, [sp, #976]                  // 8-byte Folded Reload
+	ldp	x23, x21, [sp, #368]            // 16-byte Folded Reload
+	madd	x9, x17, x9, x8
+	ldr	x8, [sp, #1048]                 // 8-byte Folded Reload
+	ldp	x25, x24, [sp, #352]            // 16-byte Folded Reload
+	ldr	x30, [sp, #344]                 // 8-byte Folded Reload
+	madd	x13, x17, x8, x12
+	add	x12, x9, x16
+	and	x8, x14, #0xffffffffffffffc0
+	add	x10, x12, x10
+	add	x14, x18, x12, lsl #2
+	add	x15, x12, x29
+	add	x12, x12, x1
+	add	x10, x18, x10, lsl #2
+	add	x15, x18, x15, lsl #2
+	add	x12, x18, x12, lsl #2
+	ldp	q1, q0, [x14]
+	ldr	x14, [sp, #1056]                // 8-byte Folded Reload
+	ldp	q7, q5, [x10]
+	ldr	x10, [sp, #1032]                // 8-byte Folded Reload
+	mul	x10, x19, x10
+	ldp	q6, q3, [x15]
+	ldr	x15, [sp, #1280]                // 8-byte Folded Reload
+	ldr	x19, [sp, #968]                 // 8-byte Folded Reload
+	ldp	q4, q2, [x12]
+	add	x12, x13, x16
+	madd	x10, x17, x14, x10
+	lsl	x14, x10, #2
+	ldr	q18, [x15, x14]
+	add	x14, x10, x28
+	lsl	x14, x14, #2
+	ldr	q17, [x15, x14]
+	add	x14, x10, x28, lsl #1
+	lsl	x14, x14, #2
+	ldr	q16, [x15, x14]
+	ldr	x14, [sp, #1264]                // 8-byte Folded Reload
+	ldr	x15, [sp, #1120]                // 8-byte Folded Reload
+	add	x12, x14, x12, lsl #2
+	ldr	x14, [sp, #656]                 // 8-byte Folded Reload
+	ldp	q21, q20, [x12]
+	add	x12, x8, #64
+	.p2align	2
+.LBB0_40:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldr	x16, [sp, #1288]                // 8-byte Folded Reload
+	fmla	v1.4s, v21.4s, v18.s[0]
+	fmla	v0.4s, v20.4s, v18.s[0]
+	cmp	x11, x22
+	add	x16, x16, x15
+	prfm	pldl1keep, [x16, #16]
+	ldr	q19, [x16]
+	b.ge	.LBB0_42
+// %bb.41:                              //   in Loop: Header=BB0_40 Depth=3
+	ldr	x17, [sp, #800]                 // 8-byte Folded Reload
+	add	x16, x25, x14
+	fmla	v6.4s, v21.4s, v17.s[0]
+	fmla	v3.4s, v20.4s, v17.s[0]
+	fmla	v4.4s, v21.4s, v16.s[0]
+	fmla	v2.4s, v20.4s, v16.s[0]
+	stp	q21, q20, [x12, #-64]
+	fmla	v7.4s, v21.4s, v19.s[0]
+	fmla	v5.4s, v20.4s, v19.s[0]
+	prfm	pldl1keep, [x16]
+	add	x16, x24, x14
+	add	x1, x21, x14
+	add	x5, x30, x14
+	add	x11, x11, #4
+	add	x17, x17, x14
+	ldp	q20, q21, [x17]
+	ldr	x17, [sp, #792]                 // 8-byte Folded Reload
+	add	x17, x17, x14
+	fmla	v0.4s, v21.4s, v18.s[1]
+	fmla	v3.4s, v21.4s, v17.s[1]
+	fmla	v2.4s, v21.4s, v16.s[1]
+	fmla	v5.4s, v21.4s, v19.s[1]
+	fmla	v1.4s, v20.4s, v18.s[1]
+	fmla	v6.4s, v20.4s, v17.s[1]
+	fmla	v4.4s, v20.4s, v16.s[1]
+	fmla	v7.4s, v20.4s, v19.s[1]
+	stp	q20, q21, [x12, #-32]
+	prfm	pldl1keep, [x16]
+	add	x16, x23, x14
+	ldp	q21, q20, [x17]
+	ldr	x17, [sp, #784]                 // 8-byte Folded Reload
+	add	x17, x17, x14
+	fmla	v0.4s, v20.4s, v18.s[2]
+	fmla	v3.4s, v20.4s, v17.s[2]
+	fmla	v2.4s, v20.4s, v16.s[2]
+	fmla	v5.4s, v20.4s, v19.s[2]
+	fmla	v1.4s, v21.4s, v18.s[2]
+	fmla	v6.4s, v21.4s, v17.s[2]
+	fmla	v4.4s, v21.4s, v16.s[2]
+	fmla	v7.4s, v21.4s, v19.s[2]
+	stp	q21, q20, [x12]
+	prfm	pldl1keep, [x16]
+	ldr	x16, [sp, #1224]                // 8-byte Folded Reload
+	ldp	q20, q21, [x17]
+	ldr	x17, [sp, #1208]                // 8-byte Folded Reload
+	ldr	x18, [sp, #1216]                // 8-byte Folded Reload
+	add	x16, x16, x15
+	add	x17, x17, x15
+	add	x18, x18, x15
+	fmla	v0.4s, v21.4s, v18.s[3]
+	fmla	v3.4s, v21.4s, v17.s[3]
+	fmla	v2.4s, v21.4s, v16.s[3]
+	fmla	v5.4s, v21.4s, v19.s[3]
+	add	x15, x15, #16
+	add	x2, x16, #32
+	add	x3, x17, #32
+	add	x4, x18, #32
+	fmla	v1.4s, v20.4s, v18.s[3]
+	fmla	v6.4s, v20.4s, v17.s[3]
+	fmla	v4.4s, v20.4s, v16.s[3]
+	fmla	v7.4s, v20.4s, v19.s[3]
+	stp	q20, q21, [x12, #32]
+	prfm	pldl1keep, [x1]
+	add	x12, x12, #128
+	ldp	q21, q20, [x5]
+	prfm	pldl1keep, [x4]
+	ldr	q18, [x18, #16]
+	prfm	pldl1keep, [x3]
+	ldr	q17, [x17, #16]
+	prfm	pldl1keep, [x2]
+	ldr	q16, [x16, #16]
+	ldr	x16, [sp, #1240]                // 8-byte Folded Reload
+	add	x14, x14, x16
+	b	.LBB0_40
+	.p2align	2
+.LBB0_42:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x16, [sp, #1000]                // 8-byte Folded Reload
+	ldr	x12, [sp, #984]                 // 8-byte Folded Reload
+	add	x11, x8, x22, lsl #5
+	fmla	v6.4s, v21.4s, v17.s[0]
+	ldr	x17, [sp, #1104]                // 8-byte Folded Reload
+	ldr	x18, [sp, #1264]                // 8-byte Folded Reload
+	fmla	v3.4s, v20.4s, v17.s[0]
+	fmla	v4.4s, v21.4s, v16.s[0]
+	stp	q21, q20, [x11]
+	fmla	v2.4s, v20.4s, v16.s[0]
+	fmla	v5.4s, v20.4s, v19.s[0]
+	fmla	v7.4s, v21.4s, v19.s[0]
+	ldr	x23, [sp, #648]                 // 8-byte Folded Reload
+	ldr	x24, [sp, #680]                 // 8-byte Folded Reload
+	mov	x14, xzr
+	madd	x11, x12, x16, x13
+	ldr	x25, [sp, #1272]                // 8-byte Folded Reload
+	mov	x15, xzr
+	add	x11, x11, x17
+	add	x11, x18, x11, lsl #2
+	ldp	q20, q21, [x11]
+	add	x11, x8, x12, lsl #5
+	madd	x12, x7, x16, x13
+	madd	x13, x19, x16, x13
+	ldr	x16, [sp, #664]                 // 8-byte Folded Reload
+	add	x12, x12, x17
+	fmla	v0.4s, v21.4s, v18.s[1]
+	fmla	v3.4s, v21.4s, v17.s[1]
+	fmla	v2.4s, v21.4s, v16.s[1]
+	fmla	v5.4s, v21.4s, v19.s[1]
+	add	x13, x13, x17
+	ldr	x17, [sp, #472]                 // 8-byte Folded Reload
+	add	x12, x18, x12, lsl #2
+	fmla	v1.4s, v20.4s, v18.s[1]
+	stp	q20, q21, [x11]
+	fmla	v6.4s, v20.4s, v17.s[1]
+	fmla	v4.4s, v20.4s, v16.s[1]
+	fmla	v7.4s, v20.4s, v19.s[1]
+	add	x13, x18, x13, lsl #2
+	ldr	x18, [sp, #672]                 // 8-byte Folded Reload
+	add	x16, x16, x17
+	ldp	q21, q20, [x12]
+	add	x12, x8, x7, lsl #5
+	add	x17, x18, x17
+	ldr	x18, [sp, #1304]                // 8-byte Folded Reload
+	fmla	v0.4s, v20.4s, v18.s[2]
+	fmla	v3.4s, v20.4s, v17.s[2]
+	fmla	v2.4s, v20.4s, v16.s[2]
+	fmla	v5.4s, v20.4s, v19.s[2]
+	stp	q21, q20, [x12]
+	fmla	v1.4s, v21.4s, v18.s[2]
+	fmla	v6.4s, v21.4s, v17.s[2]
+	fmla	v4.4s, v21.4s, v16.s[2]
+	fmla	v7.4s, v21.4s, v19.s[2]
+	ldp	q20, q21, [x13]
+	add	x13, x8, x19, lsl #5
+	fmla	v0.4s, v21.4s, v18.s[3]
+	fmla	v3.4s, v21.4s, v17.s[3]
+	fmla	v2.4s, v21.4s, v16.s[3]
+	fmla	v5.4s, v21.4s, v19.s[3]
+	fmla	v1.4s, v20.4s, v18.s[3]
+	fmla	v6.4s, v20.4s, v17.s[3]
+	fmla	v4.4s, v20.4s, v16.s[3]
+	fmla	v7.4s, v20.4s, v19.s[3]
+	stp	q20, q21, [x13]
+	cmp	x18, x20
+	b.ge	.LBB0_44
+	.p2align	2
+.LBB0_43:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldr	x2, [sp, #1152]                 // 8-byte Folded Reload
+	add	x1, x16, x15
+	add	x3, x8, x18, lsl #5
+	add	x18, x18, #1
+	add	x2, x2, x14
+	add	x14, x14, #4
+	prfm	pldl1keep, [x2]
+	ldur	s16, [x2, #-4]
+	add	x2, x2, x27
+	prfm	pldl1keep, [x2]
+	ldur	s17, [x2, #-4]
+	add	x2, x2, x27
+	prfm	pldl1keep, [x2]
+	ldur	s18, [x2, #-4]
+	add	x2, x2, x27
+	prfm	pldl1keep, [x2]
+	ldur	s19, [x2, #-4]
+	add	x2, x17, x15
+	prfm	pldl1keep, [x1]
+	add	x15, x15, x6
+	ldp	q20, q21, [x2]
+	fmla	v0.4s, v21.4s, v16.s[0]
+	fmla	v3.4s, v21.4s, v17.s[0]
+	fmla	v2.4s, v21.4s, v18.s[0]
+	fmla	v5.4s, v21.4s, v19.s[0]
+	fmla	v1.4s, v20.4s, v16.s[0]
+	fmla	v6.4s, v20.4s, v17.s[0]
+	fmla	v4.4s, v20.4s, v18.s[0]
+	fmla	v7.4s, v20.4s, v19.s[0]
+	stp	q20, q21, [x3]
+	cmp	x18, x20
+	b.lt	.LBB0_43
+.LBB0_44:                               // %.preheader67
+                                        //   in Loop: Header=BB0_7 Depth=2
+	ldr	x15, [sp, #336]                 // 8-byte Folded Reload
+	ldr	x16, [sp, #1096]                // 8-byte Folded Reload
+	mov	x5, xzr
+	add	x14, x8, #128
+	ldr	x17, [sp, #1088]                // 8-byte Folded Reload
+	mov	w2, #1                          // =0x1
+	mov	w3, #2                          // =0x2
+	mov	w1, #3                          // =0x3
+	mov	w18, #4                         // =0x4
+	add	x15, x8, x15
+	b	.LBB0_46
+	.p2align	2
+.LBB0_45:                               // %.loopexit63
+                                        //   in Loop: Header=BB0_46 Depth=3
+	ldr	x5, [sp, #1232]                 // 8-byte Folded Reload
+	add	x17, x17, x5
+	add	x16, x16, x5
+	mov	x5, x18
+	mov	x18, x4
+.LBB0_46:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Loop Header: Depth=3
+                                        //         Child Loop BB0_48 Depth 4
+                                        //         Child Loop BB0_50 Depth 4
+	madd	x4, x5, x29, x9
+	ldr	x21, [sp, #1104]                // 8-byte Folded Reload
+	add	x4, x4, x21
+	madd	x2, x2, x29, x9
+	madd	x3, x3, x29, x9
+	madd	x1, x1, x29, x9
+	add	x2, x2, x21
+	add	x1, x1, x21
+	add	x4, x25, x4, lsl #2
+	add	x2, x25, x2, lsl #2
+	stp	q1, q0, [x4]
+	stp	q6, q3, [x2]
+	add	x2, x3, x21
+	add	x1, x25, x1, lsl #2
+	add	x2, x25, x2, lsl #2
+	stp	q4, q2, [x2]
+	stp	q7, q5, [x1]
+	ldr	x1, [sp, #1296]                 // 8-byte Folded Reload
+	cmp	x18, x1
+	b.ge	.LBB0_51
+// %bb.47:                              //   in Loop: Header=BB0_46 Depth=3
+	madd	x4, x18, x29, x9
+	add	x2, x18, #1
+	add	x3, x18, #2
+	add	x1, x18, #3
+	madd	x6, x2, x29, x9
+	ldp	q20, q21, [x8]
+	mov	x5, xzr
+	madd	x7, x3, x29, x9
+	add	x4, x4, x21
+	madd	x19, x1, x29, x9
+	add	x4, x25, x4, lsl #2
+	ldp	q1, q0, [x4]
+	add	x4, x6, x21
+	add	x6, x7, x21
+	add	x7, x19, x21
+	add	x6, x25, x6, lsl #2
+	add	x7, x25, x7, lsl #2
+	add	x4, x25, x4, lsl #2
+	ldp	q4, q2, [x6]
+	madd	x6, x18, x28, x10
+	lsl	x6, x6, #2
+	ldp	q7, q5, [x7]
+	ldr	x7, [sp, #1280]                 // 8-byte Folded Reload
+	ldp	q6, q3, [x4]
+	add	x4, x18, #4
+	ldr	q19, [x7, x6]
+	madd	x6, x2, x28, x10
+	lsl	x6, x6, #2
+	ldr	q18, [x7, x6]
+	madd	x6, x3, x28, x10
+	lsl	x6, x6, #2
+	ldr	q17, [x7, x6]
+	madd	x6, x1, x28, x10
+	lsl	x6, x6, #2
+	ldr	q16, [x7, x6]
+	mov	x6, x14
+	mov	x7, x17
+	cmp	xzr, x22
+	b.ge	.LBB0_49
+	.p2align	2
+.LBB0_48:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_46 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x19, x6, #32
+	fmla	v1.4s, v20.4s, v19.s[0]
+	fmla	v0.4s, v21.4s, v19.s[0]
+	add	x5, x5, #4
+	prfm	pldl1keep, [x19]
+	ldp	q22, q23, [x6, #-96]
+	fmla	v3.4s, v21.4s, v18.s[0]
+	fmla	v6.4s, v20.4s, v18.s[0]
+	fmla	v2.4s, v21.4s, v17.s[0]
+	fmla	v4.4s, v20.4s, v17.s[0]
+	add	x19, x6, #96
+	fmla	v5.4s, v21.4s, v16.s[0]
+	fmla	v7.4s, v20.4s, v16.s[0]
+	ldp	q21, q20, [x6, #-64]
+	prfm	pldl1keep, [x19]
+	add	x19, x7, x27
+	add	x21, x19, x27
+	fmla	v0.4s, v23.4s, v19.s[1]
+	fmla	v3.4s, v23.4s, v18.s[1]
+	fmla	v2.4s, v23.4s, v17.s[1]
+	fmla	v5.4s, v23.4s, v16.s[1]
+	fmla	v1.4s, v22.4s, v19.s[1]
+	fmla	v6.4s, v22.4s, v18.s[1]
+	fmla	v4.4s, v22.4s, v17.s[1]
+	fmla	v7.4s, v22.4s, v16.s[1]
+	fmla	v0.4s, v20.4s, v19.s[2]
+	ldp	q22, q23, [x6, #-32]
+	fmla	v3.4s, v20.4s, v18.s[2]
+	fmla	v2.4s, v20.4s, v17.s[2]
+	fmla	v5.4s, v20.4s, v16.s[2]
+	fmla	v1.4s, v21.4s, v19.s[2]
+	fmla	v6.4s, v21.4s, v18.s[2]
+	fmla	v4.4s, v21.4s, v17.s[2]
+	fmla	v7.4s, v21.4s, v16.s[2]
+	ldp	q20, q21, [x6], #128
+	prfm	pldl1keep, [x7]
+	fmla	v0.4s, v23.4s, v19.s[3]
+	fmla	v3.4s, v23.4s, v18.s[3]
+	fmla	v2.4s, v23.4s, v17.s[3]
+	fmla	v5.4s, v23.4s, v16.s[3]
+	fmla	v1.4s, v22.4s, v19.s[3]
+	ldur	q19, [x7, #-16]
+	prfm	pldl1keep, [x19]
+	fmla	v6.4s, v22.4s, v18.s[3]
+	ldur	q18, [x19, #-16]
+	add	x19, x21, x27
+	prfm	pldl1keep, [x21]
+	add	x7, x7, #16
+	fmla	v4.4s, v22.4s, v17.s[3]
+	ldur	q17, [x21, #-16]
+	prfm	pldl1keep, [x19]
+	fmla	v7.4s, v22.4s, v16.s[3]
+	ldur	q16, [x19, #-16]
+	cmp	x5, x22
+	b.lt	.LBB0_48
+.LBB0_49:                               //   in Loop: Header=BB0_46 Depth=3
+	ldp	q23, q22, [x11]
+	fmla	v0.4s, v21.4s, v19.s[0]
+	fmla	v1.4s, v20.4s, v19.s[0]
+	fmla	v3.4s, v21.4s, v18.s[0]
+	fmla	v6.4s, v20.4s, v18.s[0]
+	ldr	x7, [sp, #1304]                 // 8-byte Folded Reload
+	mov	x5, x16
+	fmla	v2.4s, v21.4s, v17.s[0]
+	fmla	v4.4s, v20.4s, v17.s[0]
+	mov	x6, x15
+	fmla	v5.4s, v21.4s, v16.s[0]
+	fmla	v7.4s, v20.4s, v16.s[0]
+	ldp	q20, q21, [x12]
+	fmla	v0.4s, v22.4s, v19.s[1]
+	fmla	v3.4s, v22.4s, v18.s[1]
+	fmla	v2.4s, v22.4s, v17.s[1]
+	fmla	v5.4s, v22.4s, v16.s[1]
+	fmla	v1.4s, v23.4s, v19.s[1]
+	fmla	v6.4s, v23.4s, v18.s[1]
+	fmla	v4.4s, v23.4s, v17.s[1]
+	fmla	v7.4s, v23.4s, v16.s[1]
+	fmla	v0.4s, v21.4s, v19.s[2]
+	ldp	q23, q22, [x13]
+	fmla	v3.4s, v21.4s, v18.s[2]
+	fmla	v2.4s, v21.4s, v17.s[2]
+	fmla	v5.4s, v21.4s, v16.s[2]
+	fmla	v1.4s, v20.4s, v19.s[2]
+	fmla	v6.4s, v20.4s, v18.s[2]
+	fmla	v4.4s, v20.4s, v17.s[2]
+	fmla	v7.4s, v20.4s, v16.s[2]
+	fmla	v0.4s, v22.4s, v19.s[3]
+	fmla	v3.4s, v22.4s, v18.s[3]
+	fmla	v2.4s, v22.4s, v17.s[3]
+	fmla	v5.4s, v22.4s, v16.s[3]
+	fmla	v1.4s, v23.4s, v19.s[3]
+	fmla	v6.4s, v23.4s, v18.s[3]
+	fmla	v4.4s, v23.4s, v17.s[3]
+	fmla	v7.4s, v23.4s, v16.s[3]
+	cmp	x7, x20
+	b.ge	.LBB0_45
+	.p2align	2
+.LBB0_50:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_46 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x19, x5, x27
+	prfm	pldl1keep, [x5]
+	ldur	s16, [x5, #-4]
+	add	x7, x7, #1
+	prfm	pldl1keep, [x19]
+	ldur	s17, [x19, #-4]
+	add	x19, x19, x27
+	add	x5, x5, #4
+	prfm	pldl1keep, [x19]
+	ldur	s18, [x19, #-4]
+	add	x19, x19, x27
+	prfm	pldl1keep, [x19]
+	ldur	s19, [x19, #-4]
+	prfm	pldl1keep, [x6]
+	ldp	q20, q21, [x6, #-32]
+	add	x6, x6, #32
+	fmla	v0.4s, v21.4s, v16.s[0]
+	fmla	v3.4s, v21.4s, v17.s[0]
+	fmla	v2.4s, v21.4s, v18.s[0]
+	fmla	v1.4s, v20.4s, v16.s[0]
+	fmla	v6.4s, v20.4s, v17.s[0]
+	fmla	v4.4s, v20.4s, v18.s[0]
+	fmla	v7.4s, v20.4s, v19.s[0]
+	fmla	v5.4s, v21.4s, v19.s[0]
+	cmp	x7, x20
+	b.lt	.LBB0_50
+	b	.LBB0_45
+	.p2align	2
+.LBB0_51:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x15, [sp, #1296]                // 8-byte Folded Reload
+	ldr	x16, [sp, #1128]                // 8-byte Folded Reload
+	cmp	x15, x16
+	ldr	x19, [sp, #1072]                // 8-byte Folded Reload
+	b.ge	.LBB0_57
+// %bb.52:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x2, [sp, #1296]                 // 8-byte Folded Reload
+	ldr	x1, [sp, #1104]                 // 8-byte Folded Reload
+	mov	x17, xzr
+	add	x18, x2, #1
+	madd	x15, x2, x29, x9
+	ldp	q6, q7, [x8]
+	madd	x16, x18, x29, x9
+	madd	x18, x18, x28, x10
+	add	x15, x15, x1
+	add	x16, x16, x1
+	madd	x1, x2, x28, x10
+	ldr	x2, [sp, #1280]                 // 8-byte Folded Reload
+	add	x15, x25, x15, lsl #2
+	lsl	x18, x18, #2
+	add	x16, x25, x16, lsl #2
+	ldp	q1, q0, [x15]
+	ldp	q3, q2, [x16]
+	lsl	x1, x1, #2
+	ldr	q4, [x2, x18]
+	mov	x18, x14
+	ldr	q5, [x2, x1]
+	ldr	x1, [sp, #1120]                 // 8-byte Folded Reload
+	cmp	xzr, x22
+	b.ge	.LBB0_54
+	.p2align	2
+.LBB0_53:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x7, x18, #32
+	ldr	x2, [sp, #1184]                 // 8-byte Folded Reload
+	ldr	x4, [sp, #1176]                 // 8-byte Folded Reload
+	fmla	v1.4s, v6.4s, v5.s[0]
+	prfm	pldl1keep, [x7]
+	ldp	q16, q17, [x18, #-96]
+	fmla	v0.4s, v7.4s, v5.s[0]
+	fmla	v2.4s, v7.4s, v4.s[0]
+	fmla	v3.4s, v6.4s, v4.s[0]
+	ldp	q7, q6, [x18, #-64]
+	add	x6, x18, #96
+	prfm	pldl1keep, [x6]
+	add	x17, x17, #4
+	add	x2, x2, x1
+	add	x4, x4, x1
+	add	x1, x1, #16
+	fmla	v0.4s, v17.4s, v5.s[1]
+	fmla	v2.4s, v17.4s, v4.s[1]
+	add	x3, x2, #32
+	add	x5, x4, #32
+	fmla	v1.4s, v16.4s, v5.s[1]
+	fmla	v3.4s, v16.4s, v4.s[1]
+	ldp	q16, q17, [x18, #-32]
+	fmla	v0.4s, v6.4s, v5.s[2]
+	fmla	v2.4s, v6.4s, v4.s[2]
+	fmla	v1.4s, v7.4s, v5.s[2]
+	fmla	v3.4s, v7.4s, v4.s[2]
+	fmla	v0.4s, v17.4s, v5.s[3]
+	fmla	v2.4s, v17.4s, v4.s[3]
+	ldp	q6, q7, [x18], #128
+	prfm	pldl1keep, [x5]
+	fmla	v1.4s, v16.4s, v5.s[3]
+	ldr	q5, [x4, #16]
+	prfm	pldl1keep, [x3]
+	fmla	v3.4s, v16.4s, v4.s[3]
+	ldr	q4, [x2, #16]
+	cmp	x17, x22
+	b.lt	.LBB0_53
+.LBB0_54:                               //   in Loop: Header=BB0_7 Depth=2
+	ldp	q17, q16, [x11]
+	fmla	v0.4s, v7.4s, v5.s[0]
+	fmla	v1.4s, v6.4s, v5.s[0]
+	fmla	v2.4s, v7.4s, v4.s[0]
+	fmla	v3.4s, v6.4s, v4.s[0]
+	ldp	q6, q7, [x12]
+	ldr	x1, [sp, #416]                  // 8-byte Folded Reload
+	ldr	x2, [sp, #1304]                 // 8-byte Folded Reload
+	mov	x17, xzr
+	mov	x18, xzr
+	fmla	v0.4s, v16.4s, v5.s[1]
+	fmla	v2.4s, v16.4s, v4.s[1]
+	add	x1, x8, x1
+	fmla	v1.4s, v17.4s, v5.s[1]
+	fmla	v3.4s, v17.4s, v4.s[1]
+	ldp	q17, q16, [x13]
+	fmla	v0.4s, v7.4s, v5.s[2]
+	fmla	v2.4s, v7.4s, v4.s[2]
+	fmla	v1.4s, v6.4s, v5.s[2]
+	fmla	v3.4s, v6.4s, v4.s[2]
+	fmla	v0.4s, v16.4s, v5.s[3]
+	fmla	v2.4s, v16.4s, v4.s[3]
+	fmla	v1.4s, v17.4s, v5.s[3]
+	fmla	v3.4s, v17.4s, v4.s[3]
+	cmp	x2, x20
+	b.ge	.LBB0_56
+	.p2align	2
+.LBB0_55:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x3, x1, x18, lsl #3
+	add	x4, x24, x18
+	add	x5, x23, x18
+	add	x2, x2, #1
+	add	x4, x4, #4
+	add	x5, x5, #4
+	add	x3, x3, #32
+	prfm	pldl1keep, [x5]
+	ldr	s4, [x23, x18]
+	prfm	pldl1keep, [x4]
+	add	x4, x1, x17
+	ldr	s5, [x24, x18]
+	prfm	pldl1keep, [x3]
+	add	x18, x18, #4
+	ldp	q6, q7, [x4]
+	add	x17, x17, #32
+	fmla	v0.4s, v7.4s, v4.s[0]
+	fmla	v1.4s, v6.4s, v4.s[0]
+	fmla	v2.4s, v7.4s, v5.s[0]
+	fmla	v3.4s, v6.4s, v5.s[0]
+	cmp	x2, x20
+	b.lt	.LBB0_55
+.LBB0_56:                               //   in Loop: Header=BB0_7 Depth=2
+	stp	q1, q0, [x15]
+	stp	q3, q2, [x16]
+.LBB0_57:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x15, [sp, #1024]                // 8-byte Folded Reload
+	ldr	x16, [sp, #1128]                // 8-byte Folded Reload
+	cmp	x16, x15
+	b.ge	.LBB0_63
+// %bb.58:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x17, [sp, #1128]                // 8-byte Folded Reload
+	ldr	x16, [sp, #1104]                // 8-byte Folded Reload
+	mov	x15, xzr
+	madd	x9, x17, x29, x9
+	madd	x10, x17, x28, x10
+	ldp	q4, q3, [x8]
+	ldr	x18, [sp, #896]                 // 8-byte Folded Reload
+	add	x9, x9, x16
+	ldr	x16, [sp, #1280]                // 8-byte Folded Reload
+	lsl	x10, x10, #2
+	add	x9, x25, x9, lsl #2
+	ldp	q1, q0, [x9]
+	ldr	q2, [x16, x10]
+	ldr	x10, [sp, #904]                 // 8-byte Folded Reload
+	cmp	xzr, x22
+	b.ge	.LBB0_60
+	.p2align	2
+.LBB0_59:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x17, x14, #32
+	fmla	v1.4s, v4.4s, v2.s[0]
+	fmla	v0.4s, v3.4s, v2.s[0]
+	add	x16, x14, #96
+	prfm	pldl1keep, [x17]
+	ldp	q5, q6, [x14, #-96]
+	add	x15, x15, #4
+	ldp	q4, q3, [x14, #-64]
+	prfm	pldl1keep, [x16]
+	fmla	v0.4s, v6.4s, v2.s[1]
+	fmla	v1.4s, v5.4s, v2.s[1]
+	ldp	q5, q6, [x14, #-32]
+	prfm	pldl1keep, [x10]
+	fmla	v0.4s, v3.4s, v2.s[2]
+	fmla	v1.4s, v4.4s, v2.s[2]
+	fmla	v0.4s, v6.4s, v2.s[3]
+	fmla	v1.4s, v5.4s, v2.s[3]
+	ldur	q2, [x10, #-16]
+	ldp	q4, q3, [x14], #128
+	add	x10, x10, #16
+	cmp	x15, x22
+	b.lt	.LBB0_59
+.LBB0_60:                               //   in Loop: Header=BB0_7 Depth=2
+	ldp	q6, q5, [x11]
+	fmla	v0.4s, v3.4s, v2.s[0]
+	fmla	v1.4s, v4.4s, v2.s[0]
+	ldp	q3, q4, [x12]
+	ldr	x11, [sp, #416]                 // 8-byte Folded Reload
+	mov	x10, xzr
+	mov	x14, xzr
+	fmla	v0.4s, v5.4s, v2.s[1]
+	add	x8, x8, x11
+	ldr	x11, [sp, #1304]                // 8-byte Folded Reload
+	fmla	v1.4s, v6.4s, v2.s[1]
+	ldp	q6, q5, [x13]
+	fmla	v0.4s, v4.4s, v2.s[2]
+	fmla	v1.4s, v3.4s, v2.s[2]
+	fmla	v0.4s, v5.4s, v2.s[3]
+	fmla	v1.4s, v6.4s, v2.s[3]
+	cmp	x11, x20
+	b.ge	.LBB0_62
+	.p2align	2
+.LBB0_61:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x12, x8, x14, lsl #3
+	add	x13, x18, x14
+	add	x11, x11, #1
+	add	x13, x13, #4
+	add	x12, x12, #32
+	prfm	pldl1keep, [x13]
+	ldr	s2, [x18, x14]
+	add	x13, x8, x10
+	add	x14, x14, #4
+	add	x10, x10, #32
+	prfm	pldl1keep, [x12]
+	ldp	q3, q4, [x13]
+	fmla	v0.4s, v4.4s, v2.s[0]
+	fmla	v1.4s, v3.4s, v2.s[0]
+	cmp	x11, x20
+	b.lt	.LBB0_61
+.LBB0_62:                               //   in Loop: Header=BB0_7 Depth=2
+	stp	q1, q0, [x9]
+.LBB0_63:                               //   in Loop: Header=BB0_7 Depth=2
+	bl	free
+	ldr	x30, [sp, #1080]                // 8-byte Folded Reload
+	ldr	x8, [sp, #592]                  // 8-byte Folded Reload
+	cmp	x30, x8
+	b.ge	.LBB0_37
+.LBB0_64:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x8, [sp, #432]                  // 8-byte Folded Reload
+	add	x0, x8, #64
+	bl	malloc
+	ldr	x8, [sp, #1016]                 // 8-byte Folded Reload
+	ldr	x9, [sp, #1008]                 // 8-byte Folded Reload
+	mov	x16, x19
+	mov	x12, xzr
+	ldr	x15, [sp, #1064]                // 8-byte Folded Reload
+	ldr	x19, [sp, #1080]                // 8-byte Folded Reload
+	mul	x8, x16, x8
+	ldr	x14, [sp, #1256]                // 8-byte Folded Reload
+	ldr	x17, [sp, #1272]                // 8-byte Folded Reload
+	add	x10, x14, x29
+	ldr	x21, [sp, #992]                 // 8-byte Folded Reload
+	ldr	x23, [sp, #976]                 // 8-byte Folded Reload
+	ldr	x24, [sp, #968]                 // 8-byte Folded Reload
+	ldp	x30, x25, [sp, #320]            // 16-byte Folded Reload
+	madd	x9, x15, x9, x8
+	add	x8, x9, x19
+	add	x13, x8, x29
+	lsl	x11, x8, #2
+	add	x14, x8, x14
+	add	x8, x8, x10
+	lsl	x10, x13, #2
+	ldr	q0, [x17, x11]
+	lsl	x11, x14, #2
+	ldr	x13, [sp, #1264]                // 8-byte Folded Reload
+	ldr	x14, [sp, #1280]                // 8-byte Folded Reload
+	lsl	x8, x8, #2
+	ldr	q2, [x17, x10]
+	ldr	x10, [sp, #1040]                // 8-byte Folded Reload
+	mul	x10, x16, x10
+	ldr	q1, [x17, x11]
+	ldr	x11, [sp, #1048]                // 8-byte Folded Reload
+	madd	x11, x15, x11, x10
+	ldr	q3, [x17, x8]
+	add	x8, x0, #63
+	and	x8, x8, #0xffffffffffffffc0
+	add	x10, x11, x19
+	lsl	x10, x10, #2
+	ldr	q7, [x13, x10]
+	ldr	x10, [sp, #1032]                // 8-byte Folded Reload
+	ldr	x13, [sp, #1056]                // 8-byte Folded Reload
+	mul	x10, x16, x10
+	madd	x10, x15, x13, x10
+	ldr	x15, [sp, #1120]                // 8-byte Folded Reload
+	lsl	x13, x10, #2
+	ldr	q4, [x14, x13]
+	add	x13, x10, x28
+	lsl	x13, x13, #2
+	ldr	q5, [x14, x13]
+	add	x13, x10, x28, lsl #1
+	lsl	x13, x13, #2
+	ldr	q6, [x14, x13]
+	ldr	x14, [sp, #656]                 // 8-byte Folded Reload
+	orr	x13, x8, #0x20
+	.p2align	2
+.LBB0_65:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldr	x16, [sp, #1288]                // 8-byte Folded Reload
+	fmla	v0.4s, v7.4s, v4.s[0]
+	fmla	v2.4s, v7.4s, v5.s[0]
+	cmp	x12, x22
+	add	x16, x16, x15
+	prfm	pldl1keep, [x16, #16]
+	ldr	q16, [x16]
+	b.ge	.LBB0_67
+// %bb.66:                              //   in Loop: Header=BB0_65 Depth=3
+	ldr	x7, [sp, #768]                  // 8-byte Folded Reload
+	ldr	x16, [sp, #1224]                // 8-byte Folded Reload
+	fmla	v1.4s, v7.4s, v6.s[0]
+	fmla	v3.4s, v7.4s, v16.s[0]
+	ldr	x18, [sp, #1208]                // 8-byte Folded Reload
+	ldr	x2, [sp, #1216]                 // 8-byte Folded Reload
+	add	x5, x30, x14
+	add	x4, x25, x14
+	ldr	x6, [sp, #776]                  // 8-byte Folded Reload
+	stur	q7, [x13, #-32]
+	add	x12, x12, #4
+	add	x7, x7, x14
+	add	x6, x6, x14
+	add	x16, x16, x15
+	add	x18, x18, x15
+	add	x2, x2, x15
+	add	x15, x15, #16
+	prfm	pldl1keep, [x7]
+	ldr	x7, [sp, #752]                  // 8-byte Folded Reload
+	add	x17, x16, #32
+	add	x1, x18, #32
+	add	x3, x2, #32
+	ldr	q7, [x7, x14]
+	stur	q7, [x13, #-16]
+	prfm	pldl1keep, [x6]
+	ldr	x6, [sp, #744]                  // 8-byte Folded Reload
+	fmla	v0.4s, v7.4s, v4.s[1]
+	fmla	v2.4s, v7.4s, v5.s[1]
+	fmla	v1.4s, v7.4s, v6.s[1]
+	fmla	v3.4s, v7.4s, v16.s[1]
+	ldr	q7, [x6, x14]
+	str	q7, [x13]
+	prfm	pldl1keep, [x5]
+	ldr	x5, [sp, #736]                  // 8-byte Folded Reload
+	fmla	v0.4s, v7.4s, v4.s[2]
+	fmla	v2.4s, v7.4s, v5.s[2]
+	fmla	v1.4s, v7.4s, v6.s[2]
+	fmla	v3.4s, v7.4s, v16.s[2]
+	ldr	q7, [x5, x14]
+	str	q7, [x13, #16]
+	prfm	pldl1keep, [x4]
+	ldr	x4, [sp, #760]                  // 8-byte Folded Reload
+	fmla	v0.4s, v7.4s, v4.s[3]
+	fmla	v2.4s, v7.4s, v5.s[3]
+	fmla	v1.4s, v7.4s, v6.s[3]
+	fmla	v3.4s, v7.4s, v16.s[3]
+	add	x13, x13, #64
+	ldr	q7, [x4, x14]
+	prfm	pldl1keep, [x3]
+	ldr	q4, [x2, #16]
+	prfm	pldl1keep, [x1]
+	ldr	q5, [x18, #16]
+	prfm	pldl1keep, [x17]
+	ldr	q6, [x16, #16]
+	ldr	x16, [sp, #1240]                // 8-byte Folded Reload
+	add	x14, x14, x16
+	b	.LBB0_65
+	.p2align	2
+.LBB0_67:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x15, [sp, #1000]                // 8-byte Folded Reload
+	ldr	x6, [sp, #984]                  // 8-byte Folded Reload
+	fmla	v1.4s, v7.4s, v6.s[0]
+	fmla	v3.4s, v7.4s, v16.s[0]
+	ldr	x16, [sp, #1264]                // 8-byte Folded Reload
+	str	q7, [x8, x22, lsl #4]
+	mov	x12, xzr
+	ldr	x7, [sp, #1272]                 // 8-byte Folded Reload
+	mov	x13, xzr
+	madd	x14, x6, x15, x11
+	add	x14, x14, x19
+	lsl	x14, x14, #2
+	ldr	q17, [x16, x14]
+	madd	x14, x23, x15, x11
+	madd	x11, x24, x15, x11
+	ldr	x15, [sp, #672]                 // 8-byte Folded Reload
+	add	x14, x14, x19
+	add	x11, x11, x19
+	lsl	x14, x14, #2
+	lsl	x11, x11, #2
+	str	q17, [x8, x6, lsl #4]
+	fmla	v0.4s, v17.4s, v4.s[1]
+	fmla	v2.4s, v17.4s, v5.s[1]
+	fmla	v1.4s, v17.4s, v6.s[1]
+	fmla	v3.4s, v17.4s, v16.s[1]
+	ldr	q7, [x16, x14]
+	ldr	x14, [sp, #464]                 // 8-byte Folded Reload
+	fmla	v0.4s, v7.4s, v4.s[2]
+	str	q7, [x8, x23, lsl #4]
+	fmla	v2.4s, v7.4s, v5.s[2]
+	fmla	v1.4s, v7.4s, v6.s[2]
+	fmla	v3.4s, v7.4s, v16.s[2]
+	ldr	q7, [x16, x11]
+	ldr	x11, [sp, #664]                 // 8-byte Folded Reload
+	add	x11, x11, x14
+	add	x14, x15, x14
+	ldr	x15, [sp, #1304]                // 8-byte Folded Reload
+	fmla	v0.4s, v7.4s, v4.s[3]
+	fmla	v2.4s, v7.4s, v5.s[3]
+	fmla	v1.4s, v7.4s, v6.s[3]
+	fmla	v3.4s, v7.4s, v16.s[3]
+	str	q7, [x8, x24, lsl #4]
+	cmp	x15, x20
+	b.ge	.LBB0_69
+	.p2align	2
+.LBB0_68:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldr	x17, [sp, #1152]                // 8-byte Folded Reload
+	add	x16, x11, x13
+	add	x17, x17, x12
+	add	x12, x12, #4
+	prfm	pldl1keep, [x17]
+	ldur	s4, [x17, #-4]
+	add	x17, x17, x27
+	prfm	pldl1keep, [x17]
+	ldur	s5, [x17, #-4]
+	add	x17, x17, x27
+	prfm	pldl1keep, [x17]
+	ldur	s6, [x17, #-4]
+	add	x17, x17, x27
+	prfm	pldl1keep, [x17]
+	ldur	s7, [x17, #-4]
+	prfm	pldl1keep, [x16]
+	ldr	q16, [x14, x13]
+	add	x13, x13, x21
+	fmla	v0.4s, v16.4s, v4.s[0]
+	str	q16, [x8, x15, lsl #4]
+	add	x15, x15, #1
+	fmla	v2.4s, v16.4s, v5.s[0]
+	fmla	v1.4s, v16.4s, v6.s[0]
+	fmla	v3.4s, v16.4s, v7.s[0]
+	cmp	x15, x20
+	b.lt	.LBB0_68
+.LBB0_69:                               // %.preheader66
+                                        //   in Loop: Header=BB0_7 Depth=2
+	ldr	x12, [sp, #312]                 // 8-byte Folded Reload
+	ldr	x13, [sp, #1096]                // 8-byte Folded Reload
+	mov	x2, xzr
+	add	x11, x8, #48
+	ldr	x14, [sp, #1088]                // 8-byte Folded Reload
+	mov	w16, #1                         // =0x1
+	mov	w17, #2                         // =0x2
+	mov	w18, #3                         // =0x3
+	mov	w15, #4                         // =0x4
+	add	x12, x8, x12
+	b	.LBB0_71
+	.p2align	2
+.LBB0_70:                               // %.loopexit62
+                                        //   in Loop: Header=BB0_71 Depth=3
+	ldr	x2, [sp, #1232]                 // 8-byte Folded Reload
+	add	x14, x14, x2
+	add	x13, x13, x2
+	mov	x2, x15
+	mov	x15, x1
+.LBB0_71:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Loop Header: Depth=3
+                                        //         Child Loop BB0_73 Depth 4
+                                        //         Child Loop BB0_75 Depth 4
+	madd	x1, x2, x29, x9
+	add	x1, x1, x19
+	madd	x16, x16, x29, x9
+	madd	x17, x17, x29, x9
+	madd	x18, x18, x29, x9
+	add	x16, x16, x19
+	add	x17, x17, x19
+	lsl	x1, x1, #2
+	lsl	x16, x16, #2
+	lsl	x17, x17, #2
+	str	q0, [x7, x1]
+	str	q2, [x7, x16]
+	add	x16, x18, x19
+	lsl	x16, x16, #2
+	str	q1, [x7, x17]
+	str	q3, [x7, x16]
+	ldr	x16, [sp, #1296]                // 8-byte Folded Reload
+	cmp	x15, x16
+	b.ge	.LBB0_76
+// %bb.72:                              //   in Loop: Header=BB0_71 Depth=3
+	add	x16, x15, #1
+	add	x17, x15, #2
+	madd	x1, x15, x29, x9
+	add	x18, x15, #3
+	madd	x3, x16, x29, x9
+	ldr	q16, [x8]
+	mov	x2, xzr
+	add	x1, x1, x19
+	madd	x4, x17, x29, x9
+	add	x3, x3, x19
+	add	x4, x4, x19
+	lsl	x1, x1, #2
+	lsl	x3, x3, #2
+	lsl	x4, x4, #2
+	ldr	q0, [x7, x1]
+	madd	x1, x18, x29, x9
+	ldr	q2, [x7, x3]
+	madd	x3, x15, x28, x10
+	ldr	q1, [x7, x4]
+	ldr	x4, [sp, #1280]                 // 8-byte Folded Reload
+	add	x1, x1, x19
+	lsl	x3, x3, #2
+	lsl	x1, x1, #2
+	ldr	q7, [x4, x3]
+	madd	x3, x16, x28, x10
+	ldr	q3, [x7, x1]
+	add	x1, x15, #4
+	lsl	x3, x3, #2
+	ldr	q6, [x4, x3]
+	madd	x3, x17, x28, x10
+	lsl	x3, x3, #2
+	ldr	q5, [x4, x3]
+	madd	x3, x18, x28, x10
+	lsl	x3, x3, #2
+	ldr	q4, [x4, x3]
+	mov	x3, x11
+	mov	x4, x14
+	cmp	xzr, x22
+	b.ge	.LBB0_74
+	.p2align	2
+.LBB0_73:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_71 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x5, x3, #32
+	fmla	v0.4s, v16.4s, v7.s[0]
+	fmla	v2.4s, v16.4s, v6.s[0]
+	add	x2, x2, #4
+	fmla	v1.4s, v16.4s, v5.s[0]
+	fmla	v3.4s, v16.4s, v4.s[0]
+	prfm	pldl1keep, [x5]
+	add	x5, x4, x27
+	ldp	q16, q17, [x3, #-32]
+	fmla	v0.4s, v16.4s, v7.s[1]
+	fmla	v2.4s, v16.4s, v6.s[1]
+	fmla	v1.4s, v16.4s, v5.s[1]
+	fmla	v3.4s, v16.4s, v4.s[1]
+	fmla	v0.4s, v17.4s, v7.s[2]
+	fmla	v2.4s, v17.4s, v6.s[2]
+	fmla	v1.4s, v17.4s, v5.s[2]
+	fmla	v3.4s, v17.4s, v4.s[2]
+	ldp	q17, q16, [x3], #64
+	prfm	pldl1keep, [x4]
+	fmla	v0.4s, v17.4s, v7.s[3]
+	ldur	q7, [x4, #-16]
+	prfm	pldl1keep, [x5]
+	fmla	v2.4s, v17.4s, v6.s[3]
+	ldur	q6, [x5, #-16]
+	add	x5, x5, x27
+	fmla	v1.4s, v17.4s, v5.s[3]
+	fmla	v3.4s, v17.4s, v4.s[3]
+	add	x4, x4, #16
+	prfm	pldl1keep, [x5]
+	ldur	q5, [x5, #-16]
+	add	x5, x5, x27
+	prfm	pldl1keep, [x5]
+	ldur	q4, [x5, #-16]
+	cmp	x2, x22
+	b.lt	.LBB0_73
+.LBB0_74:                               //   in Loop: Header=BB0_71 Depth=3
+	ldr	q17, [x8, x6, lsl #4]
+	fmla	v0.4s, v16.4s, v7.s[0]
+	fmla	v2.4s, v16.4s, v6.s[0]
+	fmla	v1.4s, v16.4s, v5.s[0]
+	fmla	v3.4s, v16.4s, v4.s[0]
+	ldr	q16, [x8, x23, lsl #4]
+	ldr	q18, [x8, x24, lsl #4]
+	ldr	x4, [sp, #1304]                 // 8-byte Folded Reload
+	mov	x2, x13
+	mov	x3, x12
+	fmla	v0.4s, v17.4s, v7.s[1]
+	fmla	v2.4s, v17.4s, v6.s[1]
+	fmla	v1.4s, v17.4s, v5.s[1]
+	fmla	v3.4s, v17.4s, v4.s[1]
+	fmla	v0.4s, v16.4s, v7.s[2]
+	fmla	v2.4s, v16.4s, v6.s[2]
+	fmla	v1.4s, v16.4s, v5.s[2]
+	fmla	v3.4s, v16.4s, v4.s[2]
+	fmla	v0.4s, v18.4s, v7.s[3]
+	fmla	v2.4s, v18.4s, v6.s[3]
+	fmla	v1.4s, v18.4s, v5.s[3]
+	fmla	v3.4s, v18.4s, v4.s[3]
+	cmp	x4, x20
+	b.ge	.LBB0_70
+	.p2align	2
+.LBB0_75:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_71 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x5, x2, x27
+	prfm	pldl1keep, [x2]
+	ldur	s4, [x2, #-4]
+	add	x4, x4, #1
+	prfm	pldl1keep, [x5]
+	ldur	s5, [x5, #-4]
+	add	x5, x5, x27
+	add	x2, x2, #4
+	prfm	pldl1keep, [x5]
+	ldur	s6, [x5, #-4]
+	add	x5, x5, x27
+	prfm	pldl1keep, [x5]
+	ldur	s7, [x5, #-4]
+	prfm	pldl1keep, [x3]
+	ldur	q16, [x3, #-16]
+	add	x3, x3, #16
+	fmla	v0.4s, v16.4s, v4.s[0]
+	fmla	v2.4s, v16.4s, v5.s[0]
+	fmla	v1.4s, v16.4s, v6.s[0]
+	fmla	v3.4s, v16.4s, v7.s[0]
+	cmp	x4, x20
+	b.lt	.LBB0_75
+	b	.LBB0_70
+	.p2align	2
+.LBB0_76:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x13, [sp, #1296]                // 8-byte Folded Reload
+	ldr	x14, [sp, #1128]                // 8-byte Folded Reload
+	cmp	x13, x14
+	b.ge	.LBB0_82
+// %bb.77:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x17, [sp, #1296]                // 8-byte Folded Reload
+	ldr	x18, [sp, #1280]                // 8-byte Folded Reload
+	mov	x15, xzr
+	add	x16, x17, #1
+	madd	x13, x17, x29, x9
+	madd	x17, x17, x28, x10
+	ldr	q4, [x8]
+	madd	x14, x16, x29, x9
+	madd	x16, x16, x28, x10
+	add	x13, x13, x19
+	lsl	x17, x17, #2
+	add	x14, x14, x19
+	add	x13, x7, x13, lsl #2
+	lsl	x16, x16, #2
+	ldr	q3, [x18, x17]
+	ldr	x17, [sp, #1120]                // 8-byte Folded Reload
+	add	x14, x7, x14, lsl #2
+	ldr	q2, [x18, x16]
+	mov	x16, x11
+	ldr	q0, [x13]
+	ldr	q1, [x14]
+	cmp	xzr, x22
+	b.ge	.LBB0_79
+	.p2align	2
+.LBB0_78:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x4, x16, #32
+	ldr	x18, [sp, #1184]                // 8-byte Folded Reload
+	ldr	x2, [sp, #1176]                 // 8-byte Folded Reload
+	fmla	v0.4s, v4.4s, v3.s[0]
+	prfm	pldl1keep, [x4]
+	fmla	v1.4s, v4.4s, v2.s[0]
+	ldp	q4, q5, [x16, #-32]
+	add	x15, x15, #4
+	add	x18, x18, x17
+	add	x2, x2, x17
+	add	x17, x17, #16
+	add	x1, x18, #32
+	add	x3, x2, #32
+	fmla	v0.4s, v4.4s, v3.s[1]
+	fmla	v1.4s, v4.4s, v2.s[1]
+	fmla	v0.4s, v5.4s, v3.s[2]
+	fmla	v1.4s, v5.4s, v2.s[2]
+	ldp	q5, q4, [x16], #64
+	prfm	pldl1keep, [x3]
+	fmla	v0.4s, v5.4s, v3.s[3]
+	ldr	q3, [x2, #16]
+	prfm	pldl1keep, [x1]
+	fmla	v1.4s, v5.4s, v2.s[3]
+	ldr	q2, [x18, #16]
+	cmp	x15, x22
+	b.lt	.LBB0_78
+.LBB0_79:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	q5, [x8, x6, lsl #4]
+	fmla	v0.4s, v4.4s, v3.s[0]
+	fmla	v1.4s, v4.4s, v2.s[0]
+	ldr	q4, [x8, x23, lsl #4]
+	ldr	x15, [sp, #1120]                // 8-byte Folded Reload
+	ldr	x16, [sp, #1304]                // 8-byte Folded Reload
+	fmla	v0.4s, v5.4s, v3.s[1]
+	fmla	v1.4s, v5.4s, v2.s[1]
+	ldr	q5, [x8, x24, lsl #4]
+	fmla	v0.4s, v4.4s, v3.s[2]
+	fmla	v1.4s, v4.4s, v2.s[2]
+	fmla	v0.4s, v5.4s, v3.s[3]
+	fmla	v1.4s, v5.4s, v2.s[3]
+	cmp	x16, x20
+	b.ge	.LBB0_81
+	.p2align	2
+.LBB0_80:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldr	x1, [sp, #1168]                 // 8-byte Folded Reload
+	ldr	x2, [sp, #1160]                 // 8-byte Folded Reload
+	add	x16, x16, #1
+	add	x17, x1, x15
+	add	x18, x2, x15
+	add	x17, x17, #4
+	add	x18, x18, #4
+	prfm	pldl1keep, [x18]
+	ldr	s2, [x2, x15]
+	prfm	pldl1keep, [x17]
+	ldr	s3, [x1, x15]
+	prfm	pldl1keep, [x12]
+	ldur	q4, [x12, #-16]
+	add	x12, x12, #16
+	add	x15, x15, #4
+	fmla	v0.4s, v4.4s, v2.s[0]
+	fmla	v1.4s, v4.4s, v3.s[0]
+	cmp	x16, x20
+	b.lt	.LBB0_80
+.LBB0_81:                               //   in Loop: Header=BB0_7 Depth=2
+	str	q0, [x13]
+	str	q1, [x14]
+.LBB0_82:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x12, [sp, #1024]                // 8-byte Folded Reload
+	ldr	x13, [sp, #1128]                // 8-byte Folded Reload
+	cmp	x13, x12
+	b.ge	.LBB0_88
+// %bb.83:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x13, [sp, #1128]                // 8-byte Folded Reload
+	ldr	q2, [x8]
+	mov	x12, xzr
+	madd	x9, x13, x29, x9
+	madd	x10, x13, x28, x10
+	ldr	x13, [sp, #1280]                // 8-byte Folded Reload
+	ldr	x14, [sp, #896]                 // 8-byte Folded Reload
+	add	x9, x9, x19
+	lsl	x10, x10, #2
+	add	x9, x7, x9, lsl #2
+	ldr	q1, [x13, x10]
+	ldr	x10, [sp, #904]                 // 8-byte Folded Reload
+	ldr	q0, [x9]
+	cmp	xzr, x22
+	b.ge	.LBB0_85
+	.p2align	2
+.LBB0_84:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x13, x11, #32
+	fmla	v0.4s, v2.4s, v1.s[0]
+	add	x12, x12, #4
+	prfm	pldl1keep, [x13]
+	ldp	q2, q3, [x11, #-32]
+	fmla	v0.4s, v2.4s, v1.s[1]
+	fmla	v0.4s, v3.4s, v1.s[2]
+	ldp	q3, q2, [x11], #64
+	prfm	pldl1keep, [x10]
+	fmla	v0.4s, v3.4s, v1.s[3]
+	ldur	q1, [x10, #-16]
+	add	x10, x10, #16
+	cmp	x12, x22
+	b.lt	.LBB0_84
+.LBB0_85:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	q3, [x8, x6, lsl #4]
+	fmla	v0.4s, v2.4s, v1.s[0]
+	ldr	x11, [sp, #280]                 // 8-byte Folded Reload
+	ldr	q2, [x8, x23, lsl #4]
+	mov	x10, xzr
+	mov	w12, #16                        // =0x10
+	fmla	v0.4s, v3.4s, v1.s[1]
+	ldr	q3, [x8, x24, lsl #4]
+	add	x8, x8, x11
+	ldr	x11, [sp, #880]                 // 8-byte Folded Reload
+	fmla	v0.4s, v2.4s, v1.s[2]
+	fmla	v0.4s, v3.4s, v1.s[3]
+	ldr	x13, [sp, #1304]                // 8-byte Folded Reload
+	add	x13, x13, xzr
+	cmp	x13, x20
+	b.ge	.LBB0_87
+	.p2align	2
+.LBB0_86:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x13, x8, x12
+	prfm	pldl1keep, [x11]
+	ldr	s1, [x14, x10, lsl #2]
+	prfm	pldl1keep, [x13]
+	ldr	q2, [x8, x10, lsl #4]
+	add	x10, x10, #1
+	add	x12, x12, #16
+	add	x11, x11, #4
+	fmla	v0.4s, v2.4s, v1.s[0]
+	ldr	x13, [sp, #1304]                // 8-byte Folded Reload
+	add	x13, x13, x10
+	cmp	x13, x20
+	b.lt	.LBB0_86
+.LBB0_87:                               //   in Loop: Header=BB0_7 Depth=2
+	str	q0, [x9]
+.LBB0_88:                               //   in Loop: Header=BB0_7 Depth=2
+	bl	free
+	ldr	x30, [sp, #1080]                // 8-byte Folded Reload
+	ldr	x19, [sp, #1072]                // 8-byte Folded Reload
+	ldr	x8, [sp, #592]                  // 8-byte Folded Reload
+	ldr	x9, [sp, #584]                  // 8-byte Folded Reload
+	cmp	x8, x9
+	b.ge	.LBB0_38
+.LBB0_89:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x8, [sp, #424]                  // 8-byte Folded Reload
+	add	x0, x8, #64
+	bl	malloc
+	ldr	x8, [sp, #1016]                 // 8-byte Folded Reload
+	ldr	x9, [sp, #1008]                 // 8-byte Folded Reload
+	mov	x16, x19
+	mov	x12, xzr
+	ldr	x15, [sp, #1064]                // 8-byte Folded Reload
+	ldr	x14, [sp, #1256]                // 8-byte Folded Reload
+	mul	x8, x19, x8
+	ldr	x19, [sp, #592]                 // 8-byte Folded Reload
+	ldr	x17, [sp, #1272]                // 8-byte Folded Reload
+	add	x10, x14, x29
+	ldr	x21, [sp, #992]                 // 8-byte Folded Reload
+	ldr	x23, [sp, #976]                 // 8-byte Folded Reload
+	ldr	x24, [sp, #968]                 // 8-byte Folded Reload
+	ldp	x30, x25, [sp, #296]            // 16-byte Folded Reload
+	madd	x9, x15, x9, x8
+	add	x8, x9, x19
+	add	x13, x8, x29
+	lsl	x11, x8, #2
+	add	x14, x8, x14
+	add	x8, x8, x10
+	lsl	x10, x13, #2
+	ldr	d0, [x17, x11]
+	lsl	x11, x14, #2
+	ldr	x13, [sp, #1264]                // 8-byte Folded Reload
+	ldr	x14, [sp, #1280]                // 8-byte Folded Reload
+	lsl	x8, x8, #2
+	ldr	d2, [x17, x10]
+	ldr	x10, [sp, #1040]                // 8-byte Folded Reload
+	mul	x10, x16, x10
+	ldr	d1, [x17, x11]
+	ldr	x11, [sp, #1048]                // 8-byte Folded Reload
+	madd	x11, x15, x11, x10
+	ldr	d3, [x17, x8]
+	add	x8, x0, #63
+	and	x8, x8, #0xffffffffffffffc0
+	add	x10, x11, x19
+	lsl	x10, x10, #2
+	ldr	d7, [x13, x10]
+	ldr	x10, [sp, #1032]                // 8-byte Folded Reload
+	ldr	x13, [sp, #1056]                // 8-byte Folded Reload
+	mul	x10, x16, x10
+	madd	x10, x15, x13, x10
+	ldr	x15, [sp, #1120]                // 8-byte Folded Reload
+	lsl	x13, x10, #2
+	ldr	q4, [x14, x13]
+	add	x13, x10, x28
+	lsl	x13, x13, #2
+	ldr	q5, [x14, x13]
+	add	x13, x10, x28, lsl #1
+	lsl	x13, x13, #2
+	ldr	q6, [x14, x13]
+	ldr	x14, [sp, #656]                 // 8-byte Folded Reload
+	orr	x13, x8, #0x10
+	.p2align	2
+.LBB0_90:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldr	x16, [sp, #1288]                // 8-byte Folded Reload
+	fmla	v0.2s, v7.2s, v4.s[0]
+	fmla	v2.2s, v7.2s, v5.s[0]
+	cmp	x12, x22
+	add	x16, x16, x15
+	prfm	pldl1keep, [x16, #16]
+	ldr	q16, [x16]
+	b.ge	.LBB0_92
+// %bb.91:                              //   in Loop: Header=BB0_90 Depth=3
+	ldr	x7, [sp, #720]                  // 8-byte Folded Reload
+	ldr	x16, [sp, #1224]                // 8-byte Folded Reload
+	fmla	v1.2s, v7.2s, v6.s[0]
+	fmla	v3.2s, v7.2s, v16.s[0]
+	ldr	x18, [sp, #1208]                // 8-byte Folded Reload
+	ldr	x2, [sp, #1216]                 // 8-byte Folded Reload
+	add	x5, x30, x14
+	add	x4, x25, x14
+	ldr	x6, [sp, #728]                  // 8-byte Folded Reload
+	stur	d7, [x13, #-16]
+	add	x12, x12, #4
+	add	x7, x7, x14
+	add	x6, x6, x14
+	add	x16, x16, x15
+	add	x18, x18, x15
+	add	x2, x2, x15
+	add	x15, x15, #16
+	prfm	pldl1keep, [x7]
+	ldr	x7, [sp, #704]                  // 8-byte Folded Reload
+	add	x17, x16, #32
+	add	x1, x18, #32
+	add	x3, x2, #32
+	ldr	d7, [x7, x14]
+	stur	d7, [x13, #-8]
+	prfm	pldl1keep, [x6]
+	ldr	x6, [sp, #696]                  // 8-byte Folded Reload
+	fmla	v0.2s, v7.2s, v4.s[1]
+	fmla	v2.2s, v7.2s, v5.s[1]
+	fmla	v1.2s, v7.2s, v6.s[1]
+	fmla	v3.2s, v7.2s, v16.s[1]
+	ldr	d7, [x6, x14]
+	str	d7, [x13]
+	prfm	pldl1keep, [x5]
+	ldr	x5, [sp, #688]                  // 8-byte Folded Reload
+	fmla	v0.2s, v7.2s, v4.s[2]
+	fmla	v2.2s, v7.2s, v5.s[2]
+	fmla	v1.2s, v7.2s, v6.s[2]
+	fmla	v3.2s, v7.2s, v16.s[2]
+	ldr	d7, [x5, x14]
+	str	d7, [x13, #8]
+	prfm	pldl1keep, [x4]
+	ldr	x4, [sp, #712]                  // 8-byte Folded Reload
+	fmla	v0.2s, v7.2s, v4.s[3]
+	fmla	v2.2s, v7.2s, v5.s[3]
+	fmla	v1.2s, v7.2s, v6.s[3]
+	fmla	v3.2s, v7.2s, v16.s[3]
+	add	x13, x13, #32
+	ldr	d7, [x4, x14]
+	prfm	pldl1keep, [x3]
+	ldr	q4, [x2, #16]
+	prfm	pldl1keep, [x1]
+	ldr	q5, [x18, #16]
+	prfm	pldl1keep, [x17]
+	ldr	q6, [x16, #16]
+	ldr	x16, [sp, #1240]                // 8-byte Folded Reload
+	add	x14, x14, x16
+	b	.LBB0_90
+	.p2align	2
+.LBB0_92:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x15, [sp, #1000]                // 8-byte Folded Reload
+	ldr	x6, [sp, #984]                  // 8-byte Folded Reload
+	fmla	v1.2s, v7.2s, v6.s[0]
+	fmla	v3.2s, v7.2s, v16.s[0]
+	ldr	x16, [sp, #1264]                // 8-byte Folded Reload
+	str	d7, [x8, x22, lsl #3]
+	mov	x12, xzr
+	ldr	x7, [sp, #648]                  // 8-byte Folded Reload
+	ldr	x25, [sp, #680]                 // 8-byte Folded Reload
+	mov	x13, xzr
+	madd	x14, x6, x15, x11
+	ldr	x30, [sp, #1272]                // 8-byte Folded Reload
+	add	x14, x14, x19
+	lsl	x14, x14, #2
+	ldr	d17, [x16, x14]
+	madd	x14, x23, x15, x11
+	madd	x11, x24, x15, x11
+	ldr	x15, [sp, #672]                 // 8-byte Folded Reload
+	add	x14, x14, x19
+	add	x11, x11, x19
+	lsl	x14, x14, #2
+	lsl	x11, x11, #2
+	str	d17, [x8, x6, lsl #3]
+	fmla	v0.2s, v17.2s, v4.s[1]
+	fmla	v2.2s, v17.2s, v5.s[1]
+	fmla	v1.2s, v17.2s, v6.s[1]
+	fmla	v3.2s, v17.2s, v16.s[1]
+	ldr	d7, [x16, x14]
+	ldr	x14, [sp, #456]                 // 8-byte Folded Reload
+	fmla	v0.2s, v7.2s, v4.s[2]
+	str	d7, [x8, x23, lsl #3]
+	fmla	v2.2s, v7.2s, v5.s[2]
+	fmla	v1.2s, v7.2s, v6.s[2]
+	fmla	v3.2s, v7.2s, v16.s[2]
+	ldr	d7, [x16, x11]
+	ldr	x11, [sp, #664]                 // 8-byte Folded Reload
+	add	x11, x11, x14
+	add	x14, x15, x14
+	ldr	x15, [sp, #1304]                // 8-byte Folded Reload
+	fmla	v0.2s, v7.2s, v4.s[3]
+	fmla	v2.2s, v7.2s, v5.s[3]
+	fmla	v1.2s, v7.2s, v6.s[3]
+	fmla	v3.2s, v7.2s, v16.s[3]
+	str	d7, [x8, x24, lsl #3]
+	cmp	x15, x20
+	b.ge	.LBB0_94
+	.p2align	2
+.LBB0_93:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldr	x17, [sp, #1152]                // 8-byte Folded Reload
+	add	x16, x11, x13
+	add	x17, x17, x12
+	add	x12, x12, #4
+	prfm	pldl1keep, [x17]
+	ldur	s4, [x17, #-4]
+	add	x17, x17, x27
+	prfm	pldl1keep, [x17]
+	ldur	s5, [x17, #-4]
+	add	x17, x17, x27
+	prfm	pldl1keep, [x17]
+	ldur	s6, [x17, #-4]
+	add	x17, x17, x27
+	prfm	pldl1keep, [x17]
+	ldur	s7, [x17, #-4]
+	prfm	pldl1keep, [x16]
+	ldr	d16, [x14, x13]
+	add	x13, x13, x21
+	fmla	v0.2s, v16.2s, v4.s[0]
+	str	d16, [x8, x15, lsl #3]
+	add	x15, x15, #1
+	fmla	v2.2s, v16.2s, v5.s[0]
+	fmla	v1.2s, v16.2s, v6.s[0]
+	fmla	v3.2s, v16.2s, v7.s[0]
+	cmp	x15, x20
+	b.lt	.LBB0_93
+.LBB0_94:                               // %.preheader65
+                                        //   in Loop: Header=BB0_7 Depth=2
+	ldr	x12, [sp, #288]                 // 8-byte Folded Reload
+	ldr	x13, [sp, #1096]                // 8-byte Folded Reload
+	mov	x2, xzr
+	add	x11, x8, #24
+	ldr	x14, [sp, #1088]                // 8-byte Folded Reload
+	mov	w16, #1                         // =0x1
+	mov	w17, #2                         // =0x2
+	mov	w18, #3                         // =0x3
+	mov	w15, #4                         // =0x4
+	add	x12, x8, x12
+	b	.LBB0_96
+	.p2align	2
+.LBB0_95:                               // %.loopexit61
+                                        //   in Loop: Header=BB0_96 Depth=3
+	ldr	x2, [sp, #1232]                 // 8-byte Folded Reload
+	add	x14, x14, x2
+	add	x13, x13, x2
+	mov	x2, x15
+	mov	x15, x1
+.LBB0_96:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Loop Header: Depth=3
+                                        //         Child Loop BB0_98 Depth 4
+                                        //         Child Loop BB0_100 Depth 4
+	madd	x1, x2, x29, x9
+	add	x1, x1, x19
+	madd	x16, x16, x29, x9
+	madd	x17, x17, x29, x9
+	madd	x18, x18, x29, x9
+	add	x16, x16, x19
+	add	x17, x17, x19
+	lsl	x1, x1, #2
+	lsl	x16, x16, #2
+	lsl	x17, x17, #2
+	str	d0, [x30, x1]
+	str	d2, [x30, x16]
+	add	x16, x18, x19
+	lsl	x16, x16, #2
+	str	d1, [x30, x17]
+	str	d3, [x30, x16]
+	ldr	x16, [sp, #1296]                // 8-byte Folded Reload
+	cmp	x15, x16
+	b.ge	.LBB0_101
+// %bb.97:                              //   in Loop: Header=BB0_96 Depth=3
+	add	x16, x15, #1
+	add	x17, x15, #2
+	madd	x1, x15, x29, x9
+	add	x18, x15, #3
+	madd	x3, x16, x29, x9
+	ldr	d16, [x8]
+	mov	x2, xzr
+	add	x1, x1, x19
+	madd	x4, x17, x29, x9
+	add	x3, x3, x19
+	add	x4, x4, x19
+	lsl	x1, x1, #2
+	lsl	x3, x3, #2
+	lsl	x4, x4, #2
+	ldr	d0, [x30, x1]
+	madd	x1, x18, x29, x9
+	ldr	d2, [x30, x3]
+	madd	x3, x15, x28, x10
+	ldr	d1, [x30, x4]
+	ldr	x4, [sp, #1280]                 // 8-byte Folded Reload
+	add	x1, x1, x19
+	lsl	x3, x3, #2
+	lsl	x1, x1, #2
+	ldr	q7, [x4, x3]
+	madd	x3, x16, x28, x10
+	ldr	d3, [x30, x1]
+	add	x1, x15, #4
+	lsl	x3, x3, #2
+	ldr	q6, [x4, x3]
+	madd	x3, x17, x28, x10
+	lsl	x3, x3, #2
+	ldr	q5, [x4, x3]
+	madd	x3, x18, x28, x10
+	lsl	x3, x3, #2
+	ldr	q4, [x4, x3]
+	mov	x3, x11
+	mov	x4, x14
+	cmp	xzr, x22
+	b.ge	.LBB0_99
+	.p2align	2
+.LBB0_98:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_96 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x5, x3, #16
+	fmla	v0.2s, v16.2s, v7.s[0]
+	fmla	v2.2s, v16.2s, v6.s[0]
+	add	x2, x2, #4
+	fmla	v1.2s, v16.2s, v5.s[0]
+	fmla	v3.2s, v16.2s, v4.s[0]
+	prfm	pldl1keep, [x5]
+	add	x5, x4, x27
+	ldp	d16, d17, [x3, #-16]
+	fmla	v0.2s, v16.2s, v7.s[1]
+	fmla	v2.2s, v16.2s, v6.s[1]
+	fmla	v1.2s, v16.2s, v5.s[1]
+	fmla	v3.2s, v16.2s, v4.s[1]
+	fmla	v0.2s, v17.2s, v7.s[2]
+	fmla	v2.2s, v17.2s, v6.s[2]
+	fmla	v1.2s, v17.2s, v5.s[2]
+	fmla	v3.2s, v17.2s, v4.s[2]
+	ldp	d17, d16, [x3], #32
+	prfm	pldl1keep, [x4]
+	fmla	v0.2s, v17.2s, v7.s[3]
+	ldur	q7, [x4, #-16]
+	prfm	pldl1keep, [x5]
+	fmla	v2.2s, v17.2s, v6.s[3]
+	ldur	q6, [x5, #-16]
+	add	x5, x5, x27
+	fmla	v1.2s, v17.2s, v5.s[3]
+	fmla	v3.2s, v17.2s, v4.s[3]
+	add	x4, x4, #16
+	prfm	pldl1keep, [x5]
+	ldur	q5, [x5, #-16]
+	add	x5, x5, x27
+	prfm	pldl1keep, [x5]
+	ldur	q4, [x5, #-16]
+	cmp	x2, x22
+	b.lt	.LBB0_98
+.LBB0_99:                               //   in Loop: Header=BB0_96 Depth=3
+	ldr	d17, [x8, x6, lsl #3]
+	fmla	v0.2s, v16.2s, v7.s[0]
+	fmla	v2.2s, v16.2s, v6.s[0]
+	fmla	v1.2s, v16.2s, v5.s[0]
+	fmla	v3.2s, v16.2s, v4.s[0]
+	ldr	d16, [x8, x23, lsl #3]
+	ldr	d18, [x8, x24, lsl #3]
+	ldr	x4, [sp, #1304]                 // 8-byte Folded Reload
+	mov	x2, x13
+	mov	x3, x12
+	fmla	v0.2s, v17.2s, v7.s[1]
+	fmla	v2.2s, v17.2s, v6.s[1]
+	fmla	v1.2s, v17.2s, v5.s[1]
+	fmla	v3.2s, v17.2s, v4.s[1]
+	fmla	v0.2s, v16.2s, v7.s[2]
+	fmla	v2.2s, v16.2s, v6.s[2]
+	fmla	v1.2s, v16.2s, v5.s[2]
+	fmla	v3.2s, v16.2s, v4.s[2]
+	fmla	v0.2s, v18.2s, v7.s[3]
+	fmla	v2.2s, v18.2s, v6.s[3]
+	fmla	v1.2s, v18.2s, v5.s[3]
+	fmla	v3.2s, v18.2s, v4.s[3]
+	cmp	x4, x20
+	b.ge	.LBB0_95
+	.p2align	2
+.LBB0_100:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_96 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x5, x2, x27
+	prfm	pldl1keep, [x2]
+	ldur	s4, [x2, #-4]
+	add	x4, x4, #1
+	prfm	pldl1keep, [x5]
+	ldur	s5, [x5, #-4]
+	add	x5, x5, x27
+	add	x2, x2, #4
+	prfm	pldl1keep, [x5]
+	ldur	s6, [x5, #-4]
+	add	x5, x5, x27
+	prfm	pldl1keep, [x5]
+	ldur	s7, [x5, #-4]
+	prfm	pldl1keep, [x3]
+	ldur	d16, [x3, #-8]
+	add	x3, x3, #8
+	fmla	v0.2s, v16.2s, v4.s[0]
+	fmla	v2.2s, v16.2s, v5.s[0]
+	fmla	v1.2s, v16.2s, v6.s[0]
+	fmla	v3.2s, v16.2s, v7.s[0]
+	cmp	x4, x20
+	b.lt	.LBB0_100
+	b	.LBB0_95
+	.p2align	2
+.LBB0_101:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x12, [sp, #1296]                // 8-byte Folded Reload
+	ldr	x13, [sp, #1128]                // 8-byte Folded Reload
+	cmp	x12, x13
+	b.ge	.LBB0_107
+// %bb.102:                             //   in Loop: Header=BB0_7 Depth=2
+	ldr	x16, [sp, #1296]                // 8-byte Folded Reload
+	ldr	x17, [sp, #1280]                // 8-byte Folded Reload
+	mov	x14, xzr
+	add	x15, x16, #1
+	madd	x12, x16, x29, x9
+	madd	x16, x16, x28, x10
+	ldr	d4, [x8]
+	madd	x13, x15, x29, x9
+	madd	x15, x15, x28, x10
+	add	x12, x12, x19
+	lsl	x16, x16, #2
+	add	x13, x13, x19
+	add	x12, x30, x12, lsl #2
+	lsl	x15, x15, #2
+	ldr	q3, [x17, x16]
+	ldr	x16, [sp, #1120]                // 8-byte Folded Reload
+	add	x13, x30, x13, lsl #2
+	ldr	q2, [x17, x15]
+	mov	x15, x11
+	ldr	d0, [x12]
+	ldr	d1, [x13]
+	cmp	xzr, x22
+	b.ge	.LBB0_104
+	.p2align	2
+.LBB0_103:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x3, x15, #16
+	ldr	x17, [sp, #1184]                // 8-byte Folded Reload
+	ldr	x1, [sp, #1176]                 // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v3.s[0]
+	prfm	pldl1keep, [x3]
+	fmla	v1.2s, v4.2s, v2.s[0]
+	ldp	d4, d5, [x15, #-16]
+	add	x14, x14, #4
+	add	x17, x17, x16
+	add	x1, x1, x16
+	add	x16, x16, #16
+	add	x18, x17, #32
+	add	x2, x1, #32
+	fmla	v0.2s, v4.2s, v3.s[1]
+	fmla	v1.2s, v4.2s, v2.s[1]
+	fmla	v0.2s, v5.2s, v3.s[2]
+	fmla	v1.2s, v5.2s, v2.s[2]
+	ldp	d5, d4, [x15], #32
+	prfm	pldl1keep, [x2]
+	fmla	v0.2s, v5.2s, v3.s[3]
+	ldr	q3, [x1, #16]
+	prfm	pldl1keep, [x18]
+	fmla	v1.2s, v5.2s, v2.s[3]
+	ldr	q2, [x17, #16]
+	cmp	x14, x22
+	b.lt	.LBB0_103
+.LBB0_104:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	d5, [x8, x6, lsl #3]
+	fmla	v0.2s, v4.2s, v3.s[0]
+	fmla	v1.2s, v4.2s, v2.s[0]
+	ldr	d4, [x8, x23, lsl #3]
+	ldr	x16, [sp, #408]                 // 8-byte Folded Reload
+	mov	x14, xzr
+	mov	x15, xzr
+	add	x16, x8, x16
+	fmla	v0.2s, v5.2s, v3.s[1]
+	fmla	v1.2s, v5.2s, v2.s[1]
+	ldr	d5, [x8, x24, lsl #3]
+	fmla	v0.2s, v4.2s, v3.s[2]
+	fmla	v1.2s, v4.2s, v2.s[2]
+	fmla	v0.2s, v5.2s, v3.s[3]
+	fmla	v1.2s, v5.2s, v2.s[3]
+	ldr	x17, [sp, #1304]                // 8-byte Folded Reload
+	add	x17, x17, xzr
+	cmp	x17, x20
+	b.ge	.LBB0_106
+	.p2align	2
+.LBB0_105:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x17, x16, x15, lsl #3
+	add	x18, x25, x14
+	add	x1, x7, x14
+	add	x14, x14, #4
+	add	x1, x1, #4
+	add	x18, x18, #4
+	add	x17, x17, #8
+	prfm	pldl1keep, [x1]
+	ldr	s2, [x7, x15, lsl #2]
+	prfm	pldl1keep, [x18]
+	ldr	s3, [x25, x15, lsl #2]
+	prfm	pldl1keep, [x17]
+	ldr	d4, [x16, x15, lsl #3]
+	add	x15, x15, #1
+	fmla	v0.2s, v4.2s, v2.s[0]
+	fmla	v1.2s, v4.2s, v3.s[0]
+	ldr	x17, [sp, #1304]                // 8-byte Folded Reload
+	add	x17, x17, x15
+	cmp	x17, x20
+	b.lt	.LBB0_105
+.LBB0_106:                              //   in Loop: Header=BB0_7 Depth=2
+	str	d0, [x12]
+	str	d1, [x13]
+.LBB0_107:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x12, [sp, #1024]                // 8-byte Folded Reload
+	ldr	x13, [sp, #1128]                // 8-byte Folded Reload
+	cmp	x13, x12
+	b.ge	.LBB0_113
+// %bb.108:                             //   in Loop: Header=BB0_7 Depth=2
+	ldr	x13, [sp, #1128]                // 8-byte Folded Reload
+	ldr	d2, [x8]
+	mov	x12, xzr
+	madd	x9, x13, x29, x9
+	madd	x10, x13, x28, x10
+	ldr	x13, [sp, #1280]                // 8-byte Folded Reload
+	ldr	x14, [sp, #896]                 // 8-byte Folded Reload
+	add	x9, x9, x19
+	lsl	x10, x10, #2
+	add	x9, x30, x9, lsl #2
+	ldr	q1, [x13, x10]
+	ldr	x10, [sp, #904]                 // 8-byte Folded Reload
+	ldr	d0, [x9]
+	cmp	xzr, x22
+	b.ge	.LBB0_110
+	.p2align	2
+.LBB0_109:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x13, x11, #16
+	fmla	v0.2s, v2.2s, v1.s[0]
+	add	x12, x12, #4
+	prfm	pldl1keep, [x13]
+	ldp	d2, d3, [x11, #-16]
+	fmla	v0.2s, v2.2s, v1.s[1]
+	fmla	v0.2s, v3.2s, v1.s[2]
+	ldp	d3, d2, [x11], #32
+	prfm	pldl1keep, [x10]
+	fmla	v0.2s, v3.2s, v1.s[3]
+	ldur	q1, [x10, #-16]
+	add	x10, x10, #16
+	cmp	x12, x22
+	b.lt	.LBB0_109
+.LBB0_110:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	d3, [x8, x6, lsl #3]
+	fmla	v0.2s, v2.2s, v1.s[0]
+	ldr	x11, [sp, #408]                 // 8-byte Folded Reload
+	ldr	d4, [x8, x23, lsl #3]
+	ldr	d2, [x8, x24, lsl #3]
+	mov	x10, xzr
+	add	x8, x8, x11
+	ldr	x11, [sp, #880]                 // 8-byte Folded Reload
+	fmla	v0.2s, v3.2s, v1.s[1]
+	fmla	v0.2s, v4.2s, v1.s[2]
+	fmla	v0.2s, v2.2s, v1.s[3]
+	ldr	x12, [sp, #1304]                // 8-byte Folded Reload
+	add	x12, x12, xzr
+	cmp	x12, x20
+	b.ge	.LBB0_112
+	.p2align	2
+.LBB0_111:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x12, x8, x10, lsl #3
+	prfm	pldl1keep, [x11]
+	ldr	s1, [x14, x10, lsl #2]
+	add	x11, x11, #4
+	add	x12, x12, #8
+	prfm	pldl1keep, [x12]
+	ldr	d2, [x8, x10, lsl #3]
+	add	x10, x10, #1
+	fmla	v0.2s, v2.2s, v1.s[0]
+	ldr	x12, [sp, #1304]                // 8-byte Folded Reload
+	add	x12, x12, x10
+	cmp	x12, x20
+	b.lt	.LBB0_111
+.LBB0_112:                              //   in Loop: Header=BB0_7 Depth=2
+	str	d0, [x9]
+.LBB0_113:                              //   in Loop: Header=BB0_7 Depth=2
+	bl	free
+	ldr	x30, [sp, #1080]                // 8-byte Folded Reload
+	ldr	x19, [sp, #1072]                // 8-byte Folded Reload
+	ldr	x8, [sp, #504]                  // 8-byte Folded Reload
+	ldr	x9, [sp, #584]                  // 8-byte Folded Reload
+	cmp	x9, x8
+	b.ge	.LBB0_6
+.LBB0_114:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x8, [sp, #480]                  // 8-byte Folded Reload
+	add	x0, x8, #64
+	bl	malloc
+	ldr	x8, [sp, #1016]                 // 8-byte Folded Reload
+	ldr	x9, [sp, #1008]                 // 8-byte Folded Reload
+	add	x10, x0, #63
+	mov	x12, xzr
+	ldr	x15, [sp, #1064]                // 8-byte Folded Reload
+	ldr	x21, [sp, #584]                 // 8-byte Folded Reload
+	mov	x13, xzr
+	mul	x8, x19, x8
+	ldr	x14, [sp, #1256]                // 8-byte Folded Reload
+	ldr	x16, [sp, #1272]                // 8-byte Folded Reload
+	ldr	x23, [sp, #992]                 // 8-byte Folded Reload
+	ldr	x30, [sp, #384]                 // 8-byte Folded Reload
+	ldp	x25, x24, [sp, #392]            // 16-byte Folded Reload
+	madd	x9, x15, x9, x8
+	add	x8, x9, x21
+	add	x11, x8, x14
+	add	x14, x14, x29
+	ldr	s1, [x16, x8, lsl #2]
+	add	x14, x8, x14
+	add	x8, x8, x29
+	ldr	s2, [x16, x11, lsl #2]
+	ldr	x11, [sp, #1048]                // 8-byte Folded Reload
+	ldr	s3, [x16, x8, lsl #2]
+	ldr	x8, [sp, #1040]                 // 8-byte Folded Reload
+	ldr	s0, [x16, x14, lsl #2]
+	ldr	x14, [sp, #1264]                // 8-byte Folded Reload
+	mul	x8, x19, x8
+	madd	x11, x15, x11, x8
+	add	x8, x11, x21
+	ldr	s16, [x14, x8, lsl #2]
+	and	x8, x10, #0xffffffffffffffc0
+	ldr	x10, [sp, #1032]                // 8-byte Folded Reload
+	ldr	x14, [sp, #1056]                // 8-byte Folded Reload
+	mul	x10, x19, x10
+	madd	x10, x15, x14, x10
+	ldr	x15, [sp, #1280]                // 8-byte Folded Reload
+	lsl	x14, x10, #2
+	ldr	q4, [x15, x14]
+	add	x14, x10, x28
+	lsl	x14, x14, #2
+	ldr	q5, [x15, x14]
+	add	x14, x10, x28, lsl #1
+	lsl	x14, x14, #2
+	ldr	q6, [x15, x14]
+	ldr	x15, [sp, #656]                 // 8-byte Folded Reload
+	orr	x14, x8, #0xc
+	.p2align	2
+.LBB0_115:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldr	x16, [sp, #888]                 // 8-byte Folded Reload
+	ext	v20.16b, v4.16b, v4.16b, #8
+	cmp	x13, x22
+	ext	v19.16b, v5.16b, v5.16b, #8
+	add	x16, x16, x12
+	prfm	pldl1keep, [x16, #16]
+	ldr	q7, [x16]
+	ext	v18.16b, v6.16b, v6.16b, #8
+	ext	v17.16b, v7.16b, v7.16b, #8
+	b.ge	.LBB0_117
+// %bb.116:                             //   in Loop: Header=BB0_115 Depth=3
+	ldr	x7, [sp, #840]                  // 8-byte Folded Reload
+	add	x19, x14, x12
+	ldr	x16, [sp, #856]                 // 8-byte Folded Reload
+	fmla	v1.2s, v16.2s, v4.2s
+	ldr	x18, [sp, #864]                 // 8-byte Folded Reload
+	ldr	x2, [sp, #872]                  // 8-byte Folded Reload
+	fmla	v3.2s, v16.2s, v5.2s
+	fmla	v2.2s, v16.2s, v6.2s
+	stur	s16, [x19, #-12]
+	fmla	v0.2s, v16.2s, v7.2s
+	add	x6, x30, x15
+	add	x5, x25, x15
+	add	x4, x24, x15
+	add	x13, x13, #4
+	add	x7, x7, x15
+	add	x16, x16, x12
+	add	x18, x18, x12
+	add	x2, x2, x12
+	add	x12, x12, #16
+	prfm	pldl1keep, [x7]
+	ldr	x7, [sp, #824]                  // 8-byte Folded Reload
+	add	x17, x16, #32
+	add	x1, x18, #32
+	add	x3, x2, #32
+	ldr	s16, [x7, x15]
+	stur	s16, [x19, #-8]
+	prfm	pldl1keep, [x6]
+	ldr	x6, [sp, #816]                  // 8-byte Folded Reload
+	fmla	v1.2s, v16.2s, v4.s[1]
+	fmla	v3.2s, v16.2s, v5.s[1]
+	fmla	v2.2s, v16.2s, v6.s[1]
+	fmla	v0.2s, v16.2s, v7.s[1]
+	ldr	s16, [x6, x15]
+	stur	s16, [x19, #-4]
+	prfm	pldl1keep, [x5]
+	ldr	x5, [sp, #808]                  // 8-byte Folded Reload
+	fmla	v1.2s, v16.2s, v20.2s
+	fmla	v3.2s, v16.2s, v19.2s
+	fmla	v2.2s, v16.2s, v18.2s
+	fmla	v0.2s, v16.2s, v17.2s
+	ldr	s16, [x5, x15]
+	str	s16, [x19]
+	prfm	pldl1keep, [x4]
+	ldr	x4, [sp, #832]                  // 8-byte Folded Reload
+	fmla	v1.2s, v16.2s, v4.s[3]
+	fmla	v3.2s, v16.2s, v5.s[3]
+	fmla	v2.2s, v16.2s, v6.s[3]
+	fmla	v0.2s, v16.2s, v7.s[3]
+	ldr	s16, [x4, x15]
+	prfm	pldl1keep, [x3]
+	ldr	q4, [x2, #16]
+	prfm	pldl1keep, [x1]
+	ldr	q5, [x18, #16]
+	prfm	pldl1keep, [x17]
+	ldr	q6, [x16, #16]
+	ldr	x16, [sp, #1240]                // 8-byte Folded Reload
+	add	x15, x15, x16
+	b	.LBB0_115
+	.p2align	2
+.LBB0_117:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x15, [sp, #1000]                // 8-byte Folded Reload
+	ldr	x6, [sp, #984]                  // 8-byte Folded Reload
+	fmla	v1.2s, v16.2s, v4.2s
+	fmla	v3.2s, v16.2s, v5.2s
+	ldr	x16, [sp, #1264]                // 8-byte Folded Reload
+	ldr	x7, [sp, #976]                  // 8-byte Folded Reload
+	fmla	v2.2s, v16.2s, v6.2s
+	fmla	v0.2s, v16.2s, v7.2s
+	str	s16, [x8, x22, lsl #2]
+	ldr	x19, [sp, #968]                 // 8-byte Folded Reload
+	mov	x12, xzr
+	ldr	x24, [sp, #616]                 // 8-byte Folded Reload
+	ldr	x25, [sp, #608]                 // 8-byte Folded Reload
+	mov	x13, xzr
+	madd	x14, x6, x15, x11
+	ldr	x17, [sp, #576]                 // 8-byte Folded Reload
+	ldr	x18, [sp, #600]                 // 8-byte Folded Reload
+	add	x14, x14, x21
+	ldr	x30, [sp, #648]                 // 8-byte Folded Reload
+	ldr	s16, [x16, x14, lsl #2]
+	madd	x14, x7, x15, x11
+	madd	x11, x19, x15, x11
+	add	x14, x14, x21
+	add	x11, x11, x21
+	str	s16, [x8, x6, lsl #2]
+	fmla	v1.2s, v16.2s, v4.s[1]
+	fmla	v3.2s, v16.2s, v5.s[1]
+	fmla	v2.2s, v16.2s, v6.s[1]
+	fmla	v0.2s, v16.2s, v7.s[1]
+	ldr	s16, [x16, x14, lsl #2]
+	ldr	x14, [sp, #1304]                // 8-byte Folded Reload
+	fmla	v1.2s, v16.2s, v20.2s
+	str	s16, [x8, x7, lsl #2]
+	fmla	v3.2s, v16.2s, v19.2s
+	fmla	v2.2s, v16.2s, v18.2s
+	fmla	v0.2s, v16.2s, v17.2s
+	ldr	s16, [x16, x11, lsl #2]
+	ldr	x11, [sp, #512]                 // 8-byte Folded Reload
+	add	x11, x8, x11
+	fmla	v1.2s, v16.2s, v4.s[3]
+	fmla	v3.2s, v16.2s, v5.s[3]
+	fmla	v2.2s, v16.2s, v6.s[3]
+	fmla	v0.2s, v16.2s, v7.s[3]
+	str	s16, [x8, x19, lsl #2]
+	cmp	x14, x20
+	b.ge	.LBB0_119
+	.p2align	2
+.LBB0_118:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldr	x16, [sp, #1152]                // 8-byte Folded Reload
+	add	x15, x18, x13
+	add	x14, x14, #1
+	add	x16, x16, x12
+	prfm	pldl1keep, [x16]
+	ldur	s4, [x16, #-4]
+	add	x16, x16, x27
+	prfm	pldl1keep, [x16]
+	ldur	s5, [x16, #-4]
+	add	x16, x16, x27
+	prfm	pldl1keep, [x16]
+	ldur	s6, [x16, #-4]
+	add	x16, x16, x27
+	prfm	pldl1keep, [x16]
+	ldur	s7, [x16, #-4]
+	prfm	pldl1keep, [x15]
+	ldr	s16, [x17, x13]
+	add	x13, x13, x23
+	fmla	v1.2s, v16.2s, v4.2s
+	fmla	v3.2s, v16.2s, v5.2s
+	fmla	v2.2s, v16.2s, v6.2s
+	fmla	v0.2s, v16.2s, v7.2s
+	str	s16, [x11, x12]
+	add	x12, x12, #4
+	cmp	x14, x20
+	b.lt	.LBB0_118
+.LBB0_119:                              // %.preheader64
+                                        //   in Loop: Header=BB0_7 Depth=2
+	ldr	x12, [sp, #448]                 // 8-byte Folded Reload
+	ldr	x13, [sp, #1096]                // 8-byte Folded Reload
+	mov	x2, xzr
+	add	x11, x8, #12
+	ldr	x14, [sp, #1088]                // 8-byte Folded Reload
+	mov	w17, #1                         // =0x1
+	mov	w18, #2                         // =0x2
+	mov	w16, #3                         // =0x3
+	mov	w15, #4                         // =0x4
+	add	x12, x8, x12
+	b	.LBB0_121
+	.p2align	2
+.LBB0_120:                              // %.loopexit60
+                                        //   in Loop: Header=BB0_121 Depth=3
+	ldr	x2, [sp, #1232]                 // 8-byte Folded Reload
+	add	x14, x14, x2
+	add	x13, x13, x2
+	mov	x2, x15
+	mov	x15, x1
+.LBB0_121:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Loop Header: Depth=3
+                                        //         Child Loop BB0_123 Depth 4
+                                        //         Child Loop BB0_125 Depth 4
+	madd	x1, x2, x29, x9
+	ldr	x23, [sp, #1272]                // 8-byte Folded Reload
+	add	x1, x1, x21
+	madd	x17, x17, x29, x9
+	madd	x18, x18, x29, x9
+	madd	x16, x16, x29, x9
+	add	x17, x17, x21
+	add	x16, x16, x21
+	str	s1, [x23, x1, lsl #2]
+	str	s3, [x23, x17, lsl #2]
+	add	x17, x18, x21
+	str	s2, [x23, x17, lsl #2]
+	str	s0, [x23, x16, lsl #2]
+	ldr	x16, [sp, #1296]                // 8-byte Folded Reload
+	cmp	x15, x16
+	b.ge	.LBB0_126
+// %bb.122:                             //   in Loop: Header=BB0_121 Depth=3
+	add	x17, x15, #1
+	add	x18, x15, #2
+	add	x16, x15, #3
+	madd	x1, x15, x29, x9
+	madd	x3, x17, x29, x9
+	ldr	s16, [x8]
+	mov	x2, xzr
+	add	x1, x1, x21
+	madd	x4, x18, x29, x9
+	madd	x5, x16, x29, x9
+	add	x3, x3, x21
+	add	x4, x4, x21
+	add	x5, x5, x21
+	ldr	s1, [x23, x1, lsl #2]
+	add	x1, x15, #4
+	ldr	s3, [x23, x3, lsl #2]
+	madd	x3, x15, x28, x10
+	lsl	x3, x3, #2
+	ldr	s2, [x23, x4, lsl #2]
+	ldr	x4, [sp, #1280]                 // 8-byte Folded Reload
+	ldr	s0, [x23, x5, lsl #2]
+	ldr	q7, [x4, x3]
+	madd	x3, x17, x28, x10
+	lsl	x3, x3, #2
+	ldr	q6, [x4, x3]
+	madd	x3, x18, x28, x10
+	lsl	x3, x3, #2
+	ldr	q5, [x4, x3]
+	madd	x3, x16, x28, x10
+	lsl	x3, x3, #2
+	ldr	q4, [x4, x3]
+	mov	x3, x11
+	mov	x4, x14
+	ext	v20.16b, v7.16b, v7.16b, #8
+	cmp	xzr, x22
+	ext	v19.16b, v6.16b, v6.16b, #8
+	ext	v18.16b, v5.16b, v5.16b, #8
+	ext	v17.16b, v4.16b, v4.16b, #8
+	b.ge	.LBB0_124
+	.p2align	2
+.LBB0_123:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_121 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x5, x3, #8
+	fmla	v1.2s, v16.2s, v7.2s
+	fmla	v3.2s, v16.2s, v6.2s
+	add	x2, x2, #4
+	fmla	v2.2s, v16.2s, v5.2s
+	fmla	v0.2s, v16.2s, v4.2s
+	prfm	pldl1keep, [x5]
+	add	x5, x4, x27
+	ldp	s16, s21, [x3, #-8]
+	fmla	v0.2s, v16.2s, v4.s[1]
+	fmla	v1.2s, v16.2s, v7.s[1]
+	fmla	v3.2s, v16.2s, v6.s[1]
+	fmla	v2.2s, v16.2s, v5.s[1]
+	fmla	v0.2s, v21.2s, v17.2s
+	fmla	v1.2s, v21.2s, v20.2s
+	ldp	s17, s16, [x3], #16
+	fmla	v3.2s, v21.2s, v19.2s
+	fmla	v2.2s, v21.2s, v18.2s
+	prfm	pldl1keep, [x4]
+	fmla	v1.2s, v17.2s, v7.s[3]
+	ldur	q7, [x4, #-16]
+	prfm	pldl1keep, [x5]
+	fmla	v3.2s, v17.2s, v6.s[3]
+	ldur	q6, [x5, #-16]
+	add	x5, x5, x27
+	fmla	v2.2s, v17.2s, v5.s[3]
+	fmla	v0.2s, v17.2s, v4.s[3]
+	add	x4, x4, #16
+	prfm	pldl1keep, [x5]
+	ldur	q5, [x5, #-16]
+	add	x5, x5, x27
+	prfm	pldl1keep, [x5]
+	ldur	q4, [x5, #-16]
+	ext	v20.16b, v7.16b, v7.16b, #8
+	cmp	x2, x22
+	ext	v19.16b, v6.16b, v6.16b, #8
+	ext	v18.16b, v5.16b, v5.16b, #8
+	ext	v17.16b, v4.16b, v4.16b, #8
+	b.lt	.LBB0_123
+.LBB0_124:                              //   in Loop: Header=BB0_121 Depth=3
+	ldr	s21, [x8, x6, lsl #2]
+	fmla	v1.2s, v16.2s, v7.2s
+	fmla	v3.2s, v16.2s, v6.2s
+	fmla	v2.2s, v16.2s, v5.2s
+	fmla	v0.2s, v16.2s, v4.2s
+	ldr	s16, [x8, x7, lsl #2]
+	ldr	s22, [x8, x19, lsl #2]
+	ldr	x4, [sp, #1304]                 // 8-byte Folded Reload
+	mov	x2, x13
+	mov	x3, x12
+	fmla	v1.2s, v21.2s, v7.s[1]
+	fmla	v3.2s, v21.2s, v6.s[1]
+	fmla	v2.2s, v21.2s, v5.s[1]
+	fmla	v0.2s, v21.2s, v4.s[1]
+	fmla	v1.2s, v16.2s, v20.2s
+	fmla	v3.2s, v16.2s, v19.2s
+	fmla	v2.2s, v16.2s, v18.2s
+	fmla	v0.2s, v16.2s, v17.2s
+	fmla	v1.2s, v22.2s, v7.s[3]
+	fmla	v3.2s, v22.2s, v6.s[3]
+	fmla	v2.2s, v22.2s, v5.s[3]
+	fmla	v0.2s, v22.2s, v4.s[3]
+	cmp	x4, x20
+	b.ge	.LBB0_120
+	.p2align	2
+.LBB0_125:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_121 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x5, x2, x27
+	prfm	pldl1keep, [x2]
+	ldur	s4, [x2, #-4]
+	add	x4, x4, #1
+	prfm	pldl1keep, [x5]
+	ldur	s5, [x5, #-4]
+	add	x5, x5, x27
+	add	x2, x2, #4
+	prfm	pldl1keep, [x5]
+	ldur	s6, [x5, #-4]
+	add	x5, x5, x27
+	prfm	pldl1keep, [x5]
+	ldur	s7, [x5, #-4]
+	prfm	pldl1keep, [x3]
+	ldur	s16, [x3, #-4]
+	add	x3, x3, #4
+	fmla	v1.2s, v16.2s, v4.2s
+	fmla	v3.2s, v16.2s, v5.2s
+	fmla	v2.2s, v16.2s, v6.2s
+	fmla	v0.2s, v16.2s, v7.2s
+	cmp	x4, x20
+	b.lt	.LBB0_125
+	b	.LBB0_120
+	.p2align	2
+.LBB0_126:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x12, [sp, #1296]                // 8-byte Folded Reload
+	ldr	x13, [sp, #1128]                // 8-byte Folded Reload
+	cmp	x12, x13
+	b.ge	.LBB0_132
+// %bb.127:                             //   in Loop: Header=BB0_7 Depth=2
+	ldr	x16, [sp, #1296]                // 8-byte Folded Reload
+	ldr	x17, [sp, #1280]                // 8-byte Folded Reload
+	mov	x14, xzr
+	mov	x15, xzr
+	ldr	s4, [x8]
+	madd	x12, x16, x28, x10
+	add	x13, x16, #1
+	lsl	x12, x12, #2
+	ldr	q3, [x17, x12]
+	madd	x12, x13, x28, x10
+	madd	x13, x13, x29, x9
+	lsl	x12, x12, #2
+	add	x13, x13, x21
+	ldr	q2, [x17, x12]
+	madd	x12, x16, x29, x9
+	ldr	x16, [sp, #1272]                // 8-byte Folded Reload
+	add	x12, x12, x21
+	ldr	s0, [x16, x13, lsl #2]
+	ldr	s1, [x16, x12, lsl #2]
+	ext	v6.16b, v3.16b, v3.16b, #8
+	cmp	xzr, x22
+	ext	v5.16b, v2.16b, v2.16b, #8
+	b.ge	.LBB0_129
+	.p2align	2
+.LBB0_128:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x2, x8, x14
+	fmla	v1.2s, v4.2s, v3.2s
+	fmla	v0.2s, v4.2s, v2.2s
+	add	x16, x25, x14
+	add	x3, x2, #20
+	add	x18, x24, x14
+	add	x17, x16, #32
+	add	x1, x18, #32
+	prfm	pldl1keep, [x3]
+	ldp	s4, s7, [x2, #4]
+	add	x15, x15, #4
+	add	x14, x14, #16
+	fmla	v0.2s, v4.2s, v2.s[1]
+	fmla	v1.2s, v4.2s, v3.s[1]
+	fmla	v0.2s, v7.2s, v5.2s
+	ldp	s5, s4, [x2, #12]
+	fmla	v1.2s, v7.2s, v6.2s
+	prfm	pldl1keep, [x1]
+	fmla	v1.2s, v5.2s, v3.s[3]
+	ldr	q3, [x18, #16]
+	prfm	pldl1keep, [x17]
+	fmla	v0.2s, v5.2s, v2.s[3]
+	ldr	q2, [x16, #16]
+	ext	v6.16b, v3.16b, v3.16b, #8
+	cmp	x15, x22
+	ext	v5.16b, v2.16b, v2.16b, #8
+	b.lt	.LBB0_128
+.LBB0_129:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	s7, [x8, x6, lsl #2]
+	fmla	v1.2s, v4.2s, v3.2s
+	fmla	v0.2s, v4.2s, v2.2s
+	ldr	s4, [x8, x7, lsl #2]
+	ldr	x15, [sp, #512]                 // 8-byte Folded Reload
+	ldr	x16, [sp, #1304]                // 8-byte Folded Reload
+	mov	x14, xzr
+	add	x15, x8, x15
+	fmla	v1.2s, v7.2s, v3.s[1]
+	fmla	v0.2s, v7.2s, v2.s[1]
+	ldr	s7, [x8, x19, lsl #2]
+	fmla	v1.2s, v4.2s, v6.2s
+	fmla	v0.2s, v4.2s, v5.2s
+	fmla	v1.2s, v7.2s, v3.s[3]
+	fmla	v0.2s, v7.2s, v2.s[3]
+	cmp	x16, x20
+	b.ge	.LBB0_131
+	.p2align	2
+.LBB0_130:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldr	x2, [sp, #680]                  // 8-byte Folded Reload
+	add	x17, x15, x14
+	add	x1, x30, x14
+	add	x16, x16, #1
+	add	x17, x17, #4
+	add	x1, x1, #4
+	prfm	pldl1keep, [x1]
+	add	x18, x2, x14
+	add	x18, x18, #4
+	prfm	pldl1keep, [x18]
+	ldr	s2, [x30, x14]
+	prfm	pldl1keep, [x17]
+	ldr	s3, [x15, x14]
+	fmla	v1.2s, v3.2s, v2.2s
+	ldr	s2, [x2, x14]
+	add	x14, x14, #4
+	fmla	v0.2s, v3.2s, v2.2s
+	cmp	x16, x20
+	b.lt	.LBB0_130
+.LBB0_131:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x14, [sp, #1272]                // 8-byte Folded Reload
+	str	s1, [x14, x12, lsl #2]
+	str	s0, [x14, x13, lsl #2]
+.LBB0_132:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x12, [sp, #1024]                // 8-byte Folded Reload
+	ldr	x13, [sp, #1128]                // 8-byte Folded Reload
+	cmp	x13, x12
+	b.ge	.LBB0_5
+// %bb.133:                             //   in Loop: Header=BB0_7 Depth=2
+	ldr	x13, [sp, #1128]                // 8-byte Folded Reload
+	ldr	x15, [sp, #1272]                // 8-byte Folded Reload
+	mov	x12, xzr
+	madd	x9, x13, x29, x9
+	madd	x10, x13, x28, x10
+	ldr	x13, [sp, #1280]                // 8-byte Folded Reload
+	ldr	s2, [x8]
+	ldr	x14, [sp, #896]                 // 8-byte Folded Reload
+	add	x9, x9, x21
+	lsl	x10, x10, #2
+	ldr	s0, [x15, x9, lsl #2]
+	ldr	q1, [x13, x10]
+	ldr	x10, [sp, #904]                 // 8-byte Folded Reload
+	ext	v3.16b, v1.16b, v1.16b, #8
+	cmp	xzr, x22
+	b.ge	.LBB0_135
+	.p2align	2
+.LBB0_134:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x13, x11, #8
+	fmla	v0.2s, v2.2s, v1.2s
+	add	x12, x12, #4
+	prfm	pldl1keep, [x13]
+	ldp	s2, s4, [x11, #-8]
+	fmla	v0.2s, v2.2s, v1.s[1]
+	fmla	v0.2s, v4.2s, v3.2s
+	ldp	s3, s2, [x11], #16
+	prfm	pldl1keep, [x10]
+	fmla	v0.2s, v3.2s, v1.s[3]
+	ldur	q1, [x10, #-16]
+	add	x10, x10, #16
+	ext	v3.16b, v1.16b, v1.16b, #8
+	cmp	x12, x22
+	b.lt	.LBB0_134
+.LBB0_135:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	s4, [x8, x6, lsl #2]
+	fmla	v0.2s, v2.2s, v1.2s
+	ldr	x11, [sp, #512]                 // 8-byte Folded Reload
+	ldr	s5, [x8, x7, lsl #2]
+	ldr	s2, [x8, x19, lsl #2]
+	mov	x10, xzr
+	add	x8, x8, x11
+	ldr	x11, [sp, #1304]                // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v1.s[1]
+	fmla	v0.2s, v5.2s, v3.2s
+	fmla	v0.2s, v2.2s, v1.s[3]
+	cmp	x11, x20
+	b.ge	.LBB0_4
+	.p2align	2
+.LBB0_136:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x12, x8, x10
+	add	x13, x14, x10
+	add	x11, x11, #1
+	add	x12, x12, #4
+	add	x13, x13, #4
+	prfm	pldl1keep, [x13]
+	prfm	pldl1keep, [x12]
+	ldr	s1, [x8, x10]
+	ldr	s2, [x14, x10]
+	add	x10, x10, #4
+	fmla	v0.2s, v1.2s, v2.2s
+	cmp	x11, x20
+	b.lt	.LBB0_136
+	b	.LBB0_4
+.LBB0_137:
+	ldr	x0, [sp, #16]                   // 8-byte Folded Reload
+	bl	free
+	add	sp, sp, #1312
+	ldp	d9, d8, [sp, #48]               // 16-byte Folded Reload
+	ldp	d11, d10, [sp, #32]             // 16-byte Folded Reload
+	ldp	d13, d12, [sp, #16]             // 16-byte Folded Reload
+	ldp	x20, x19, [sp, #144]            // 16-byte Folded Reload
+	ldp	x22, x21, [sp, #128]            // 16-byte Folded Reload
+	ldp	x24, x23, [sp, #112]            // 16-byte Folded Reload
+	ldp	x26, x25, [sp, #96]             // 16-byte Folded Reload
+	ldp	x28, x27, [sp, #80]             // 16-byte Folded Reload
+	ldp	x29, x30, [sp, #64]             // 16-byte Folded Reload
+	ldp	d15, d14, [sp], #160            // 16-byte Folded Reload
+	ret
+.Lfunc_end0:
+	.size	sbatch_matmul_4d_nn_mlir, .Lfunc_end0-sbatch_matmul_4d_nn_mlir
+	.cfi_endproc
+                                        // -- End function
+	.section	".note.GNU-stack","",@progbits
diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nt_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nt_mlir.s
new file mode 100644
index 00000000000000..89f885cbd35df1
--- /dev/null
+++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nt_mlir.s
@@ -0,0 +1,3208 @@
+	.text
+	.file	"LLVMDialectModule"
+	.globl	sbatch_matmul_4d_nt_mlir                    // -- Begin function sbatch_matmul_4d_nt_mlir
+	.p2align	4
+	.type	sbatch_matmul_4d_nt_mlir,@function
+sbatch_matmul_4d_nt_mlir:                           // @sbatch_matmul_4d_nt_mlir
+	.cfi_startproc
+// %bb.0:
+	stp	d15, d14, [sp, #-160]!          // 16-byte Folded Spill
+	stp	d13, d12, [sp, #16]             // 16-byte Folded Spill
+	stp	x29, x30, [sp, #64]             // 16-byte Folded Spill
+	stp	x28, x27, [sp, #80]             // 16-byte Folded Spill
+	stp	x26, x25, [sp, #96]             // 16-byte Folded Spill
+	stp	x24, x23, [sp, #112]            // 16-byte Folded Spill
+	stp	x22, x21, [sp, #128]            // 16-byte Folded Spill
+	stp	x20, x19, [sp, #144]            // 16-byte Folded Spill
+	stp	d11, d10, [sp, #32]             // 16-byte Folded Spill
+	stp	d9, d8, [sp, #48]               // 16-byte Folded Spill
+	sub	sp, sp, #688
+	.cfi_def_cfa_offset 848
+	.cfi_offset w19, -8
+	.cfi_offset w20, -16
+	.cfi_offset w21, -24
+	.cfi_offset w22, -32
+	.cfi_offset w23, -40
+	.cfi_offset w24, -48
+	.cfi_offset w25, -56
+	.cfi_offset w26, -64
+	.cfi_offset w27, -72
+	.cfi_offset w28, -80
+	.cfi_offset w30, -88
+	.cfi_offset w29, -96
+	.cfi_offset b8, -104
+	.cfi_offset b9, -112
+	.cfi_offset b10, -120
+	.cfi_offset b11, -128
+	.cfi_offset b12, -136
+	.cfi_offset b13, -144
+	.cfi_offset b14, -152
+	.cfi_offset b15, -160
+	cmp	x5, #0
+	ldr	x13, [sp, #912]
+	ldr	x14, [sp, #848]
+	mov	x20, x6
+	cinv	x8, x5, lt
+	ldr	x28, [sp, #1032]
+	ldr	x22, [sp, #856]
+	mov	x27, x2
+	add	x9, x8, x8, lsr #63
+	add	x10, x8, #3
+	ldr	x25, [sp, #944]
+	str	x7, [sp, #664]                  // 8-byte Folded Spill
+	stp	x13, x4, [sp, #296]             // 16-byte Folded Spill
+	str	x3, [sp, #32]                   // 8-byte Folded Spill
+	mov	x19, x1
+	asr	x9, x9, #1
+	str	x14, [sp, #656]                 // 8-byte Folded Spill
+	str	x5, [sp, #528]                  // 8-byte Folded Spill
+	cinv	x21, x9, lt
+	ldr	x9, [sp, #1024]
+	cmp	x8, #0
+	csel	x8, x10, x8, lt
+	ldr	x10, [sp, #976]
+	cmp	x5, #0
+	asr	x8, x8, #2
+	cinv	x29, x8, lt
+	cmp	x13, #0
+	str	x9, [sp, #520]                  // 8-byte Folded Spill
+	ldr	x9, [sp, #1016]
+	cinv	x8, x13, lt
+	add	x11, x8, #7
+	add	x12, x8, #3
+	str	x9, [sp, #512]                  // 8-byte Folded Spill
+	ldr	x9, [sp, #968]
+	stp	x9, x10, [sp, #480]             // 16-byte Folded Spill
+	add	x9, x8, x8, lsr #63
+	add	x10, x8, #15
+	asr	x9, x9, #1
+	cinv	x14, x9, lt
+	cmp	x8, #0
+	csel	x9, x10, x8, lt
+	csel	x10, x11, x8, lt
+	ldr	x11, [sp, #888]
+	csel	x8, x12, x8, lt
+	cmp	x13, #0
+	str	x14, [sp, #616]                 // 8-byte Folded Spill
+	asr	x9, x9, #4
+	asr	x10, x10, #3
+	asr	x8, x8, #2
+	cinv	x24, x9, lt
+	cinv	x26, x10, lt
+	cinv	x23, x8, lt
+	lsl	x8, x24, #4
+	str	x11, [sp, #672]                 // 8-byte Folded Spill
+	ldr	x11, [sp, #880]
+	str	x8, [sp, #568]                  // 8-byte Folded Spill
+	lsl	x8, x23, #2
+	str	x11, [sp, #648]                 // 8-byte Folded Spill
+	ldr	x11, [sp, #936]
+	str	x11, [sp, #632]                 // 8-byte Folded Spill
+	ldr	x11, [sp, #928]
+	str	x11, [sp, #624]                 // 8-byte Folded Spill
+	lsl	x11, x26, #3
+	stp	x8, x11, [sp, #440]             // 16-byte Folded Spill
+	lsl	x8, x14, #1
+	str	x8, [sp, #432]                  // 8-byte Folded Spill
+	lsl	x8, x6, #6
+	add	x0, x8, #64
+	str	x8, [sp, #640]                  // 8-byte Folded Spill
+	bl	malloc
+	add	x12, x0, #63
+	mul	x9, x24, x25
+	ldr	x1, [sp, #672]                  // 8-byte Folded Reload
+	ldr	x2, [sp, #648]                  // 8-byte Folded Reload
+	and	x24, x12, #0xffffffffffffffc0
+	ldr	x12, [sp, #624]                 // 8-byte Folded Reload
+	mul	x15, x21, x22
+	lsl	x8, x29, #2
+	str	x8, [sp, #680]                  // 8-byte Folded Spill
+	lsl	x8, x21, #1
+	mov	w11, #1                         // =0x1
+	str	x8, [sp, #592]                  // 8-byte Folded Spill
+	negs	x8, x20
+	bfi	x11, x29, #2, #62
+	and	x10, x20, #0x3
+	lsl	x21, x22, #2
+	str	x0, [sp, #8]                    // 8-byte Folded Spill
+	mul	x18, x22, x11
+	and	x8, x8, #0x3
+	add	x11, x20, x15, lsl #1
+	lsl	x12, x12, #2
+	lsl	x0, x27, #2
+	mov	w14, #1                         // =0x1
+	add	x1, x2, x1, lsl #2
+	str	x12, [sp, #24]                  // 8-byte Folded Spill
+	ldr	x12, [sp, #632]                 // 8-byte Folded Reload
+	csneg	x8, x10, x8, mi
+	mul	x10, x26, x25
+	bfi	x14, x23, #2, #62
+	add	x2, x0, x19
+	mul	x16, x25, x14
+	add	x4, x1, #4
+	add	x5, x2, #4
+	add	x9, x4, x9, lsl #6
+	mul	x17, x29, x22
+	sub	x29, x20, x8
+	mul	x13, x23, x25
+	lsl	x23, x22, #4
+	add	x2, x2, x23
+	add	x2, x2, #32
+	lsl	x12, x12, #2
+	str	x2, [sp, #152]                  // 8-byte Folded Spill
+	sub	x2, x24, x8, lsl #6
+	ldr	x6, [sp, #640]                  // 8-byte Folded Reload
+	str	x12, [sp, #288]                 // 8-byte Folded Spill
+	lsl	x12, x25, #6
+	add	x18, x0, x18, lsl #2
+	str	x27, [sp, #504]                 // 8-byte Folded Spill
+	str	x12, [sp, #472]                 // 8-byte Folded Spill
+	add	x12, x20, x21
+	add	x17, x0, x17, lsl #4
+	lsl	x27, x25, #2
+	sub	x14, x12, x8
+	sub	x12, x11, x8
+	add	x15, x0, x15, lsl #3
+	lsl	x0, x8, #2
+	add	x12, x5, x12, lsl #2
+	ldr	x11, [sp, #616]                 // 8-byte Folded Reload
+	add	x2, x2, x6
+	add	x6, x19, x18
+	add	x16, x1, x16, lsl #2
+	add	x7, x19, x17
+	str	xzr, [sp, #176]                 // 8-byte Folded Spill
+	mov	x3, xzr
+	stp	x9, x12, [sp, #96]              // 16-byte Folded Spill
+	add	x9, x4, x10, lsl #5
+	lsl	x10, x20, #4
+	lsl	x12, x20, #3
+	add	x13, x1, x13, lsl #4
+	stp	x13, x16, [sp, #136]            // 16-byte Folded Spill
+	add	x13, x15, x19
+	add	x13, x13, #32
+	str	x9, [sp, #88]                   // 8-byte Folded Spill
+	lsl	x9, x20, #5
+	mul	x11, x11, x25
+	lsl	x25, x20, #2
+	stp	x10, x9, [sp, #248]             // 16-byte Folded Spill
+	sub	x9, x9, x8, lsl #5
+	str	x13, [sp, #128]                 // 8-byte Folded Spill
+	add	x13, x15, x25
+	sub	x10, x10, x8, lsl #4
+	sub	x13, x13, x0
+	add	x18, x18, x25
+	add	x17, x17, x25
+	stp	x9, x12, [sp, #232]             // 16-byte Folded Spill
+	sub	x8, x12, x8, lsl #3
+	sub	x12, x29, #3
+	add	x13, x19, x13
+	str	x12, [sp, #648]                 // 8-byte Folded Spill
+	sub	x12, x29, #2
+	add	x14, x5, x14, lsl #2
+	sub	x18, x18, x0
+	str	x12, [sp, #640]                 // 8-byte Folded Spill
+	sub	x12, x29, #1
+	sub	x17, x17, x0
+	stp	x13, x14, [sp, #112]            // 16-byte Folded Spill
+	str	x12, [sp, #632]                 // 8-byte Folded Spill
+	ldr	x12, [sp, #664]                 // 8-byte Folded Reload
+	sub	x13, x25, x0
+	ldr	x0, [sp, #568]                  // 8-byte Folded Reload
+	add	x9, x9, #32
+	add	x11, x4, x11, lsl #3
+	stp	x9, x8, [sp, #216]              // 16-byte Folded Spill
+	str	x10, [sp, #184]                 // 8-byte Folded Spill
+	add	x10, x10, #16
+	add	x8, x8, #8
+	add	x18, x19, x18
+	add	x17, x19, x17
+	stp	x8, x10, [sp, #200]             // 16-byte Folded Spill
+	add	x8, x13, #4
+	str	x19, [sp, #496]                 // 8-byte Folded Spill
+	stp	x13, x25, [sp, #264]            // 16-byte Folded Spill
+	lsl	x12, x12, #2
+	stp	x5, x4, [sp, #160]              // 16-byte Folded Spill
+	str	x18, [sp, #376]                 // 8-byte Folded Spill
+	sub	x19, x29, #4
+	str	x12, [sp, #16]                  // 8-byte Folded Spill
+	ldr	x12, [sp, #656]                 // 8-byte Folded Reload
+	mov	x9, x11
+	str	x8, [sp, #192]                  // 8-byte Folded Spill
+	str	x7, [sp, #600]                  // 8-byte Folded Spill
+	str	x6, [sp, #608]                  // 8-byte Folded Spill
+	lsl	x12, x12, #2
+	stp	x17, x6, [sp, #72]              // 16-byte Folded Spill
+	stp	x2, x17, [sp, #360]             // 16-byte Folded Spill
+	str	x18, [sp, #64]                  // 8-byte Folded Spill
+	str	x12, [sp, #280]                 // 8-byte Folded Spill
+	add	x12, x24, #256
+	str	x12, [sp, #624]                 // 8-byte Folded Spill
+	add	x12, x2, #64
+	str	x12, [sp, #616]                 // 8-byte Folded Spill
+	b	.LBB0_2
+	.p2align	2
+.LBB0_1:                                // %.loopexit40
+                                        //   in Loop: Header=BB0_2 Depth=1
+	ldp	x10, x9, [sp, #16]              // 16-byte Folded Reload
+	ldr	x8, [sp, #168]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	ldr	x3, [sp, #40]                   // 8-byte Folded Reload
+	str	x8, [sp, #168]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #160]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #160]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #152]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #152]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #120]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #120]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #176]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #176]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #128]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #128]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #104]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #104]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #112]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #112]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #96]                   // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #96]                   // 8-byte Folded Spill
+	ldr	x8, [sp, #64]                   // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #64]                   // 8-byte Folded Spill
+	ldr	x8, [sp, #72]                   // 8-byte Folded Reload
+	add	x8, x8, x10
+	str	x8, [sp, #72]                   // 8-byte Folded Spill
+	ldr	x8, [sp, #88]                   // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #88]                   // 8-byte Folded Spill
+	ldr	x8, [sp, #144]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #144]                  // 8-byte Folded Spill
+	ldr	x8, [sp, #136]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #136]                  // 8-byte Folded Spill
+	ldp	x7, x8, [sp, #48]               // 16-byte Folded Reload
+	add	x9, x8, x9
+	ldr	x8, [sp, #80]                   // 8-byte Folded Reload
+	add	x7, x7, x10
+	add	x8, x8, x10
+	str	x8, [sp, #80]                   // 8-byte Folded Spill
+.LBB0_2:                                // =>This Loop Header: Depth=1
+                                        //     Child Loop BB0_7 Depth 2
+                                        //       Child Loop BB0_11 Depth 3
+                                        //         Child Loop BB0_13 Depth 4
+                                        //         Child Loop BB0_16 Depth 4
+                                        //           Child Loop BB0_18 Depth 5
+                                        //           Child Loop BB0_20 Depth 5
+                                        //         Child Loop BB0_23 Depth 4
+                                        //         Child Loop BB0_25 Depth 4
+                                        //         Child Loop BB0_29 Depth 4
+                                        //         Child Loop BB0_31 Depth 4
+                                        //       Child Loop BB0_37 Depth 3
+                                        //       Child Loop BB0_40 Depth 3
+                                        //         Child Loop BB0_42 Depth 4
+                                        //         Child Loop BB0_44 Depth 4
+                                        //       Child Loop BB0_47 Depth 3
+                                        //       Child Loop BB0_49 Depth 3
+                                        //       Child Loop BB0_53 Depth 3
+                                        //       Child Loop BB0_55 Depth 3
+                                        //       Child Loop BB0_59 Depth 3
+                                        //       Child Loop BB0_62 Depth 3
+                                        //         Child Loop BB0_64 Depth 4
+                                        //         Child Loop BB0_66 Depth 4
+                                        //       Child Loop BB0_69 Depth 3
+                                        //       Child Loop BB0_71 Depth 3
+                                        //       Child Loop BB0_75 Depth 3
+                                        //       Child Loop BB0_77 Depth 3
+                                        //       Child Loop BB0_81 Depth 3
+                                        //       Child Loop BB0_84 Depth 3
+                                        //         Child Loop BB0_86 Depth 4
+                                        //         Child Loop BB0_88 Depth 4
+                                        //       Child Loop BB0_91 Depth 3
+                                        //       Child Loop BB0_93 Depth 3
+                                        //       Child Loop BB0_97 Depth 3
+                                        //       Child Loop BB0_99 Depth 3
+                                        //       Child Loop BB0_103 Depth 3
+                                        //       Child Loop BB0_106 Depth 3
+                                        //         Child Loop BB0_108 Depth 4
+                                        //         Child Loop BB0_110 Depth 4
+                                        //       Child Loop BB0_113 Depth 3
+                                        //       Child Loop BB0_115 Depth 3
+                                        //       Child Loop BB0_119 Depth 3
+                                        //       Child Loop BB0_121 Depth 3
+	ldr	x8, [sp, #32]                   // 8-byte Folded Reload
+	cmp	x3, x8
+	b.ge	.LBB0_122
+// %bb.3:                               //   in Loop: Header=BB0_2 Depth=1
+	add	x8, x3, #1
+	str	x9, [sp, #56]                   // 8-byte Folded Spill
+	mov	x25, xzr
+	str	x9, [sp, #328]                  // 8-byte Folded Spill
+	stp	x8, x7, [sp, #40]               // 16-byte Folded Spill
+	ldr	x8, [sp, #80]                   // 8-byte Folded Reload
+	str	x3, [sp, #672]                  // 8-byte Folded Spill
+	stp	x8, x7, [sp, #336]              // 16-byte Folded Spill
+	ldp	x9, x8, [sp, #136]              // 16-byte Folded Reload
+	stp	x8, x9, [sp, #400]              // 16-byte Folded Spill
+	ldr	x9, [sp, #88]                   // 8-byte Folded Reload
+	ldp	x11, x10, [sp, #64]             // 16-byte Folded Reload
+	str	x10, [sp, #352]                 // 8-byte Folded Spill
+	ldp	x10, x8, [sp, #96]              // 16-byte Folded Reload
+	stp	x8, x11, [sp, #416]             // 16-byte Folded Spill
+	ldr	x8, [sp, #128]                  // 8-byte Folded Reload
+	stp	x10, x9, [sp, #384]             // 16-byte Folded Spill
+	str	x8, [sp, #456]                  // 8-byte Folded Spill
+	ldp	x12, x8, [sp, #168]             // 16-byte Folded Reload
+	str	x8, [sp, #536]                  // 8-byte Folded Spill
+	ldp	x11, x8, [sp, #112]             // 16-byte Folded Reload
+	str	x8, [sp, #552]                  // 8-byte Folded Spill
+	ldp	x8, x16, [sp, #152]             // 16-byte Folded Reload
+	str	x8, [sp, #544]                  // 8-byte Folded Spill
+	b	.LBB0_7
+	.p2align	2
+.LBB0_4:                                //   in Loop: Header=BB0_7 Depth=2
+	str	s0, [x6, x9, lsl #2]
+.LBB0_5:                                //   in Loop: Header=BB0_7 Depth=2
+	bl	free
+	ldr	x16, [sp, #560]                 // 8-byte Folded Reload
+.LBB0_6:                                // %.backedge41
+                                        //   in Loop: Header=BB0_7 Depth=2
+	ldp	x9, x8, [sp, #280]              // 16-byte Folded Reload
+	ldr	x10, [sp, #544]                 // 8-byte Folded Reload
+	add	x10, x10, x9
+	ldp	x25, x12, [sp, #312]            // 16-byte Folded Reload
+	add	x12, x12, x8
+	ldr	x0, [sp, #568]                  // 8-byte Folded Reload
+	add	x16, x16, x9
+	str	x10, [sp, #544]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #552]                 // 8-byte Folded Reload
+	add	x10, x10, x9
+	str	x10, [sp, #552]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #536]                 // 8-byte Folded Reload
+	add	x10, x10, x9
+	str	x10, [sp, #536]                 // 8-byte Folded Spill
+	ldp	x10, x11, [sp, #456]            // 16-byte Folded Reload
+	add	x10, x10, x9
+	add	x11, x11, x9
+	str	x10, [sp, #456]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #416]                 // 8-byte Folded Reload
+	add	x10, x10, x9
+	str	x10, [sp, #416]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #384]                 // 8-byte Folded Reload
+	add	x10, x10, x8
+	str	x10, [sp, #384]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #424]                 // 8-byte Folded Reload
+	add	x10, x10, x9
+	str	x10, [sp, #424]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #352]                 // 8-byte Folded Reload
+	add	x10, x10, x9
+	str	x10, [sp, #352]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #392]                 // 8-byte Folded Reload
+	add	x10, x10, x8
+	str	x10, [sp, #392]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #400]                 // 8-byte Folded Reload
+	add	x10, x10, x8
+	str	x10, [sp, #400]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #408]                 // 8-byte Folded Reload
+	add	x10, x10, x8
+	str	x10, [sp, #408]                 // 8-byte Folded Spill
+	ldr	x10, [sp, #328]                 // 8-byte Folded Reload
+	add	x10, x10, x8
+	ldr	x8, [sp, #336]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	stp	x10, x8, [sp, #328]             // 16-byte Folded Spill
+	ldr	x8, [sp, #344]                  // 8-byte Folded Reload
+	add	x8, x8, x9
+	str	x8, [sp, #344]                  // 8-byte Folded Spill
+.LBB0_7:                                //   Parent Loop BB0_2 Depth=1
+                                        // =>  This Loop Header: Depth=2
+                                        //       Child Loop BB0_11 Depth 3
+                                        //         Child Loop BB0_13 Depth 4
+                                        //         Child Loop BB0_16 Depth 4
+                                        //           Child Loop BB0_18 Depth 5
+                                        //           Child Loop BB0_20 Depth 5
+                                        //         Child Loop BB0_23 Depth 4
+                                        //         Child Loop BB0_25 Depth 4
+                                        //         Child Loop BB0_29 Depth 4
+                                        //         Child Loop BB0_31 Depth 4
+                                        //       Child Loop BB0_37 Depth 3
+                                        //       Child Loop BB0_40 Depth 3
+                                        //         Child Loop BB0_42 Depth 4
+                                        //         Child Loop BB0_44 Depth 4
+                                        //       Child Loop BB0_47 Depth 3
+                                        //       Child Loop BB0_49 Depth 3
+                                        //       Child Loop BB0_53 Depth 3
+                                        //       Child Loop BB0_55 Depth 3
+                                        //       Child Loop BB0_59 Depth 3
+                                        //       Child Loop BB0_62 Depth 3
+                                        //         Child Loop BB0_64 Depth 4
+                                        //         Child Loop BB0_66 Depth 4
+                                        //       Child Loop BB0_69 Depth 3
+                                        //       Child Loop BB0_71 Depth 3
+                                        //       Child Loop BB0_75 Depth 3
+                                        //       Child Loop BB0_77 Depth 3
+                                        //       Child Loop BB0_81 Depth 3
+                                        //       Child Loop BB0_84 Depth 3
+                                        //         Child Loop BB0_86 Depth 4
+                                        //         Child Loop BB0_88 Depth 4
+                                        //       Child Loop BB0_91 Depth 3
+                                        //       Child Loop BB0_93 Depth 3
+                                        //       Child Loop BB0_97 Depth 3
+                                        //       Child Loop BB0_99 Depth 3
+                                        //       Child Loop BB0_103 Depth 3
+                                        //       Child Loop BB0_106 Depth 3
+                                        //         Child Loop BB0_108 Depth 4
+                                        //         Child Loop BB0_110 Depth 4
+                                        //       Child Loop BB0_113 Depth 3
+                                        //       Child Loop BB0_115 Depth 3
+                                        //       Child Loop BB0_119 Depth 3
+                                        //       Child Loop BB0_121 Depth 3
+	ldr	x8, [sp, #304]                  // 8-byte Folded Reload
+	cmp	x25, x8
+	b.ge	.LBB0_1
+// %bb.8:                               //   in Loop: Header=BB0_7 Depth=2
+	mov	x10, xzr
+	add	x8, x25, #1
+	mov	x1, x12
+	str	x11, [sp, #464]                 // 8-byte Folded Spill
+	stp	x8, x12, [sp, #312]             // 16-byte Folded Spill
+	str	x16, [sp, #560]                 // 8-byte Folded Spill
+	b	.LBB0_11
+	.p2align	2
+.LBB0_9:                                //   in Loop: Header=BB0_11 Depth=3
+	stp	q3, q2, [x10]
+	stp	q1, q0, [x10, #32]
+.LBB0_10:                               // %.backedge
+                                        //   in Loop: Header=BB0_11 Depth=3
+	ldr	x8, [sp, #472]                  // 8-byte Folded Reload
+	ldr	x1, [sp, #584]                  // 8-byte Folded Reload
+	add	x1, x1, x8
+	ldr	x10, [sp, #576]                 // 8-byte Folded Reload
+	ldr	x16, [sp, #560]                 // 8-byte Folded Reload
+.LBB0_11:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Loop Header: Depth=3
+                                        //         Child Loop BB0_13 Depth 4
+                                        //         Child Loop BB0_16 Depth 4
+                                        //           Child Loop BB0_18 Depth 5
+                                        //           Child Loop BB0_20 Depth 5
+                                        //         Child Loop BB0_23 Depth 4
+                                        //         Child Loop BB0_25 Depth 4
+                                        //         Child Loop BB0_29 Depth 4
+                                        //         Child Loop BB0_31 Depth 4
+	ldp	x9, x8, [sp, #496]              // 16-byte Folded Reload
+	cmp	x10, x0
+	add	x26, x9, x8, lsl #2
+	b.ge	.LBB0_32
+// %bb.12:                              //   in Loop: Header=BB0_11 Depth=3
+	ldr	x8, [sp, #520]                  // 8-byte Folded Reload
+	ldr	x11, [sp, #672]                 // 8-byte Folded Reload
+	mov	x13, xzr
+	mul	x9, x25, x8
+	ldr	x8, [sp, #512]                  // 8-byte Folded Reload
+	madd	x12, x11, x8, x9
+	ldp	x9, x8, [sp, #480]              // 16-byte Folded Reload
+	add	x11, x9, x8, lsl #2
+	add	x14, x12, x10
+	add	x8, x10, #16
+	add	x15, x14, x28
+	str	x8, [sp, #576]                  // 8-byte Folded Spill
+	add	x15, x11, x15, lsl #2
+	add	x9, x11, x14, lsl #2
+	ldp	q3, q1, [x15, #32]
+	ldp	q5, q4, [x15]
+	lsl	x15, x28, #1
+	ldp	q17, q6, [x9, #32]
+	ldp	q2, q0, [x9]
+	add	x9, x14, x15
+	add	x15, x15, x28
+	add	x14, x14, x15
+	add	x9, x11, x9, lsl #2
+	mov	x15, x1
+	add	x14, x11, x14, lsl #2
+	ldp	q18, q7, [x9, #32]
+	ldp	q21, q20, [x9]
+	ldp	q19, q16, [x14, #32]
+	ldp	q23, q22, [x14]
+	mov	x14, x16
+	cmp	xzr, x20
+	b.ge	.LBB0_14
+	.p2align	2
+.LBB0_13:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_11 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x16, x14, x21
+	prfm	pldl1keep, [x14]
+	ldur	s27, [x14, #-4]
+	add	x14, x14, #4
+	add	x17, x16, x21
+	prfm	pldl1keep, [x16]
+	ldur	s28, [x16, #-4]
+	add	x16, x15, x27
+	add	x18, x17, x21
+	prfm	pldl1keep, [x17]
+	ldur	s26, [x17, #-4]
+	sub	x17, x16, #4
+	prfm	pldl1keep, [x18]
+	ldur	s25, [x18, #-4]
+	add	x18, x16, x27
+	prfm	pldl1keep, [x15]
+	ldur	s24, [x15, #-4]
+	add	x15, x15, #4
+	prfm	pldl1keep, [x16]
+	sub	x16, x18, #4
+	prfm	pldl1keep, [x18]
+	ld1	{ v24.s }[1], [x17]
+	add	x17, x18, x27
+	prfm	pldl1keep, [x17]
+	ld1	{ v24.s }[2], [x16]
+	add	x16, x17, x27
+	sub	x17, x17, #4
+	prfm	pldl1keep, [x16]
+	ldur	s29, [x16, #-4]
+	add	x16, x16, x27
+	sub	x18, x16, #4
+	add	x0, x16, x27
+	ld1	{ v24.s }[3], [x17]
+	prfm	pldl1keep, [x16]
+	prfm	pldl1keep, [x0]
+	ld1	{ v29.s }[1], [x18]
+	sub	x16, x0, #4
+	add	x17, x0, x27
+	prfm	pldl1keep, [x17]
+	fmla	v2.4s, v24.4s, v27.s[0]
+	ld1	{ v29.s }[2], [x16]
+	add	x16, x17, x27
+	sub	x17, x17, #4
+	fmla	v5.4s, v24.4s, v28.s[0]
+	fmla	v21.4s, v24.4s, v26.s[0]
+	fmla	v23.4s, v24.4s, v25.s[0]
+	prfm	pldl1keep, [x16]
+	ldur	s30, [x16, #-4]
+	add	x16, x16, x27
+	sub	x18, x16, #4
+	add	x0, x16, x27
+	ld1	{ v29.s }[3], [x17]
+	prfm	pldl1keep, [x16]
+	prfm	pldl1keep, [x0]
+	ld1	{ v30.s }[1], [x18]
+	sub	x16, x0, #4
+	add	x17, x0, x27
+	prfm	pldl1keep, [x17]
+	ld1	{ v30.s }[2], [x16]
+	add	x16, x17, x27
+	sub	x17, x17, #4
+	fmla	v0.4s, v29.4s, v27.s[0]
+	fmla	v4.4s, v29.4s, v28.s[0]
+	fmla	v20.4s, v29.4s, v26.s[0]
+	fmla	v22.4s, v29.4s, v25.s[0]
+	prfm	pldl1keep, [x16]
+	ldur	s31, [x16, #-4]
+	add	x16, x16, x27
+	sub	x18, x16, #4
+	add	x0, x16, x27
+	ld1	{ v30.s }[3], [x17]
+	prfm	pldl1keep, [x16]
+	prfm	pldl1keep, [x0]
+	ld1	{ v31.s }[1], [x18]
+	sub	x16, x0, #4
+	add	x17, x0, x27
+	prfm	pldl1keep, [x17]
+	fmla	v17.4s, v30.4s, v27.s[0]
+	ld1	{ v31.s }[2], [x16]
+	sub	x16, x17, #4
+	fmla	v3.4s, v30.4s, v28.s[0]
+	fmla	v18.4s, v30.4s, v26.s[0]
+	fmla	v19.4s, v30.4s, v25.s[0]
+	ld1	{ v31.s }[3], [x16]
+	add	x16, x24, x13, lsl #6
+	add	x13, x13, #1
+	stp	q24, q29, [x16]
+	fmla	v6.4s, v31.4s, v27.s[0]
+	fmla	v1.4s, v31.4s, v28.s[0]
+	fmla	v7.4s, v31.4s, v26.s[0]
+	fmla	v16.4s, v31.4s, v25.s[0]
+	stp	q30, q31, [x16, #32]
+	cmp	x13, x20
+	b.lt	.LBB0_13
+.LBB0_14:                               // %.preheader
+                                        //   in Loop: Header=BB0_11 Depth=3
+	ldr	x16, [sp, #552]                 // 8-byte Folded Reload
+	ldr	x17, [sp, #544]                 // 8-byte Folded Reload
+	mov	x13, xzr
+	mov	w2, #2                          // =0x2
+	str	x1, [sp, #584]                  // 8-byte Folded Spill
+	mov	w1, #1                          // =0x1
+	mov	w0, #3                          // =0x3
+	mov	w18, #4                         // =0x4
+	b	.LBB0_16
+	.p2align	2
+.LBB0_15:                               // %.loopexit
+                                        //   in Loop: Header=BB0_16 Depth=4
+	add	x17, x17, x23
+	add	x16, x16, x23
+	mov	x13, x18
+	mov	x18, x3
+.LBB0_16:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_11 Depth=3
+                                        // =>      This Loop Header: Depth=4
+                                        //           Child Loop BB0_18 Depth 5
+                                        //           Child Loop BB0_20 Depth 5
+	ldr	x8, [sp, #680]                  // 8-byte Folded Reload
+	madd	x13, x13, x28, x12
+	cmp	x18, x8
+	madd	x14, x1, x28, x12
+	madd	x15, x2, x28, x12
+	ldr	x8, [sp, #648]                  // 8-byte Folded Reload
+	add	x13, x13, x10
+	add	x14, x14, x10
+	add	x15, x15, x10
+	add	x13, x11, x13, lsl #2
+	stp	q2, q0, [x13]
+	stp	q17, q6, [x13, #32]
+	add	x13, x11, x14, lsl #2
+	add	x14, x11, x15, lsl #2
+	add	x15, x24, x8, lsl #6
+	ldr	x8, [sp, #640]                  // 8-byte Folded Reload
+	stp	q5, q4, [x13]
+	stp	q3, q1, [x13, #32]
+	madd	x13, x0, x28, x12
+	add	x13, x13, x10
+	stp	q21, q20, [x14]
+	stp	q18, q7, [x14, #32]
+	add	x14, x24, x8, lsl #6
+	ldr	x8, [sp, #632]                  // 8-byte Folded Reload
+	add	x0, x11, x13, lsl #2
+	add	x13, x24, x8, lsl #6
+	stp	q23, q22, [x0]
+	stp	q19, q16, [x0, #32]
+	b.ge	.LBB0_21
+// %bb.17:                              //   in Loop: Header=BB0_16 Depth=4
+	madd	x5, x18, x28, x12
+	add	x1, x18, #1
+	add	x2, x18, #2
+	add	x0, x18, #3
+	ldr	x8, [sp, #656]                  // 8-byte Folded Reload
+	ldr	x9, [sp, #672]                  // 8-byte Folded Reload
+	ldp	q28, q29, [x24, #32]
+	ldp	q30, q31, [x24]
+	mov	x4, xzr
+	add	x3, x18, #4
+	add	x5, x5, x10
+	mul	x6, x25, x8
+	ldr	x8, [sp, #664]                  // 8-byte Folded Reload
+	add	x5, x11, x5, lsl #2
+	ldp	q17, q6, [x5, #32]
+	ldp	q2, q0, [x5]
+	madd	x5, x1, x28, x12
+	add	x5, x5, x10
+	add	x5, x11, x5, lsl #2
+	ldp	q3, q1, [x5, #32]
+	ldp	q5, q4, [x5]
+	madd	x5, x2, x28, x12
+	add	x5, x5, x10
+	add	x5, x11, x5, lsl #2
+	ldp	q18, q7, [x5, #32]
+	ldp	q21, q20, [x5]
+	madd	x5, x0, x28, x12
+	add	x5, x5, x10
+	add	x5, x11, x5, lsl #2
+	ldp	q19, q16, [x5, #32]
+	ldp	q23, q22, [x5]
+	madd	x5, x9, x8, x6
+	madd	x6, x18, x22, x5
+	lsl	x6, x6, #2
+	ldr	q27, [x26, x6]
+	madd	x6, x1, x22, x5
+	lsl	x6, x6, #2
+	ldr	q26, [x26, x6]
+	madd	x6, x2, x22, x5
+	madd	x5, x0, x22, x5
+	lsl	x6, x6, #2
+	lsl	x5, x5, #2
+	ldr	q25, [x26, x6]
+	ldr	q24, [x26, x5]
+	ldr	x6, [sp, #624]                  // 8-byte Folded Reload
+	mov	x5, x17
+	fmla	v6.4s, v29.4s, v27.s[0]
+	cmp	xzr, x19
+	b.ge	.LBB0_19
+	.p2align	2
+.LBB0_18:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_11 Depth=3
+                                        //         Parent Loop BB0_16 Depth=4
+                                        // =>        This Inner Loop Header: Depth=5
+	add	x8, x6, #64
+	fmla	v17.4s, v28.4s, v27.s[0]
+	fmla	v2.4s, v30.4s, v27.s[0]
+	add	x9, x6, #128
+	prfm	pldl1keep, [x8]
+	ldp	q9, q8, [x6, #-160]
+	fmla	v0.4s, v31.4s, v27.s[0]
+	ldp	q12, q15, [x6, #-192]
+	fmla	v1.4s, v29.4s, v26.s[0]
+	fmla	v3.4s, v28.4s, v26.s[0]
+	fmla	v4.4s, v31.4s, v26.s[0]
+	fmla	v5.4s, v30.4s, v26.s[0]
+	fmla	v7.4s, v29.4s, v25.s[0]
+	prfm	pldl1keep, [x9]
+	fmla	v18.4s, v28.4s, v25.s[0]
+	fmla	v20.4s, v31.4s, v25.s[0]
+	ldp	q11, q10, [x6, #-128]
+	fmla	v21.4s, v30.4s, v25.s[0]
+	fmla	v16.4s, v29.4s, v24.s[0]
+	ldp	q13, q14, [x6, #-96]
+	fmla	v19.4s, v28.4s, v24.s[0]
+	fmla	v22.4s, v31.4s, v24.s[0]
+	add	x30, x6, #192
+	prfm	pldl1keep, [x30]
+	fmla	v23.4s, v30.4s, v24.s[0]
+	fmla	v0.4s, v15.4s, v27.s[1]
+	add	x7, x6, #256
+	add	x8, x5, x21
+	fmla	v2.4s, v12.4s, v27.s[1]
+	fmla	v17.4s, v9.4s, v27.s[1]
+	add	x4, x4, #4
+	fmla	v6.4s, v8.4s, v27.s[1]
+	fmla	v5.4s, v12.4s, v26.s[1]
+	fmla	v4.4s, v15.4s, v26.s[1]
+	fmla	v3.4s, v9.4s, v26.s[1]
+	fmla	v1.4s, v8.4s, v26.s[1]
+	fmla	v21.4s, v12.4s, v25.s[1]
+	fmla	v20.4s, v15.4s, v25.s[1]
+	fmla	v18.4s, v9.4s, v25.s[1]
+	fmla	v7.4s, v8.4s, v25.s[1]
+	fmla	v23.4s, v12.4s, v24.s[1]
+	fmla	v22.4s, v15.4s, v24.s[1]
+	ldp	q15, q12, [x6, #-64]
+	fmla	v19.4s, v9.4s, v24.s[1]
+	fmla	v16.4s, v8.4s, v24.s[1]
+	ldp	q9, q8, [x6, #-32]
+	prfm	pldl1keep, [x7]
+	ldp	q28, q29, [x6, #32]
+	fmla	v6.4s, v14.4s, v27.s[2]
+	ldp	q30, q31, [x6]
+	prfm	pldl1keep, [x5]
+	mov	x6, x7
+	fmla	v17.4s, v13.4s, v27.s[2]
+	fmla	v2.4s, v11.4s, v27.s[2]
+	fmla	v0.4s, v10.4s, v27.s[2]
+	fmla	v1.4s, v14.4s, v26.s[2]
+	fmla	v3.4s, v13.4s, v26.s[2]
+	fmla	v4.4s, v10.4s, v26.s[2]
+	fmla	v5.4s, v11.4s, v26.s[2]
+	fmla	v7.4s, v14.4s, v25.s[2]
+	fmla	v18.4s, v13.4s, v25.s[2]
+	fmla	v20.4s, v10.4s, v25.s[2]
+	fmla	v21.4s, v11.4s, v25.s[2]
+	fmla	v16.4s, v14.4s, v24.s[2]
+	fmla	v19.4s, v13.4s, v24.s[2]
+	fmla	v22.4s, v10.4s, v24.s[2]
+	fmla	v23.4s, v11.4s, v24.s[2]
+	fmla	v0.4s, v12.4s, v27.s[3]
+	fmla	v2.4s, v15.4s, v27.s[3]
+	fmla	v17.4s, v9.4s, v27.s[3]
+	fmla	v6.4s, v8.4s, v27.s[3]
+	ldur	q27, [x5, #-16]
+	prfm	pldl1keep, [x8]
+	add	x5, x5, #16
+	fmla	v5.4s, v15.4s, v26.s[3]
+	fmla	v4.4s, v12.4s, v26.s[3]
+	fmla	v3.4s, v9.4s, v26.s[3]
+	fmla	v1.4s, v8.4s, v26.s[3]
+	ldur	q26, [x8, #-16]
+	add	x8, x8, x21
+	prfm	pldl1keep, [x8]
+	fmla	v21.4s, v15.4s, v25.s[3]
+	fmla	v20.4s, v12.4s, v25.s[3]
+	fmla	v18.4s, v9.4s, v25.s[3]
+	fmla	v7.4s, v8.4s, v25.s[3]
+	ldur	q25, [x8, #-16]
+	add	x8, x8, x21
+	prfm	pldl1keep, [x8]
+	fmla	v23.4s, v15.4s, v24.s[3]
+	fmla	v22.4s, v12.4s, v24.s[3]
+	fmla	v19.4s, v9.4s, v24.s[3]
+	fmla	v16.4s, v8.4s, v24.s[3]
+	ldur	q24, [x8, #-16]
+	fmla	v6.4s, v29.4s, v27.s[0]
+	cmp	x4, x19
+	b.lt	.LBB0_18
+.LBB0_19:                               //   in Loop: Header=BB0_16 Depth=4
+	ldp	q10, q8, [x15, #32]
+	ldp	q11, q12, [x15]
+	fmla	v17.4s, v28.4s, v27.s[0]
+	fmla	v2.4s, v30.4s, v27.s[0]
+	fmla	v0.4s, v31.4s, v27.s[0]
+	fmla	v1.4s, v29.4s, v26.s[0]
+	fmla	v3.4s, v28.4s, v26.s[0]
+	fmla	v4.4s, v31.4s, v26.s[0]
+	ldp	q9, q13, [x14, #32]
+	fmla	v5.4s, v30.4s, v26.s[0]
+	fmla	v7.4s, v29.4s, v25.s[0]
+	mov	x15, x29
+	fmla	v18.4s, v28.4s, v25.s[0]
+	fmla	v20.4s, v31.4s, v25.s[0]
+	fmla	v21.4s, v30.4s, v25.s[0]
+	fmla	v16.4s, v29.4s, v24.s[0]
+	fmla	v19.4s, v28.4s, v24.s[0]
+	fmla	v22.4s, v31.4s, v24.s[0]
+	ldp	q31, q28, [x13, #32]
+	fmla	v23.4s, v30.4s, v24.s[0]
+	ldp	q29, q30, [x14]
+	mov	x14, x16
+	fmla	v0.4s, v12.4s, v27.s[1]
+	fmla	v2.4s, v11.4s, v27.s[1]
+	fmla	v17.4s, v10.4s, v27.s[1]
+	fmla	v6.4s, v8.4s, v27.s[1]
+	fmla	v5.4s, v11.4s, v26.s[1]
+	fmla	v4.4s, v12.4s, v26.s[1]
+	fmla	v3.4s, v10.4s, v26.s[1]
+	fmla	v1.4s, v8.4s, v26.s[1]
+	fmla	v21.4s, v11.4s, v25.s[1]
+	fmla	v20.4s, v12.4s, v25.s[1]
+	fmla	v18.4s, v10.4s, v25.s[1]
+	fmla	v7.4s, v8.4s, v25.s[1]
+	fmla	v23.4s, v11.4s, v24.s[1]
+	fmla	v22.4s, v12.4s, v24.s[1]
+	fmla	v19.4s, v10.4s, v24.s[1]
+	fmla	v16.4s, v8.4s, v24.s[1]
+	fmla	v6.4s, v13.4s, v27.s[2]
+	ldp	q8, q10, [x13]
+	ldr	x13, [sp, #616]                 // 8-byte Folded Reload
+	fmla	v17.4s, v9.4s, v27.s[2]
+	fmla	v2.4s, v29.4s, v27.s[2]
+	fmla	v0.4s, v30.4s, v27.s[2]
+	fmla	v1.4s, v13.4s, v26.s[2]
+	fmla	v3.4s, v9.4s, v26.s[2]
+	fmla	v4.4s, v30.4s, v26.s[2]
+	fmla	v5.4s, v29.4s, v26.s[2]
+	fmla	v7.4s, v13.4s, v25.s[2]
+	fmla	v18.4s, v9.4s, v25.s[2]
+	fmla	v20.4s, v30.4s, v25.s[2]
+	fmla	v21.4s, v29.4s, v25.s[2]
+	fmla	v16.4s, v13.4s, v24.s[2]
+	fmla	v19.4s, v9.4s, v24.s[2]
+	fmla	v22.4s, v30.4s, v24.s[2]
+	fmla	v23.4s, v29.4s, v24.s[2]
+	fmla	v0.4s, v10.4s, v27.s[3]
+	fmla	v2.4s, v8.4s, v27.s[3]
+	fmla	v17.4s, v31.4s, v27.s[3]
+	fmla	v6.4s, v28.4s, v27.s[3]
+	fmla	v5.4s, v8.4s, v26.s[3]
+	fmla	v4.4s, v10.4s, v26.s[3]
+	fmla	v3.4s, v31.4s, v26.s[3]
+	fmla	v1.4s, v28.4s, v26.s[3]
+	fmla	v21.4s, v8.4s, v25.s[3]
+	fmla	v20.4s, v10.4s, v25.s[3]
+	fmla	v18.4s, v31.4s, v25.s[3]
+	fmla	v7.4s, v28.4s, v25.s[3]
+	fmla	v23.4s, v8.4s, v24.s[3]
+	fmla	v22.4s, v10.4s, v24.s[3]
+	fmla	v19.4s, v31.4s, v24.s[3]
+	fmla	v16.4s, v28.4s, v24.s[3]
+	cmp	x29, x20
+	b.ge	.LBB0_15
+	.p2align	2
+.LBB0_20:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_11 Depth=3
+                                        //         Parent Loop BB0_16 Depth=4
+                                        // =>        This Inner Loop Header: Depth=5
+	prfm	pldl1keep, [x13]
+	ldp	q24, q25, [x13, #-64]
+	add	x8, x14, x21
+	ldp	q26, q27, [x13, #-32]
+	prfm	pldl1keep, [x14]
+	add	x15, x15, #1
+	ldur	s28, [x14, #-4]
+	prfm	pldl1keep, [x8]
+	add	x14, x14, #4
+	add	x13, x13, #64
+	ldur	s29, [x8, #-4]
+	add	x8, x8, x21
+	prfm	pldl1keep, [x8]
+	fmla	v6.4s, v27.4s, v28.s[0]
+	ldur	s30, [x8, #-4]
+	add	x8, x8, x21
+	prfm	pldl1keep, [x8]
+	fmla	v17.4s, v26.4s, v28.s[0]
+	fmla	v0.4s, v25.4s, v28.s[0]
+	fmla	v2.4s, v24.4s, v28.s[0]
+	ldur	s28, [x8, #-4]
+	fmla	v4.4s, v25.4s, v29.s[0]
+	fmla	v5.4s, v24.4s, v29.s[0]
+	fmla	v3.4s, v26.4s, v29.s[0]
+	fmla	v1.4s, v27.4s, v29.s[0]
+	fmla	v21.4s, v24.4s, v30.s[0]
+	fmla	v20.4s, v25.4s, v30.s[0]
+	fmla	v18.4s, v26.4s, v30.s[0]
+	fmla	v7.4s, v27.4s, v30.s[0]
+	fmla	v23.4s, v24.4s, v28.s[0]
+	fmla	v22.4s, v25.4s, v28.s[0]
+	fmla	v19.4s, v26.4s, v28.s[0]
+	fmla	v16.4s, v27.4s, v28.s[0]
+	cmp	x15, x20
+	b.lt	.LBB0_20
+	b	.LBB0_15
+	.p2align	2
+.LBB0_21:                               //   in Loop: Header=BB0_11 Depth=3
+	ldr	x8, [sp, #680]                  // 8-byte Folded Reload
+	ldr	x9, [sp, #592]                  // 8-byte Folded Reload
+	cmp	x8, x9
+	b.ge	.LBB0_27
+// %bb.22:                              //   in Loop: Header=BB0_11 Depth=3
+	ldr	x9, [sp, #656]                  // 8-byte Folded Reload
+	ldr	x0, [sp, #680]                  // 8-byte Folded Reload
+	mov	x18, xzr
+	mul	x9, x25, x9
+	ldr	x17, [sp, #664]                 // 8-byte Folded Reload
+	ldr	x1, [sp, #672]                  // 8-byte Folded Reload
+	madd	x8, x0, x28, x12
+	madd	x9, x1, x17, x9
+	ldp	q20, q21, [x24, #32]
+	ldp	q18, q19, [x24]
+	ldr	x1, [sp, #624]                  // 8-byte Folded Reload
+	madd	x17, x0, x22, x9
+	add	x8, x8, x10
+	add	x16, x11, x8, lsl #2
+	add	x8, x0, #1
+	ldr	x0, [sp, #536]                  // 8-byte Folded Reload
+	lsl	x17, x17, #2
+	ldr	q17, [x26, x17]
+	madd	x17, x8, x28, x12
+	madd	x8, x8, x22, x9
+	ldp	q1, q0, [x16, #32]
+	ldp	q3, q2, [x16]
+	add	x17, x17, x10
+	lsl	x8, x8, #2
+	add	x17, x11, x17, lsl #2
+	ldr	q16, [x26, x8]
+	ldp	q5, q4, [x17, #32]
+	ldp	q7, q6, [x17]
+	cmp	xzr, x19
+	b.ge	.LBB0_24
+	.p2align	2
+.LBB0_23:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_11 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	ldr	x8, [sp, #608]                  // 8-byte Folded Reload
+	add	x7, x1, #64
+	fmla	v0.4s, v21.4s, v17.s[0]
+	fmla	v1.4s, v20.4s, v17.s[0]
+	fmla	v3.4s, v18.4s, v17.s[0]
+	fmla	v2.4s, v19.4s, v17.s[0]
+	add	x9, x1, #128
+	add	x2, x1, #256
+	fmla	v4.4s, v21.4s, v16.s[0]
+	fmla	v5.4s, v20.4s, v16.s[0]
+	add	x18, x18, #4
+	add	x3, x8, x0
+	ldr	x8, [sp, #600]                  // 8-byte Folded Reload
+	prfm	pldl1keep, [x7]
+	ldp	q23, q22, [x1, #-160]
+	ldp	q24, q25, [x1, #-192]
+	fmla	v6.4s, v19.4s, v16.s[0]
+	fmla	v7.4s, v18.4s, v16.s[0]
+	prfm	pldl1keep, [x9]
+	ldp	q19, q18, [x1, #-128]
+	add	x4, x3, #32
+	ldp	q20, q21, [x1, #-96]
+	fmla	v2.4s, v25.4s, v17.s[1]
+	fmla	v0.4s, v22.4s, v17.s[1]
+	fmla	v6.4s, v25.4s, v16.s[1]
+	fmla	v4.4s, v22.4s, v16.s[1]
+	add	x5, x8, x0
+	add	x8, x1, #192
+	add	x0, x0, #16
+	fmla	v3.4s, v24.4s, v17.s[1]
+	fmla	v1.4s, v23.4s, v17.s[1]
+	fmla	v7.4s, v24.4s, v16.s[1]
+	fmla	v5.4s, v23.4s, v16.s[1]
+	prfm	pldl1keep, [x8]
+	ldp	q23, q22, [x1, #-32]
+	ldp	q24, q25, [x1, #-64]
+	add	x6, x5, #32
+	prfm	pldl1keep, [x6]
+	fmla	v0.4s, v21.4s, v17.s[2]
+	fmla	v2.4s, v18.4s, v17.s[2]
+	fmla	v4.4s, v21.4s, v16.s[2]
+	fmla	v6.4s, v18.4s, v16.s[2]
+	fmla	v1.4s, v20.4s, v17.s[2]
+	fmla	v3.4s, v19.4s, v17.s[2]
+	fmla	v5.4s, v20.4s, v16.s[2]
+	fmla	v7.4s, v19.4s, v16.s[2]
+	fmla	v2.4s, v25.4s, v17.s[3]
+	fmla	v0.4s, v22.4s, v17.s[3]
+	fmla	v6.4s, v25.4s, v16.s[3]
+	fmla	v4.4s, v22.4s, v16.s[3]
+	fmla	v3.4s, v24.4s, v17.s[3]
+	fmla	v1.4s, v23.4s, v17.s[3]
+	ldr	q17, [x5, #16]
+	prfm	pldl1keep, [x4]
+	fmla	v7.4s, v24.4s, v16.s[3]
+	fmla	v5.4s, v23.4s, v16.s[3]
+	ldr	q16, [x3, #16]
+	prfm	pldl1keep, [x2]
+	ldp	q20, q21, [x1, #32]
+	ldp	q18, q19, [x1]
+	mov	x1, x2
+	cmp	x18, x19
+	b.lt	.LBB0_23
+.LBB0_24:                               //   in Loop: Header=BB0_11 Depth=3
+	ldp	q23, q22, [x15, #32]
+	ldp	q24, q25, [x15]
+	fmla	v0.4s, v21.4s, v17.s[0]
+	fmla	v1.4s, v20.4s, v17.s[0]
+	fmla	v3.4s, v18.4s, v17.s[0]
+	fmla	v2.4s, v19.4s, v17.s[0]
+	fmla	v4.4s, v21.4s, v16.s[0]
+	fmla	v5.4s, v20.4s, v16.s[0]
+	ldp	q20, q21, [x14, #32]
+	fmla	v6.4s, v19.4s, v16.s[0]
+	fmla	v7.4s, v18.4s, v16.s[0]
+	ldp	q19, q18, [x14]
+	fmla	v2.4s, v25.4s, v17.s[1]
+	fmla	v0.4s, v22.4s, v17.s[1]
+	ldr	x18, [sp, #616]                 // 8-byte Folded Reload
+	ldr	x0, [sp, #536]                  // 8-byte Folded Reload
+	fmla	v3.4s, v24.4s, v17.s[1]
+	fmla	v1.4s, v23.4s, v17.s[1]
+	ldp	x3, x2, [sp, #368]              // 16-byte Folded Reload
+	fmla	v7.4s, v24.4s, v16.s[1]
+	fmla	v6.4s, v25.4s, v16.s[1]
+	ldp	q24, q25, [x13]
+	fmla	v5.4s, v23.4s, v16.s[1]
+	fmla	v4.4s, v22.4s, v16.s[1]
+	ldp	q23, q22, [x13, #32]
+	fmla	v0.4s, v21.4s, v17.s[2]
+	fmla	v2.4s, v18.4s, v17.s[2]
+	mov	x1, x29
+	fmla	v4.4s, v21.4s, v16.s[2]
+	fmla	v1.4s, v20.4s, v17.s[2]
+	fmla	v3.4s, v19.4s, v17.s[2]
+	fmla	v5.4s, v20.4s, v16.s[2]
+	fmla	v6.4s, v18.4s, v16.s[2]
+	fmla	v7.4s, v19.4s, v16.s[2]
+	fmla	v2.4s, v25.4s, v17.s[3]
+	fmla	v0.4s, v22.4s, v17.s[3]
+	fmla	v6.4s, v25.4s, v16.s[3]
+	fmla	v4.4s, v22.4s, v16.s[3]
+	fmla	v3.4s, v24.4s, v17.s[3]
+	fmla	v1.4s, v23.4s, v17.s[3]
+	fmla	v7.4s, v24.4s, v16.s[3]
+	fmla	v5.4s, v23.4s, v16.s[3]
+	cmp	x29, x20
+	b.ge	.LBB0_26
+	.p2align	2
+.LBB0_25:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_11 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x8, x2, x0
+	add	x9, x3, x0
+	prfm	pldl1keep, [x18]
+	add	x1, x1, #1
+	add	x8, x8, #4
+	add	x9, x9, #4
+	ldp	q16, q17, [x18, #-64]
+	ldp	q18, q19, [x18, #-32]
+	prfm	pldl1keep, [x9]
+	add	x18, x18, #64
+	ldr	s20, [x3, x0]
+	prfm	pldl1keep, [x8]
+	fmla	v0.4s, v19.4s, v20.s[0]
+	ldr	s21, [x2, x0]
+	fmla	v1.4s, v18.4s, v20.s[0]
+	fmla	v2.4s, v17.4s, v20.s[0]
+	fmla	v3.4s, v16.4s, v20.s[0]
+	fmla	v6.4s, v17.4s, v21.s[0]
+	fmla	v7.4s, v16.4s, v21.s[0]
+	fmla	v5.4s, v18.4s, v21.s[0]
+	fmla	v4.4s, v19.4s, v21.s[0]
+	add	x0, x0, #4
+	cmp	x1, x20
+	b.lt	.LBB0_25
+.LBB0_26:                               //   in Loop: Header=BB0_11 Depth=3
+	stp	q3, q2, [x16]
+	stp	q1, q0, [x16, #32]
+	stp	q7, q6, [x17]
+	stp	q5, q4, [x17, #32]
+.LBB0_27:                               //   in Loop: Header=BB0_11 Depth=3
+	ldr	x8, [sp, #528]                  // 8-byte Folded Reload
+	ldr	x9, [sp, #592]                  // 8-byte Folded Reload
+	cmp	x9, x8
+	ldr	x0, [sp, #568]                  // 8-byte Folded Reload
+	b.ge	.LBB0_10
+// %bb.28:                              //   in Loop: Header=BB0_11 Depth=3
+	ldr	x17, [sp, #592]                 // 8-byte Folded Reload
+	ldr	x9, [sp, #664]                  // 8-byte Folded Reload
+	mov	x16, xzr
+	madd	x8, x17, x28, x12
+	ldp	q7, q16, [x24, #32]
+	ldp	q6, q5, [x24]
+	ldr	x12, [sp, #624]                 // 8-byte Folded Reload
+	ldr	x1, [sp, #360]                  // 8-byte Folded Reload
+	add	x8, x8, x10
+	add	x10, x11, x8, lsl #2
+	ldr	x8, [sp, #656]                  // 8-byte Folded Reload
+	ldr	x11, [sp, #672]                 // 8-byte Folded Reload
+	ldp	q1, q0, [x10, #32]
+	ldp	q3, q2, [x10]
+	mul	x8, x25, x8
+	madd	x8, x11, x9, x8
+	ldr	x11, [sp, #456]                 // 8-byte Folded Reload
+	madd	x8, x17, x22, x8
+	lsl	x8, x8, #2
+	ldr	q4, [x26, x8]
+	cmp	xzr, x19
+	b.ge	.LBB0_30
+	.p2align	2
+.LBB0_29:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_11 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x18, x12, #64
+	fmla	v0.4s, v16.4s, v4.s[0]
+	fmla	v1.4s, v7.4s, v4.s[0]
+	add	x9, x12, #128
+	prfm	pldl1keep, [x18]
+	ldp	q18, q17, [x12, #-160]
+	fmla	v3.4s, v6.4s, v4.s[0]
+	ldp	q19, q20, [x12, #-192]
+	fmla	v2.4s, v5.4s, v4.s[0]
+	prfm	pldl1keep, [x9]
+	ldp	q6, q5, [x12, #-128]
+	ldp	q7, q16, [x12, #-96]
+	add	x8, x12, #192
+	prfm	pldl1keep, [x8]
+	add	x17, x12, #256
+	add	x16, x16, #4
+	fmla	v2.4s, v20.4s, v4.s[1]
+	fmla	v0.4s, v17.4s, v4.s[1]
+	fmla	v3.4s, v19.4s, v4.s[1]
+	fmla	v1.4s, v18.4s, v4.s[1]
+	ldp	q18, q17, [x12, #-32]
+	ldp	q19, q20, [x12, #-64]
+	prfm	pldl1keep, [x11]
+	fmla	v0.4s, v16.4s, v4.s[2]
+	fmla	v2.4s, v5.4s, v4.s[2]
+	fmla	v1.4s, v7.4s, v4.s[2]
+	fmla	v3.4s, v6.4s, v4.s[2]
+	fmla	v2.4s, v20.4s, v4.s[3]
+	fmla	v0.4s, v17.4s, v4.s[3]
+	fmla	v3.4s, v19.4s, v4.s[3]
+	fmla	v1.4s, v18.4s, v4.s[3]
+	ldur	q4, [x11, #-16]
+	prfm	pldl1keep, [x17]
+	add	x11, x11, #16
+	ldp	q7, q16, [x12, #32]
+	ldp	q6, q5, [x12]
+	mov	x12, x17
+	cmp	x16, x19
+	b.lt	.LBB0_29
+.LBB0_30:                               //   in Loop: Header=BB0_11 Depth=3
+	ldp	q18, q17, [x15, #32]
+	ldp	q19, q20, [x15]
+	fmla	v0.4s, v16.4s, v4.s[0]
+	fmla	v1.4s, v7.4s, v4.s[0]
+	fmla	v3.4s, v6.4s, v4.s[0]
+	fmla	v2.4s, v5.4s, v4.s[0]
+	ldp	q6, q5, [x14]
+	ldp	q7, q16, [x14, #32]
+	ldr	x14, [sp, #464]                 // 8-byte Folded Reload
+	mov	x11, xzr
+	mov	w12, #64                        // =0x40
+	fmla	v2.4s, v20.4s, v4.s[1]
+	fmla	v0.4s, v17.4s, v4.s[1]
+	fmla	v3.4s, v19.4s, v4.s[1]
+	fmla	v1.4s, v18.4s, v4.s[1]
+	fmla	v0.4s, v16.4s, v4.s[2]
+	ldp	q18, q17, [x13, #32]
+	ldp	q19, q20, [x13]
+	fmla	v2.4s, v5.4s, v4.s[2]
+	ldr	x13, [sp, #416]                 // 8-byte Folded Reload
+	fmla	v1.4s, v7.4s, v4.s[2]
+	fmla	v3.4s, v6.4s, v4.s[2]
+	fmla	v2.4s, v20.4s, v4.s[3]
+	fmla	v0.4s, v17.4s, v4.s[3]
+	fmla	v3.4s, v19.4s, v4.s[3]
+	fmla	v1.4s, v18.4s, v4.s[3]
+	add	x8, x29, xzr
+	cmp	x8, x20
+	b.ge	.LBB0_9
+	.p2align	2
+.LBB0_31:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_11 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x9, x1, x11, lsl #6
+	add	x8, x1, x12
+	add	x12, x12, #64
+	prfm	pldl1keep, [x8]
+	ldp	q4, q5, [x9]
+	ldp	q6, q7, [x9, #32]
+	prfm	pldl1keep, [x13]
+	ldr	s16, [x14, x11, lsl #2]
+	add	x11, x11, #1
+	add	x13, x13, #4
+	fmla	v0.4s, v7.4s, v16.s[0]
+	fmla	v1.4s, v6.4s, v16.s[0]
+	fmla	v2.4s, v5.4s, v16.s[0]
+	fmla	v3.4s, v4.4s, v16.s[0]
+	add	x8, x29, x11
+	cmp	x8, x20
+	b.lt	.LBB0_31
+	b	.LBB0_9
+	.p2align	2
+.LBB0_32:                               //   in Loop: Header=BB0_7 Depth=2
+	ldp	x9, x8, [sp, #480]              // 16-byte Folded Reload
+	ldr	x10, [sp, #448]                 // 8-byte Folded Reload
+	add	x8, x9, x8, lsl #2
+	cmp	x0, x10
+	str	x8, [sp, #584]                  // 8-byte Folded Spill
+	lsl	x8, x28, #1
+	str	x8, [sp, #576]                  // 8-byte Folded Spill
+	b.lt	.LBB0_36
+// %bb.33:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x8, [sp, #440]                  // 8-byte Folded Reload
+	cmp	x10, x8
+	b.lt	.LBB0_58
+.LBB0_34:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x9, [sp, #432]                  // 8-byte Folded Reload
+	cmp	x8, x9
+	b.lt	.LBB0_80
+.LBB0_35:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x8, [sp, #296]                  // 8-byte Folded Reload
+	cmp	x9, x8
+	b.ge	.LBB0_6
+	b	.LBB0_102
+	.p2align	2
+.LBB0_36:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x8, [sp, #256]                  // 8-byte Folded Reload
+	add	x0, x8, #64
+	bl	malloc
+	ldr	x8, [sp, #520]                  // 8-byte Folded Reload
+	ldr	x9, [sp, #512]                  // 8-byte Folded Reload
+	mov	x10, xzr
+	mov	x11, xzr
+	ldr	x12, [sp, #672]                 // 8-byte Folded Reload
+	ldr	x13, [sp, #584]                 // 8-byte Folded Reload
+	mul	x8, x25, x8
+	ldr	x14, [sp, #576]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #560]                 // 8-byte Folded Reload
+	madd	x9, x12, x9, x8
+	ldr	x8, [sp, #568]                  // 8-byte Folded Reload
+	add	x8, x9, x8
+	add	x12, x13, x8, lsl #2
+	ldp	q3, q2, [x12]
+	add	x12, x8, x28
+	add	x12, x13, x12, lsl #2
+	ldp	q1, q0, [x12]
+	add	x12, x8, x14
+	add	x12, x13, x12, lsl #2
+	ldp	q5, q4, [x12]
+	add	x12, x14, x28
+	add	x8, x8, x12
+	add	x8, x13, x8, lsl #2
+	ldp	q7, q6, [x8]
+	add	x8, x0, #63
+	and	x8, x8, #0xffffffffffffffc0
+	cmp	xzr, x20
+	b.ge	.LBB0_38
+	.p2align	2
+.LBB0_37:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldr	x12, [sp, #384]                 // 8-byte Folded Reload
+	add	x13, x15, x10
+	prfm	pldl1keep, [x13]
+	ldur	s16, [x13, #-4]
+	add	x13, x13, x21
+	prfm	pldl1keep, [x13]
+	ldur	s17, [x13, #-4]
+	add	x13, x13, x21
+	prfm	pldl1keep, [x13]
+	ldur	s18, [x13, #-4]
+	add	x13, x13, x21
+	add	x12, x12, x10
+	prfm	pldl1keep, [x13]
+	ldur	s20, [x13, #-4]
+	add	x10, x10, #4
+	prfm	pldl1keep, [x12]
+	ldur	s19, [x12, #-4]
+	add	x12, x12, x27
+	prfm	pldl1keep, [x12]
+	sub	x13, x12, #4
+	add	x12, x12, x27
+	prfm	pldl1keep, [x12]
+	sub	x14, x12, #4
+	add	x12, x12, x27
+	ld1	{ v19.s }[1], [x13]
+	prfm	pldl1keep, [x12]
+	sub	x13, x12, #4
+	add	x12, x12, x27
+	ld1	{ v19.s }[2], [x14]
+	prfm	pldl1keep, [x12]
+	ldur	s21, [x12, #-4]
+	add	x12, x12, x27
+	ld1	{ v19.s }[3], [x13]
+	prfm	pldl1keep, [x12]
+	sub	x13, x12, #4
+	add	x12, x12, x27
+	prfm	pldl1keep, [x12]
+	ld1	{ v21.s }[1], [x13]
+	sub	x14, x12, #4
+	add	x12, x12, x27
+	prfm	pldl1keep, [x12]
+	sub	x12, x12, #4
+	fmla	v3.4s, v19.4s, v16.s[0]
+	fmla	v1.4s, v19.4s, v17.s[0]
+	fmla	v5.4s, v19.4s, v18.s[0]
+	fmla	v7.4s, v19.4s, v20.s[0]
+	ld1	{ v21.s }[2], [x14]
+	ld1	{ v21.s }[3], [x12]
+	add	x12, x8, x11, lsl #5
+	add	x11, x11, #1
+	fmla	v2.4s, v21.4s, v16.s[0]
+	fmla	v0.4s, v21.4s, v17.s[0]
+	fmla	v4.4s, v21.4s, v18.s[0]
+	fmla	v6.4s, v21.4s, v20.s[0]
+	stp	q19, q21, [x12]
+	cmp	x11, x20
+	b.lt	.LBB0_37
+.LBB0_38:                               // %.preheader39
+                                        //   in Loop: Header=BB0_7 Depth=2
+	ldr	x12, [sp, #216]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #552]                 // 8-byte Folded Reload
+	mov	x11, xzr
+	add	x10, x8, #128
+	ldr	x16, [sp, #544]                 // 8-byte Folded Reload
+	mov	w18, #1                         // =0x1
+	mov	w2, #2                          // =0x2
+	mov	w1, #3                          // =0x3
+	mov	w17, #4                         // =0x4
+	add	x14, x8, x12
+	b	.LBB0_40
+	.p2align	2
+.LBB0_39:                               // %.loopexit35
+                                        //   in Loop: Header=BB0_40 Depth=3
+	add	x16, x16, x23
+	add	x15, x15, x23
+	mov	x11, x17
+	mov	x17, x3
+.LBB0_40:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Loop Header: Depth=3
+                                        //         Child Loop BB0_42 Depth 4
+                                        //         Child Loop BB0_44 Depth 4
+	madd	x11, x11, x28, x9
+	ldr	x7, [sp, #568]                  // 8-byte Folded Reload
+	ldr	x30, [sp, #584]                 // 8-byte Folded Reload
+	add	x11, x11, x7
+	madd	x12, x18, x28, x9
+	madd	x13, x2, x28, x9
+	add	x12, x12, x7
+	add	x13, x13, x7
+	add	x11, x30, x11, lsl #2
+	add	x12, x30, x12, lsl #2
+	stp	q3, q2, [x11]
+	madd	x11, x1, x28, x9
+	stp	q1, q0, [x12]
+	add	x12, x30, x13, lsl #2
+	stp	q5, q4, [x12]
+	add	x11, x11, x7
+	add	x11, x30, x11, lsl #2
+	stp	q7, q6, [x11]
+	ldr	x11, [sp, #680]                 // 8-byte Folded Reload
+	cmp	x17, x11
+	ldr	x11, [sp, #648]                 // 8-byte Folded Reload
+	add	x13, x8, x11, lsl #5
+	ldr	x11, [sp, #640]                 // 8-byte Folded Reload
+	add	x12, x8, x11, lsl #5
+	ldr	x11, [sp, #632]                 // 8-byte Folded Reload
+	add	x11, x8, x11, lsl #5
+	b.ge	.LBB0_45
+// %bb.41:                              //   in Loop: Header=BB0_40 Depth=3
+	madd	x5, x17, x28, x9
+	add	x18, x17, #1
+	add	x2, x17, #2
+	add	x1, x17, #3
+	madd	x6, x18, x28, x9
+	ldp	q20, q21, [x8]
+	mov	x4, xzr
+	add	x3, x17, #4
+	add	x5, x5, x7
+	add	x5, x30, x5, lsl #2
+	add	x6, x6, x7
+	add	x6, x30, x6, lsl #2
+	ldp	q3, q2, [x5]
+	madd	x5, x2, x28, x9
+	ldp	q1, q0, [x6]
+	madd	x6, x1, x28, x9
+	add	x5, x5, x7
+	add	x6, x6, x7
+	add	x5, x30, x5, lsl #2
+	ldr	x7, [sp, #672]                  // 8-byte Folded Reload
+	add	x6, x30, x6, lsl #2
+	ldp	q5, q4, [x5]
+	ldr	x5, [sp, #656]                  // 8-byte Folded Reload
+	mul	x5, x25, x5
+	ldp	q7, q6, [x6]
+	ldr	x6, [sp, #664]                  // 8-byte Folded Reload
+	madd	x5, x7, x6, x5
+	madd	x6, x17, x22, x5
+	lsl	x6, x6, #2
+	ldr	q19, [x26, x6]
+	madd	x6, x18, x22, x5
+	lsl	x6, x6, #2
+	ldr	q18, [x26, x6]
+	madd	x6, x2, x22, x5
+	madd	x5, x1, x22, x5
+	lsl	x6, x6, #2
+	lsl	x5, x5, #2
+	ldr	q17, [x26, x6]
+	ldr	q16, [x26, x5]
+	mov	x5, x10
+	mov	x6, x16
+	cmp	xzr, x19
+	b.ge	.LBB0_43
+	.p2align	2
+.LBB0_42:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_40 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x7, x5, #32
+	fmla	v3.4s, v20.4s, v19.s[0]
+	fmla	v2.4s, v21.4s, v19.s[0]
+	add	x4, x4, #4
+	prfm	pldl1keep, [x7]
+	ldp	q22, q23, [x5, #-96]
+	fmla	v0.4s, v21.4s, v18.s[0]
+	fmla	v1.4s, v20.4s, v18.s[0]
+	fmla	v4.4s, v21.4s, v17.s[0]
+	fmla	v5.4s, v20.4s, v17.s[0]
+	add	x7, x5, #96
+	fmla	v6.4s, v21.4s, v16.s[0]
+	fmla	v7.4s, v20.4s, v16.s[0]
+	ldp	q21, q20, [x5, #-64]
+	prfm	pldl1keep, [x7]
+	add	x7, x6, x21
+	add	x30, x7, x21
+	fmla	v2.4s, v23.4s, v19.s[1]
+	fmla	v0.4s, v23.4s, v18.s[1]
+	fmla	v4.4s, v23.4s, v17.s[1]
+	fmla	v6.4s, v23.4s, v16.s[1]
+	fmla	v3.4s, v22.4s, v19.s[1]
+	fmla	v1.4s, v22.4s, v18.s[1]
+	fmla	v5.4s, v22.4s, v17.s[1]
+	fmla	v7.4s, v22.4s, v16.s[1]
+	fmla	v2.4s, v20.4s, v19.s[2]
+	ldp	q22, q23, [x5, #-32]
+	fmla	v0.4s, v20.4s, v18.s[2]
+	fmla	v4.4s, v20.4s, v17.s[2]
+	fmla	v6.4s, v20.4s, v16.s[2]
+	fmla	v3.4s, v21.4s, v19.s[2]
+	fmla	v1.4s, v21.4s, v18.s[2]
+	fmla	v5.4s, v21.4s, v17.s[2]
+	fmla	v7.4s, v21.4s, v16.s[2]
+	ldp	q20, q21, [x5], #128
+	prfm	pldl1keep, [x6]
+	fmla	v2.4s, v23.4s, v19.s[3]
+	fmla	v0.4s, v23.4s, v18.s[3]
+	fmla	v4.4s, v23.4s, v17.s[3]
+	fmla	v6.4s, v23.4s, v16.s[3]
+	fmla	v3.4s, v22.4s, v19.s[3]
+	ldur	q19, [x6, #-16]
+	prfm	pldl1keep, [x7]
+	fmla	v1.4s, v22.4s, v18.s[3]
+	ldur	q18, [x7, #-16]
+	add	x7, x30, x21
+	prfm	pldl1keep, [x30]
+	add	x6, x6, #16
+	fmla	v5.4s, v22.4s, v17.s[3]
+	ldur	q17, [x30, #-16]
+	prfm	pldl1keep, [x7]
+	fmla	v7.4s, v22.4s, v16.s[3]
+	ldur	q16, [x7, #-16]
+	cmp	x4, x19
+	b.lt	.LBB0_42
+.LBB0_43:                               //   in Loop: Header=BB0_40 Depth=3
+	ldp	q22, q23, [x13]
+	fmla	v3.4s, v20.4s, v19.s[0]
+	fmla	v2.4s, v21.4s, v19.s[0]
+	fmla	v0.4s, v21.4s, v18.s[0]
+	fmla	v1.4s, v20.4s, v18.s[0]
+	mov	x13, x29
+	fmla	v4.4s, v21.4s, v17.s[0]
+	fmla	v5.4s, v20.4s, v17.s[0]
+	fmla	v6.4s, v21.4s, v16.s[0]
+	fmla	v7.4s, v20.4s, v16.s[0]
+	fmla	v2.4s, v23.4s, v19.s[1]
+	ldp	q21, q20, [x12]
+	fmla	v0.4s, v23.4s, v18.s[1]
+	mov	x12, x15
+	fmla	v4.4s, v23.4s, v17.s[1]
+	fmla	v6.4s, v23.4s, v16.s[1]
+	fmla	v3.4s, v22.4s, v19.s[1]
+	fmla	v1.4s, v22.4s, v18.s[1]
+	fmla	v5.4s, v22.4s, v17.s[1]
+	fmla	v7.4s, v22.4s, v16.s[1]
+	ldp	q22, q23, [x11]
+	mov	x11, x14
+	fmla	v2.4s, v20.4s, v19.s[2]
+	fmla	v0.4s, v20.4s, v18.s[2]
+	fmla	v4.4s, v20.4s, v17.s[2]
+	fmla	v6.4s, v20.4s, v16.s[2]
+	fmla	v3.4s, v21.4s, v19.s[2]
+	fmla	v1.4s, v21.4s, v18.s[2]
+	fmla	v5.4s, v21.4s, v17.s[2]
+	fmla	v7.4s, v21.4s, v16.s[2]
+	fmla	v2.4s, v23.4s, v19.s[3]
+	fmla	v0.4s, v23.4s, v18.s[3]
+	fmla	v4.4s, v23.4s, v17.s[3]
+	fmla	v6.4s, v23.4s, v16.s[3]
+	fmla	v3.4s, v22.4s, v19.s[3]
+	fmla	v1.4s, v22.4s, v18.s[3]
+	fmla	v5.4s, v22.4s, v17.s[3]
+	fmla	v7.4s, v22.4s, v16.s[3]
+	cmp	x29, x20
+	b.ge	.LBB0_39
+	.p2align	2
+.LBB0_44:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_40 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x4, x12, x21
+	prfm	pldl1keep, [x11]
+	ldp	q16, q17, [x11, #-32]
+	prfm	pldl1keep, [x12]
+	ldur	s18, [x12, #-4]
+	add	x13, x13, #1
+	add	x12, x12, #4
+	prfm	pldl1keep, [x4]
+	ldur	s19, [x4, #-4]
+	add	x4, x4, x21
+	add	x11, x11, #32
+	prfm	pldl1keep, [x4]
+	ldur	s20, [x4, #-4]
+	add	x4, x4, x21
+	fmla	v2.4s, v17.4s, v18.s[0]
+	prfm	pldl1keep, [x4]
+	ldur	s21, [x4, #-4]
+	fmla	v3.4s, v16.4s, v18.s[0]
+	fmla	v0.4s, v17.4s, v19.s[0]
+	fmla	v1.4s, v16.4s, v19.s[0]
+	fmla	v4.4s, v17.4s, v20.s[0]
+	fmla	v5.4s, v16.4s, v20.s[0]
+	fmla	v6.4s, v17.4s, v21.s[0]
+	fmla	v7.4s, v16.4s, v21.s[0]
+	cmp	x13, x20
+	b.lt	.LBB0_44
+	b	.LBB0_39
+	.p2align	2
+.LBB0_45:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x14, [sp, #680]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #592]                 // 8-byte Folded Reload
+	mov	x7, x30
+	cmp	x14, x15
+	b.ge	.LBB0_51
+// %bb.46:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x15, [sp, #656]                 // 8-byte Folded Reload
+	ldr	x18, [sp, #664]                 // 8-byte Folded Reload
+	mov	x16, xzr
+	mul	x15, x25, x15
+	ldr	x3, [sp, #672]                  // 8-byte Folded Reload
+	ldr	x2, [sp, #680]                  // 8-byte Folded Reload
+	add	x17, x2, #1
+	madd	x14, x2, x28, x9
+	ldr	x1, [sp, #568]                  // 8-byte Folded Reload
+	ldp	q6, q7, [x8]
+	madd	x18, x3, x18, x15
+	madd	x15, x2, x22, x18
+	add	x14, x14, x1
+	add	x14, x7, x14, lsl #2
+	lsl	x15, x15, #2
+	ldr	q4, [x26, x15]
+	madd	x15, x17, x28, x9
+	madd	x17, x17, x22, x18
+	ldp	q1, q0, [x14]
+	ldr	x18, [sp, #536]                 // 8-byte Folded Reload
+	add	x15, x15, x1
+	lsl	x17, x17, #2
+	add	x15, x7, x15, lsl #2
+	ldr	q5, [x26, x17]
+	mov	x17, x10
+	ldp	q3, q2, [x15]
+	cmp	xzr, x19
+	b.ge	.LBB0_48
+	.p2align	2
+.LBB0_47:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x6, x17, #32
+	ldr	x1, [sp, #608]                  // 8-byte Folded Reload
+	ldr	x3, [sp, #600]                  // 8-byte Folded Reload
+	fmla	v1.4s, v6.4s, v4.s[0]
+	prfm	pldl1keep, [x6]
+	ldp	q16, q17, [x17, #-96]
+	fmla	v0.4s, v7.4s, v4.s[0]
+	fmla	v2.4s, v7.4s, v5.s[0]
+	fmla	v3.4s, v6.4s, v5.s[0]
+	ldp	q7, q6, [x17, #-64]
+	add	x5, x17, #96
+	prfm	pldl1keep, [x5]
+	add	x16, x16, #4
+	add	x1, x1, x18
+	add	x3, x3, x18
+	add	x18, x18, #16
+	fmla	v0.4s, v17.4s, v4.s[1]
+	fmla	v2.4s, v17.4s, v5.s[1]
+	add	x2, x1, #32
+	add	x4, x3, #32
+	fmla	v1.4s, v16.4s, v4.s[1]
+	fmla	v3.4s, v16.4s, v5.s[1]
+	ldp	q16, q17, [x17, #-32]
+	fmla	v0.4s, v6.4s, v4.s[2]
+	fmla	v2.4s, v6.4s, v5.s[2]
+	fmla	v1.4s, v7.4s, v4.s[2]
+	fmla	v3.4s, v7.4s, v5.s[2]
+	fmla	v0.4s, v17.4s, v4.s[3]
+	fmla	v2.4s, v17.4s, v5.s[3]
+	ldp	q6, q7, [x17], #128
+	prfm	pldl1keep, [x4]
+	fmla	v1.4s, v16.4s, v4.s[3]
+	ldr	q4, [x3, #16]
+	prfm	pldl1keep, [x2]
+	fmla	v3.4s, v16.4s, v5.s[3]
+	ldr	q5, [x1, #16]
+	cmp	x16, x19
+	b.lt	.LBB0_47
+.LBB0_48:                               //   in Loop: Header=BB0_7 Depth=2
+	ldp	q16, q17, [x13]
+	fmla	v1.4s, v6.4s, v4.s[0]
+	fmla	v0.4s, v7.4s, v4.s[0]
+	fmla	v2.4s, v7.4s, v5.s[0]
+	fmla	v3.4s, v6.4s, v5.s[0]
+	ldp	q7, q6, [x12]
+	ldr	x18, [sp, #232]                 // 8-byte Folded Reload
+	mov	x16, xzr
+	mov	x17, xzr
+	mov	x1, x29
+	fmla	v0.4s, v17.4s, v4.s[1]
+	fmla	v2.4s, v17.4s, v5.s[1]
+	add	x18, x8, x18
+	fmla	v1.4s, v16.4s, v4.s[1]
+	fmla	v3.4s, v16.4s, v5.s[1]
+	ldp	q16, q17, [x11]
+	fmla	v0.4s, v6.4s, v4.s[2]
+	fmla	v2.4s, v6.4s, v5.s[2]
+	fmla	v1.4s, v7.4s, v4.s[2]
+	fmla	v3.4s, v7.4s, v5.s[2]
+	fmla	v0.4s, v17.4s, v4.s[3]
+	fmla	v2.4s, v17.4s, v5.s[3]
+	fmla	v1.4s, v16.4s, v4.s[3]
+	fmla	v3.4s, v16.4s, v5.s[3]
+	cmp	x29, x20
+	b.ge	.LBB0_50
+	.p2align	2
+.LBB0_49:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldr	x2, [sp, #424]                  // 8-byte Folded Reload
+	ldr	x6, [sp, #352]                  // 8-byte Folded Reload
+	add	x4, x18, x17, lsl #3
+	add	x5, x18, x16
+	add	x1, x1, #1
+	add	x16, x16, #32
+	add	x4, x4, #32
+	prfm	pldl1keep, [x4]
+	ldp	q4, q5, [x5]
+	add	x2, x2, x17
+	add	x3, x6, x17
+	add	x2, x2, #4
+	add	x3, x3, #4
+	prfm	pldl1keep, [x3]
+	ldr	s6, [x6, x17]
+	prfm	pldl1keep, [x2]
+	ldr	x2, [sp, #424]                  // 8-byte Folded Reload
+	ldr	s7, [x2, x17]
+	add	x17, x17, #4
+	fmla	v0.4s, v5.4s, v6.s[0]
+	fmla	v1.4s, v4.4s, v6.s[0]
+	fmla	v2.4s, v5.4s, v7.s[0]
+	fmla	v3.4s, v4.4s, v7.s[0]
+	cmp	x1, x20
+	b.lt	.LBB0_49
+.LBB0_50:                               //   in Loop: Header=BB0_7 Depth=2
+	stp	q1, q0, [x14]
+	stp	q3, q2, [x15]
+.LBB0_51:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x14, [sp, #528]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #592]                 // 8-byte Folded Reload
+	cmp	x15, x14
+	b.ge	.LBB0_57
+// %bb.52:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x17, [sp, #592]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #568]                 // 8-byte Folded Reload
+	mov	x14, xzr
+	madd	x9, x17, x28, x9
+	ldr	x16, [sp, #664]                 // 8-byte Folded Reload
+	ldr	x18, [sp, #672]                 // 8-byte Folded Reload
+	ldp	q4, q3, [x8]
+	add	x9, x9, x15
+	ldr	x15, [sp, #656]                 // 8-byte Folded Reload
+	add	x9, x7, x9, lsl #2
+	mul	x15, x25, x15
+	ldp	q1, q0, [x9]
+	madd	x15, x18, x16, x15
+	madd	x15, x17, x22, x15
+	lsl	x15, x15, #2
+	ldr	q2, [x26, x15]
+	ldr	x15, [sp, #456]                 // 8-byte Folded Reload
+	cmp	xzr, x19
+	b.ge	.LBB0_54
+	.p2align	2
+.LBB0_53:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x17, x10, #32
+	fmla	v1.4s, v4.4s, v2.s[0]
+	fmla	v0.4s, v3.4s, v2.s[0]
+	add	x16, x10, #96
+	prfm	pldl1keep, [x17]
+	ldp	q5, q6, [x10, #-96]
+	add	x14, x14, #4
+	ldp	q4, q3, [x10, #-64]
+	prfm	pldl1keep, [x16]
+	fmla	v0.4s, v6.4s, v2.s[1]
+	fmla	v1.4s, v5.4s, v2.s[1]
+	ldp	q5, q6, [x10, #-32]
+	prfm	pldl1keep, [x15]
+	fmla	v0.4s, v3.4s, v2.s[2]
+	fmla	v1.4s, v4.4s, v2.s[2]
+	fmla	v0.4s, v6.4s, v2.s[3]
+	fmla	v1.4s, v5.4s, v2.s[3]
+	ldur	q2, [x15, #-16]
+	ldp	q4, q3, [x10], #128
+	add	x15, x15, #16
+	cmp	x14, x19
+	b.lt	.LBB0_53
+.LBB0_54:                               //   in Loop: Header=BB0_7 Depth=2
+	ldp	q5, q6, [x13]
+	fmla	v1.4s, v4.4s, v2.s[0]
+	fmla	v0.4s, v3.4s, v2.s[0]
+	ldp	q4, q3, [x12]
+	mov	x10, xzr
+	mov	x14, xzr
+	fmla	v0.4s, v6.4s, v2.s[1]
+	fmla	v1.4s, v5.4s, v2.s[1]
+	ldp	q5, q6, [x11]
+	ldr	x11, [sp, #232]                 // 8-byte Folded Reload
+	fmla	v0.4s, v3.4s, v2.s[2]
+	fmla	v1.4s, v4.4s, v2.s[2]
+	add	x8, x8, x11
+	mov	x11, x29
+	fmla	v0.4s, v6.4s, v2.s[3]
+	fmla	v1.4s, v5.4s, v2.s[3]
+	cmp	x29, x20
+	b.ge	.LBB0_56
+	.p2align	2
+.LBB0_55:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldr	x16, [sp, #464]                 // 8-byte Folded Reload
+	add	x13, x8, x14, lsl #3
+	add	x15, x8, x10
+	add	x11, x11, #1
+	add	x10, x10, #32
+	add	x13, x13, #32
+	prfm	pldl1keep, [x13]
+	add	x12, x16, x14
+	ldp	q2, q3, [x15]
+	add	x12, x12, #4
+	prfm	pldl1keep, [x12]
+	ldr	s4, [x16, x14]
+	add	x14, x14, #4
+	fmla	v0.4s, v3.4s, v4.s[0]
+	fmla	v1.4s, v2.4s, v4.s[0]
+	cmp	x11, x20
+	b.lt	.LBB0_55
+.LBB0_56:                               //   in Loop: Header=BB0_7 Depth=2
+	stp	q1, q0, [x9]
+.LBB0_57:                               //   in Loop: Header=BB0_7 Depth=2
+	bl	free
+	ldp	x8, x10, [sp, #440]             // 16-byte Folded Reload
+	ldr	x16, [sp, #560]                 // 8-byte Folded Reload
+	cmp	x10, x8
+	b.ge	.LBB0_34
+.LBB0_58:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x8, [sp, #248]                  // 8-byte Folded Reload
+	add	x0, x8, #64
+	bl	malloc
+	ldr	x8, [sp, #520]                  // 8-byte Folded Reload
+	ldr	x9, [sp, #512]                  // 8-byte Folded Reload
+	mov	x10, xzr
+	mov	x11, xzr
+	ldr	x12, [sp, #672]                 // 8-byte Folded Reload
+	ldr	x7, [sp, #584]                  // 8-byte Folded Reload
+	mul	x8, x25, x8
+	ldr	x13, [sp, #576]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #560]                 // 8-byte Folded Reload
+	ldp	x6, x5, [sp, #368]              // 16-byte Folded Reload
+	madd	x9, x12, x9, x8
+	ldr	x8, [sp, #448]                  // 8-byte Folded Reload
+	add	x8, x9, x8
+	lsl	x12, x8, #2
+	ldr	q0, [x7, x12]
+	add	x12, x8, x28
+	lsl	x12, x12, #2
+	ldr	q1, [x7, x12]
+	add	x12, x8, x13
+	lsl	x12, x12, #2
+	ldr	q2, [x7, x12]
+	add	x12, x13, x28
+	add	x8, x8, x12
+	lsl	x8, x8, #2
+	ldr	q3, [x7, x8]
+	add	x8, x0, #63
+	and	x8, x8, #0xffffffffffffffc0
+	cmp	xzr, x20
+	b.ge	.LBB0_60
+	.p2align	2
+.LBB0_59:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldr	x12, [sp, #392]                 // 8-byte Folded Reload
+	add	x13, x15, x10
+	prfm	pldl1keep, [x13]
+	ldur	s4, [x13, #-4]
+	add	x13, x13, x21
+	prfm	pldl1keep, [x13]
+	ldur	s5, [x13, #-4]
+	add	x13, x13, x21
+	prfm	pldl1keep, [x13]
+	ldur	s6, [x13, #-4]
+	add	x13, x13, x21
+	add	x12, x12, x10
+	prfm	pldl1keep, [x13]
+	ldur	s7, [x13, #-4]
+	add	x10, x10, #4
+	prfm	pldl1keep, [x12]
+	ldur	s16, [x12, #-4]
+	add	x12, x12, x27
+	prfm	pldl1keep, [x12]
+	sub	x13, x12, #4
+	add	x12, x12, x27
+	prfm	pldl1keep, [x12]
+	sub	x14, x12, #4
+	add	x12, x12, x27
+	ld1	{ v16.s }[1], [x13]
+	prfm	pldl1keep, [x12]
+	sub	x12, x12, #4
+	ld1	{ v16.s }[2], [x14]
+	ld1	{ v16.s }[3], [x12]
+	str	q16, [x8, x11, lsl #4]
+	add	x11, x11, #1
+	fmla	v0.4s, v16.4s, v4.s[0]
+	fmla	v1.4s, v16.4s, v5.s[0]
+	fmla	v2.4s, v16.4s, v6.s[0]
+	fmla	v3.4s, v16.4s, v7.s[0]
+	cmp	x11, x20
+	b.lt	.LBB0_59
+.LBB0_60:                               // %.preheader38
+                                        //   in Loop: Header=BB0_7 Depth=2
+	ldr	x11, [sp, #208]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #552]                 // 8-byte Folded Reload
+	mov	x1, xzr
+	add	x10, x8, #48
+	ldr	x13, [sp, #544]                 // 8-byte Folded Reload
+	mov	w15, #1                         // =0x1
+	mov	w16, #2                         // =0x2
+	mov	w17, #3                         // =0x3
+	mov	w14, #4                         // =0x4
+	add	x11, x8, x11
+	b	.LBB0_62
+	.p2align	2
+.LBB0_61:                               // %.loopexit34
+                                        //   in Loop: Header=BB0_62 Depth=3
+	add	x13, x13, x23
+	add	x12, x12, x23
+	mov	x1, x14
+	mov	x14, x18
+.LBB0_62:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Loop Header: Depth=3
+                                        //         Child Loop BB0_64 Depth 4
+                                        //         Child Loop BB0_66 Depth 4
+	madd	x18, x1, x28, x9
+	ldr	x4, [sp, #448]                  // 8-byte Folded Reload
+	add	x18, x18, x4
+	madd	x15, x15, x28, x9
+	madd	x16, x16, x28, x9
+	madd	x17, x17, x28, x9
+	add	x15, x15, x4
+	add	x16, x16, x4
+	lsl	x18, x18, #2
+	lsl	x15, x15, #2
+	lsl	x16, x16, #2
+	str	q0, [x7, x18]
+	str	q1, [x7, x15]
+	add	x15, x17, x4
+	str	q2, [x7, x16]
+	lsl	x15, x15, #2
+	str	q3, [x7, x15]
+	ldr	x15, [sp, #680]                 // 8-byte Folded Reload
+	cmp	x14, x15
+	b.ge	.LBB0_67
+// %bb.63:                              //   in Loop: Header=BB0_62 Depth=3
+	madd	x2, x14, x28, x9
+	add	x15, x14, #1
+	add	x17, x14, #3
+	add	x16, x14, #2
+	madd	x3, x16, x28, x9
+	ldr	q16, [x8]
+	mov	x1, xzr
+	add	x18, x14, #4
+	add	x2, x2, x4
+	lsl	x2, x2, #2
+	add	x3, x3, x4
+	lsl	x3, x3, #2
+	ldr	q0, [x7, x2]
+	madd	x2, x15, x28, x9
+	add	x2, x2, x4
+	ldr	q2, [x7, x3]
+	ldr	x3, [sp, #664]                  // 8-byte Folded Reload
+	lsl	x2, x2, #2
+	ldr	q1, [x7, x2]
+	madd	x2, x17, x28, x9
+	add	x2, x2, x4
+	ldr	x4, [sp, #672]                  // 8-byte Folded Reload
+	lsl	x2, x2, #2
+	ldr	q3, [x7, x2]
+	ldr	x2, [sp, #656]                  // 8-byte Folded Reload
+	mul	x2, x25, x2
+	madd	x2, x4, x3, x2
+	madd	x3, x14, x22, x2
+	lsl	x3, x3, #2
+	ldr	q7, [x26, x3]
+	madd	x3, x15, x22, x2
+	lsl	x3, x3, #2
+	ldr	q6, [x26, x3]
+	madd	x3, x16, x22, x2
+	madd	x2, x17, x22, x2
+	lsl	x3, x3, #2
+	lsl	x2, x2, #2
+	ldr	q5, [x26, x3]
+	ldr	q4, [x26, x2]
+	mov	x2, x10
+	mov	x3, x13
+	cmp	xzr, x19
+	b.ge	.LBB0_65
+	.p2align	2
+.LBB0_64:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_62 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x4, x2, #32
+	fmla	v0.4s, v16.4s, v7.s[0]
+	fmla	v1.4s, v16.4s, v6.s[0]
+	add	x1, x1, #4
+	fmla	v2.4s, v16.4s, v5.s[0]
+	fmla	v3.4s, v16.4s, v4.s[0]
+	prfm	pldl1keep, [x4]
+	add	x4, x3, x21
+	ldp	q16, q17, [x2, #-32]
+	fmla	v0.4s, v16.4s, v7.s[1]
+	fmla	v1.4s, v16.4s, v6.s[1]
+	fmla	v2.4s, v16.4s, v5.s[1]
+	fmla	v3.4s, v16.4s, v4.s[1]
+	fmla	v0.4s, v17.4s, v7.s[2]
+	fmla	v1.4s, v17.4s, v6.s[2]
+	fmla	v2.4s, v17.4s, v5.s[2]
+	fmla	v3.4s, v17.4s, v4.s[2]
+	ldp	q17, q16, [x2], #64
+	prfm	pldl1keep, [x3]
+	fmla	v0.4s, v17.4s, v7.s[3]
+	ldur	q7, [x3, #-16]
+	prfm	pldl1keep, [x4]
+	fmla	v1.4s, v17.4s, v6.s[3]
+	ldur	q6, [x4, #-16]
+	add	x4, x4, x21
+	fmla	v2.4s, v17.4s, v5.s[3]
+	fmla	v3.4s, v17.4s, v4.s[3]
+	add	x3, x3, #16
+	prfm	pldl1keep, [x4]
+	ldur	q5, [x4, #-16]
+	add	x4, x4, x21
+	prfm	pldl1keep, [x4]
+	ldur	q4, [x4, #-16]
+	cmp	x1, x19
+	b.lt	.LBB0_64
+.LBB0_65:                               //   in Loop: Header=BB0_62 Depth=3
+	ldr	x1, [sp, #648]                  // 8-byte Folded Reload
+	fmla	v0.4s, v16.4s, v7.s[0]
+	fmla	v1.4s, v16.4s, v6.s[0]
+	mov	x2, x12
+	fmla	v2.4s, v16.4s, v5.s[0]
+	fmla	v3.4s, v16.4s, v4.s[0]
+	mov	x3, x29
+	ldr	q17, [x8, x1, lsl #4]
+	ldr	x1, [sp, #640]                  // 8-byte Folded Reload
+	fmla	v0.4s, v17.4s, v7.s[1]
+	ldr	q16, [x8, x1, lsl #4]
+	ldr	x1, [sp, #632]                  // 8-byte Folded Reload
+	fmla	v1.4s, v17.4s, v6.s[1]
+	fmla	v2.4s, v17.4s, v5.s[1]
+	fmla	v3.4s, v17.4s, v4.s[1]
+	ldr	q18, [x8, x1, lsl #4]
+	mov	x1, x11
+	fmla	v0.4s, v16.4s, v7.s[2]
+	fmla	v1.4s, v16.4s, v6.s[2]
+	fmla	v2.4s, v16.4s, v5.s[2]
+	fmla	v3.4s, v16.4s, v4.s[2]
+	fmla	v0.4s, v18.4s, v7.s[3]
+	fmla	v1.4s, v18.4s, v6.s[3]
+	fmla	v2.4s, v18.4s, v5.s[3]
+	fmla	v3.4s, v18.4s, v4.s[3]
+	cmp	x29, x20
+	b.ge	.LBB0_61
+	.p2align	2
+.LBB0_66:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_62 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x4, x2, x21
+	prfm	pldl1keep, [x1]
+	ldur	q4, [x1, #-16]
+	add	x3, x3, #1
+	prfm	pldl1keep, [x2]
+	ldur	s5, [x2, #-4]
+	add	x2, x2, #4
+	add	x1, x1, #16
+	prfm	pldl1keep, [x4]
+	ldur	s6, [x4, #-4]
+	add	x4, x4, x21
+	fmla	v0.4s, v4.4s, v5.s[0]
+	prfm	pldl1keep, [x4]
+	ldur	s7, [x4, #-4]
+	add	x4, x4, x21
+	prfm	pldl1keep, [x4]
+	ldur	s16, [x4, #-4]
+	fmla	v1.4s, v4.4s, v6.s[0]
+	fmla	v2.4s, v4.4s, v7.s[0]
+	fmla	v3.4s, v4.4s, v16.s[0]
+	cmp	x3, x20
+	b.lt	.LBB0_66
+	b	.LBB0_61
+	.p2align	2
+.LBB0_67:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x12, [sp, #680]                 // 8-byte Folded Reload
+	ldr	x13, [sp, #592]                 // 8-byte Folded Reload
+	cmp	x12, x13
+	b.ge	.LBB0_73
+// %bb.68:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x13, [sp, #656]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #664]                 // 8-byte Folded Reload
+	mov	x14, xzr
+	mul	x13, x25, x13
+	ldr	x18, [sp, #672]                 // 8-byte Folded Reload
+	ldr	x16, [sp, #680]                 // 8-byte Folded Reload
+	madd	x12, x16, x28, x9
+	ldr	x17, [sp, #448]                 // 8-byte Folded Reload
+	ldr	q4, [x8]
+	madd	x15, x18, x15, x13
+	madd	x13, x16, x22, x15
+	add	x16, x16, #1
+	madd	x15, x16, x22, x15
+	add	x12, x12, x17
+	add	x12, x7, x12, lsl #2
+	lsl	x13, x13, #2
+	ldr	q2, [x26, x13]
+	madd	x13, x16, x28, x9
+	lsl	x15, x15, #2
+	ldr	q0, [x12]
+	ldr	x16, [sp, #536]                 // 8-byte Folded Reload
+	ldr	q3, [x26, x15]
+	mov	x15, x10
+	add	x13, x13, x17
+	add	x13, x7, x13, lsl #2
+	ldr	q1, [x13]
+	cmp	xzr, x19
+	b.ge	.LBB0_70
+	.p2align	2
+.LBB0_69:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x3, x15, #32
+	ldr	x17, [sp, #608]                 // 8-byte Folded Reload
+	ldr	x1, [sp, #600]                  // 8-byte Folded Reload
+	fmla	v0.4s, v4.4s, v2.s[0]
+	prfm	pldl1keep, [x3]
+	fmla	v1.4s, v4.4s, v3.s[0]
+	ldp	q4, q5, [x15, #-32]
+	add	x14, x14, #4
+	add	x17, x17, x16
+	add	x1, x1, x16
+	add	x16, x16, #16
+	add	x18, x17, #32
+	add	x2, x1, #32
+	fmla	v0.4s, v4.4s, v2.s[1]
+	fmla	v1.4s, v4.4s, v3.s[1]
+	fmla	v0.4s, v5.4s, v2.s[2]
+	fmla	v1.4s, v5.4s, v3.s[2]
+	ldp	q5, q4, [x15], #64
+	prfm	pldl1keep, [x2]
+	fmla	v0.4s, v5.4s, v2.s[3]
+	ldr	q2, [x1, #16]
+	prfm	pldl1keep, [x18]
+	fmla	v1.4s, v5.4s, v3.s[3]
+	ldr	q3, [x17, #16]
+	cmp	x14, x19
+	b.lt	.LBB0_69
+.LBB0_70:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x14, [sp, #648]                 // 8-byte Folded Reload
+	fmla	v0.4s, v4.4s, v2.s[0]
+	fmla	v1.4s, v4.4s, v3.s[0]
+	mov	x15, x29
+	ldr	q5, [x8, x14, lsl #4]
+	ldr	x14, [sp, #640]                 // 8-byte Folded Reload
+	fmla	v0.4s, v5.4s, v2.s[1]
+	ldr	q4, [x8, x14, lsl #4]
+	ldr	x14, [sp, #632]                 // 8-byte Folded Reload
+	fmla	v1.4s, v5.4s, v3.s[1]
+	ldr	q5, [x8, x14, lsl #4]
+	ldr	x14, [sp, #536]                 // 8-byte Folded Reload
+	fmla	v0.4s, v4.4s, v2.s[2]
+	fmla	v1.4s, v4.4s, v3.s[2]
+	fmla	v0.4s, v5.4s, v2.s[3]
+	fmla	v1.4s, v5.4s, v3.s[3]
+	cmp	x29, x20
+	b.ge	.LBB0_72
+	.p2align	2
+.LBB0_71:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x16, x5, x14
+	add	x17, x6, x14
+	prfm	pldl1keep, [x11]
+	ldur	q2, [x11, #-16]
+	add	x16, x16, #4
+	add	x17, x17, #4
+	add	x15, x15, #1
+	add	x11, x11, #16
+	prfm	pldl1keep, [x17]
+	ldr	s3, [x6, x14]
+	prfm	pldl1keep, [x16]
+	ldr	s4, [x5, x14]
+	add	x14, x14, #4
+	fmla	v0.4s, v2.4s, v3.s[0]
+	fmla	v1.4s, v2.4s, v4.s[0]
+	cmp	x15, x20
+	b.lt	.LBB0_71
+.LBB0_72:                               //   in Loop: Header=BB0_7 Depth=2
+	str	q0, [x12]
+	str	q1, [x13]
+.LBB0_73:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x11, [sp, #528]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #592]                 // 8-byte Folded Reload
+	cmp	x12, x11
+	b.ge	.LBB0_79
+// %bb.74:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x14, [sp, #592]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #448]                 // 8-byte Folded Reload
+	mov	x11, xzr
+	madd	x9, x14, x28, x9
+	ldr	x13, [sp, #664]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #672]                 // 8-byte Folded Reload
+	ldr	q2, [x8]
+	add	x9, x9, x12
+	ldr	x12, [sp, #656]                 // 8-byte Folded Reload
+	add	x9, x7, x9, lsl #2
+	ldr	q0, [x9]
+	mul	x12, x25, x12
+	madd	x12, x15, x13, x12
+	madd	x12, x14, x22, x12
+	lsl	x12, x12, #2
+	ldr	q1, [x26, x12]
+	ldp	x12, x14, [sp, #456]            // 16-byte Folded Reload
+	cmp	xzr, x19
+	b.ge	.LBB0_76
+	.p2align	2
+.LBB0_75:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x13, x10, #32
+	fmla	v0.4s, v2.4s, v1.s[0]
+	add	x11, x11, #4
+	prfm	pldl1keep, [x13]
+	ldp	q2, q3, [x10, #-32]
+	fmla	v0.4s, v2.4s, v1.s[1]
+	fmla	v0.4s, v3.4s, v1.s[2]
+	ldp	q3, q2, [x10], #64
+	prfm	pldl1keep, [x12]
+	fmla	v0.4s, v3.4s, v1.s[3]
+	ldur	q1, [x12, #-16]
+	add	x12, x12, #16
+	cmp	x11, x19
+	b.lt	.LBB0_75
+.LBB0_76:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x11, [sp, #648]                 // 8-byte Folded Reload
+	fmla	v0.4s, v2.4s, v1.s[0]
+	ldr	x12, [sp, #416]                 // 8-byte Folded Reload
+	mov	x10, xzr
+	ldr	q3, [x8, x11, lsl #4]
+	ldr	x11, [sp, #640]                 // 8-byte Folded Reload
+	fmla	v0.4s, v3.4s, v1.s[1]
+	ldr	q2, [x8, x11, lsl #4]
+	ldr	x11, [sp, #632]                 // 8-byte Folded Reload
+	fmla	v0.4s, v2.4s, v1.s[2]
+	ldr	q3, [x8, x11, lsl #4]
+	ldr	x11, [sp, #184]                 // 8-byte Folded Reload
+	add	x8, x8, x11
+	mov	w11, #16                        // =0x10
+	fmla	v0.4s, v3.4s, v1.s[3]
+	add	x13, x29, xzr
+	cmp	x13, x20
+	b.ge	.LBB0_78
+	.p2align	2
+.LBB0_77:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x13, x8, x11
+	add	x11, x11, #16
+	prfm	pldl1keep, [x13]
+	ldr	q1, [x8, x10, lsl #4]
+	prfm	pldl1keep, [x12]
+	add	x12, x12, #4
+	ldr	s2, [x14, x10, lsl #2]
+	add	x10, x10, #1
+	fmla	v0.4s, v1.4s, v2.s[0]
+	add	x13, x29, x10
+	cmp	x13, x20
+	b.lt	.LBB0_77
+.LBB0_78:                               //   in Loop: Header=BB0_7 Depth=2
+	str	q0, [x9]
+.LBB0_79:                               //   in Loop: Header=BB0_7 Depth=2
+	bl	free
+	ldp	x9, x8, [sp, #432]              // 16-byte Folded Reload
+	ldr	x16, [sp, #560]                 // 8-byte Folded Reload
+	cmp	x8, x9
+	b.ge	.LBB0_35
+.LBB0_80:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x8, [sp, #240]                  // 8-byte Folded Reload
+	add	x0, x8, #64
+	bl	malloc
+	ldr	x8, [sp, #520]                  // 8-byte Folded Reload
+	ldr	x9, [sp, #512]                  // 8-byte Folded Reload
+	mov	x10, xzr
+	mov	x11, xzr
+	ldr	x12, [sp, #672]                 // 8-byte Folded Reload
+	ldr	x5, [sp, #584]                  // 8-byte Folded Reload
+	mul	x8, x25, x8
+	ldr	x13, [sp, #576]                 // 8-byte Folded Reload
+	ldr	x17, [sp, #560]                 // 8-byte Folded Reload
+	madd	x9, x12, x9, x8
+	ldr	x8, [sp, #440]                  // 8-byte Folded Reload
+	add	x8, x9, x8
+	lsl	x12, x8, #2
+	ldr	d0, [x5, x12]
+	add	x12, x8, x28
+	lsl	x12, x12, #2
+	ldr	d1, [x5, x12]
+	add	x12, x8, x13
+	lsl	x12, x12, #2
+	ldr	d2, [x5, x12]
+	add	x12, x13, x28
+	add	x8, x8, x12
+	lsl	x8, x8, #2
+	ldr	d3, [x5, x8]
+	add	x8, x0, #63
+	and	x8, x8, #0xffffffffffffffc0
+	cmp	xzr, x20
+	b.ge	.LBB0_82
+	.p2align	2
+.LBB0_81:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	ldp	x12, x16, [sp, #400]            // 16-byte Folded Reload
+	add	x15, x17, x10
+	prfm	pldl1keep, [x15]
+	ldur	s4, [x15, #-4]
+	add	x15, x15, x21
+	prfm	pldl1keep, [x15]
+	ldur	s5, [x15, #-4]
+	add	x15, x15, x21
+	prfm	pldl1keep, [x15]
+	ldur	s6, [x15, #-4]
+	add	x15, x15, x21
+	add	x12, x12, x10
+	add	x14, x16, x10
+	prfm	pldl1keep, [x15]
+	ldur	s7, [x15, #-4]
+	add	x13, x12, #4
+	add	x14, x14, #4
+	prfm	pldl1keep, [x14]
+	prfm	pldl1keep, [x13]
+	ldr	s16, [x16, x10]
+	add	x10, x10, #4
+	ld1	{ v16.s }[1], [x12]
+	str	d16, [x8, x11, lsl #3]
+	add	x11, x11, #1
+	fmla	v0.2s, v16.2s, v4.s[0]
+	fmla	v1.2s, v16.2s, v5.s[0]
+	fmla	v2.2s, v16.2s, v6.s[0]
+	fmla	v3.2s, v16.2s, v7.s[0]
+	cmp	x11, x20
+	b.lt	.LBB0_81
+.LBB0_82:                               // %.preheader37
+                                        //   in Loop: Header=BB0_7 Depth=2
+	ldr	x11, [sp, #200]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #552]                 // 8-byte Folded Reload
+	mov	x1, xzr
+	add	x10, x8, #24
+	ldr	x13, [sp, #544]                 // 8-byte Folded Reload
+	mov	w15, #1                         // =0x1
+	mov	w16, #2                         // =0x2
+	mov	w17, #3                         // =0x3
+	mov	w14, #4                         // =0x4
+	add	x11, x8, x11
+	b	.LBB0_84
+	.p2align	2
+.LBB0_83:                               // %.loopexit33
+                                        //   in Loop: Header=BB0_84 Depth=3
+	add	x13, x13, x23
+	add	x12, x12, x23
+	mov	x1, x14
+	mov	x14, x18
+.LBB0_84:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Loop Header: Depth=3
+                                        //         Child Loop BB0_86 Depth 4
+                                        //         Child Loop BB0_88 Depth 4
+	madd	x18, x1, x28, x9
+	ldr	x4, [sp, #440]                  // 8-byte Folded Reload
+	add	x18, x18, x4
+	madd	x15, x15, x28, x9
+	madd	x16, x16, x28, x9
+	madd	x17, x17, x28, x9
+	add	x15, x15, x4
+	add	x16, x16, x4
+	lsl	x18, x18, #2
+	lsl	x15, x15, #2
+	lsl	x16, x16, #2
+	str	d0, [x5, x18]
+	str	d1, [x5, x15]
+	add	x15, x17, x4
+	str	d2, [x5, x16]
+	lsl	x15, x15, #2
+	str	d3, [x5, x15]
+	ldr	x15, [sp, #680]                 // 8-byte Folded Reload
+	cmp	x14, x15
+	b.ge	.LBB0_89
+// %bb.85:                              //   in Loop: Header=BB0_84 Depth=3
+	madd	x2, x14, x28, x9
+	add	x15, x14, #1
+	add	x17, x14, #3
+	add	x16, x14, #2
+	madd	x3, x16, x28, x9
+	ldr	d16, [x8]
+	mov	x1, xzr
+	add	x18, x14, #4
+	add	x2, x2, x4
+	lsl	x2, x2, #2
+	add	x3, x3, x4
+	lsl	x3, x3, #2
+	ldr	d0, [x5, x2]
+	madd	x2, x15, x28, x9
+	add	x2, x2, x4
+	ldr	d2, [x5, x3]
+	ldr	x3, [sp, #664]                  // 8-byte Folded Reload
+	lsl	x2, x2, #2
+	ldr	d1, [x5, x2]
+	madd	x2, x17, x28, x9
+	add	x2, x2, x4
+	ldr	x4, [sp, #672]                  // 8-byte Folded Reload
+	lsl	x2, x2, #2
+	ldr	d3, [x5, x2]
+	ldr	x2, [sp, #656]                  // 8-byte Folded Reload
+	mul	x2, x25, x2
+	madd	x2, x4, x3, x2
+	madd	x3, x14, x22, x2
+	lsl	x3, x3, #2
+	ldr	q7, [x26, x3]
+	madd	x3, x15, x22, x2
+	lsl	x3, x3, #2
+	ldr	q6, [x26, x3]
+	madd	x3, x16, x22, x2
+	madd	x2, x17, x22, x2
+	lsl	x3, x3, #2
+	lsl	x2, x2, #2
+	ldr	q5, [x26, x3]
+	ldr	q4, [x26, x2]
+	mov	x2, x10
+	mov	x3, x13
+	cmp	xzr, x19
+	b.ge	.LBB0_87
+	.p2align	2
+.LBB0_86:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_84 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x4, x2, #16
+	fmla	v0.2s, v16.2s, v7.s[0]
+	fmla	v1.2s, v16.2s, v6.s[0]
+	add	x1, x1, #4
+	fmla	v2.2s, v16.2s, v5.s[0]
+	fmla	v3.2s, v16.2s, v4.s[0]
+	prfm	pldl1keep, [x4]
+	add	x4, x3, x21
+	ldp	d16, d17, [x2, #-16]
+	fmla	v0.2s, v16.2s, v7.s[1]
+	fmla	v1.2s, v16.2s, v6.s[1]
+	fmla	v2.2s, v16.2s, v5.s[1]
+	fmla	v3.2s, v16.2s, v4.s[1]
+	fmla	v0.2s, v17.2s, v7.s[2]
+	fmla	v1.2s, v17.2s, v6.s[2]
+	fmla	v2.2s, v17.2s, v5.s[2]
+	fmla	v3.2s, v17.2s, v4.s[2]
+	ldp	d17, d16, [x2], #32
+	prfm	pldl1keep, [x3]
+	fmla	v0.2s, v17.2s, v7.s[3]
+	ldur	q7, [x3, #-16]
+	prfm	pldl1keep, [x4]
+	fmla	v1.2s, v17.2s, v6.s[3]
+	ldur	q6, [x4, #-16]
+	add	x4, x4, x21
+	fmla	v2.2s, v17.2s, v5.s[3]
+	fmla	v3.2s, v17.2s, v4.s[3]
+	add	x3, x3, #16
+	prfm	pldl1keep, [x4]
+	ldur	q5, [x4, #-16]
+	add	x4, x4, x21
+	prfm	pldl1keep, [x4]
+	ldur	q4, [x4, #-16]
+	cmp	x1, x19
+	b.lt	.LBB0_86
+.LBB0_87:                               //   in Loop: Header=BB0_84 Depth=3
+	ldr	x1, [sp, #648]                  // 8-byte Folded Reload
+	fmla	v0.2s, v16.2s, v7.s[0]
+	fmla	v1.2s, v16.2s, v6.s[0]
+	mov	x2, x12
+	fmla	v2.2s, v16.2s, v5.s[0]
+	fmla	v3.2s, v16.2s, v4.s[0]
+	mov	x3, x29
+	ldr	d17, [x8, x1, lsl #3]
+	ldr	x1, [sp, #640]                  // 8-byte Folded Reload
+	fmla	v0.2s, v17.2s, v7.s[1]
+	ldr	d16, [x8, x1, lsl #3]
+	ldr	x1, [sp, #632]                  // 8-byte Folded Reload
+	fmla	v1.2s, v17.2s, v6.s[1]
+	fmla	v2.2s, v17.2s, v5.s[1]
+	fmla	v3.2s, v17.2s, v4.s[1]
+	ldr	d18, [x8, x1, lsl #3]
+	mov	x1, x11
+	fmla	v0.2s, v16.2s, v7.s[2]
+	fmla	v1.2s, v16.2s, v6.s[2]
+	fmla	v2.2s, v16.2s, v5.s[2]
+	fmla	v3.2s, v16.2s, v4.s[2]
+	fmla	v0.2s, v18.2s, v7.s[3]
+	fmla	v1.2s, v18.2s, v6.s[3]
+	fmla	v2.2s, v18.2s, v5.s[3]
+	fmla	v3.2s, v18.2s, v4.s[3]
+	cmp	x29, x20
+	b.ge	.LBB0_83
+	.p2align	2
+.LBB0_88:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_84 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x4, x2, x21
+	prfm	pldl1keep, [x1]
+	ldur	d4, [x1, #-8]
+	add	x3, x3, #1
+	prfm	pldl1keep, [x2]
+	ldur	s5, [x2, #-4]
+	add	x2, x2, #4
+	add	x1, x1, #8
+	prfm	pldl1keep, [x4]
+	ldur	s6, [x4, #-4]
+	add	x4, x4, x21
+	fmla	v0.2s, v4.2s, v5.s[0]
+	prfm	pldl1keep, [x4]
+	ldur	s7, [x4, #-4]
+	add	x4, x4, x21
+	prfm	pldl1keep, [x4]
+	ldur	s16, [x4, #-4]
+	fmla	v1.2s, v4.2s, v6.s[0]
+	fmla	v2.2s, v4.2s, v7.s[0]
+	fmla	v3.2s, v4.2s, v16.s[0]
+	cmp	x3, x20
+	b.lt	.LBB0_88
+	b	.LBB0_83
+	.p2align	2
+.LBB0_89:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x11, [sp, #680]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #592]                 // 8-byte Folded Reload
+	cmp	x11, x12
+	b.ge	.LBB0_95
+// %bb.90:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x12, [sp, #656]                 // 8-byte Folded Reload
+	ldr	x14, [sp, #664]                 // 8-byte Folded Reload
+	mov	x13, xzr
+	mul	x12, x25, x12
+	ldr	x17, [sp, #672]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #680]                 // 8-byte Folded Reload
+	madd	x11, x15, x28, x9
+	ldr	x16, [sp, #440]                 // 8-byte Folded Reload
+	ldr	d4, [x8]
+	madd	x14, x17, x14, x12
+	madd	x12, x15, x22, x14
+	add	x15, x15, #1
+	madd	x14, x15, x22, x14
+	add	x11, x11, x16
+	add	x11, x5, x11, lsl #2
+	lsl	x12, x12, #2
+	ldr	q2, [x26, x12]
+	madd	x12, x15, x28, x9
+	lsl	x14, x14, #2
+	ldr	d0, [x11]
+	ldr	x15, [sp, #536]                 // 8-byte Folded Reload
+	ldr	q3, [x26, x14]
+	mov	x14, x10
+	add	x12, x12, x16
+	add	x12, x5, x12, lsl #2
+	ldr	d1, [x12]
+	cmp	xzr, x19
+	b.ge	.LBB0_92
+	.p2align	2
+.LBB0_91:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x2, x14, #16
+	ldr	x16, [sp, #608]                 // 8-byte Folded Reload
+	ldr	x18, [sp, #600]                 // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v2.s[0]
+	prfm	pldl1keep, [x2]
+	fmla	v1.2s, v4.2s, v3.s[0]
+	ldp	d4, d5, [x14, #-16]
+	add	x13, x13, #4
+	add	x16, x16, x15
+	add	x18, x18, x15
+	add	x15, x15, #16
+	add	x17, x16, #32
+	add	x1, x18, #32
+	fmla	v0.2s, v4.2s, v2.s[1]
+	fmla	v1.2s, v4.2s, v3.s[1]
+	fmla	v0.2s, v5.2s, v2.s[2]
+	fmla	v1.2s, v5.2s, v3.s[2]
+	ldp	d5, d4, [x14], #32
+	prfm	pldl1keep, [x1]
+	fmla	v0.2s, v5.2s, v2.s[3]
+	ldr	q2, [x18, #16]
+	prfm	pldl1keep, [x17]
+	fmla	v1.2s, v5.2s, v3.s[3]
+	ldr	q3, [x16, #16]
+	cmp	x13, x19
+	b.lt	.LBB0_91
+.LBB0_92:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x15, [sp, #648]                 // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v2.s[0]
+	fmla	v1.2s, v4.2s, v3.s[0]
+	ldr	x1, [sp, #352]                  // 8-byte Folded Reload
+	ldr	x2, [sp, #424]                  // 8-byte Folded Reload
+	mov	x13, xzr
+	mov	x14, xzr
+	ldr	d5, [x8, x15, lsl #3]
+	ldr	x15, [sp, #640]                 // 8-byte Folded Reload
+	fmla	v0.2s, v5.2s, v2.s[1]
+	ldr	d4, [x8, x15, lsl #3]
+	ldr	x15, [sp, #632]                 // 8-byte Folded Reload
+	fmla	v1.2s, v5.2s, v3.s[1]
+	ldr	d5, [x8, x15, lsl #3]
+	ldr	x15, [sp, #224]                 // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v2.s[2]
+	fmla	v1.2s, v4.2s, v3.s[2]
+	add	x15, x8, x15
+	fmla	v0.2s, v5.2s, v2.s[3]
+	fmla	v1.2s, v5.2s, v3.s[3]
+	add	x16, x29, xzr
+	cmp	x16, x20
+	b.ge	.LBB0_94
+	.p2align	2
+.LBB0_93:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x18, x15, x14, lsl #3
+	add	x16, x2, x13
+	add	x17, x1, x13
+	add	x13, x13, #4
+	add	x17, x17, #4
+	add	x16, x16, #4
+	add	x18, x18, #8
+	prfm	pldl1keep, [x18]
+	ldr	d2, [x15, x14, lsl #3]
+	prfm	pldl1keep, [x17]
+	ldr	s3, [x1, x14, lsl #2]
+	prfm	pldl1keep, [x16]
+	fmla	v0.2s, v2.2s, v3.s[0]
+	ldr	s4, [x2, x14, lsl #2]
+	fmla	v1.2s, v2.2s, v4.s[0]
+	add	x14, x14, #1
+	add	x16, x29, x14
+	cmp	x16, x20
+	b.lt	.LBB0_93
+.LBB0_94:                               //   in Loop: Header=BB0_7 Depth=2
+	str	d0, [x11]
+	str	d1, [x12]
+.LBB0_95:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x11, [sp, #528]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #592]                 // 8-byte Folded Reload
+	cmp	x12, x11
+	b.ge	.LBB0_101
+// %bb.96:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x14, [sp, #592]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #440]                 // 8-byte Folded Reload
+	mov	x11, xzr
+	madd	x9, x14, x28, x9
+	ldr	x13, [sp, #664]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #672]                 // 8-byte Folded Reload
+	ldr	d1, [x8]
+	add	x9, x9, x12
+	ldr	x12, [sp, #656]                 // 8-byte Folded Reload
+	add	x9, x5, x9, lsl #2
+	ldr	d0, [x9]
+	mul	x12, x25, x12
+	madd	x12, x15, x13, x12
+	madd	x12, x14, x22, x12
+	lsl	x12, x12, #2
+	ldr	q2, [x26, x12]
+	ldp	x12, x14, [sp, #456]            // 16-byte Folded Reload
+	cmp	xzr, x19
+	b.ge	.LBB0_98
+	.p2align	2
+.LBB0_97:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x13, x10, #16
+	fmla	v0.2s, v1.2s, v2.s[0]
+	add	x11, x11, #4
+	prfm	pldl1keep, [x13]
+	ldp	d1, d3, [x10, #-16]
+	fmla	v0.2s, v1.2s, v2.s[1]
+	fmla	v0.2s, v3.2s, v2.s[2]
+	ldp	d3, d1, [x10], #32
+	prfm	pldl1keep, [x12]
+	fmla	v0.2s, v3.2s, v2.s[3]
+	ldur	q2, [x12, #-16]
+	add	x12, x12, #16
+	cmp	x11, x19
+	b.lt	.LBB0_97
+.LBB0_98:                               //   in Loop: Header=BB0_7 Depth=2
+	ldr	x11, [sp, #648]                 // 8-byte Folded Reload
+	fmla	v0.2s, v1.2s, v2.s[0]
+	mov	x10, xzr
+	ldr	d3, [x8, x11, lsl #3]
+	ldr	x11, [sp, #640]                 // 8-byte Folded Reload
+	fmla	v0.2s, v3.2s, v2.s[1]
+	ldr	d4, [x8, x11, lsl #3]
+	ldr	x11, [sp, #632]                 // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v2.s[2]
+	ldr	d1, [x8, x11, lsl #3]
+	ldr	x11, [sp, #224]                 // 8-byte Folded Reload
+	add	x8, x8, x11
+	ldr	x11, [sp, #416]                 // 8-byte Folded Reload
+	fmla	v0.2s, v1.2s, v2.s[3]
+	add	x12, x29, xzr
+	cmp	x12, x20
+	b.ge	.LBB0_100
+	.p2align	2
+.LBB0_99:                               //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x12, x8, x10, lsl #3
+	add	x12, x12, #8
+	prfm	pldl1keep, [x12]
+	ldr	d1, [x8, x10, lsl #3]
+	prfm	pldl1keep, [x11]
+	add	x11, x11, #4
+	ldr	s2, [x14, x10, lsl #2]
+	add	x10, x10, #1
+	fmla	v0.2s, v1.2s, v2.s[0]
+	add	x12, x29, x10
+	cmp	x12, x20
+	b.lt	.LBB0_99
+.LBB0_100:                              //   in Loop: Header=BB0_7 Depth=2
+	str	d0, [x9]
+.LBB0_101:                              //   in Loop: Header=BB0_7 Depth=2
+	bl	free
+	ldr	x9, [sp, #432]                  // 8-byte Folded Reload
+	ldr	x16, [sp, #560]                 // 8-byte Folded Reload
+	ldr	x8, [sp, #296]                  // 8-byte Folded Reload
+	cmp	x9, x8
+	b.ge	.LBB0_6
+.LBB0_102:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x8, [sp, #272]                  // 8-byte Folded Reload
+	add	x0, x8, #64
+	bl	malloc
+	ldr	x8, [sp, #520]                  // 8-byte Folded Reload
+	ldr	x9, [sp, #512]                  // 8-byte Folded Reload
+	mov	x10, xzr
+	mov	x11, xzr
+	ldr	x12, [sp, #672]                 // 8-byte Folded Reload
+	ldr	x13, [sp, #576]                 // 8-byte Folded Reload
+	mul	x8, x25, x8
+	ldr	x6, [sp, #584]                  // 8-byte Folded Reload
+	ldr	x14, [sp, #560]                 // 8-byte Folded Reload
+	madd	x9, x12, x9, x8
+	ldr	x8, [sp, #432]                  // 8-byte Folded Reload
+	add	x12, x9, x8
+	add	x8, x13, x28
+	add	x8, x12, x8
+	add	x13, x12, x13
+	ldr	s1, [x6, x12, lsl #2]
+	add	x12, x12, x28
+	ldr	s0, [x6, x8, lsl #2]
+	ldr	s2, [x6, x13, lsl #2]
+	ldr	s3, [x6, x12, lsl #2]
+	ldr	x12, [sp, #328]                 // 8-byte Folded Reload
+	add	x8, x0, #63
+	and	x8, x8, #0xffffffffffffffc0
+	cmp	xzr, x20
+	b.ge	.LBB0_104
+	.p2align	2
+.LBB0_103:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x13, x14, x10
+	add	x11, x11, #1
+	prfm	pldl1keep, [x13]
+	ldur	s4, [x13, #-4]
+	add	x13, x13, x21
+	prfm	pldl1keep, [x13]
+	ldur	s5, [x13, #-4]
+	add	x13, x13, x21
+	prfm	pldl1keep, [x13]
+	ldur	s6, [x13, #-4]
+	add	x13, x13, x21
+	prfm	pldl1keep, [x13]
+	ldur	s7, [x13, #-4]
+	prfm	pldl1keep, [x12]
+	ldur	s16, [x12, #-4]
+	add	x12, x12, #4
+	fmla	v1.2s, v16.2s, v4.2s
+	fmla	v3.2s, v16.2s, v5.2s
+	fmla	v2.2s, v16.2s, v6.2s
+	fmla	v0.2s, v16.2s, v7.2s
+	str	s16, [x8, x10]
+	add	x10, x10, #4
+	cmp	x11, x20
+	b.lt	.LBB0_103
+.LBB0_104:                              // %.preheader36
+                                        //   in Loop: Header=BB0_7 Depth=2
+	ldr	x11, [sp, #192]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #552]                 // 8-byte Folded Reload
+	mov	x1, xzr
+	add	x10, x8, #12
+	ldr	x13, [sp, #544]                 // 8-byte Folded Reload
+	mov	w16, #1                         // =0x1
+	mov	w17, #2                         // =0x2
+	mov	w15, #3                         // =0x3
+	mov	w14, #4                         // =0x4
+	add	x11, x8, x11
+	b	.LBB0_106
+	.p2align	2
+.LBB0_105:                              // %.loopexit32
+                                        //   in Loop: Header=BB0_106 Depth=3
+	add	x13, x13, x23
+	add	x12, x12, x23
+	mov	x1, x14
+	mov	x14, x18
+.LBB0_106:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Loop Header: Depth=3
+                                        //         Child Loop BB0_108 Depth 4
+                                        //         Child Loop BB0_110 Depth 4
+	madd	x18, x1, x28, x9
+	ldr	x5, [sp, #432]                  // 8-byte Folded Reload
+	add	x18, x18, x5
+	madd	x16, x16, x28, x9
+	madd	x17, x17, x28, x9
+	madd	x15, x15, x28, x9
+	add	x16, x16, x5
+	add	x15, x15, x5
+	str	s1, [x6, x18, lsl #2]
+	str	s3, [x6, x16, lsl #2]
+	add	x16, x17, x5
+	str	s2, [x6, x16, lsl #2]
+	str	s0, [x6, x15, lsl #2]
+	ldr	x15, [sp, #680]                 // 8-byte Folded Reload
+	cmp	x14, x15
+	b.ge	.LBB0_111
+// %bb.107:                             //   in Loop: Header=BB0_106 Depth=3
+	madd	x2, x14, x28, x9
+	add	x15, x14, #3
+	add	x16, x14, #1
+	add	x17, x14, #2
+	madd	x3, x16, x28, x9
+	ldr	s16, [x8]
+	mov	x1, xzr
+	add	x18, x14, #4
+	madd	x4, x17, x28, x9
+	add	x2, x2, x5
+	ldr	s1, [x6, x2, lsl #2]
+	madd	x2, x15, x28, x9
+	add	x4, x4, x5
+	ldr	s2, [x6, x4, lsl #2]
+	ldr	x4, [sp, #672]                  // 8-byte Folded Reload
+	add	x2, x2, x5
+	ldr	s0, [x6, x2, lsl #2]
+	add	x2, x3, x5
+	ldr	x3, [sp, #664]                  // 8-byte Folded Reload
+	ldr	s3, [x6, x2, lsl #2]
+	ldr	x2, [sp, #656]                  // 8-byte Folded Reload
+	mul	x2, x25, x2
+	madd	x2, x4, x3, x2
+	madd	x3, x14, x22, x2
+	lsl	x3, x3, #2
+	ldr	q7, [x26, x3]
+	madd	x3, x16, x22, x2
+	lsl	x3, x3, #2
+	ldr	q6, [x26, x3]
+	madd	x3, x17, x22, x2
+	madd	x2, x15, x22, x2
+	lsl	x3, x3, #2
+	lsl	x2, x2, #2
+	ldr	q5, [x26, x3]
+	ldr	q4, [x26, x2]
+	mov	x2, x10
+	mov	x3, x13
+	ext	v20.16b, v7.16b, v7.16b, #8
+	cmp	xzr, x19
+	ext	v19.16b, v6.16b, v6.16b, #8
+	ext	v18.16b, v5.16b, v5.16b, #8
+	ext	v17.16b, v4.16b, v4.16b, #8
+	b.ge	.LBB0_109
+	.p2align	2
+.LBB0_108:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_106 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x4, x2, #8
+	fmla	v1.2s, v16.2s, v7.2s
+	fmla	v3.2s, v16.2s, v6.2s
+	add	x1, x1, #4
+	fmla	v2.2s, v16.2s, v5.2s
+	fmla	v0.2s, v16.2s, v4.2s
+	prfm	pldl1keep, [x4]
+	add	x4, x3, x21
+	ldp	s16, s21, [x2, #-8]
+	fmla	v0.2s, v16.2s, v4.s[1]
+	fmla	v1.2s, v16.2s, v7.s[1]
+	fmla	v3.2s, v16.2s, v6.s[1]
+	fmla	v2.2s, v16.2s, v5.s[1]
+	fmla	v0.2s, v21.2s, v17.2s
+	fmla	v1.2s, v21.2s, v20.2s
+	ldp	s17, s16, [x2], #16
+	fmla	v3.2s, v21.2s, v19.2s
+	fmla	v2.2s, v21.2s, v18.2s
+	prfm	pldl1keep, [x3]
+	fmla	v1.2s, v17.2s, v7.s[3]
+	ldur	q7, [x3, #-16]
+	prfm	pldl1keep, [x4]
+	fmla	v3.2s, v17.2s, v6.s[3]
+	ldur	q6, [x4, #-16]
+	add	x4, x4, x21
+	fmla	v2.2s, v17.2s, v5.s[3]
+	fmla	v0.2s, v17.2s, v4.s[3]
+	add	x3, x3, #16
+	prfm	pldl1keep, [x4]
+	ldur	q5, [x4, #-16]
+	add	x4, x4, x21
+	prfm	pldl1keep, [x4]
+	ldur	q4, [x4, #-16]
+	ext	v20.16b, v7.16b, v7.16b, #8
+	cmp	x1, x19
+	ext	v19.16b, v6.16b, v6.16b, #8
+	ext	v18.16b, v5.16b, v5.16b, #8
+	ext	v17.16b, v4.16b, v4.16b, #8
+	b.lt	.LBB0_108
+.LBB0_109:                              //   in Loop: Header=BB0_106 Depth=3
+	ldr	x1, [sp, #648]                  // 8-byte Folded Reload
+	fmla	v1.2s, v16.2s, v7.2s
+	fmla	v3.2s, v16.2s, v6.2s
+	mov	x2, x12
+	fmla	v2.2s, v16.2s, v5.2s
+	fmla	v0.2s, v16.2s, v4.2s
+	mov	x3, x29
+	ldr	s21, [x8, x1, lsl #2]
+	ldr	x1, [sp, #640]                  // 8-byte Folded Reload
+	fmla	v1.2s, v21.2s, v7.s[1]
+	ldr	s16, [x8, x1, lsl #2]
+	ldr	x1, [sp, #632]                  // 8-byte Folded Reload
+	fmla	v3.2s, v21.2s, v6.s[1]
+	fmla	v2.2s, v21.2s, v5.s[1]
+	fmla	v0.2s, v21.2s, v4.s[1]
+	ldr	s22, [x8, x1, lsl #2]
+	mov	x1, x11
+	fmla	v1.2s, v16.2s, v20.2s
+	fmla	v3.2s, v16.2s, v19.2s
+	fmla	v2.2s, v16.2s, v18.2s
+	fmla	v0.2s, v16.2s, v17.2s
+	fmla	v1.2s, v22.2s, v7.s[3]
+	fmla	v3.2s, v22.2s, v6.s[3]
+	fmla	v2.2s, v22.2s, v5.s[3]
+	fmla	v0.2s, v22.2s, v4.s[3]
+	cmp	x29, x20
+	b.ge	.LBB0_105
+	.p2align	2
+.LBB0_110:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        //       Parent Loop BB0_106 Depth=3
+                                        // =>      This Inner Loop Header: Depth=4
+	add	x4, x2, x21
+	prfm	pldl1keep, [x1]
+	ldur	s4, [x1, #-4]
+	add	x3, x3, #1
+	prfm	pldl1keep, [x2]
+	ldur	s5, [x2, #-4]
+	add	x2, x2, #4
+	add	x1, x1, #4
+	prfm	pldl1keep, [x4]
+	ldur	s6, [x4, #-4]
+	add	x4, x4, x21
+	fmla	v1.2s, v4.2s, v5.2s
+	prfm	pldl1keep, [x4]
+	ldur	s7, [x4, #-4]
+	add	x4, x4, x21
+	prfm	pldl1keep, [x4]
+	ldur	s16, [x4, #-4]
+	fmla	v3.2s, v4.2s, v6.2s
+	fmla	v2.2s, v4.2s, v7.2s
+	fmla	v0.2s, v4.2s, v16.2s
+	cmp	x3, x20
+	b.lt	.LBB0_110
+	b	.LBB0_105
+	.p2align	2
+.LBB0_111:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x11, [sp, #680]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #592]                 // 8-byte Folded Reload
+	cmp	x11, x12
+	b.ge	.LBB0_117
+// %bb.112:                             //   in Loop: Header=BB0_7 Depth=2
+	ldr	x12, [sp, #656]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #664]                 // 8-byte Folded Reload
+	mov	x13, xzr
+	mov	x14, xzr
+	ldr	x18, [sp, #672]                 // 8-byte Folded Reload
+	ldr	x16, [sp, #680]                 // 8-byte Folded Reload
+	mul	x12, x25, x12
+	madd	x11, x16, x28, x9
+	ldr	x17, [sp, #432]                 // 8-byte Folded Reload
+	ldr	s4, [x8]
+	madd	x12, x18, x15, x12
+	madd	x15, x16, x22, x12
+	add	x11, x11, x17
+	ldr	s1, [x6, x11, lsl #2]
+	lsl	x15, x15, #2
+	ldr	q2, [x26, x15]
+	add	x15, x16, #1
+	madd	x16, x15, x22, x12
+	madd	x12, x15, x28, x9
+	add	x12, x12, x17
+	lsl	x15, x16, #2
+	ldr	s0, [x6, x12, lsl #2]
+	ldr	q3, [x26, x15]
+	ext	v6.16b, v2.16b, v2.16b, #8
+	cmp	xzr, x19
+	ext	v5.16b, v3.16b, v3.16b, #8
+	b.ge	.LBB0_114
+	.p2align	2
+.LBB0_113:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x1, x8, x13
+	ldp	x15, x17, [sp, #336]            // 16-byte Folded Reload
+	fmla	v1.2s, v4.2s, v2.2s
+	add	x2, x1, #20
+	fmla	v0.2s, v4.2s, v3.2s
+	add	x14, x14, #4
+	prfm	pldl1keep, [x2]
+	ldp	s4, s7, [x1, #4]
+	add	x15, x15, x13
+	add	x17, x17, x13
+	add	x13, x13, #16
+	add	x16, x15, #32
+	add	x18, x17, #32
+	fmla	v0.2s, v4.2s, v3.s[1]
+	fmla	v1.2s, v4.2s, v2.s[1]
+	fmla	v0.2s, v7.2s, v5.2s
+	ldp	s5, s4, [x1, #12]
+	fmla	v1.2s, v7.2s, v6.2s
+	prfm	pldl1keep, [x18]
+	fmla	v1.2s, v5.2s, v2.s[3]
+	ldr	q2, [x17, #16]
+	prfm	pldl1keep, [x16]
+	fmla	v0.2s, v5.2s, v3.s[3]
+	ldr	q3, [x15, #16]
+	ext	v6.16b, v2.16b, v2.16b, #8
+	cmp	x14, x19
+	ext	v5.16b, v3.16b, v3.16b, #8
+	b.lt	.LBB0_113
+.LBB0_114:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x14, [sp, #648]                 // 8-byte Folded Reload
+	fmla	v1.2s, v4.2s, v2.2s
+	fmla	v0.2s, v4.2s, v3.2s
+	ldr	x1, [sp, #352]                  // 8-byte Folded Reload
+	ldr	x2, [sp, #424]                  // 8-byte Folded Reload
+	mov	x13, xzr
+	mov	x15, x29
+	ldr	s7, [x8, x14, lsl #2]
+	ldr	x14, [sp, #640]                 // 8-byte Folded Reload
+	fmla	v1.2s, v7.2s, v2.s[1]
+	ldr	s4, [x8, x14, lsl #2]
+	ldr	x14, [sp, #632]                 // 8-byte Folded Reload
+	fmla	v0.2s, v7.2s, v3.s[1]
+	ldr	s7, [x8, x14, lsl #2]
+	ldr	x14, [sp, #264]                 // 8-byte Folded Reload
+	fmla	v1.2s, v4.2s, v6.2s
+	fmla	v0.2s, v4.2s, v5.2s
+	add	x14, x8, x14
+	fmla	v1.2s, v7.2s, v2.s[3]
+	fmla	v0.2s, v7.2s, v3.s[3]
+	cmp	x29, x20
+	b.ge	.LBB0_116
+	.p2align	2
+.LBB0_115:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x16, x2, x13
+	add	x17, x1, x13
+	add	x18, x14, x13
+	add	x15, x15, #1
+	add	x16, x16, #4
+	add	x17, x17, #4
+	add	x18, x18, #4
+	prfm	pldl1keep, [x18]
+	ldr	s2, [x14, x13]
+	prfm	pldl1keep, [x17]
+	prfm	pldl1keep, [x16]
+	ldr	s3, [x1, x13]
+	fmla	v1.2s, v2.2s, v3.2s
+	ldr	s3, [x2, x13]
+	add	x13, x13, #4
+	fmla	v0.2s, v2.2s, v3.2s
+	cmp	x15, x20
+	b.lt	.LBB0_115
+.LBB0_116:                              //   in Loop: Header=BB0_7 Depth=2
+	str	s1, [x6, x11, lsl #2]
+	str	s0, [x6, x12, lsl #2]
+.LBB0_117:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x11, [sp, #528]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #592]                 // 8-byte Folded Reload
+	cmp	x12, x11
+	b.ge	.LBB0_5
+// %bb.118:                             //   in Loop: Header=BB0_7 Depth=2
+	ldr	x14, [sp, #592]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #432]                 // 8-byte Folded Reload
+	mov	x11, xzr
+	madd	x9, x14, x28, x9
+	ldr	x13, [sp, #664]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #672]                 // 8-byte Folded Reload
+	ldr	s2, [x8]
+	add	x9, x9, x12
+	ldr	x12, [sp, #656]                 // 8-byte Folded Reload
+	ldr	s0, [x6, x9, lsl #2]
+	mul	x12, x25, x12
+	madd	x12, x15, x13, x12
+	madd	x12, x14, x22, x12
+	lsl	x12, x12, #2
+	ldr	q1, [x26, x12]
+	ldp	x12, x14, [sp, #456]            // 16-byte Folded Reload
+	ext	v3.16b, v1.16b, v1.16b, #8
+	cmp	xzr, x19
+	b.ge	.LBB0_120
+	.p2align	2
+.LBB0_119:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x13, x10, #8
+	fmla	v0.2s, v2.2s, v1.2s
+	add	x11, x11, #4
+	prfm	pldl1keep, [x13]
+	ldp	s2, s4, [x10, #-8]
+	fmla	v0.2s, v2.2s, v1.s[1]
+	fmla	v0.2s, v4.2s, v3.2s
+	ldp	s3, s2, [x10], #16
+	prfm	pldl1keep, [x12]
+	fmla	v0.2s, v3.2s, v1.s[3]
+	ldur	q1, [x12, #-16]
+	add	x12, x12, #16
+	ext	v3.16b, v1.16b, v1.16b, #8
+	cmp	x11, x19
+	b.lt	.LBB0_119
+.LBB0_120:                              //   in Loop: Header=BB0_7 Depth=2
+	ldr	x11, [sp, #648]                 // 8-byte Folded Reload
+	fmla	v0.2s, v2.2s, v1.2s
+	mov	x10, xzr
+	ldr	s4, [x8, x11, lsl #2]
+	ldr	x11, [sp, #640]                 // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v1.s[1]
+	ldr	s5, [x8, x11, lsl #2]
+	ldr	x11, [sp, #632]                 // 8-byte Folded Reload
+	fmla	v0.2s, v5.2s, v3.2s
+	ldr	s2, [x8, x11, lsl #2]
+	ldr	x11, [sp, #264]                 // 8-byte Folded Reload
+	add	x8, x8, x11
+	mov	x11, x29
+	fmla	v0.2s, v2.2s, v1.s[3]
+	cmp	x29, x20
+	b.ge	.LBB0_4
+	.p2align	2
+.LBB0_121:                              //   Parent Loop BB0_2 Depth=1
+                                        //     Parent Loop BB0_7 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x12, x14, x10
+	add	x13, x8, x10
+	add	x11, x11, #1
+	add	x12, x12, #4
+	add	x13, x13, #4
+	prfm	pldl1keep, [x13]
+	ldr	s1, [x8, x10]
+	prfm	pldl1keep, [x12]
+	ldr	s2, [x14, x10]
+	add	x10, x10, #4
+	fmla	v0.2s, v1.2s, v2.2s
+	cmp	x11, x20
+	b.lt	.LBB0_121
+	b	.LBB0_4
+.LBB0_122:
+	ldr	x0, [sp, #8]                    // 8-byte Folded Reload
+	bl	free
+	add	sp, sp, #688
+	ldp	d9, d8, [sp, #48]               // 16-byte Folded Reload
+	ldp	d11, d10, [sp, #32]             // 16-byte Folded Reload
+	ldp	d13, d12, [sp, #16]             // 16-byte Folded Reload
+	ldp	x20, x19, [sp, #144]            // 16-byte Folded Reload
+	ldp	x22, x21, [sp, #128]            // 16-byte Folded Reload
+	ldp	x24, x23, [sp, #112]            // 16-byte Folded Reload
+	ldp	x26, x25, [sp, #96]             // 16-byte Folded Reload
+	ldp	x28, x27, [sp, #80]             // 16-byte Folded Reload
+	ldp	x29, x30, [sp, #64]             // 16-byte Folded Reload
+	ldp	d15, d14, [sp], #160            // 16-byte Folded Reload
+	ret
+.Lfunc_end0:
+	.size	sbatch_matmul_4d_nt_mlir, .Lfunc_end0-sbatch_matmul_4d_nt_mlir
+	.cfi_endproc
+                                        // -- End function
+	.section	".note.GNU-stack","",@progbits
diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemm_nn_alpha1_beta1_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemm_nn_alpha1_beta1_mlir.s
new file mode 100644
index 00000000000000..efa5087d8c2dfe
--- /dev/null
+++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemm_nn_alpha1_beta1_mlir.s
@@ -0,0 +1,4104 @@
+	.text
+	.file	"LLVMDialectModule"
+	.globl	sgemm_nn_alpha1_beta1_mlir                            // -- Begin function sgemm_nn_alpha1_beta1_mlir
+	.p2align	4
+	.type	sgemm_nn_alpha1_beta1_mlir,@function
+sgemm_nn_alpha1_beta1_mlir:                                   // @sgemm_nn_alpha1_beta1_mlir
+	.cfi_startproc
+// %bb.0:
+	str	d12, [sp, #-144]!               // 8-byte Folded Spill
+	stp	d11, d10, [sp, #16]             // 16-byte Folded Spill
+	stp	x29, x30, [sp, #48]             // 16-byte Folded Spill
+	stp	x28, x27, [sp, #64]             // 16-byte Folded Spill
+	stp	x26, x25, [sp, #80]             // 16-byte Folded Spill
+	stp	x24, x23, [sp, #96]             // 16-byte Folded Spill
+	stp	x22, x21, [sp, #112]            // 16-byte Folded Spill
+	stp	x20, x19, [sp, #128]            // 16-byte Folded Spill
+	stp	d9, d8, [sp, #32]               // 16-byte Folded Spill
+	sub	sp, sp, #512
+	.cfi_def_cfa_offset 656
+	.cfi_offset w19, -8
+	.cfi_offset w20, -16
+	.cfi_offset w21, -24
+	.cfi_offset w22, -32
+	.cfi_offset w23, -40
+	.cfi_offset w24, -48
+	.cfi_offset w25, -56
+	.cfi_offset w26, -64
+	.cfi_offset w27, -72
+	.cfi_offset w28, -80
+	.cfi_offset w30, -88
+	.cfi_offset w29, -96
+	.cfi_offset b8, -104
+	.cfi_offset b9, -112
+	.cfi_offset b10, -120
+	.cfi_offset b11, -128
+	.cfi_offset b12, -144
+	cmp	x3, #0
+	ldr	x29, [sp, #688]
+	ldr	x20, [sp, #656]
+	mov	x22, x5
+	cinv	x8, x3, lt
+	ldr	x26, [sp, #664]
+	ldr	x27, [sp, #744]
+	mov	x19, x4
+	add	x10, x8, x8, lsr #63
+	add	x9, x8, #7
+	str	x2, [sp, #320]                  // 8-byte Folded Spill
+	mov	x25, x1
+	str	x3, [sp, #288]                  // 8-byte Folded Spill
+	asr	x10, x10, #1
+	cinv	x23, x10, lt
+	cmp	x8, #0
+	add	x10, x8, #3
+	csel	x9, x9, x8, lt
+	csel	x8, x10, x8, lt
+	cmp	x3, #0
+	asr	x9, x9, #3
+	asr	x8, x8, #2
+	cinv	x24, x9, lt
+	ldr	x9, [sp, #680]
+	cinv	x21, x8, lt
+	cmp	x9, #0
+	str	x9, [sp, #128]                  // 8-byte Folded Spill
+	cinv	x10, x9, lt
+	add	x8, x10, #7
+	cmp	x10, #0
+	str	x10, [sp, #112]                 // 8-byte Folded Spill
+	csel	x8, x8, x10, lt
+	cmp	x9, #0
+	ldr	x10, [sp, #712]
+	ldr	x9, [sp, #720]
+	asr	x8, x8, #3
+	cinv	x8, x8, lt
+	str	x8, [sp, #16]                   // 8-byte Folded Spill
+	lsl	x8, x8, #3
+	str	x8, [sp, #384]                  // 8-byte Folded Spill
+	lsl	x8, x4, #5
+	stp	x9, x10, [sp, #256]             // 16-byte Folded Spill
+	add	x0, x8, #64
+	str	x8, [sp, #504]                  // 8-byte Folded Spill
+	bl	malloc
+	lsl	x8, x24, #3
+	mul	x3, x24, x22
+	mov	w9, #1                          // =0x1
+	add	x12, x0, #63
+	str	x8, [sp, #520]                  // 8-byte Folded Spill
+	lsl	x8, x21, #2
+	bfi	x9, x21, #2, #62
+	and	x11, x19, #0x3
+	str	x8, [sp, #336]                  // 8-byte Folded Spill
+	lsl	x8, x23, #1
+	mul	x9, x22, x9
+	str	x0, [sp, #56]                   // 8-byte Folded Spill
+	str	x8, [sp, #312]                  // 8-byte Folded Spill
+	negs	x8, x19
+	mul	x0, x23, x22
+	mul	x2, x21, x22
+	and	x13, x8, #0x3
+	and	x8, x12, #0xffffffffffffffc0
+	ldr	x12, [sp, #320]                 // 8-byte Folded Reload
+	lsl	x5, x19, #2
+	csneg	x18, x11, x13, mi
+	add	x11, x19, x3, lsl #3
+	add	x16, x19, x0, lsl #1
+	lsl	x28, x22, #2
+	lsl	x6, x18, #2
+	add	x10, x19, x22, lsl #3
+	lsl	x4, x22, #5
+	mov	w15, #28                        // =0x1c
+	sub	x23, x11, x18
+	add	x11, x19, x9
+	add	x9, x25, x9, lsl #2
+	sub	x13, x4, x28
+	lsl	x24, x12, #2
+	sub	x11, x11, x18
+	add	x14, x19, x2, lsl #2
+	sub	x12, x5, x6
+	str	x9, [sp, #168]                  // 8-byte Folded Spill
+	add	x9, x24, x11, lsl #2
+	lsl	x21, x26, #2
+	add	x17, x12, #4
+	madd	x1, x29, x15, x20
+	sub	x15, x16, x18
+	add	x16, x24, x25
+	stp	x13, x12, [sp, #64]             // 16-byte Folded Spill
+	sub	x12, x10, x18
+	sub	x10, x14, x18
+	madd	x14, x29, x17, x21
+	add	x16, x13, x16
+	add	x9, x25, x9
+	add	x16, x16, #16
+	str	x9, [sp, #480]                  // 8-byte Folded Spill
+	add	x9, x24, x10, lsl #2
+	str	x16, [sp, #240]                 // 8-byte Folded Spill
+	mov	w16, #16                        // =0x10
+	sub	x30, x19, x18
+	sub	x13, x16, x13
+	add	x9, x25, x9
+	mul	x17, x29, x30
+	str	x13, [sp, #232]                 // 8-byte Folded Spill
+	add	x13, x20, x14
+	stp	x13, x9, [sp, #464]             // 16-byte Folded Spill
+	add	x9, x24, x15, lsl #2
+	ldr	x13, [sp, #504]                 // 8-byte Folded Reload
+	add	x14, x5, x24
+	sub	x14, x14, x6
+	add	x16, x21, x17, lsl #2
+	add	x14, x14, x25
+	add	x9, x9, x25
+	sub	x10, x30, #3
+	stp	x17, x18, [sp, #96]             // 16-byte Folded Spill
+	add	x11, x20, x29, lsl #5
+	add	x9, x9, #4
+	str	x26, [sp, #280]                 // 8-byte Folded Spill
+	lsl	x26, x29, #4
+	stp	x20, x29, [sp, #296]            // 16-byte Folded Spill
+	str	x9, [sp, #152]                  // 8-byte Folded Spill
+	sub	x9, x30, #2
+	sub	x17, x13, x18, lsl #5
+	add	x13, x14, #4
+	add	x14, x24, x12, lsl #2
+	add	x12, x4, x24
+	stp	x10, x9, [sp, #400]             // 16-byte Folded Spill
+	sub	x9, x30, #1
+	str	x13, [sp, #224]                 // 8-byte Folded Spill
+	add	x13, x20, x16
+	add	x12, x12, x25
+	str	x9, [sp, #416]                  // 8-byte Folded Spill
+	mov	w9, #20                         // =0x14
+	str	x13, [sp, #456]                 // 8-byte Folded Spill
+	add	x13, x12, #32
+	add	x12, x14, x25
+	madd	x9, x29, x9, x20
+	add	x12, x12, #4
+	stp	x12, x13, [sp, #208]            // 16-byte Folded Spill
+	add	x12, x24, x3, lsl #5
+	add	x13, x24, x23, lsl #2
+	stp	x6, x14, [sp, #80]              // 16-byte Folded Spill
+	mov	w10, #24                        // =0x18
+	lsl	x18, x29, #2
+	add	x12, x12, x25
+	madd	x6, x29, x10, x20
+	add	x10, x20, x18
+	str	x25, [sp, #328]                 // 8-byte Folded Spill
+	stp	x9, x11, [sp, #440]             // 16-byte Folded Spill
+	add	x11, x20, x29, lsl #3
+	add	x14, x12, #32
+	add	x12, x13, x25
+	add	x12, x12, #4
+	add	x9, x20, x26
+	mov	x7, xzr
+	str	x5, [sp, #120]                  // 8-byte Folded Spill
+	str	x11, [sp, #432]                 // 8-byte Folded Spill
+	add	x11, x8, #64
+	stp	x12, x14, [sp, #176]            // 16-byte Folded Spill
+	add	x12, x17, #32
+	str	x11, [sp, #192]                 // 8-byte Folded Spill
+	mov	w11, #12                        // =0xc
+	add	x13, x8, x17
+	sub	x23, x30, #4
+	madd	x11, x29, x11, x20
+	mov	x20, x1
+	mov	x1, x9
+	add	x9, x8, x12
+	stp	x9, x4, [sp, #488]              // 16-byte Folded Spill
+	add	x9, x24, x0, lsl #3
+	add	x5, x8, #128
+	stp	x2, x0, [sp, #40]               // 16-byte Folded Spill
+	mov	x0, x10
+	stp	x9, x3, [sp, #24]               // 16-byte Folded Spill
+	str	x12, [sp, #160]                 // 8-byte Folded Spill
+	add	x9, x25, x9
+	str	x30, [sp, #504]                 // 8-byte Folded Spill
+	str	x24, [sp, #344]                 // 8-byte Folded Spill
+	str	x9, [sp, #144]                  // 8-byte Folded Spill
+	add	x9, x9, #32
+	str	x11, [sp, #424]                 // 8-byte Folded Spill
+	str	x18, [sp, #272]                 // 8-byte Folded Spill
+	str	x9, [sp, #136]                  // 8-byte Folded Spill
+	add	x9, x25, x2, lsl #4
+	ldr	x25, [sp, #384]                 // 8-byte Folded Reload
+	str	x13, [sp, #200]                 // 8-byte Folded Spill
+	str	x9, [sp, #248]                  // 8-byte Folded Spill
+	b	.LBB0_3
+	.p2align	2
+.LBB0_1:                                //   in Loop: Header=BB0_3 Depth=1
+	stp	q1, q0, [x10]
+.LBB0_2:                                // %.backedge
+                                        //   in Loop: Header=BB0_3 Depth=1
+	ldp	x9, x11, [sp, #440]             // 16-byte Folded Reload
+	ldr	x20, [sp, #376]                 // 8-byte Folded Reload
+	add	x6, x6, #32
+	add	x1, x1, #32
+	ldp	x7, x0, [sp, #352]              // 16-byte Folded Reload
+	ldr	x30, [sp, #504]                 // 8-byte Folded Reload
+	add	x10, x11, #32
+	add	x20, x20, #32
+	add	x0, x0, #32
+	add	x9, x9, #32
+	stp	x9, x10, [sp, #440]             // 16-byte Folded Spill
+	ldp	x9, x11, [sp, #424]             // 16-byte Folded Reload
+	add	x10, x11, #32
+	add	x9, x9, #32
+	stp	x9, x10, [sp, #424]             // 16-byte Folded Spill
+	ldp	x9, x11, [sp, #456]             // 16-byte Folded Reload
+	add	x10, x11, #32
+	add	x9, x9, #32
+	stp	x9, x10, [sp, #456]             // 16-byte Folded Spill
+.LBB0_3:                                // =>This Loop Header: Depth=1
+                                        //     Child Loop BB0_5 Depth 2
+                                        //     Child Loop BB0_7 Depth 2
+                                        //     Child Loop BB0_10 Depth 2
+                                        //       Child Loop BB0_12 Depth 3
+                                        //       Child Loop BB0_14 Depth 3
+                                        //     Child Loop BB0_19 Depth 2
+                                        //     Child Loop BB0_21 Depth 2
+                                        //     Child Loop BB0_24 Depth 2
+                                        //     Child Loop BB0_26 Depth 2
+                                        //     Child Loop BB0_29 Depth 2
+                                        //     Child Loop BB0_31 Depth 2
+	cmp	x7, x25
+	b.ge	.LBB0_32
+// %bb.4:                               //   in Loop: Header=BB0_3 Depth=1
+	add	x10, x7, #8
+	add	x12, x7, x27, lsl #1
+	ldr	x17, [sp, #192]                 // 8-byte Folded Reload
+	mov	x9, xzr
+	str	x10, [sp, #352]                 // 8-byte Folded Spill
+	ldp	x11, x10, [sp, #256]            // 16-byte Folded Reload
+	stp	x0, x1, [sp, #360]              // 16-byte Folded Spill
+	ldp	x14, x3, [sp, #232]             // 16-byte Folded Reload
+	ldp	x4, x24, [sp, #440]             // 16-byte Folded Reload
+	str	x6, [sp, #392]                  // 8-byte Folded Spill
+	str	x20, [sp, #376]                 // 8-byte Folded Spill
+	add	x2, x10, x11, lsl #2
+	add	x11, x27, x7
+	lsl	x10, x7, #2
+	add	x11, x2, x11, lsl #2
+	add	x15, x2, x12, lsl #2
+	add	x12, x12, x27
+	add	x13, x2, x10
+	add	x12, x2, x12, lsl #2
+	ldp	q1, q0, [x13]
+	ldp	q3, q2, [x11]
+	add	x11, x7, x27, lsl #2
+	ldp	q6, q5, [x15]
+	ldp	x15, x18, [sp, #424]            // 16-byte Folded Reload
+	ldp	q7, q4, [x12]
+	add	x12, x2, x11, lsl #2
+	add	x11, x11, x27
+	add	x11, x2, x11, lsl #2
+	ldp	q17, q16, [x12]
+	ldp	x13, x12, [sp, #320]            // 16-byte Folded Reload
+	ldp	q19, q18, [x11]
+	mov	w11, #6                         // =0x6
+	madd	x11, x27, x11, x7
+	add	x16, x12, x13, lsl #2
+	lsl	x12, x22, #3
+	add	x11, x2, x11, lsl #2
+	ldr	q25, [x16, x12]
+	ldr	x12, [sp, #280]                 // 8-byte Folded Reload
+	ldr	q26, [x16, x28]
+	ldr	q28, [x16, x22, lsl #4]
+	ldr	q30, [x16]
+	ldp	q21, q20, [x11]
+	mov	w11, #12                        // =0xc
+	mul	x11, x22, x11
+	ldr	q27, [x16, x11]
+	sub	x11, x7, x27
+	add	x11, x11, x27, lsl #3
+	add	x11, x2, x11, lsl #2
+	ldp	q23, q22, [x11]
+	ldr	x11, [sp, #296]                 // 8-byte Folded Reload
+	add	x11, x11, x12, lsl #2
+	mov	w12, #20                        // =0x14
+	mul	x12, x22, x12
+	add	x10, x11, x10
+	ldp	q8, q9, [x10]
+	ldr	q24, [x16, x12]
+	mov	w12, #24                        // =0x18
+	mul	x12, x22, x12
+	ldr	q29, [x16, x12]
+	prfm	pldl1keep, [x3]
+	ldur	q31, [x3, #-16]
+	cmp	xzr, x23
+	b.ge	.LBB0_6
+	.p2align	2
+.LBB0_5:                                //   Parent Loop BB0_3 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x12, x4, x21
+	add	x29, x0, x21
+	fmla	v1.4s, v8.4s, v30.s[0]
+	fmla	v0.4s, v9.4s, v30.s[0]
+	fmla	v3.4s, v8.4s, v26.s[0]
+	fmla	v2.4s, v9.4s, v26.s[0]
+	stp	q8, q9, [x17, #-64]
+	fmla	v6.4s, v8.4s, v25.s[0]
+	fmla	v5.4s, v9.4s, v25.s[0]
+	prfm	pldl1keep, [x12]
+	add	x25, x6, x21
+	fmla	v7.4s, v8.4s, v27.s[0]
+	fmla	v4.4s, v9.4s, v27.s[0]
+	add	x12, x18, x21
+	add	x13, x20, x21
+	fmla	v16.4s, v9.4s, v28.s[0]
+	fmla	v17.4s, v8.4s, v28.s[0]
+	add	x10, x24, x21
+	add	x9, x9, #4
+	fmla	v18.4s, v9.4s, v24.s[0]
+	fmla	v19.4s, v8.4s, v24.s[0]
+	add	x24, x24, x26
+	add	x20, x20, x26
+	fmla	v20.4s, v9.4s, v29.s[0]
+	fmla	v21.4s, v8.4s, v29.s[0]
+	add	x6, x6, x26
+	add	x4, x4, x26
+	fmla	v22.4s, v9.4s, v31.s[0]
+	fmla	v23.4s, v8.4s, v31.s[0]
+	ldp	q8, q9, [x29]
+	fmla	v0.4s, v9.4s, v30.s[1]
+	fmla	v1.4s, v8.4s, v30.s[1]
+	stp	q8, q9, [x17, #-32]
+	prfm	pldl1keep, [x25]
+	fmla	v2.4s, v9.4s, v26.s[1]
+	fmla	v3.4s, v8.4s, v26.s[1]
+	add	x0, x0, x26
+	fmla	v5.4s, v9.4s, v25.s[1]
+	fmla	v6.4s, v8.4s, v25.s[1]
+	add	x18, x18, x26
+	fmla	v4.4s, v9.4s, v27.s[1]
+	fmla	v7.4s, v8.4s, v27.s[1]
+	fmla	v17.4s, v8.4s, v28.s[1]
+	fmla	v16.4s, v9.4s, v28.s[1]
+	fmla	v19.4s, v8.4s, v24.s[1]
+	fmla	v18.4s, v9.4s, v24.s[1]
+	fmla	v21.4s, v8.4s, v29.s[1]
+	fmla	v20.4s, v9.4s, v29.s[1]
+	fmla	v23.4s, v8.4s, v31.s[1]
+	fmla	v22.4s, v9.4s, v31.s[1]
+	ldp	q9, q8, [x12]
+	add	x12, x15, x21
+	stp	q9, q8, [x17]
+	prfm	pldl1keep, [x13]
+	add	x15, x15, x26
+	ldp	q11, q10, [x12]
+	add	x12, x1, x21
+	add	x1, x1, x26
+	fmla	v1.4s, v9.4s, v30.s[2]
+	fmla	v0.4s, v8.4s, v30.s[2]
+	stp	q11, q10, [x17, #32]
+	fmla	v3.4s, v9.4s, v26.s[2]
+	fmla	v2.4s, v8.4s, v26.s[2]
+	prfm	pldl1keep, [x10]
+	add	x10, x3, x14
+	fmla	v6.4s, v9.4s, v25.s[2]
+	fmla	v5.4s, v8.4s, v25.s[2]
+	add	x3, x3, #16
+	add	x17, x17, #128
+	fmla	v7.4s, v9.4s, v27.s[2]
+	fmla	v4.4s, v8.4s, v27.s[2]
+	fmla	v16.4s, v8.4s, v28.s[2]
+	fmla	v17.4s, v9.4s, v28.s[2]
+	fmla	v18.4s, v8.4s, v24.s[2]
+	fmla	v19.4s, v9.4s, v24.s[2]
+	fmla	v20.4s, v8.4s, v29.s[2]
+	fmla	v21.4s, v9.4s, v29.s[2]
+	fmla	v22.4s, v8.4s, v31.s[2]
+	fmla	v23.4s, v9.4s, v31.s[2]
+	ldp	q8, q9, [x12]
+	prfm	pldl1keep, [x10]
+	fmla	v0.4s, v10.4s, v30.s[3]
+	fmla	v1.4s, v11.4s, v30.s[3]
+	ldur	q30, [x10, #-16]
+	add	x10, x10, x28
+	prfm	pldl1keep, [x10]
+	fmla	v2.4s, v10.4s, v26.s[3]
+	fmla	v3.4s, v11.4s, v26.s[3]
+	ldur	q26, [x10, #-16]
+	add	x10, x10, x28
+	prfm	pldl1keep, [x10]
+	fmla	v5.4s, v10.4s, v25.s[3]
+	fmla	v6.4s, v11.4s, v25.s[3]
+	ldur	q25, [x10, #-16]
+	add	x10, x10, x28
+	prfm	pldl1keep, [x10]
+	fmla	v4.4s, v10.4s, v27.s[3]
+	fmla	v7.4s, v11.4s, v27.s[3]
+	ldur	q27, [x10, #-16]
+	add	x10, x10, x28
+	prfm	pldl1keep, [x10]
+	fmla	v17.4s, v11.4s, v28.s[3]
+	fmla	v16.4s, v10.4s, v28.s[3]
+	ldur	q28, [x10, #-16]
+	add	x10, x10, x28
+	prfm	pldl1keep, [x10]
+	fmla	v19.4s, v11.4s, v24.s[3]
+	fmla	v18.4s, v10.4s, v24.s[3]
+	ldur	q24, [x10, #-16]
+	add	x10, x10, x28
+	prfm	pldl1keep, [x10]
+	fmla	v21.4s, v11.4s, v29.s[3]
+	fmla	v20.4s, v10.4s, v29.s[3]
+	ldur	q29, [x10, #-16]
+	fmla	v23.4s, v11.4s, v31.s[3]
+	fmla	v22.4s, v10.4s, v31.s[3]
+	prfm	pldl1keep, [x3]
+	ldur	q31, [x3, #-16]
+	cmp	x9, x23
+	b.lt	.LBB0_5
+.LBB0_6:                                //   in Loop: Header=BB0_3 Depth=1
+	ldp	x13, x12, [sp, #400]            // 16-byte Folded Reload
+	ldr	x14, [sp, #304]                 // 8-byte Folded Reload
+	add	x10, x8, x23, lsl #5
+	fmla	v1.4s, v8.4s, v30.s[0]
+	fmla	v0.4s, v9.4s, v30.s[0]
+	fmla	v3.4s, v8.4s, v26.s[0]
+	fmla	v2.4s, v9.4s, v26.s[0]
+	stp	q8, q9, [x10]
+	fmla	v6.4s, v8.4s, v25.s[0]
+	fmla	v5.4s, v9.4s, v25.s[0]
+	fmla	v4.4s, v9.4s, v27.s[0]
+	fmla	v7.4s, v8.4s, v27.s[0]
+	ldp	x1, x18, [sp, #456]             // 16-byte Folded Reload
+	madd	x9, x13, x14, x7
+	fmla	v16.4s, v9.4s, v28.s[0]
+	fmla	v17.4s, v8.4s, v28.s[0]
+	madd	x10, x12, x14, x7
+	fmla	v18.4s, v9.4s, v24.s[0]
+	fmla	v19.4s, v8.4s, v24.s[0]
+	add	x0, x8, x12, lsl #5
+	ldr	x12, [sp, #416]                 // 8-byte Folded Reload
+	fmla	v20.4s, v9.4s, v29.s[0]
+	fmla	v21.4s, v8.4s, v29.s[0]
+	mov	x15, xzr
+	add	x9, x11, x9, lsl #2
+	fmla	v22.4s, v9.4s, v31.s[0]
+	fmla	v23.4s, v8.4s, v31.s[0]
+	add	x10, x11, x10, lsl #2
+	ldp	q8, q9, [x9]
+	add	x9, x8, x13, lsl #5
+	fmla	v0.4s, v9.4s, v30.s[1]
+	fmla	v2.4s, v9.4s, v26.s[1]
+	fmla	v5.4s, v9.4s, v25.s[1]
+	fmla	v4.4s, v9.4s, v27.s[1]
+	fmla	v16.4s, v9.4s, v28.s[1]
+	fmla	v18.4s, v9.4s, v24.s[1]
+	fmla	v20.4s, v9.4s, v29.s[1]
+	fmla	v22.4s, v9.4s, v31.s[1]
+	fmla	v1.4s, v8.4s, v30.s[1]
+	stp	q8, q9, [x9]
+	fmla	v3.4s, v8.4s, v26.s[1]
+	fmla	v6.4s, v8.4s, v25.s[1]
+	fmla	v7.4s, v8.4s, v27.s[1]
+	fmla	v17.4s, v8.4s, v28.s[1]
+	fmla	v19.4s, v8.4s, v24.s[1]
+	fmla	v21.4s, v8.4s, v29.s[1]
+	fmla	v23.4s, v8.4s, v31.s[1]
+	ldp	q9, q8, [x10]
+	madd	x10, x12, x14, x7
+	ldr	x14, [sp, #272]                 // 8-byte Folded Reload
+	add	x10, x11, x10, lsl #2
+	fmla	v0.4s, v8.4s, v30.s[2]
+	fmla	v2.4s, v8.4s, v26.s[2]
+	fmla	v5.4s, v8.4s, v25.s[2]
+	fmla	v4.4s, v8.4s, v27.s[2]
+	fmla	v16.4s, v8.4s, v28.s[2]
+	fmla	v18.4s, v8.4s, v24.s[2]
+	fmla	v20.4s, v8.4s, v29.s[2]
+	fmla	v22.4s, v8.4s, v31.s[2]
+	mov	x11, x30
+	add	x30, x8, x12, lsl #5
+	stp	q9, q8, [x0]
+	fmla	v1.4s, v9.4s, v30.s[2]
+	fmla	v3.4s, v9.4s, v26.s[2]
+	fmla	v6.4s, v9.4s, v25.s[2]
+	fmla	v7.4s, v9.4s, v27.s[2]
+	fmla	v17.4s, v9.4s, v28.s[2]
+	fmla	v19.4s, v9.4s, v24.s[2]
+	fmla	v21.4s, v9.4s, v29.s[2]
+	fmla	v23.4s, v9.4s, v31.s[2]
+	ldp	q8, q9, [x10]
+	ldr	x10, [sp, #224]                 // 8-byte Folded Reload
+	stp	q8, q9, [x30]
+	fmla	v0.4s, v9.4s, v30.s[3]
+	fmla	v1.4s, v8.4s, v30.s[3]
+	fmla	v2.4s, v9.4s, v26.s[3]
+	fmla	v3.4s, v8.4s, v26.s[3]
+	fmla	v5.4s, v9.4s, v25.s[3]
+	fmla	v6.4s, v8.4s, v25.s[3]
+	fmla	v7.4s, v8.4s, v27.s[3]
+	fmla	v4.4s, v9.4s, v27.s[3]
+	fmla	v17.4s, v8.4s, v28.s[3]
+	fmla	v16.4s, v9.4s, v28.s[3]
+	fmla	v19.4s, v8.4s, v24.s[3]
+	fmla	v18.4s, v9.4s, v24.s[3]
+	fmla	v21.4s, v8.4s, v29.s[3]
+	fmla	v20.4s, v9.4s, v29.s[3]
+	fmla	v23.4s, v8.4s, v31.s[3]
+	fmla	v22.4s, v9.4s, v31.s[3]
+	cmp	x11, x19
+	b.ge	.LBB0_8
+	.p2align	2
+.LBB0_7:                                //   Parent Loop BB0_3 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x12, x10, x28
+	prfm	pldl1keep, [x10]
+	ldur	s24, [x10, #-4]
+	add	x10, x10, #4
+	add	x13, x12, x28
+	prfm	pldl1keep, [x12]
+	ldur	s25, [x12, #-4]
+	add	x17, x13, x28
+	prfm	pldl1keep, [x13]
+	ldur	s26, [x13, #-4]
+	add	x12, x17, x28
+	prfm	pldl1keep, [x17]
+	ldur	s27, [x17, #-4]
+	add	x13, x12, x28
+	prfm	pldl1keep, [x12]
+	ldur	s28, [x12, #-4]
+	add	x12, x18, x15
+	add	x17, x13, x28
+	prfm	pldl1keep, [x13]
+	ldur	s29, [x13, #-4]
+	add	x13, x1, x15
+	add	x15, x15, x14
+	prfm	pldl1keep, [x17]
+	ldur	s30, [x17, #-4]
+	add	x17, x17, x28
+	prfm	pldl1keep, [x17]
+	ldur	s31, [x17, #-4]
+	prfm	pldl1keep, [x12]
+	add	x12, x8, x11, lsl #5
+	add	x11, x11, #1
+	ldp	q8, q9, [x13]
+	fmla	v0.4s, v9.4s, v24.s[0]
+	fmla	v2.4s, v9.4s, v25.s[0]
+	fmla	v5.4s, v9.4s, v26.s[0]
+	fmla	v4.4s, v9.4s, v27.s[0]
+	fmla	v16.4s, v9.4s, v28.s[0]
+	fmla	v18.4s, v9.4s, v29.s[0]
+	fmla	v20.4s, v9.4s, v30.s[0]
+	fmla	v1.4s, v8.4s, v24.s[0]
+	fmla	v3.4s, v8.4s, v25.s[0]
+	fmla	v6.4s, v8.4s, v26.s[0]
+	fmla	v7.4s, v8.4s, v27.s[0]
+	fmla	v17.4s, v8.4s, v28.s[0]
+	fmla	v19.4s, v8.4s, v29.s[0]
+	fmla	v21.4s, v8.4s, v30.s[0]
+	fmla	v23.4s, v8.4s, v31.s[0]
+	fmla	v22.4s, v9.4s, v31.s[0]
+	stp	q8, q9, [x12]
+	cmp	x11, x19
+	b.lt	.LBB0_7
+.LBB0_8:                                // %.preheader29
+                                        //   in Loop: Header=BB0_3 Depth=1
+	ldp	x18, x13, [sp, #208]            // 16-byte Folded Reload
+	mov	x10, xzr
+	mov	w6, #1                          // =0x1
+	mov	w24, #2                         // =0x2
+	mov	w20, #3                         // =0x3
+	mov	w29, #4                         // =0x4
+	mov	w15, #5                         // =0x5
+	mov	w11, #6                         // =0x6
+	mov	w25, #7                         // =0x7
+	mov	w1, #8                          // =0x8
+	b	.LBB0_10
+	.p2align	2
+.LBB0_9:                                // %.loopexit28
+                                        //   in Loop: Header=BB0_10 Depth=2
+	ldr	x10, [sp, #496]                 // 8-byte Folded Reload
+	add	x13, x13, x10
+	add	x18, x18, x10
+	mov	x10, x1
+	mov	x1, x3
+.LBB0_10:                               //   Parent Loop BB0_3 Depth=1
+                                        // =>  This Loop Header: Depth=2
+                                        //       Child Loop BB0_12 Depth 3
+                                        //       Child Loop BB0_14 Depth 3
+	madd	x10, x10, x27, x7
+	add	x10, x2, x10, lsl #2
+	stp	q1, q0, [x10]
+	madd	x10, x6, x27, x7
+	add	x10, x2, x10, lsl #2
+	stp	q3, q2, [x10]
+	madd	x10, x24, x27, x7
+	add	x10, x2, x10, lsl #2
+	stp	q6, q5, [x10]
+	madd	x10, x20, x27, x7
+	add	x10, x2, x10, lsl #2
+	stp	q7, q4, [x10]
+	madd	x10, x29, x27, x7
+	add	x10, x2, x10, lsl #2
+	stp	q17, q16, [x10]
+	madd	x10, x15, x27, x7
+	add	x10, x2, x10, lsl #2
+	stp	q19, q18, [x10]
+	madd	x10, x11, x27, x7
+	ldr	x11, [sp, #520]                 // 8-byte Folded Reload
+	add	x10, x2, x10, lsl #2
+	cmp	x1, x11
+	stp	q21, q20, [x10]
+	madd	x10, x25, x27, x7
+	add	x10, x2, x10, lsl #2
+	stp	q23, q22, [x10]
+	b.ge	.LBB0_15
+// %bb.11:                              //   in Loop: Header=BB0_10 Depth=2
+	madd	x10, x1, x27, x7
+	add	x20, x1, #3
+	add	x29, x1, #4
+	add	x6, x1, #1
+	madd	x15, x20, x27, x7
+	add	x25, x1, #7
+	add	x24, x1, #2
+	mov	x4, xzr
+	madd	x11, x6, x27, x7
+	ldp	q8, q9, [x8]
+	add	x3, x1, #8
+	madd	x12, x24, x27, x7
+	mov	x17, x13
+	add	x10, x2, x10, lsl #2
+	add	x15, x2, x15, lsl #2
+	ldp	q1, q0, [x10]
+	madd	x10, x29, x27, x7
+	add	x11, x2, x11, lsl #2
+	ldp	q7, q4, [x15]
+	add	x15, x1, #5
+	add	x12, x2, x12, lsl #2
+	ldp	q3, q2, [x11]
+	add	x11, x1, #6
+	add	x10, x2, x10, lsl #2
+	ldp	q6, q5, [x12]
+	ldp	q17, q16, [x10]
+	madd	x10, x15, x27, x7
+	add	x10, x2, x10, lsl #2
+	ldp	q19, q18, [x10]
+	madd	x10, x11, x27, x7
+	add	x10, x2, x10, lsl #2
+	ldp	q21, q20, [x10]
+	madd	x10, x25, x27, x7
+	add	x10, x2, x10, lsl #2
+	ldp	q23, q22, [x10]
+	mul	x10, x1, x22
+	lsl	x10, x10, #2
+	ldr	q31, [x16, x10]
+	mul	x10, x6, x22
+	lsl	x10, x10, #2
+	ldr	q30, [x16, x10]
+	mul	x10, x24, x22
+	lsl	x10, x10, #2
+	ldr	q29, [x16, x10]
+	mul	x10, x20, x22
+	lsl	x10, x10, #2
+	ldr	q28, [x16, x10]
+	mul	x10, x29, x22
+	lsl	x10, x10, #2
+	ldr	q27, [x16, x10]
+	mul	x10, x15, x22
+	lsl	x10, x10, #2
+	ldr	q26, [x16, x10]
+	mul	x10, x11, x22
+	lsl	x10, x10, #2
+	ldr	q25, [x16, x10]
+	mul	x10, x25, x22
+	lsl	x10, x10, #2
+	ldr	q24, [x16, x10]
+	mov	x10, x5
+	cmp	xzr, x23
+	b.ge	.LBB0_13
+	.p2align	2
+.LBB0_12:                               //   Parent Loop BB0_3 Depth=1
+                                        //     Parent Loop BB0_10 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x14, x10, #32
+	fmla	v1.4s, v8.4s, v31.s[0]
+	fmla	v0.4s, v9.4s, v31.s[0]
+	add	x12, x10, #96
+	fmla	v2.4s, v9.4s, v30.s[0]
+	fmla	v3.4s, v8.4s, v30.s[0]
+	prfm	pldl1keep, [x14]
+	add	x4, x4, #4
+	fmla	v5.4s, v9.4s, v29.s[0]
+	fmla	v6.4s, v8.4s, v29.s[0]
+	fmla	v4.4s, v9.4s, v28.s[0]
+	fmla	v7.4s, v8.4s, v28.s[0]
+	fmla	v16.4s, v9.4s, v27.s[0]
+	fmla	v17.4s, v8.4s, v27.s[0]
+	fmla	v18.4s, v9.4s, v26.s[0]
+	fmla	v19.4s, v8.4s, v26.s[0]
+	fmla	v20.4s, v9.4s, v25.s[0]
+	fmla	v21.4s, v8.4s, v25.s[0]
+	fmla	v22.4s, v9.4s, v24.s[0]
+	fmla	v23.4s, v8.4s, v24.s[0]
+	ldp	q8, q9, [x10, #-96]
+	fmla	v0.4s, v9.4s, v31.s[1]
+	fmla	v2.4s, v9.4s, v30.s[1]
+	fmla	v1.4s, v8.4s, v31.s[1]
+	fmla	v3.4s, v8.4s, v30.s[1]
+	fmla	v6.4s, v8.4s, v29.s[1]
+	fmla	v5.4s, v9.4s, v29.s[1]
+	fmla	v7.4s, v8.4s, v28.s[1]
+	fmla	v4.4s, v9.4s, v28.s[1]
+	fmla	v17.4s, v8.4s, v27.s[1]
+	fmla	v16.4s, v9.4s, v27.s[1]
+	fmla	v19.4s, v8.4s, v26.s[1]
+	fmla	v18.4s, v9.4s, v26.s[1]
+	fmla	v21.4s, v8.4s, v25.s[1]
+	fmla	v20.4s, v9.4s, v25.s[1]
+	fmla	v23.4s, v8.4s, v24.s[1]
+	fmla	v22.4s, v9.4s, v24.s[1]
+	ldp	q9, q8, [x10, #-64]
+	prfm	pldl1keep, [x12]
+	ldp	q11, q10, [x10, #-32]
+	add	x12, x17, x28
+	fmla	v1.4s, v9.4s, v31.s[2]
+	fmla	v0.4s, v8.4s, v31.s[2]
+	fmla	v2.4s, v8.4s, v30.s[2]
+	fmla	v3.4s, v9.4s, v30.s[2]
+	fmla	v5.4s, v8.4s, v29.s[2]
+	fmla	v6.4s, v9.4s, v29.s[2]
+	fmla	v4.4s, v8.4s, v28.s[2]
+	fmla	v7.4s, v9.4s, v28.s[2]
+	fmla	v16.4s, v8.4s, v27.s[2]
+	fmla	v17.4s, v9.4s, v27.s[2]
+	fmla	v18.4s, v8.4s, v26.s[2]
+	fmla	v19.4s, v9.4s, v26.s[2]
+	fmla	v20.4s, v8.4s, v25.s[2]
+	fmla	v21.4s, v9.4s, v25.s[2]
+	fmla	v22.4s, v8.4s, v24.s[2]
+	fmla	v23.4s, v9.4s, v24.s[2]
+	ldp	q8, q9, [x10], #128
+	prfm	pldl1keep, [x17]
+	fmla	v0.4s, v10.4s, v31.s[3]
+	fmla	v1.4s, v11.4s, v31.s[3]
+	ldur	q31, [x17, #-16]
+	prfm	pldl1keep, [x12]
+	add	x17, x17, #16
+	fmla	v3.4s, v11.4s, v30.s[3]
+	fmla	v2.4s, v10.4s, v30.s[3]
+	ldur	q30, [x12, #-16]
+	add	x12, x12, x28
+	prfm	pldl1keep, [x12]
+	fmla	v6.4s, v11.4s, v29.s[3]
+	fmla	v5.4s, v10.4s, v29.s[3]
+	ldur	q29, [x12, #-16]
+	add	x12, x12, x28
+	prfm	pldl1keep, [x12]
+	fmla	v7.4s, v11.4s, v28.s[3]
+	fmla	v4.4s, v10.4s, v28.s[3]
+	ldur	q28, [x12, #-16]
+	add	x12, x12, x28
+	prfm	pldl1keep, [x12]
+	fmla	v17.4s, v11.4s, v27.s[3]
+	fmla	v16.4s, v10.4s, v27.s[3]
+	ldur	q27, [x12, #-16]
+	add	x12, x12, x28
+	prfm	pldl1keep, [x12]
+	fmla	v19.4s, v11.4s, v26.s[3]
+	fmla	v18.4s, v10.4s, v26.s[3]
+	ldur	q26, [x12, #-16]
+	add	x12, x12, x28
+	prfm	pldl1keep, [x12]
+	fmla	v21.4s, v11.4s, v25.s[3]
+	fmla	v20.4s, v10.4s, v25.s[3]
+	ldur	q25, [x12, #-16]
+	add	x12, x12, x28
+	prfm	pldl1keep, [x12]
+	fmla	v23.4s, v11.4s, v24.s[3]
+	fmla	v22.4s, v10.4s, v24.s[3]
+	ldur	q24, [x12, #-16]
+	cmp	x4, x23
+	b.lt	.LBB0_12
+.LBB0_13:                               //   in Loop: Header=BB0_10 Depth=2
+	ldp	q11, q10, [x9]
+	fmla	v0.4s, v9.4s, v31.s[0]
+	fmla	v1.4s, v8.4s, v31.s[0]
+	fmla	v2.4s, v9.4s, v30.s[0]
+	fmla	v3.4s, v8.4s, v30.s[0]
+	ldr	x17, [sp, #488]                 // 8-byte Folded Reload
+	ldr	x4, [sp, #504]                  // 8-byte Folded Reload
+	fmla	v5.4s, v9.4s, v29.s[0]
+	fmla	v6.4s, v8.4s, v29.s[0]
+	mov	x10, x18
+	fmla	v4.4s, v9.4s, v28.s[0]
+	fmla	v7.4s, v8.4s, v28.s[0]
+	fmla	v16.4s, v9.4s, v27.s[0]
+	fmla	v17.4s, v8.4s, v27.s[0]
+	fmla	v18.4s, v9.4s, v26.s[0]
+	fmla	v19.4s, v8.4s, v26.s[0]
+	fmla	v20.4s, v9.4s, v25.s[0]
+	fmla	v21.4s, v8.4s, v25.s[0]
+	fmla	v22.4s, v9.4s, v24.s[0]
+	ldp	q9, q12, [x0]
+	fmla	v23.4s, v8.4s, v24.s[0]
+	fmla	v1.4s, v11.4s, v31.s[1]
+	fmla	v0.4s, v10.4s, v31.s[1]
+	fmla	v3.4s, v11.4s, v30.s[1]
+	fmla	v2.4s, v10.4s, v30.s[1]
+	fmla	v6.4s, v11.4s, v29.s[1]
+	fmla	v5.4s, v10.4s, v29.s[1]
+	fmla	v7.4s, v11.4s, v28.s[1]
+	fmla	v4.4s, v10.4s, v28.s[1]
+	fmla	v17.4s, v11.4s, v27.s[1]
+	fmla	v16.4s, v10.4s, v27.s[1]
+	fmla	v19.4s, v11.4s, v26.s[1]
+	fmla	v18.4s, v10.4s, v26.s[1]
+	fmla	v21.4s, v11.4s, v25.s[1]
+	fmla	v20.4s, v10.4s, v25.s[1]
+	fmla	v23.4s, v11.4s, v24.s[1]
+	fmla	v22.4s, v10.4s, v24.s[1]
+	fmla	v0.4s, v12.4s, v31.s[2]
+	ldp	q10, q8, [x30]
+	fmla	v1.4s, v9.4s, v31.s[2]
+	fmla	v2.4s, v12.4s, v30.s[2]
+	fmla	v3.4s, v9.4s, v30.s[2]
+	fmla	v5.4s, v12.4s, v29.s[2]
+	fmla	v6.4s, v9.4s, v29.s[2]
+	fmla	v4.4s, v12.4s, v28.s[2]
+	fmla	v7.4s, v9.4s, v28.s[2]
+	fmla	v16.4s, v12.4s, v27.s[2]
+	fmla	v17.4s, v9.4s, v27.s[2]
+	fmla	v18.4s, v12.4s, v26.s[2]
+	fmla	v19.4s, v9.4s, v26.s[2]
+	fmla	v20.4s, v12.4s, v25.s[2]
+	fmla	v21.4s, v9.4s, v25.s[2]
+	fmla	v22.4s, v12.4s, v24.s[2]
+	fmla	v23.4s, v9.4s, v24.s[2]
+	fmla	v1.4s, v10.4s, v31.s[3]
+	fmla	v0.4s, v8.4s, v31.s[3]
+	fmla	v3.4s, v10.4s, v30.s[3]
+	fmla	v2.4s, v8.4s, v30.s[3]
+	fmla	v6.4s, v10.4s, v29.s[3]
+	fmla	v5.4s, v8.4s, v29.s[3]
+	fmla	v7.4s, v10.4s, v28.s[3]
+	fmla	v4.4s, v8.4s, v28.s[3]
+	fmla	v17.4s, v10.4s, v27.s[3]
+	fmla	v16.4s, v8.4s, v27.s[3]
+	fmla	v19.4s, v10.4s, v26.s[3]
+	fmla	v18.4s, v8.4s, v26.s[3]
+	fmla	v21.4s, v10.4s, v25.s[3]
+	fmla	v20.4s, v8.4s, v25.s[3]
+	fmla	v23.4s, v10.4s, v24.s[3]
+	fmla	v22.4s, v8.4s, v24.s[3]
+	cmp	x4, x19
+	b.ge	.LBB0_9
+	.p2align	2
+.LBB0_14:                               //   Parent Loop BB0_3 Depth=1
+                                        //     Parent Loop BB0_10 Depth=2
+                                        // =>    This Inner Loop Header: Depth=3
+	add	x12, x10, x28
+	prfm	pldl1keep, [x10]
+	ldur	s24, [x10, #-4]
+	add	x4, x4, #1
+	prfm	pldl1keep, [x12]
+	ldur	s25, [x12, #-4]
+	add	x12, x12, x28
+	add	x10, x10, #4
+	prfm	pldl1keep, [x12]
+	ldur	s26, [x12, #-4]
+	add	x12, x12, x28
+	prfm	pldl1keep, [x12]
+	ldur	s27, [x12, #-4]
+	add	x12, x12, x28
+	prfm	pldl1keep, [x12]
+	ldur	s28, [x12, #-4]
+	add	x12, x12, x28
+	prfm	pldl1keep, [x12]
+	ldur	s29, [x12, #-4]
+	add	x12, x12, x28
+	prfm	pldl1keep, [x12]
+	ldur	s30, [x12, #-4]
+	add	x12, x12, x28
+	prfm	pldl1keep, [x12]
+	ldur	s31, [x12, #-4]
+	prfm	pldl1keep, [x17]
+	ldp	q8, q9, [x17, #-32]
+	add	x17, x17, #32
+	fmla	v0.4s, v9.4s, v24.s[0]
+	fmla	v2.4s, v9.4s, v25.s[0]
+	fmla	v5.4s, v9.4s, v26.s[0]
+	fmla	v4.4s, v9.4s, v27.s[0]
+	fmla	v16.4s, v9.4s, v28.s[0]
+	fmla	v18.4s, v9.4s, v29.s[0]
+	fmla	v20.4s, v9.4s, v30.s[0]
+	fmla	v1.4s, v8.4s, v24.s[0]
+	fmla	v3.4s, v8.4s, v25.s[0]
+	fmla	v6.4s, v8.4s, v26.s[0]
+	fmla	v7.4s, v8.4s, v27.s[0]
+	fmla	v17.4s, v8.4s, v28.s[0]
+	fmla	v19.4s, v8.4s, v29.s[0]
+	fmla	v21.4s, v8.4s, v30.s[0]
+	fmla	v23.4s, v8.4s, v31.s[0]
+	fmla	v22.4s, v9.4s, v31.s[0]
+	cmp	x4, x19
+	b.lt	.LBB0_14
+	b	.LBB0_9
+	.p2align	2
+.LBB0_15:                               //   in Loop: Header=BB0_3 Depth=1
+	ldp	x17, x24, [sp, #336]            // 16-byte Folded Reload
+	ldr	x20, [sp, #312]                 // 8-byte Folded Reload
+	cmp	x11, x17
+	ldp	x25, x6, [sp, #384]             // 16-byte Folded Reload
+	ldr	x29, [sp, #200]                 // 8-byte Folded Reload
+	b.lt	.LBB0_18
+// %bb.16:                              //   in Loop: Header=BB0_3 Depth=1
+	cmp	x17, x20
+	b.lt	.LBB0_23
+.LBB0_17:                               //   in Loop: Header=BB0_3 Depth=1
+	ldr	x10, [sp, #288]                 // 8-byte Folded Reload
+	ldr	x1, [sp, #368]                  // 8-byte Folded Reload
+	cmp	x20, x10
+	b.ge	.LBB0_2
+	b	.LBB0_28
+	.p2align	2
+.LBB0_18:                               //   in Loop: Header=BB0_3 Depth=1
+	ldr	x15, [sp, #520]                 // 8-byte Folded Reload
+	ldp	q20, q21, [x8]
+	mov	x10, xzr
+	add	x12, x15, #1
+	add	x13, x15, #2
+	mul	x11, x15, x27
+	add	x14, x15, #3
+	mul	x15, x15, x22
+	madd	x18, x12, x27, x7
+	mul	x12, x12, x22
+	madd	x1, x13, x27, x7
+	lsl	x3, x15, #2
+	add	x11, x11, x7
+	lsl	x12, x12, #2
+	add	x15, x2, x18, lsl #2
+	madd	x18, x14, x27, x7
+	add	x17, x2, x11, lsl #2
+	add	x11, x2, x1, lsl #2
+	ldr	x1, [sp, #184]                  // 8-byte Folded Reload
+	ldr	q18, [x16, x3]
+	ldr	q19, [x16, x12]
+	mul	x12, x13, x22
+	mov	x13, x5
+	ldp	q3, q0, [x17]
+	ldp	q4, q1, [x15]
+	ldp	q5, q2, [x11]
+	add	x18, x2, x18, lsl #2
+	lsl	x12, x12, #2
+	ldr	q17, [x16, x12]
+	mul	x12, x14, x22
+	ldp	q7, q6, [x18]
+	lsl	x12, x12, #2
+	ldr	q16, [x16, x12]
+	cmp	xzr, x23
+	b.ge	.LBB0_20
+	.p2align	2
+.LBB0_19:                               //   Parent Loop BB0_3 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x12, x13, #32
+	fmla	v3.4s, v20.4s, v18.s[0]
+	fmla	v0.4s, v21.4s, v18.s[0]
+	add	x10, x10, #4
+	prfm	pldl1keep, [x12]
+	ldp	q22, q23, [x13, #-96]
+	fmla	v1.4s, v21.4s, v19.s[0]
+	fmla	v4.4s, v20.4s, v19.s[0]
+	fmla	v2.4s, v21.4s, v17.s[0]
+	fmla	v5.4s, v20.4s, v17.s[0]
+	add	x12, x13, #96
+	fmla	v6.4s, v21.4s, v16.s[0]
+	fmla	v7.4s, v20.4s, v16.s[0]
+	ldp	q21, q20, [x13, #-64]
+	prfm	pldl1keep, [x12]
+	add	x12, x1, x28
+	add	x14, x12, x28
+	fmla	v0.4s, v23.4s, v18.s[1]
+	fmla	v1.4s, v23.4s, v19.s[1]
+	fmla	v2.4s, v23.4s, v17.s[1]
+	fmla	v6.4s, v23.4s, v16.s[1]
+	fmla	v3.4s, v22.4s, v18.s[1]
+	fmla	v4.4s, v22.4s, v19.s[1]
+	fmla	v5.4s, v22.4s, v17.s[1]
+	fmla	v7.4s, v22.4s, v16.s[1]
+	fmla	v0.4s, v20.4s, v18.s[2]
+	ldp	q22, q23, [x13, #-32]
+	fmla	v1.4s, v20.4s, v19.s[2]
+	fmla	v2.4s, v20.4s, v17.s[2]
+	fmla	v6.4s, v20.4s, v16.s[2]
+	fmla	v3.4s, v21.4s, v18.s[2]
+	fmla	v4.4s, v21.4s, v19.s[2]
+	fmla	v5.4s, v21.4s, v17.s[2]
+	fmla	v7.4s, v21.4s, v16.s[2]
+	ldp	q20, q21, [x13], #128
+	prfm	pldl1keep, [x1]
+	fmla	v0.4s, v23.4s, v18.s[3]
+	fmla	v1.4s, v23.4s, v19.s[3]
+	fmla	v2.4s, v23.4s, v17.s[3]
+	fmla	v6.4s, v23.4s, v16.s[3]
+	fmla	v3.4s, v22.4s, v18.s[3]
+	ldur	q18, [x1, #-16]
+	prfm	pldl1keep, [x12]
+	fmla	v4.4s, v22.4s, v19.s[3]
+	ldur	q19, [x12, #-16]
+	add	x12, x14, x28
+	prfm	pldl1keep, [x14]
+	add	x1, x1, #16
+	fmla	v5.4s, v22.4s, v17.s[3]
+	ldur	q17, [x14, #-16]
+	prfm	pldl1keep, [x12]
+	fmla	v7.4s, v22.4s, v16.s[3]
+	ldur	q16, [x12, #-16]
+	cmp	x10, x23
+	b.lt	.LBB0_19
+.LBB0_20:                               //   in Loop: Header=BB0_3 Depth=1
+	ldp	q23, q22, [x9]
+	fmla	v0.4s, v21.4s, v18.s[0]
+	fmla	v3.4s, v20.4s, v18.s[0]
+	fmla	v1.4s, v21.4s, v19.s[0]
+	fmla	v4.4s, v20.4s, v19.s[0]
+	ldr	x10, [sp, #176]                 // 8-byte Folded Reload
+	ldr	x13, [sp, #488]                 // 8-byte Folded Reload
+	fmla	v2.4s, v21.4s, v17.s[0]
+	fmla	v5.4s, v20.4s, v17.s[0]
+	ldr	x1, [sp, #504]                  // 8-byte Folded Reload
+	fmla	v6.4s, v21.4s, v16.s[0]
+	fmla	v7.4s, v20.4s, v16.s[0]
+	ldp	q20, q21, [x0]
+	fmla	v0.4s, v22.4s, v18.s[1]
+	fmla	v1.4s, v22.4s, v19.s[1]
+	fmla	v2.4s, v22.4s, v17.s[1]
+	fmla	v6.4s, v22.4s, v16.s[1]
+	fmla	v3.4s, v23.4s, v18.s[1]
+	fmla	v4.4s, v23.4s, v19.s[1]
+	fmla	v5.4s, v23.4s, v17.s[1]
+	fmla	v7.4s, v23.4s, v16.s[1]
+	fmla	v0.4s, v21.4s, v18.s[2]
+	ldp	q23, q22, [x30]
+	fmla	v1.4s, v21.4s, v19.s[2]
+	fmla	v2.4s, v21.4s, v17.s[2]
+	fmla	v6.4s, v21.4s, v16.s[2]
+	fmla	v3.4s, v20.4s, v18.s[2]
+	fmla	v4.4s, v20.4s, v19.s[2]
+	fmla	v5.4s, v20.4s, v17.s[2]
+	fmla	v7.4s, v20.4s, v16.s[2]
+	fmla	v0.4s, v22.4s, v18.s[3]
+	fmla	v1.4s, v22.4s, v19.s[3]
+	fmla	v2.4s, v22.4s, v17.s[3]
+	fmla	v6.4s, v22.4s, v16.s[3]
+	fmla	v3.4s, v23.4s, v18.s[3]
+	fmla	v4.4s, v23.4s, v19.s[3]
+	fmla	v5.4s, v23.4s, v17.s[3]
+	fmla	v7.4s, v23.4s, v16.s[3]
+	cmp	x1, x19
+	b.ge	.LBB0_22
+	.p2align	2
+.LBB0_21:                               //   Parent Loop BB0_3 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x12, x10, x28
+	prfm	pldl1keep, [x10]
+	ldur	s16, [x10, #-4]
+	add	x1, x1, #1
+	prfm	pldl1keep, [x12]
+	ldur	s17, [x12, #-4]
+	add	x12, x12, x28
+	add	x10, x10, #4
+	prfm	pldl1keep, [x12]
+	ldur	s18, [x12, #-4]
+	add	x12, x12, x28
+	prfm	pldl1keep, [x12]
+	ldur	s19, [x12, #-4]
+	prfm	pldl1keep, [x13]
+	ldp	q20, q21, [x13, #-32]
+	add	x13, x13, #32
+	fmla	v0.4s, v21.4s, v16.s[0]
+	fmla	v1.4s, v21.4s, v17.s[0]
+	fmla	v2.4s, v21.4s, v18.s[0]
+	fmla	v3.4s, v20.4s, v16.s[0]
+	fmla	v4.4s, v20.4s, v17.s[0]
+	fmla	v5.4s, v20.4s, v18.s[0]
+	fmla	v7.4s, v20.4s, v19.s[0]
+	fmla	v6.4s, v21.4s, v19.s[0]
+	cmp	x1, x19
+	b.lt	.LBB0_21
+.LBB0_22:                               //   in Loop: Header=BB0_3 Depth=1
+	stp	q3, q0, [x17]
+	ldr	x17, [sp, #336]                 // 8-byte Folded Reload
+	stp	q4, q1, [x15]
+	stp	q5, q2, [x11]
+	stp	q7, q6, [x18]
+	cmp	x17, x20
+	b.ge	.LBB0_17
+.LBB0_23:                               //   in Loop: Header=BB0_3 Depth=1
+	mul	x10, x17, x27
+	add	x12, x17, #1
+	ldp	q6, q7, [x8]
+	madd	x11, x12, x27, x7
+	ldr	x18, [sp, #168]                 // 8-byte Folded Reload
+	mov	x13, xzr
+	mov	x15, x5
+	mul	x14, x17, x22
+	ldr	x17, [sp, #248]                 // 8-byte Folded Reload
+	add	x10, x10, x7
+	mul	x12, x12, x22
+	lsl	x14, x14, #2
+	add	x10, x2, x10, lsl #2
+	add	x11, x2, x11, lsl #2
+	lsl	x12, x12, #2
+	ldr	q5, [x16, x14]
+	ldr	q4, [x16, x12]
+	ldp	q1, q0, [x10]
+	ldp	q3, q2, [x11]
+	cmp	xzr, x23
+	b.ge	.LBB0_25
+	.p2align	2
+.LBB0_24:                               //   Parent Loop BB0_3 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x6, x15, #32
+	fmla	v1.4s, v6.4s, v5.s[0]
+	fmla	v0.4s, v7.4s, v5.s[0]
+	add	x4, x15, #96
+	prfm	pldl1keep, [x6]
+	ldp	q16, q17, [x15, #-96]
+	fmla	v2.4s, v7.4s, v4.s[0]
+	fmla	v3.4s, v6.4s, v4.s[0]
+	ldp	q7, q6, [x15, #-64]
+	prfm	pldl1keep, [x4]
+	add	x12, x18, x24
+	add	x1, x17, x24
+	add	x14, x12, #32
+	add	x3, x1, #32
+	add	x13, x13, #4
+	add	x18, x18, #16
+	add	x17, x17, #16
+	fmla	v0.4s, v17.4s, v5.s[1]
+	fmla	v2.4s, v17.4s, v4.s[1]
+	fmla	v1.4s, v16.4s, v5.s[1]
+	fmla	v3.4s, v16.4s, v4.s[1]
+	fmla	v0.4s, v6.4s, v5.s[2]
+	ldp	q16, q17, [x15, #-32]
+	fmla	v2.4s, v6.4s, v4.s[2]
+	fmla	v1.4s, v7.4s, v5.s[2]
+	fmla	v3.4s, v7.4s, v4.s[2]
+	ldp	q6, q7, [x15], #128
+	prfm	pldl1keep, [x3]
+	fmla	v0.4s, v17.4s, v5.s[3]
+	fmla	v2.4s, v17.4s, v4.s[3]
+	fmla	v1.4s, v16.4s, v5.s[3]
+	ldr	q5, [x1, #16]
+	prfm	pldl1keep, [x14]
+	fmla	v3.4s, v16.4s, v4.s[3]
+	ldr	q4, [x12, #16]
+	cmp	x13, x23
+	b.lt	.LBB0_24
+.LBB0_25:                               //   in Loop: Header=BB0_3 Depth=1
+	ldp	q17, q16, [x9]
+	fmla	v0.4s, v7.4s, v5.s[0]
+	fmla	v1.4s, v6.4s, v5.s[0]
+	fmla	v2.4s, v7.4s, v4.s[0]
+	fmla	v3.4s, v6.4s, v4.s[0]
+	ldp	q6, q7, [x0]
+	ldr	x12, [sp, #504]                 // 8-byte Folded Reload
+	ldr	x6, [sp, #392]                  // 8-byte Folded Reload
+	mov	x13, xzr
+	mov	x15, xzr
+	fmla	v0.4s, v16.4s, v5.s[1]
+	fmla	v2.4s, v16.4s, v4.s[1]
+	fmla	v1.4s, v17.4s, v5.s[1]
+	fmla	v3.4s, v17.4s, v4.s[1]
+	ldp	q17, q16, [x30]
+	fmla	v0.4s, v7.4s, v5.s[2]
+	fmla	v2.4s, v7.4s, v4.s[2]
+	fmla	v1.4s, v6.4s, v5.s[2]
+	fmla	v3.4s, v6.4s, v4.s[2]
+	fmla	v0.4s, v16.4s, v5.s[3]
+	fmla	v2.4s, v16.4s, v4.s[3]
+	fmla	v1.4s, v17.4s, v5.s[3]
+	fmla	v3.4s, v17.4s, v4.s[3]
+	cmp	x12, x19
+	b.ge	.LBB0_27
+	.p2align	2
+.LBB0_26:                               //   Parent Loop BB0_3 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	ldp	x3, x1, [sp, #472]              // 16-byte Folded Reload
+	add	x14, x29, x15, lsl #3
+	add	x12, x12, #1
+	add	x14, x14, #32
+	add	x17, x1, x15
+	add	x18, x3, x15
+	add	x17, x17, #4
+	add	x18, x18, #4
+	prfm	pldl1keep, [x18]
+	ldr	s4, [x3, x15]
+	prfm	pldl1keep, [x17]
+	ldr	s5, [x1, x15]
+	add	x17, x29, x13
+	prfm	pldl1keep, [x14]
+	add	x15, x15, #4
+	add	x13, x13, #32
+	ldp	q6, q7, [x17]
+	fmla	v0.4s, v7.4s, v4.s[0]
+	fmla	v1.4s, v6.4s, v4.s[0]
+	fmla	v2.4s, v7.4s, v5.s[0]
+	fmla	v3.4s, v6.4s, v5.s[0]
+	cmp	x12, x19
+	b.lt	.LBB0_26
+.LBB0_27:                               //   in Loop: Header=BB0_3 Depth=1
+	stp	q1, q0, [x10]
+	stp	q3, q2, [x11]
+	ldr	x10, [sp, #288]                 // 8-byte Folded Reload
+	ldr	x1, [sp, #368]                  // 8-byte Folded Reload
+	cmp	x20, x10
+	b.ge	.LBB0_2
+.LBB0_28:                               //   in Loop: Header=BB0_3 Depth=1
+	mul	x10, x20, x27
+	ldp	q4, q3, [x8]
+	ldr	x13, [sp, #136]                 // 8-byte Folded Reload
+	mul	x12, x20, x22
+	mov	x11, xzr
+	add	x10, x10, x7
+	lsl	x12, x12, #2
+	add	x10, x2, x10, lsl #2
+	ldr	q2, [x16, x12]
+	mov	x12, x5
+	ldp	q1, q0, [x10]
+	cmp	xzr, x23
+	b.ge	.LBB0_30
+	.p2align	2
+.LBB0_29:                               //   Parent Loop BB0_3 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x15, x12, #32
+	fmla	v1.4s, v4.4s, v2.s[0]
+	fmla	v0.4s, v3.4s, v2.s[0]
+	add	x14, x12, #96
+	prfm	pldl1keep, [x15]
+	ldp	q5, q6, [x12, #-96]
+	add	x11, x11, #4
+	ldp	q4, q3, [x12, #-64]
+	prfm	pldl1keep, [x14]
+	fmla	v0.4s, v6.4s, v2.s[1]
+	fmla	v1.4s, v5.4s, v2.s[1]
+	ldp	q5, q6, [x12, #-32]
+	prfm	pldl1keep, [x13]
+	fmla	v0.4s, v3.4s, v2.s[2]
+	fmla	v1.4s, v4.4s, v2.s[2]
+	fmla	v0.4s, v6.4s, v2.s[3]
+	fmla	v1.4s, v5.4s, v2.s[3]
+	ldur	q2, [x13, #-16]
+	ldp	q4, q3, [x12], #128
+	add	x13, x13, #16
+	cmp	x11, x23
+	b.lt	.LBB0_29
+.LBB0_30:                               //   in Loop: Header=BB0_3 Depth=1
+	ldp	q6, q5, [x9]
+	fmla	v0.4s, v3.4s, v2.s[0]
+	fmla	v1.4s, v4.4s, v2.s[0]
+	ldp	q3, q4, [x0]
+	ldp	x9, x11, [sp, #152]             // 16-byte Folded Reload
+	ldr	x12, [sp, #504]                 // 8-byte Folded Reload
+	ldr	x15, [sp, #144]                 // 8-byte Folded Reload
+	fmla	v0.4s, v5.4s, v2.s[1]
+	fmla	v1.4s, v6.4s, v2.s[1]
+	ldp	q6, q5, [x30]
+	fmla	v0.4s, v4.4s, v2.s[2]
+	fmla	v1.4s, v3.4s, v2.s[2]
+	fmla	v0.4s, v5.4s, v2.s[3]
+	fmla	v1.4s, v6.4s, v2.s[3]
+	cmp	x12, x19
+	b.ge	.LBB0_1
+	.p2align	2
+.LBB0_31:                               //   Parent Loop BB0_3 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x14, x8, x12, lsl #5
+	add	x13, x8, x11
+	prfm	pldl1keep, [x9]
+	add	x11, x11, #32
+	ldr	s2, [x15, x12, lsl #2]
+	prfm	pldl1keep, [x13]
+	add	x12, x12, #1
+	ldp	q3, q4, [x14]
+	add	x9, x9, #4
+	fmla	v0.4s, v4.4s, v2.s[0]
+	fmla	v1.4s, v3.4s, v2.s[0]
+	cmp	x12, x19
+	b.lt	.LBB0_31
+	b	.LBB0_1
+.LBB0_32:
+	ldr	x0, [sp, #56]                   // 8-byte Folded Reload
+	bl	free
+	ldr	x9, [sp, #112]                  // 8-byte Folded Reload
+	lsl	x20, x22, #3
+	str	x20, [sp, #472]                 // 8-byte Folded Spill
+	add	x8, x9, #3
+	cmp	x9, #0
+	csel	x8, x8, x9, lt
+	ldr	x9, [sp, #128]                  // 8-byte Folded Reload
+	asr	x8, x8, #2
+	cmp	x9, #0
+	cinv	x29, x8, lt
+	lsl	x8, x29, #2
+	str	x8, [sp, #488]                  // 8-byte Folded Spill
+	cmp	x25, x8
+	ldp	x9, x8, [sp, #256]              // 16-byte Folded Reload
+	add	x24, x8, x9, lsl #2
+	b.ge	.LBB0_63
+// %bb.33:
+	lsl	x8, x19, #4
+	str	x29, [sp, #480]                 // 8-byte Folded Spill
+	add	x0, x8, #64
+	str	x8, [sp, #464]                  // 8-byte Folded Spill
+	bl	malloc
+	add	x8, x25, x27, lsl #1
+	add	x10, x27, x25
+	ldp	x6, x5, [sp, #296]              // 16-byte Folded Reload
+	lsl	x10, x10, #2
+	add	x13, x25, x27, lsl #2
+	add	x11, x0, #63
+	ldr	x18, [sp, #328]                 // 8-byte Folded Reload
+	ldr	q2, [x24, x10]
+	lsl	x10, x8, #2
+	add	x8, x8, x27
+	ldr	q1, [x24, x10]
+	lsl	x10, x13, #2
+	lsl	x8, x8, #2
+	ldr	q3, [x24, x10]
+	mov	w10, #6                         // =0x6
+	ldr	x1, [sp, #344]                  // 8-byte Folded Reload
+	ldr	q4, [x24, x8]
+	mul	x8, x27, x10
+	mov	w4, #12                         // =0xc
+	add	x13, x13, x27
+	lsl	x10, x13, #2
+	lsl	x9, x25, #2
+	mov	w16, #20                        // =0x14
+	ldr	q5, [x24, x10]
+	add	x10, x6, x21
+	mov	w15, #24                        // =0x18
+	ldr	q0, [x24, x9]
+	ldr	q23, [x10, x9]
+	add	x8, x8, x25
+	add	x9, x18, x1
+	mul	x13, x22, x15
+	ldr	x7, [sp, #64]                   // 8-byte Folded Reload
+	lsl	x8, x8, #2
+	ldr	q22, [x9, x13]
+	mov	w13, #28                        // =0x1c
+	ldr	q6, [x24, x8]
+	sub	x8, x25, x27
+	ldr	q16, [x9]
+	add	x8, x8, x27, lsl #3
+	ldr	q17, [x9, x28]
+	ldr	x29, [sp, #504]                 // 8-byte Folded Reload
+	ldr	q18, [x9, x20]
+	ldr	q20, [x9, x22, lsl #4]
+	lsl	x8, x8, #2
+	ldr	x30, [sp, #104]                 // 8-byte Folded Reload
+	mov	x12, xzr
+	ldr	q7, [x24, x8]
+	and	x8, x11, #0xffffffffffffffc0
+	mul	x11, x22, x4
+	orr	x3, x8, #0x20
+	ldr	q19, [x9, x11]
+	mul	x11, x22, x16
+	ldr	q21, [x9, x11]
+	ldr	x11, [sp, #16]                  // 8-byte Folded Reload
+	lsl	x11, x11, #5
+	madd	x17, x5, x13, x11
+	add	x14, x11, x5, lsl #5
+	madd	x15, x5, x15, x11
+	madd	x16, x5, x16, x11
+	madd	x4, x5, x4, x11
+	add	x2, x11, x5, lsl #3
+	add	x13, x6, x14
+	add	x2, x6, x2
+	add	x14, x6, x17
+	add	x17, x1, x18
+	add	x1, x11, x5, lsl #2
+	add	x18, x11, x26
+	mov	w5, #16                         // =0x10
+	add	x15, x6, x15
+	add	x16, x6, x16
+	add	x4, x6, x4
+	add	x17, x7, x17
+	add	x18, x6, x18
+	sub	x5, x5, x7
+	add	x17, x17, #16
+	add	x1, x6, x1
+	prfm	pldl1keep, [x17]
+	ldur	q24, [x17, #-16]
+	fmla	v0.4s, v23.4s, v16.s[0]
+	fmla	v2.4s, v23.4s, v17.s[0]
+	cmp	xzr, x23
+	b.ge	.LBB0_35
+	.p2align	2
+.LBB0_34:                               // =>This Inner Loop Header: Depth=1
+	add	x6, x16, x21
+	stur	q23, [x3, #-32]
+	fmla	v1.4s, v23.4s, v18.s[0]
+	fmla	v4.4s, v23.4s, v19.s[0]
+	prfm	pldl1keep, [x6]
+	ldr	q25, [x1, x21]
+	fmla	v3.4s, v23.4s, v20.s[0]
+	fmla	v5.4s, v23.4s, v21.s[0]
+	fmla	v6.4s, v23.4s, v22.s[0]
+	fmla	v7.4s, v23.4s, v24.s[0]
+	add	x6, x15, x21
+	add	x7, x17, x5
+	add	x20, x7, x28
+	add	x25, x20, x28
+	add	x12, x12, #4
+	add	x15, x15, x26
+	add	x16, x16, x26
+	add	x17, x17, #16
+	add	x1, x1, x26
+	stur	q25, [x3, #-16]
+	prfm	pldl1keep, [x6]
+	ldr	q23, [x2, x21]
+	fmla	v0.4s, v25.4s, v16.s[1]
+	fmla	v2.4s, v25.4s, v17.s[1]
+	fmla	v1.4s, v25.4s, v18.s[1]
+	fmla	v4.4s, v25.4s, v19.s[1]
+	fmla	v3.4s, v25.4s, v20.s[1]
+	fmla	v5.4s, v25.4s, v21.s[1]
+	fmla	v6.4s, v25.4s, v22.s[1]
+	fmla	v7.4s, v25.4s, v24.s[1]
+	add	x6, x14, x21
+	add	x14, x14, x26
+	add	x2, x2, x26
+	fmla	v0.4s, v23.4s, v16.s[2]
+	fmla	v2.4s, v23.4s, v17.s[2]
+	fmla	v1.4s, v23.4s, v18.s[2]
+	fmla	v4.4s, v23.4s, v19.s[2]
+	fmla	v3.4s, v23.4s, v20.s[2]
+	fmla	v5.4s, v23.4s, v21.s[2]
+	fmla	v6.4s, v23.4s, v22.s[2]
+	fmla	v7.4s, v23.4s, v24.s[2]
+	str	q23, [x3]
+	prfm	pldl1keep, [x6]
+	ldr	q23, [x4, x21]
+	add	x6, x13, x21
+	add	x13, x13, x26
+	add	x4, x4, x26
+	str	q23, [x3, #16]
+	prfm	pldl1keep, [x6]
+	add	x6, x25, x28
+	fmla	v0.4s, v23.4s, v16.s[3]
+	fmla	v2.4s, v23.4s, v17.s[3]
+	fmla	v1.4s, v23.4s, v18.s[3]
+	fmla	v4.4s, v23.4s, v19.s[3]
+	fmla	v3.4s, v23.4s, v20.s[3]
+	fmla	v5.4s, v23.4s, v21.s[3]
+	fmla	v6.4s, v23.4s, v22.s[3]
+	fmla	v7.4s, v23.4s, v24.s[3]
+	ldr	q23, [x18, x21]
+	prfm	pldl1keep, [x7]
+	ldur	q16, [x7, #-16]
+	prfm	pldl1keep, [x20]
+	ldur	q17, [x20, #-16]
+	prfm	pldl1keep, [x25]
+	ldur	q18, [x25, #-16]
+	ldr	x25, [sp, #384]                 // 8-byte Folded Reload
+	add	x18, x18, x26
+	add	x3, x3, #64
+	prfm	pldl1keep, [x6]
+	ldur	q19, [x6, #-16]
+	add	x6, x6, x28
+	prfm	pldl1keep, [x6]
+	ldur	q20, [x6, #-16]
+	add	x6, x6, x28
+	prfm	pldl1keep, [x6]
+	ldur	q21, [x6, #-16]
+	add	x6, x6, x28
+	prfm	pldl1keep, [x6]
+	ldur	q22, [x6, #-16]
+	prfm	pldl1keep, [x17]
+	ldur	q24, [x17, #-16]
+	fmla	v0.4s, v23.4s, v16.s[0]
+	fmla	v2.4s, v23.4s, v17.s[0]
+	cmp	x12, x23
+	b.lt	.LBB0_34
+.LBB0_35:
+	ldp	x13, x14, [sp, #400]            // 16-byte Folded Reload
+	ldr	x15, [sp, #304]                 // 8-byte Folded Reload
+	fmla	v1.4s, v23.4s, v18.s[0]
+	str	q23, [x8, x23, lsl #4]
+	fmla	v4.4s, v23.4s, v19.s[0]
+	fmla	v3.4s, v23.4s, v20.s[0]
+	fmla	v5.4s, v23.4s, v21.s[0]
+	fmla	v6.4s, v23.4s, v22.s[0]
+	fmla	v7.4s, v23.4s, v24.s[0]
+	ldr	x7, [sp, #520]                  // 8-byte Folded Reload
+	mul	x12, x13, x15
+	add	x12, x12, x25
+	lsl	x12, x12, #2
+	ldr	q23, [x10, x12]
+	mul	x12, x14, x15
+	add	x12, x12, x25
+	lsl	x12, x12, #2
+	str	q23, [x8, x13, lsl #4]
+	ldr	x13, [sp, #416]                 // 8-byte Folded Reload
+	fmla	v0.4s, v23.4s, v16.s[1]
+	fmla	v2.4s, v23.4s, v17.s[1]
+	fmla	v1.4s, v23.4s, v18.s[1]
+	fmla	v4.4s, v23.4s, v19.s[1]
+	fmla	v3.4s, v23.4s, v20.s[1]
+	fmla	v5.4s, v23.4s, v21.s[1]
+	fmla	v6.4s, v23.4s, v22.s[1]
+	fmla	v7.4s, v23.4s, v24.s[1]
+	ldr	q23, [x10, x12]
+	madd	x12, x13, x15, x25
+	fmla	v0.4s, v23.4s, v16.s[2]
+	str	q23, [x8, x14, lsl #4]
+	fmla	v2.4s, v23.4s, v17.s[2]
+	fmla	v1.4s, v23.4s, v18.s[2]
+	fmla	v4.4s, v23.4s, v19.s[2]
+	fmla	v3.4s, v23.4s, v20.s[2]
+	fmla	v5.4s, v23.4s, v21.s[2]
+	fmla	v6.4s, v23.4s, v22.s[2]
+	fmla	v7.4s, v23.4s, v24.s[2]
+	mov	x14, x29
+	lsl	x12, x12, #2
+	ldr	q23, [x10, x12]
+	ldr	x10, [sp, #72]                  // 8-byte Folded Reload
+	add	x12, x10, #4
+	ldp	x17, x10, [sp, #272]            // 16-byte Folded Reload
+	str	q23, [x8, x13, lsl #4]
+	ldr	x13, [sp, #120]                 // 8-byte Folded Reload
+	fmla	v0.4s, v23.4s, v16.s[3]
+	fmla	v2.4s, v23.4s, v17.s[3]
+	fmla	v1.4s, v23.4s, v18.s[3]
+	fmla	v4.4s, v23.4s, v19.s[3]
+	fmla	v3.4s, v23.4s, v20.s[3]
+	fmla	v5.4s, v23.4s, v21.s[3]
+	fmla	v6.4s, v23.4s, v22.s[3]
+	fmla	v7.4s, v23.4s, v24.s[3]
+	add	x10, x11, x10, lsl #2
+	ldr	x11, [sp, #296]                 // 8-byte Folded Reload
+	add	x10, x11, x10
+	ldr	x11, [sp, #320]                 // 8-byte Folded Reload
+	add	x11, x13, x11, lsl #2
+	ldr	x13, [sp, #80]                  // 8-byte Folded Reload
+	sub	x11, x11, x13
+	ldr	x13, [sp, #328]                 // 8-byte Folded Reload
+	add	x13, x11, x13
+	mul	x11, x15, x12
+	add	x12, x13, #4
+	ldr	x13, [sp, #96]                  // 8-byte Folded Reload
+	lsl	x13, x13, #2
+	cmp	x29, x19
+	b.ge	.LBB0_37
+	.p2align	2
+.LBB0_36:                               // =>This Inner Loop Header: Depth=1
+	add	x16, x12, x28
+	prfm	pldl1keep, [x12]
+	ldur	s16, [x12, #-4]
+	add	x15, x10, x11
+	prfm	pldl1keep, [x16]
+	ldur	s17, [x16, #-4]
+	add	x16, x16, x28
+	add	x12, x12, #4
+	prfm	pldl1keep, [x16]
+	ldur	s18, [x16, #-4]
+	add	x16, x16, x28
+	prfm	pldl1keep, [x16]
+	ldur	s19, [x16, #-4]
+	add	x16, x16, x28
+	prfm	pldl1keep, [x16]
+	ldur	s20, [x16, #-4]
+	add	x16, x16, x28
+	prfm	pldl1keep, [x16]
+	ldur	s21, [x16, #-4]
+	add	x16, x16, x28
+	prfm	pldl1keep, [x16]
+	ldur	s22, [x16, #-4]
+	add	x16, x16, x28
+	prfm	pldl1keep, [x16]
+	ldur	s23, [x16, #-4]
+	prfm	pldl1keep, [x15]
+	ldr	q24, [x10, x13]
+	add	x10, x10, x17
+	fmla	v0.4s, v24.4s, v16.s[0]
+	str	q24, [x8, x14, lsl #4]
+	add	x14, x14, #1
+	fmla	v2.4s, v24.4s, v17.s[0]
+	fmla	v1.4s, v24.4s, v18.s[0]
+	fmla	v4.4s, v24.4s, v19.s[0]
+	fmla	v3.4s, v24.4s, v20.s[0]
+	fmla	v5.4s, v24.4s, v21.s[0]
+	fmla	v6.4s, v24.4s, v22.s[0]
+	fmla	v7.4s, v24.4s, v23.s[0]
+	cmp	x14, x19
+	b.lt	.LBB0_36
+.LBB0_37:                               // %.preheader27
+	ldr	x10, [sp, #344]                 // 8-byte Folded Reload
+	ldr	x11, [sp, #496]                 // 8-byte Folded Reload
+	mov	x6, xzr
+	mov	w4, #7                          // =0x7
+	ldr	x15, [sp, #328]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #88]                  // 8-byte Folded Reload
+	mov	w3, #6                          // =0x6
+	mov	w16, #5                         // =0x5
+	mov	w1, #4                          // =0x4
+	mov	w17, #3                         // =0x3
+	mov	w18, #2                         // =0x2
+	mov	w2, #1                          // =0x1
+	add	x13, x11, x10
+	sub	x10, x8, x30, lsl #4
+	add	x14, x12, x15
+	add	x11, x8, #48
+	mov	w12, #8                         // =0x8
+	add	x15, x13, x15
+	add	x13, x14, #4
+	add	x10, x10, x19, lsl #4
+	add	x14, x15, #32
+	add	x15, x10, #16
+	b	.LBB0_39
+	.p2align	2
+.LBB0_38:                               // %.loopexit26
+                                        //   in Loop: Header=BB0_39 Depth=1
+	ldr	x6, [sp, #496]                  // 8-byte Folded Reload
+	ldr	x7, [sp, #520]                  // 8-byte Folded Reload
+	add	x14, x14, x6
+	ldr	x25, [sp, #384]                 // 8-byte Folded Reload
+	add	x13, x13, x6
+	mov	x6, x12
+	mov	x12, x5
+.LBB0_39:                               // =>This Loop Header: Depth=1
+                                        //     Child Loop BB0_41 Depth 2
+                                        //     Child Loop BB0_43 Depth 2
+	madd	x5, x6, x27, x25
+	cmp	x12, x7
+	lsl	x5, x5, #2
+	madd	x2, x2, x27, x25
+	madd	x18, x18, x27, x25
+	madd	x17, x17, x27, x25
+	madd	x1, x1, x27, x25
+	lsl	x2, x2, #2
+	lsl	x18, x18, #2
+	lsl	x17, x17, #2
+	lsl	x1, x1, #2
+	madd	x16, x16, x27, x25
+	lsl	x16, x16, #2
+	str	q0, [x24, x5]
+	str	q2, [x24, x2]
+	str	q1, [x24, x18]
+	str	q4, [x24, x17]
+	str	q3, [x24, x1]
+	str	q5, [x24, x16]
+	madd	x16, x3, x27, x25
+	lsl	x16, x16, #2
+	str	q6, [x24, x16]
+	madd	x16, x4, x27, x25
+	lsl	x16, x16, #2
+	str	q7, [x24, x16]
+	b.ge	.LBB0_44
+// %bb.40:                              //   in Loop: Header=BB0_39 Depth=1
+	add	x17, x12, #3
+	add	x2, x12, #1
+	add	x18, x12, #2
+	mul	x3, x12, x27
+	mul	x7, x17, x27
+	add	x1, x12, #4
+	add	x16, x12, #5
+	ldr	q24, [x8]
+	mul	x4, x2, x27
+	mov	x6, xzr
+	add	x3, x3, x25
+	mul	x5, x18, x27
+	mul	x20, x1, x27
+	add	x7, x7, x25
+	lsl	x3, x3, #2
+	add	x4, x4, x25
+	add	x5, x5, x25
+	add	x20, x20, x25
+	lsl	x7, x7, #2
+	lsl	x4, x4, #2
+	ldr	q0, [x24, x3]
+	mul	x3, x16, x27
+	lsl	x5, x5, #2
+	lsl	x20, x20, #2
+	ldr	q4, [x24, x7]
+	mul	x7, x12, x22
+	ldr	q2, [x24, x4]
+	ldr	q1, [x24, x5]
+	ldr	q3, [x24, x20]
+	mov	x20, x14
+	add	x3, x3, x25
+	lsl	x7, x7, #2
+	lsl	x3, x3, #2
+	ldr	q23, [x9, x7]
+	mul	x7, x2, x22
+	ldr	q5, [x24, x3]
+	add	x3, x12, #6
+	mul	x4, x3, x27
+	lsl	x7, x7, #2
+	ldr	q22, [x9, x7]
+	mul	x7, x18, x22
+	add	x4, x4, x25
+	lsl	x4, x4, #2
+	lsl	x7, x7, #2
+	ldr	q6, [x24, x4]
+	add	x4, x12, #7
+	mul	x5, x4, x27
+	ldr	q21, [x9, x7]
+	mul	x7, x17, x22
+	add	x5, x5, x25
+	lsl	x5, x5, #2
+	lsl	x7, x7, #2
+	ldr	q7, [x24, x5]
+	add	x5, x12, #8
+	ldr	q20, [x9, x7]
+	mul	x7, x1, x22
+	lsl	x7, x7, #2
+	ldr	q19, [x9, x7]
+	mul	x7, x16, x22
+	lsl	x7, x7, #2
+	ldr	q18, [x9, x7]
+	mul	x7, x3, x22
+	lsl	x7, x7, #2
+	ldr	q17, [x9, x7]
+	mul	x7, x4, x22
+	lsl	x7, x7, #2
+	ldr	q16, [x9, x7]
+	mov	x7, x11
+	cmp	xzr, x23
+	b.ge	.LBB0_42
+	.p2align	2
+.LBB0_41:                               //   Parent Loop BB0_39 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x25, x7, #32
+	fmla	v0.4s, v24.4s, v23.s[0]
+	fmla	v2.4s, v24.4s, v22.s[0]
+	add	x6, x6, #4
+	fmla	v1.4s, v24.4s, v21.s[0]
+	fmla	v4.4s, v24.4s, v20.s[0]
+	prfm	pldl1keep, [x25]
+	add	x25, x20, x28
+	fmla	v3.4s, v24.4s, v19.s[0]
+	fmla	v5.4s, v24.4s, v18.s[0]
+	fmla	v6.4s, v24.4s, v17.s[0]
+	fmla	v7.4s, v24.4s, v16.s[0]
+	ldp	q24, q25, [x7, #-32]
+	fmla	v0.4s, v24.4s, v23.s[1]
+	fmla	v2.4s, v24.4s, v22.s[1]
+	fmla	v1.4s, v24.4s, v21.s[1]
+	fmla	v4.4s, v24.4s, v20.s[1]
+	fmla	v3.4s, v24.4s, v19.s[1]
+	fmla	v5.4s, v24.4s, v18.s[1]
+	fmla	v6.4s, v24.4s, v17.s[1]
+	fmla	v7.4s, v24.4s, v16.s[1]
+	fmla	v0.4s, v25.4s, v23.s[2]
+	fmla	v2.4s, v25.4s, v22.s[2]
+	ldp	q26, q24, [x7], #64
+	fmla	v1.4s, v25.4s, v21.s[2]
+	fmla	v4.4s, v25.4s, v20.s[2]
+	fmla	v3.4s, v25.4s, v19.s[2]
+	prfm	pldl1keep, [x20]
+	fmla	v5.4s, v25.4s, v18.s[2]
+	fmla	v6.4s, v25.4s, v17.s[2]
+	fmla	v7.4s, v25.4s, v16.s[2]
+	fmla	v0.4s, v26.4s, v23.s[3]
+	ldur	q23, [x20, #-16]
+	prfm	pldl1keep, [x25]
+	fmla	v2.4s, v26.4s, v22.s[3]
+	ldur	q22, [x25, #-16]
+	add	x25, x25, x28
+	fmla	v1.4s, v26.4s, v21.s[3]
+	fmla	v4.4s, v26.4s, v20.s[3]
+	fmla	v3.4s, v26.4s, v19.s[3]
+	fmla	v5.4s, v26.4s, v18.s[3]
+	add	x20, x20, #16
+	prfm	pldl1keep, [x25]
+	ldur	q21, [x25, #-16]
+	add	x25, x25, x28
+	fmla	v6.4s, v26.4s, v17.s[3]
+	fmla	v7.4s, v26.4s, v16.s[3]
+	prfm	pldl1keep, [x25]
+	ldur	q20, [x25, #-16]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	q19, [x25, #-16]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	q18, [x25, #-16]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	q17, [x25, #-16]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	q16, [x25, #-16]
+	cmp	x6, x23
+	b.lt	.LBB0_41
+.LBB0_42:                               //   in Loop: Header=BB0_39 Depth=1
+	ldp	x7, x6, [sp, #400]              // 16-byte Folded Reload
+	fmla	v0.4s, v24.4s, v23.s[0]
+	fmla	v2.4s, v24.4s, v22.s[0]
+	fmla	v1.4s, v24.4s, v21.s[0]
+	fmla	v4.4s, v24.4s, v20.s[0]
+	mov	x20, x29
+	fmla	v3.4s, v24.4s, v19.s[0]
+	fmla	v5.4s, v24.4s, v18.s[0]
+	ldr	q25, [x8, x7, lsl #4]
+	fmla	v6.4s, v24.4s, v17.s[0]
+	fmla	v7.4s, v24.4s, v16.s[0]
+	ldr	q24, [x8, x6, lsl #4]
+	ldr	x6, [sp, #416]                  // 8-byte Folded Reload
+	mov	x7, x15
+	ldr	q26, [x8, x6, lsl #4]
+	mov	x6, x13
+	fmla	v0.4s, v25.4s, v23.s[1]
+	fmla	v2.4s, v25.4s, v22.s[1]
+	fmla	v1.4s, v25.4s, v21.s[1]
+	fmla	v4.4s, v25.4s, v20.s[1]
+	fmla	v3.4s, v25.4s, v19.s[1]
+	fmla	v5.4s, v25.4s, v18.s[1]
+	fmla	v6.4s, v25.4s, v17.s[1]
+	fmla	v7.4s, v25.4s, v16.s[1]
+	fmla	v0.4s, v24.4s, v23.s[2]
+	fmla	v2.4s, v24.4s, v22.s[2]
+	fmla	v1.4s, v24.4s, v21.s[2]
+	fmla	v4.4s, v24.4s, v20.s[2]
+	fmla	v3.4s, v24.4s, v19.s[2]
+	fmla	v5.4s, v24.4s, v18.s[2]
+	fmla	v6.4s, v24.4s, v17.s[2]
+	fmla	v7.4s, v24.4s, v16.s[2]
+	fmla	v0.4s, v26.4s, v23.s[3]
+	fmla	v2.4s, v26.4s, v22.s[3]
+	fmla	v1.4s, v26.4s, v21.s[3]
+	fmla	v4.4s, v26.4s, v20.s[3]
+	fmla	v3.4s, v26.4s, v19.s[3]
+	fmla	v5.4s, v26.4s, v18.s[3]
+	fmla	v6.4s, v26.4s, v17.s[3]
+	fmla	v7.4s, v26.4s, v16.s[3]
+	cmp	x29, x19
+	b.ge	.LBB0_38
+	.p2align	2
+.LBB0_43:                               //   Parent Loop BB0_39 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x25, x6, x28
+	prfm	pldl1keep, [x6]
+	ldur	s16, [x6, #-4]
+	add	x20, x20, #1
+	prfm	pldl1keep, [x25]
+	ldur	s17, [x25, #-4]
+	add	x25, x25, x28
+	add	x6, x6, #4
+	prfm	pldl1keep, [x25]
+	ldur	s18, [x25, #-4]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	s19, [x25, #-4]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	s20, [x25, #-4]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	s21, [x25, #-4]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	s22, [x25, #-4]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	s23, [x25, #-4]
+	prfm	pldl1keep, [x7]
+	ldur	q24, [x7, #-16]
+	add	x7, x7, #16
+	fmla	v0.4s, v24.4s, v16.s[0]
+	fmla	v2.4s, v24.4s, v17.s[0]
+	fmla	v1.4s, v24.4s, v18.s[0]
+	fmla	v4.4s, v24.4s, v19.s[0]
+	fmla	v3.4s, v24.4s, v20.s[0]
+	fmla	v5.4s, v24.4s, v21.s[0]
+	fmla	v6.4s, v24.4s, v22.s[0]
+	fmla	v7.4s, v24.4s, v23.s[0]
+	cmp	x20, x19
+	b.lt	.LBB0_43
+	b	.LBB0_38
+.LBB0_44:
+	ldr	x13, [sp, #336]                 // 8-byte Folded Reload
+	cmp	x7, x13
+	b.lt	.LBB0_47
+// %bb.45:
+	ldr	x10, [sp, #312]                 // 8-byte Folded Reload
+	cmp	x13, x10
+	b.lt	.LBB0_52
+.LBB0_46:
+	ldr	x10, [sp, #288]                 // 8-byte Folded Reload
+	ldr	x11, [sp, #312]                 // 8-byte Folded Reload
+	cmp	x11, x10
+	b.lt	.LBB0_57
+	b	.LBB0_62
+.LBB0_47:
+	add	x18, x7, #1
+	add	x1, x7, #2
+	add	x2, x7, #3
+	mul	x11, x7, x27
+	mul	x12, x18, x27
+	mov	x16, xzr
+	add	x11, x11, x25
+	mul	x18, x18, x22
+	mul	x13, x1, x27
+	mul	x14, x2, x27
+	lsl	x18, x18, #2
+	mul	x15, x7, x22
+	add	x12, x12, x25
+	add	x13, x13, x25
+	add	x14, x14, x25
+	lsl	x17, x15, #2
+	ldr	q5, [x9, x17]
+	mov	x17, x8
+	add	x11, x24, x11, lsl #2
+	ldr	q7, [x9, x18]
+	ldr	q16, [x17], #48
+	ldr	q0, [x11]
+	mul	x18, x1, x22
+	lsl	x18, x18, #2
+	add	x12, x24, x12, lsl #2
+	add	x13, x24, x13, lsl #2
+	add	x14, x24, x14, lsl #2
+	ldr	q1, [x12]
+	ldr	q2, [x13]
+	ldr	q3, [x14]
+	ldr	q6, [x9, x18]
+	mul	x18, x2, x22
+	ldp	x2, x1, [sp, #320]              // 16-byte Folded Reload
+	lsl	x18, x18, #2
+	ldr	q4, [x9, x18]
+	ldr	x18, [sp, #32]                  // 8-byte Folded Reload
+	lsl	x18, x18, #5
+	add	x18, x18, x2, lsl #2
+	add	x18, x18, x1
+	add	x18, x18, #32
+	cmp	xzr, x23
+	b.ge	.LBB0_49
+	.p2align	2
+.LBB0_48:                               // =>This Inner Loop Header: Depth=1
+	add	x1, x17, #32
+	fmla	v0.4s, v16.4s, v5.s[0]
+	fmla	v1.4s, v16.4s, v7.s[0]
+	add	x16, x16, #4
+	fmla	v2.4s, v16.4s, v6.s[0]
+	fmla	v3.4s, v16.4s, v4.s[0]
+	prfm	pldl1keep, [x1]
+	add	x1, x18, x28
+	ldp	q16, q17, [x17, #-32]
+	fmla	v0.4s, v16.4s, v5.s[1]
+	fmla	v1.4s, v16.4s, v7.s[1]
+	fmla	v2.4s, v16.4s, v6.s[1]
+	fmla	v3.4s, v16.4s, v4.s[1]
+	fmla	v0.4s, v17.4s, v5.s[2]
+	fmla	v1.4s, v17.4s, v7.s[2]
+	fmla	v2.4s, v17.4s, v6.s[2]
+	fmla	v3.4s, v17.4s, v4.s[2]
+	ldp	q17, q16, [x17], #64
+	prfm	pldl1keep, [x18]
+	fmla	v0.4s, v17.4s, v5.s[3]
+	ldur	q5, [x18, #-16]
+	prfm	pldl1keep, [x1]
+	fmla	v1.4s, v17.4s, v7.s[3]
+	ldur	q7, [x1, #-16]
+	add	x1, x1, x28
+	fmla	v2.4s, v17.4s, v6.s[3]
+	fmla	v3.4s, v17.4s, v4.s[3]
+	add	x18, x18, #16
+	prfm	pldl1keep, [x1]
+	ldur	q6, [x1, #-16]
+	add	x1, x1, x28
+	prfm	pldl1keep, [x1]
+	ldur	q4, [x1, #-16]
+	cmp	x16, x23
+	b.lt	.LBB0_48
+.LBB0_49:
+	ldp	x17, x16, [sp, #400]            // 16-byte Folded Reload
+	fmla	v0.4s, v16.4s, v5.s[0]
+	fmla	v1.4s, v16.4s, v7.s[0]
+	fmla	v2.4s, v16.4s, v6.s[0]
+	fmla	v3.4s, v16.4s, v4.s[0]
+	add	x15, x19, x15
+	sub	x15, x15, x30
+	add	x10, x10, #16
+	ldr	q17, [x8, x17, lsl #4]
+	fmla	v0.4s, v17.4s, v5.s[1]
+	ldr	q16, [x8, x16, lsl #4]
+	ldr	x16, [sp, #416]                 // 8-byte Folded Reload
+	fmla	v1.4s, v17.4s, v7.s[1]
+	fmla	v2.4s, v17.4s, v6.s[1]
+	fmla	v3.4s, v17.4s, v4.s[1]
+	ldr	q18, [x8, x16, lsl #4]
+	ldr	x16, [sp, #344]                 // 8-byte Folded Reload
+	add	x15, x16, x15, lsl #2
+	ldr	x16, [sp, #328]                 // 8-byte Folded Reload
+	fmla	v0.4s, v16.4s, v5.s[2]
+	fmla	v1.4s, v16.4s, v7.s[2]
+	fmla	v2.4s, v16.4s, v6.s[2]
+	fmla	v3.4s, v16.4s, v4.s[2]
+	add	x15, x15, x16
+	mov	x16, x29
+	add	x15, x15, #4
+	fmla	v0.4s, v18.4s, v5.s[3]
+	fmla	v1.4s, v18.4s, v7.s[3]
+	fmla	v2.4s, v18.4s, v6.s[3]
+	fmla	v3.4s, v18.4s, v4.s[3]
+	cmp	x29, x19
+	b.ge	.LBB0_51
+	.p2align	2
+.LBB0_50:                               // =>This Inner Loop Header: Depth=1
+	add	x17, x15, x28
+	prfm	pldl1keep, [x15]
+	ldur	s4, [x15, #-4]
+	add	x16, x16, #1
+	prfm	pldl1keep, [x17]
+	ldur	s5, [x17, #-4]
+	add	x17, x17, x28
+	add	x15, x15, #4
+	prfm	pldl1keep, [x17]
+	ldur	s6, [x17, #-4]
+	add	x17, x17, x28
+	prfm	pldl1keep, [x17]
+	ldur	s7, [x17, #-4]
+	prfm	pldl1keep, [x10]
+	ldur	q16, [x10, #-16]
+	add	x10, x10, #16
+	fmla	v0.4s, v16.4s, v4.s[0]
+	fmla	v1.4s, v16.4s, v5.s[0]
+	fmla	v2.4s, v16.4s, v6.s[0]
+	fmla	v3.4s, v16.4s, v7.s[0]
+	cmp	x16, x19
+	b.lt	.LBB0_50
+.LBB0_51:
+	str	q0, [x11]
+	str	q1, [x12]
+	str	q2, [x13]
+	ldr	x13, [sp, #336]                 // 8-byte Folded Reload
+	str	q3, [x14]
+	ldr	x10, [sp, #312]                 // 8-byte Folded Reload
+	cmp	x13, x10
+	b.ge	.LBB0_46
+.LBB0_52:
+	mul	x14, x13, x22
+	add	x12, x13, #1
+	ldr	x18, [sp, #328]                 // 8-byte Folded Reload
+	mov	x16, x8
+	mul	x10, x13, x27
+	ldr	q4, [x16], #48
+	mov	x15, xzr
+	mul	x11, x12, x27
+	lsl	x13, x14, #2
+	add	x10, x10, x25
+	add	x11, x11, x25
+	ldr	q3, [x9, x13]
+	mul	x13, x12, x22
+	add	x10, x24, x10, lsl #2
+	add	x11, x24, x11, lsl #2
+	ldr	q0, [x10]
+	ldr	q1, [x11]
+	lsl	x17, x13, #2
+	ldr	q2, [x9, x17]
+	add	x17, x18, x17
+	ldr	x18, [sp, #248]                 // 8-byte Folded Reload
+	cmp	xzr, x23
+	b.ge	.LBB0_54
+	.p2align	2
+.LBB0_53:                               // =>This Inner Loop Header: Depth=1
+	add	x5, x16, #32
+	ldr	x3, [sp, #344]                  // 8-byte Folded Reload
+	fmla	v0.4s, v4.4s, v3.s[0]
+	fmla	v1.4s, v4.4s, v2.s[0]
+	prfm	pldl1keep, [x5]
+	ldp	q4, q5, [x16, #-32]
+	add	x15, x15, #4
+	add	x1, x17, x3
+	add	x3, x18, x3
+	add	x17, x17, #16
+	add	x18, x18, #16
+	add	x2, x1, #32
+	add	x4, x3, #32
+	fmla	v0.4s, v4.4s, v3.s[1]
+	fmla	v1.4s, v4.4s, v2.s[1]
+	fmla	v0.4s, v5.4s, v3.s[2]
+	fmla	v1.4s, v5.4s, v2.s[2]
+	ldp	q5, q4, [x16], #64
+	prfm	pldl1keep, [x4]
+	fmla	v0.4s, v5.4s, v3.s[3]
+	ldr	q3, [x3, #16]
+	prfm	pldl1keep, [x2]
+	fmla	v1.4s, v5.4s, v2.s[3]
+	ldr	q2, [x1, #16]
+	cmp	x15, x23
+	b.lt	.LBB0_53
+.LBB0_54:
+	ldr	x15, [sp, #400]                 // 8-byte Folded Reload
+	fmla	v0.4s, v4.4s, v3.s[0]
+	fmla	v1.4s, v4.4s, v2.s[0]
+	ldr	x16, [sp, #344]                 // 8-byte Folded Reload
+	add	x13, x19, x13
+	ldr	x17, [sp, #328]                 // 8-byte Folded Reload
+	sub	x13, x13, x30
+	add	x14, x19, x14
+	mul	x12, x22, x12
+	ldr	x18, [sp, #40]                  // 8-byte Folded Reload
+	ldr	q5, [x8, x15, lsl #4]
+	ldr	x15, [sp, #408]                 // 8-byte Folded Reload
+	add	x13, x16, x13, lsl #2
+	add	x12, x16, x12, lsl #2
+	add	x12, x17, x12
+	ldr	q4, [x8, x15, lsl #4]
+	ldr	x15, [sp, #416]                 // 8-byte Folded Reload
+	fmla	v0.4s, v5.4s, v3.s[1]
+	fmla	v1.4s, v5.4s, v2.s[1]
+	ldr	q5, [x8, x15, lsl #4]
+	sub	x15, x14, x30
+	add	x14, x13, x17
+	ldr	x13, [sp, #464]                 // 8-byte Folded Reload
+	add	x15, x16, x15, lsl #2
+	add	x16, x16, x18, lsl #4
+	add	x14, x14, #4
+	fmla	v0.4s, v4.4s, v3.s[2]
+	fmla	v1.4s, v4.4s, v2.s[2]
+	add	x15, x15, x17
+	add	x16, x17, x16
+	mov	x17, x29
+	sub	x13, x13, x30, lsl #4
+	add	x15, x15, #4
+	fmla	v0.4s, v5.4s, v3.s[3]
+	fmla	v1.4s, v5.4s, v2.s[3]
+	add	x13, x13, #16
+	cmp	x29, x19
+	b.ge	.LBB0_56
+	.p2align	2
+.LBB0_55:                               // =>This Inner Loop Header: Depth=1
+	add	x18, x8, x13
+	prfm	pldl1keep, [x15]
+	ldr	s2, [x16, x17, lsl #2]
+	prfm	pldl1keep, [x14]
+	ldr	s3, [x12, x17, lsl #2]
+	add	x13, x13, #16
+	prfm	pldl1keep, [x18]
+	ldr	q4, [x8, x17, lsl #4]
+	add	x17, x17, #1
+	add	x14, x14, #4
+	add	x15, x15, #4
+	fmla	v0.4s, v4.4s, v2.s[0]
+	fmla	v1.4s, v4.4s, v3.s[0]
+	cmp	x17, x19
+	b.lt	.LBB0_55
+.LBB0_56:
+	str	q0, [x10]
+	str	q1, [x11]
+	ldr	x10, [sp, #288]                 // 8-byte Folded Reload
+	ldr	x11, [sp, #312]                 // 8-byte Folded Reload
+	cmp	x11, x10
+	b.ge	.LBB0_62
+.LBB0_57:
+	ldr	x11, [sp, #312]                 // 8-byte Folded Reload
+	mov	x13, x8
+	mov	x12, xzr
+	mul	x10, x11, x27
+	mul	x11, x11, x22
+	ldr	q2, [x13], #48
+	lsl	x14, x11, #2
+	add	x10, x10, x25
+	ldr	q1, [x9, x14]
+	ldr	x9, [sp, #48]                   // 8-byte Folded Reload
+	ldp	x15, x14, [sp, #320]            // 16-byte Folded Reload
+	add	x10, x24, x10, lsl #2
+	ldr	q0, [x10]
+	lsl	x9, x9, #3
+	add	x9, x9, x15, lsl #2
+	add	x9, x9, x14
+	add	x9, x9, #32
+	cmp	xzr, x23
+	b.ge	.LBB0_59
+	.p2align	2
+.LBB0_58:                               // =>This Inner Loop Header: Depth=1
+	add	x14, x13, #32
+	fmla	v0.4s, v2.4s, v1.s[0]
+	add	x12, x12, #4
+	prfm	pldl1keep, [x14]
+	ldp	q2, q3, [x13, #-32]
+	fmla	v0.4s, v2.4s, v1.s[1]
+	fmla	v0.4s, v3.4s, v1.s[2]
+	ldp	q3, q2, [x13], #64
+	prfm	pldl1keep, [x9]
+	fmla	v0.4s, v3.4s, v1.s[3]
+	ldur	q1, [x9, #-16]
+	add	x9, x9, #16
+	cmp	x12, x23
+	b.lt	.LBB0_58
+.LBB0_59:
+	ldr	x9, [sp, #400]                  // 8-byte Folded Reload
+	fmla	v0.4s, v2.4s, v1.s[0]
+	ldr	x12, [sp, #328]                 // 8-byte Folded Reload
+	ldr	x13, [sp, #24]                  // 8-byte Folded Reload
+	ldr	q3, [x8, x9, lsl #4]
+	ldr	x9, [sp, #408]                  // 8-byte Folded Reload
+	fmla	v0.4s, v3.4s, v1.s[1]
+	ldr	q2, [x8, x9, lsl #4]
+	ldr	x9, [sp, #416]                  // 8-byte Folded Reload
+	fmla	v0.4s, v2.4s, v1.s[2]
+	ldr	q4, [x8, x9, lsl #4]
+	add	x9, x19, x11
+	ldr	x11, [sp, #344]                 // 8-byte Folded Reload
+	sub	x9, x9, x30
+	add	x9, x11, x9, lsl #2
+	add	x11, x9, x12
+	ldr	x9, [sp, #464]                  // 8-byte Folded Reload
+	fmla	v0.4s, v4.4s, v1.s[3]
+	add	x12, x12, x13
+	mov	x13, x29
+	add	x11, x11, #4
+	sub	x9, x9, x30, lsl #4
+	add	x9, x9, #16
+	cmp	x29, x19
+	b.ge	.LBB0_61
+	.p2align	2
+.LBB0_60:                               // =>This Inner Loop Header: Depth=1
+	add	x14, x8, x9
+	prfm	pldl1keep, [x11]
+	ldr	s1, [x12, x13, lsl #2]
+	prfm	pldl1keep, [x14]
+	ldr	q2, [x8, x13, lsl #4]
+	add	x13, x13, #1
+	add	x9, x9, #16
+	add	x11, x11, #4
+	fmla	v0.4s, v2.4s, v1.s[0]
+	cmp	x13, x19
+	b.lt	.LBB0_60
+.LBB0_61:
+	str	q0, [x10]
+.LBB0_62:
+	bl	free
+	ldp	x20, x29, [sp, #472]            // 16-byte Folded Reload
+.LBB0_63:
+	ldr	x8, [sp, #112]                  // 8-byte Folded Reload
+	ldr	x9, [sp, #128]                  // 8-byte Folded Reload
+	add	x8, x8, x8, lsr #63
+	ldr	x25, [sp, #488]                 // 8-byte Folded Reload
+	cmp	x9, #0
+	asr	x8, x8, #1
+	cinv	x8, x8, lt
+	str	x8, [sp, #464]                  // 8-byte Folded Spill
+	lsl	x8, x8, #1
+	cmp	x25, x8
+	str	x8, [sp, #480]                  // 8-byte Folded Spill
+	b.ge	.LBB0_94
+// %bb.64:
+	lsl	x8, x19, #3
+	add	x0, x8, #64
+	bl	malloc
+	add	x8, x27, x25
+	add	x10, x25, x27, lsl #1
+	ldp	x6, x5, [sp, #296]              // 16-byte Folded Reload
+	add	x11, x25, x27, lsl #2
+	lsl	x8, x8, #2
+	ldr	x18, [sp, #328]                 // 8-byte Folded Reload
+	ldr	x1, [sp, #344]                  // 8-byte Folded Reload
+	ldr	d5, [x24, x8]
+	lsl	x8, x10, #2
+	mov	w4, #12                         // =0xc
+	ldr	d0, [x24, x8]
+	lsl	x8, x11, #2
+	add	x10, x10, x27
+	ldr	d3, [x24, x8]
+	add	x8, x11, x27
+	mul	x11, x22, x4
+	lsl	x8, x8, #2
+	lsl	x10, x10, #2
+	lsl	x9, x25, #2
+	add	x13, x0, #63
+	ldr	d4, [x24, x8]
+	mov	w8, #6                          // =0x6
+	mov	w16, #20                        // =0x14
+	madd	x8, x27, x8, x25
+	ldr	d2, [x24, x10]
+	add	x10, x6, x21
+	ldr	d1, [x24, x9]
+	ldr	d23, [x10, x9]
+	add	x9, x18, x1
+	mov	w15, #24                        // =0x18
+	ldr	q19, [x9, x11]
+	mul	x11, x22, x16
+	ldr	q17, [x9, x28]
+	ldr	x7, [sp, #64]                   // 8-byte Folded Reload
+	lsl	x8, x8, #2
+	ldr	q18, [x9, x20]
+	ldr	q16, [x9]
+	ldr	d6, [x24, x8]
+	sub	x8, x25, x27
+	ldr	x30, [sp, #104]                 // 8-byte Folded Reload
+	add	x8, x8, x27, lsl #3
+	ldr	q21, [x9, x11]
+	lsl	x11, x29, #4
+	ldr	q20, [x9, x22, lsl #4]
+	add	x14, x11, x5, lsl #5
+	madd	x16, x5, x16, x11
+	lsl	x8, x8, #2
+	madd	x4, x5, x4, x11
+	ldr	x29, [sp, #504]                 // 8-byte Folded Reload
+	add	x2, x11, x5, lsl #3
+	ldr	d7, [x24, x8]
+	and	x8, x13, #0xffffffffffffffc0
+	mul	x13, x22, x15
+	madd	x15, x5, x15, x11
+	mov	x12, xzr
+	add	x16, x6, x16
+	add	x2, x6, x2
+	orr	x3, x8, #0x10
+	add	x4, x6, x4
+	ldr	q22, [x9, x13]
+	mov	w13, #28                        // =0x1c
+	add	x15, x6, x15
+	madd	x17, x5, x13, x11
+	add	x13, x6, x14
+	add	x14, x6, x17
+	add	x17, x1, x18
+	add	x1, x11, x5, lsl #2
+	add	x18, x6, x26
+	mov	w5, #16                         // =0x10
+	add	x17, x7, x17
+	add	x18, x18, x11
+	sub	x5, x5, x7
+	add	x17, x17, #16
+	add	x1, x6, x1
+	prfm	pldl1keep, [x17]
+	ldur	q24, [x17, #-16]
+	fmla	v1.2s, v23.2s, v16.s[0]
+	fmla	v5.2s, v23.2s, v17.s[0]
+	cmp	xzr, x23
+	b.ge	.LBB0_66
+	.p2align	2
+.LBB0_65:                               // =>This Inner Loop Header: Depth=1
+	add	x6, x16, x21
+	stur	d23, [x3, #-16]
+	fmla	v0.2s, v23.2s, v18.s[0]
+	fmla	v2.2s, v23.2s, v19.s[0]
+	prfm	pldl1keep, [x6]
+	ldr	d25, [x1, x21]
+	fmla	v3.2s, v23.2s, v20.s[0]
+	fmla	v4.2s, v23.2s, v21.s[0]
+	fmla	v6.2s, v23.2s, v22.s[0]
+	fmla	v7.2s, v23.2s, v24.s[0]
+	add	x6, x15, x21
+	add	x7, x17, x5
+	add	x20, x7, x28
+	add	x25, x20, x28
+	add	x12, x12, #4
+	add	x15, x15, x26
+	add	x16, x16, x26
+	add	x17, x17, #16
+	add	x1, x1, x26
+	stur	d25, [x3, #-8]
+	prfm	pldl1keep, [x6]
+	ldr	d23, [x2, x21]
+	fmla	v1.2s, v25.2s, v16.s[1]
+	fmla	v5.2s, v25.2s, v17.s[1]
+	fmla	v0.2s, v25.2s, v18.s[1]
+	fmla	v2.2s, v25.2s, v19.s[1]
+	fmla	v3.2s, v25.2s, v20.s[1]
+	fmla	v4.2s, v25.2s, v21.s[1]
+	fmla	v6.2s, v25.2s, v22.s[1]
+	fmla	v7.2s, v25.2s, v24.s[1]
+	add	x6, x14, x21
+	add	x14, x14, x26
+	add	x2, x2, x26
+	fmla	v1.2s, v23.2s, v16.s[2]
+	fmla	v5.2s, v23.2s, v17.s[2]
+	fmla	v0.2s, v23.2s, v18.s[2]
+	fmla	v2.2s, v23.2s, v19.s[2]
+	fmla	v3.2s, v23.2s, v20.s[2]
+	fmla	v4.2s, v23.2s, v21.s[2]
+	fmla	v6.2s, v23.2s, v22.s[2]
+	fmla	v7.2s, v23.2s, v24.s[2]
+	str	d23, [x3]
+	prfm	pldl1keep, [x6]
+	ldr	d23, [x4, x21]
+	add	x6, x13, x21
+	add	x13, x13, x26
+	add	x4, x4, x26
+	str	d23, [x3, #8]
+	prfm	pldl1keep, [x6]
+	add	x6, x25, x28
+	fmla	v1.2s, v23.2s, v16.s[3]
+	fmla	v5.2s, v23.2s, v17.s[3]
+	fmla	v0.2s, v23.2s, v18.s[3]
+	fmla	v2.2s, v23.2s, v19.s[3]
+	fmla	v3.2s, v23.2s, v20.s[3]
+	fmla	v4.2s, v23.2s, v21.s[3]
+	fmla	v6.2s, v23.2s, v22.s[3]
+	fmla	v7.2s, v23.2s, v24.s[3]
+	ldr	d23, [x18, x21]
+	prfm	pldl1keep, [x7]
+	ldur	q16, [x7, #-16]
+	prfm	pldl1keep, [x20]
+	ldur	q17, [x20, #-16]
+	prfm	pldl1keep, [x25]
+	ldur	q18, [x25, #-16]
+	add	x18, x18, x26
+	add	x3, x3, #32
+	prfm	pldl1keep, [x6]
+	ldur	q19, [x6, #-16]
+	add	x6, x6, x28
+	prfm	pldl1keep, [x6]
+	ldur	q20, [x6, #-16]
+	add	x6, x6, x28
+	prfm	pldl1keep, [x6]
+	ldur	q21, [x6, #-16]
+	add	x6, x6, x28
+	prfm	pldl1keep, [x6]
+	ldur	q22, [x6, #-16]
+	prfm	pldl1keep, [x17]
+	ldur	q24, [x17, #-16]
+	fmla	v1.2s, v23.2s, v16.s[0]
+	fmla	v5.2s, v23.2s, v17.s[0]
+	cmp	x12, x23
+	b.lt	.LBB0_65
+.LBB0_66:
+	ldp	x13, x14, [sp, #400]            // 16-byte Folded Reload
+	ldr	x15, [sp, #304]                 // 8-byte Folded Reload
+	fmla	v0.2s, v23.2s, v18.s[0]
+	ldr	x20, [sp, #488]                 // 8-byte Folded Reload
+	str	d23, [x8, x23, lsl #3]
+	fmla	v2.2s, v23.2s, v19.s[0]
+	fmla	v3.2s, v23.2s, v20.s[0]
+	fmla	v4.2s, v23.2s, v21.s[0]
+	fmla	v6.2s, v23.2s, v22.s[0]
+	fmla	v7.2s, v23.2s, v24.s[0]
+	ldr	x7, [sp, #520]                  // 8-byte Folded Reload
+	madd	x12, x13, x15, x20
+	lsl	x12, x12, #2
+	ldr	d23, [x10, x12]
+	madd	x12, x14, x15, x20
+	lsl	x12, x12, #2
+	str	d23, [x8, x13, lsl #3]
+	ldr	x13, [sp, #416]                 // 8-byte Folded Reload
+	fmla	v1.2s, v23.2s, v16.s[1]
+	fmla	v5.2s, v23.2s, v17.s[1]
+	fmla	v0.2s, v23.2s, v18.s[1]
+	fmla	v2.2s, v23.2s, v19.s[1]
+	fmla	v3.2s, v23.2s, v20.s[1]
+	fmla	v4.2s, v23.2s, v21.s[1]
+	fmla	v6.2s, v23.2s, v22.s[1]
+	fmla	v7.2s, v23.2s, v24.s[1]
+	ldr	d23, [x10, x12]
+	madd	x12, x13, x15, x20
+	fmla	v1.2s, v23.2s, v16.s[2]
+	str	d23, [x8, x14, lsl #3]
+	fmla	v5.2s, v23.2s, v17.s[2]
+	fmla	v0.2s, v23.2s, v18.s[2]
+	fmla	v2.2s, v23.2s, v19.s[2]
+	fmla	v3.2s, v23.2s, v20.s[2]
+	fmla	v4.2s, v23.2s, v21.s[2]
+	fmla	v6.2s, v23.2s, v22.s[2]
+	fmla	v7.2s, v23.2s, v24.s[2]
+	mov	x14, x29
+	lsl	x12, x12, #2
+	ldr	d23, [x10, x12]
+	ldr	x10, [sp, #72]                  // 8-byte Folded Reload
+	add	x12, x10, #4
+	ldp	x17, x10, [sp, #272]            // 16-byte Folded Reload
+	str	d23, [x8, x13, lsl #3]
+	ldr	x13, [sp, #120]                 // 8-byte Folded Reload
+	fmla	v1.2s, v23.2s, v16.s[3]
+	fmla	v5.2s, v23.2s, v17.s[3]
+	fmla	v0.2s, v23.2s, v18.s[3]
+	fmla	v2.2s, v23.2s, v19.s[3]
+	fmla	v3.2s, v23.2s, v20.s[3]
+	fmla	v4.2s, v23.2s, v21.s[3]
+	fmla	v6.2s, v23.2s, v22.s[3]
+	fmla	v7.2s, v23.2s, v24.s[3]
+	add	x10, x11, x10, lsl #2
+	ldr	x11, [sp, #296]                 // 8-byte Folded Reload
+	add	x10, x11, x10
+	ldr	x11, [sp, #320]                 // 8-byte Folded Reload
+	add	x11, x13, x11, lsl #2
+	ldr	x13, [sp, #80]                  // 8-byte Folded Reload
+	sub	x11, x11, x13
+	ldr	x13, [sp, #328]                 // 8-byte Folded Reload
+	add	x13, x11, x13
+	mul	x11, x15, x12
+	add	x12, x13, #4
+	ldr	x13, [sp, #96]                  // 8-byte Folded Reload
+	lsl	x13, x13, #2
+	cmp	x29, x19
+	b.ge	.LBB0_68
+	.p2align	2
+.LBB0_67:                               // =>This Inner Loop Header: Depth=1
+	add	x16, x12, x28
+	prfm	pldl1keep, [x12]
+	ldur	s16, [x12, #-4]
+	add	x15, x10, x11
+	prfm	pldl1keep, [x16]
+	ldur	s17, [x16, #-4]
+	add	x16, x16, x28
+	add	x12, x12, #4
+	prfm	pldl1keep, [x16]
+	ldur	s18, [x16, #-4]
+	add	x16, x16, x28
+	prfm	pldl1keep, [x16]
+	ldur	s19, [x16, #-4]
+	add	x16, x16, x28
+	prfm	pldl1keep, [x16]
+	ldur	s20, [x16, #-4]
+	add	x16, x16, x28
+	prfm	pldl1keep, [x16]
+	ldur	s21, [x16, #-4]
+	add	x16, x16, x28
+	prfm	pldl1keep, [x16]
+	ldur	s22, [x16, #-4]
+	add	x16, x16, x28
+	prfm	pldl1keep, [x16]
+	ldur	s23, [x16, #-4]
+	prfm	pldl1keep, [x15]
+	ldr	d24, [x10, x13]
+	add	x10, x10, x17
+	fmla	v1.2s, v24.2s, v16.s[0]
+	str	d24, [x8, x14, lsl #3]
+	add	x14, x14, #1
+	fmla	v5.2s, v24.2s, v17.s[0]
+	fmla	v0.2s, v24.2s, v18.s[0]
+	fmla	v2.2s, v24.2s, v19.s[0]
+	fmla	v3.2s, v24.2s, v20.s[0]
+	fmla	v4.2s, v24.2s, v21.s[0]
+	fmla	v6.2s, v24.2s, v22.s[0]
+	fmla	v7.2s, v24.2s, v23.s[0]
+	cmp	x14, x19
+	b.lt	.LBB0_67
+.LBB0_68:                               // %.preheader25
+	ldr	x10, [sp, #344]                 // 8-byte Folded Reload
+	ldr	x11, [sp, #496]                 // 8-byte Folded Reload
+	mov	x6, xzr
+	mov	w3, #7                          // =0x7
+	ldr	x15, [sp, #328]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #88]                  // 8-byte Folded Reload
+	mov	w2, #6                          // =0x6
+	mov	w16, #5                         // =0x5
+	mov	w18, #4                         // =0x4
+	mov	w17, #3                         // =0x3
+	mov	w1, #2                          // =0x2
+	mov	w4, #1                          // =0x1
+	add	x13, x11, x10
+	sub	x10, x8, x30, lsl #3
+	add	x14, x12, x15
+	add	x11, x8, #24
+	mov	w12, #8                         // =0x8
+	add	x15, x13, x15
+	add	x13, x14, #4
+	add	x10, x10, x19, lsl #3
+	add	x14, x15, #32
+	add	x15, x10, #8
+	b	.LBB0_70
+	.p2align	2
+.LBB0_69:                               // %.loopexit24
+                                        //   in Loop: Header=BB0_70 Depth=1
+	ldp	x20, x6, [sp, #488]             // 16-byte Folded Reload
+	ldr	x7, [sp, #520]                  // 8-byte Folded Reload
+	add	x14, x14, x6
+	add	x13, x13, x6
+	mov	x6, x12
+	mov	x12, x5
+.LBB0_70:                               // =>This Loop Header: Depth=1
+                                        //     Child Loop BB0_72 Depth 2
+                                        //     Child Loop BB0_74 Depth 2
+	madd	x5, x6, x27, x20
+	cmp	x12, x7
+	lsl	x5, x5, #2
+	madd	x4, x4, x27, x20
+	madd	x1, x1, x27, x20
+	madd	x17, x17, x27, x20
+	madd	x18, x18, x27, x20
+	lsl	x4, x4, #2
+	lsl	x1, x1, #2
+	lsl	x17, x17, #2
+	lsl	x18, x18, #2
+	madd	x16, x16, x27, x20
+	lsl	x16, x16, #2
+	str	d1, [x24, x5]
+	str	d5, [x24, x4]
+	str	d0, [x24, x1]
+	str	d2, [x24, x17]
+	str	d3, [x24, x18]
+	str	d4, [x24, x16]
+	madd	x16, x2, x27, x20
+	lsl	x16, x16, #2
+	str	d6, [x24, x16]
+	madd	x16, x3, x27, x20
+	lsl	x16, x16, #2
+	str	d7, [x24, x16]
+	b.ge	.LBB0_75
+// %bb.71:                              //   in Loop: Header=BB0_70 Depth=1
+	add	x17, x12, #3
+	add	x4, x12, #1
+	add	x1, x12, #2
+	madd	x2, x12, x27, x20
+	madd	x7, x17, x27, x20
+	add	x18, x12, #4
+	add	x16, x12, #5
+	mov	x25, x20
+	madd	x3, x4, x27, x20
+	ldr	d24, [x8]
+	mov	x6, xzr
+	lsl	x2, x2, #2
+	madd	x5, x1, x27, x20
+	madd	x20, x18, x27, x20
+	lsl	x7, x7, #2
+	lsl	x3, x3, #2
+	ldr	d1, [x24, x2]
+	madd	x2, x16, x27, x25
+	lsl	x5, x5, #2
+	lsl	x20, x20, #2
+	ldr	d2, [x24, x7]
+	mul	x7, x12, x22
+	ldr	d5, [x24, x3]
+	ldr	d0, [x24, x5]
+	ldr	d3, [x24, x20]
+	mov	x20, x14
+	lsl	x2, x2, #2
+	lsl	x7, x7, #2
+	ldr	d4, [x24, x2]
+	add	x2, x12, #6
+	ldr	q23, [x9, x7]
+	mul	x7, x4, x22
+	madd	x3, x2, x27, x25
+	lsl	x7, x7, #2
+	lsl	x3, x3, #2
+	ldr	q22, [x9, x7]
+	mul	x7, x1, x22
+	ldr	d6, [x24, x3]
+	add	x3, x12, #7
+	madd	x5, x3, x27, x25
+	lsl	x7, x7, #2
+	ldr	q21, [x9, x7]
+	mul	x7, x17, x22
+	lsl	x5, x5, #2
+	ldr	d7, [x24, x5]
+	add	x5, x12, #8
+	lsl	x7, x7, #2
+	ldr	q20, [x9, x7]
+	mul	x7, x18, x22
+	lsl	x7, x7, #2
+	ldr	q19, [x9, x7]
+	mul	x7, x16, x22
+	lsl	x7, x7, #2
+	ldr	q18, [x9, x7]
+	mul	x7, x2, x22
+	lsl	x7, x7, #2
+	ldr	q17, [x9, x7]
+	mul	x7, x3, x22
+	lsl	x7, x7, #2
+	ldr	q16, [x9, x7]
+	mov	x7, x11
+	cmp	xzr, x23
+	b.ge	.LBB0_73
+	.p2align	2
+.LBB0_72:                               //   Parent Loop BB0_70 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x25, x7, #16
+	fmla	v1.2s, v24.2s, v23.s[0]
+	fmla	v5.2s, v24.2s, v22.s[0]
+	add	x6, x6, #4
+	fmla	v0.2s, v24.2s, v21.s[0]
+	fmla	v2.2s, v24.2s, v20.s[0]
+	prfm	pldl1keep, [x25]
+	add	x25, x20, x28
+	fmla	v3.2s, v24.2s, v19.s[0]
+	fmla	v4.2s, v24.2s, v18.s[0]
+	fmla	v6.2s, v24.2s, v17.s[0]
+	fmla	v7.2s, v24.2s, v16.s[0]
+	ldp	d24, d25, [x7, #-16]
+	fmla	v1.2s, v24.2s, v23.s[1]
+	fmla	v5.2s, v24.2s, v22.s[1]
+	fmla	v0.2s, v24.2s, v21.s[1]
+	fmla	v2.2s, v24.2s, v20.s[1]
+	fmla	v3.2s, v24.2s, v19.s[1]
+	fmla	v4.2s, v24.2s, v18.s[1]
+	fmla	v6.2s, v24.2s, v17.s[1]
+	fmla	v7.2s, v24.2s, v16.s[1]
+	fmla	v1.2s, v25.2s, v23.s[2]
+	fmla	v5.2s, v25.2s, v22.s[2]
+	ldp	d26, d24, [x7], #32
+	fmla	v0.2s, v25.2s, v21.s[2]
+	fmla	v2.2s, v25.2s, v20.s[2]
+	fmla	v3.2s, v25.2s, v19.s[2]
+	prfm	pldl1keep, [x20]
+	fmla	v4.2s, v25.2s, v18.s[2]
+	fmla	v6.2s, v25.2s, v17.s[2]
+	fmla	v7.2s, v25.2s, v16.s[2]
+	fmla	v1.2s, v26.2s, v23.s[3]
+	ldur	q23, [x20, #-16]
+	prfm	pldl1keep, [x25]
+	fmla	v5.2s, v26.2s, v22.s[3]
+	ldur	q22, [x25, #-16]
+	add	x25, x25, x28
+	fmla	v0.2s, v26.2s, v21.s[3]
+	fmla	v2.2s, v26.2s, v20.s[3]
+	fmla	v3.2s, v26.2s, v19.s[3]
+	fmla	v4.2s, v26.2s, v18.s[3]
+	add	x20, x20, #16
+	prfm	pldl1keep, [x25]
+	ldur	q21, [x25, #-16]
+	add	x25, x25, x28
+	fmla	v6.2s, v26.2s, v17.s[3]
+	fmla	v7.2s, v26.2s, v16.s[3]
+	prfm	pldl1keep, [x25]
+	ldur	q20, [x25, #-16]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	q19, [x25, #-16]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	q18, [x25, #-16]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	q17, [x25, #-16]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	q16, [x25, #-16]
+	cmp	x6, x23
+	b.lt	.LBB0_72
+.LBB0_73:                               //   in Loop: Header=BB0_70 Depth=1
+	ldp	x7, x6, [sp, #400]              // 16-byte Folded Reload
+	fmla	v1.2s, v24.2s, v23.s[0]
+	fmla	v5.2s, v24.2s, v22.s[0]
+	fmla	v0.2s, v24.2s, v21.s[0]
+	fmla	v2.2s, v24.2s, v20.s[0]
+	mov	x20, x29
+	fmla	v3.2s, v24.2s, v19.s[0]
+	fmla	v4.2s, v24.2s, v18.s[0]
+	ldr	d25, [x8, x7, lsl #3]
+	fmla	v6.2s, v24.2s, v17.s[0]
+	fmla	v7.2s, v24.2s, v16.s[0]
+	ldr	d24, [x8, x6, lsl #3]
+	ldr	x6, [sp, #416]                  // 8-byte Folded Reload
+	mov	x7, x15
+	ldr	d26, [x8, x6, lsl #3]
+	mov	x6, x13
+	fmla	v1.2s, v25.2s, v23.s[1]
+	fmla	v5.2s, v25.2s, v22.s[1]
+	fmla	v0.2s, v25.2s, v21.s[1]
+	fmla	v2.2s, v25.2s, v20.s[1]
+	fmla	v3.2s, v25.2s, v19.s[1]
+	fmla	v4.2s, v25.2s, v18.s[1]
+	fmla	v6.2s, v25.2s, v17.s[1]
+	fmla	v7.2s, v25.2s, v16.s[1]
+	fmla	v1.2s, v24.2s, v23.s[2]
+	fmla	v5.2s, v24.2s, v22.s[2]
+	fmla	v0.2s, v24.2s, v21.s[2]
+	fmla	v2.2s, v24.2s, v20.s[2]
+	fmla	v3.2s, v24.2s, v19.s[2]
+	fmla	v4.2s, v24.2s, v18.s[2]
+	fmla	v6.2s, v24.2s, v17.s[2]
+	fmla	v7.2s, v24.2s, v16.s[2]
+	fmla	v1.2s, v26.2s, v23.s[3]
+	fmla	v5.2s, v26.2s, v22.s[3]
+	fmla	v0.2s, v26.2s, v21.s[3]
+	fmla	v2.2s, v26.2s, v20.s[3]
+	fmla	v3.2s, v26.2s, v19.s[3]
+	fmla	v4.2s, v26.2s, v18.s[3]
+	fmla	v6.2s, v26.2s, v17.s[3]
+	fmla	v7.2s, v26.2s, v16.s[3]
+	cmp	x29, x19
+	b.ge	.LBB0_69
+	.p2align	2
+.LBB0_74:                               //   Parent Loop BB0_70 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x25, x6, x28
+	prfm	pldl1keep, [x6]
+	ldur	s16, [x6, #-4]
+	add	x20, x20, #1
+	prfm	pldl1keep, [x25]
+	ldur	s17, [x25, #-4]
+	add	x25, x25, x28
+	add	x6, x6, #4
+	prfm	pldl1keep, [x25]
+	ldur	s18, [x25, #-4]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	s19, [x25, #-4]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	s20, [x25, #-4]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	s21, [x25, #-4]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	s22, [x25, #-4]
+	add	x25, x25, x28
+	prfm	pldl1keep, [x25]
+	ldur	s23, [x25, #-4]
+	prfm	pldl1keep, [x7]
+	ldur	d24, [x7, #-8]
+	add	x7, x7, #8
+	fmla	v1.2s, v24.2s, v16.s[0]
+	fmla	v5.2s, v24.2s, v17.s[0]
+	fmla	v0.2s, v24.2s, v18.s[0]
+	fmla	v2.2s, v24.2s, v19.s[0]
+	fmla	v3.2s, v24.2s, v20.s[0]
+	fmla	v4.2s, v24.2s, v21.s[0]
+	fmla	v6.2s, v24.2s, v22.s[0]
+	fmla	v7.2s, v24.2s, v23.s[0]
+	cmp	x20, x19
+	b.lt	.LBB0_74
+	b	.LBB0_69
+.LBB0_75:
+	ldr	x13, [sp, #336]                 // 8-byte Folded Reload
+	cmp	x7, x13
+	b.lt	.LBB0_78
+// %bb.76:
+	ldr	x10, [sp, #312]                 // 8-byte Folded Reload
+	cmp	x13, x10
+	b.lt	.LBB0_83
+.LBB0_77:
+	ldr	x10, [sp, #288]                 // 8-byte Folded Reload
+	ldr	x11, [sp, #312]                 // 8-byte Folded Reload
+	cmp	x11, x10
+	b.lt	.LBB0_88
+	b	.LBB0_93
+.LBB0_78:
+	add	x18, x7, #1
+	add	x1, x7, #2
+	mul	x15, x7, x22
+	add	x2, x7, #3
+	madd	x12, x18, x27, x20
+	mov	x17, x8
+	ldr	d16, [x17], #24
+	mul	x18, x18, x22
+	mov	x16, xzr
+	lsl	x14, x15, #2
+	mul	x11, x7, x27
+	madd	x13, x1, x27, x20
+	add	x11, x11, x20
+	lsl	x18, x18, #2
+	add	x11, x24, x11, lsl #2
+	ldr	q5, [x9, x14]
+	ldr	q7, [x9, x18]
+	mul	x18, x1, x22
+	ldr	d0, [x11]
+	madd	x14, x2, x27, x20
+	lsl	x18, x18, #2
+	add	x12, x24, x12, lsl #2
+	add	x13, x24, x13, lsl #2
+	add	x14, x24, x14, lsl #2
+	ldr	d1, [x12]
+	ldr	d2, [x13]
+	ldr	q6, [x9, x18]
+	mul	x18, x2, x22
+	ldp	x2, x1, [sp, #320]              // 16-byte Folded Reload
+	ldr	d3, [x14]
+	lsl	x18, x18, #2
+	ldr	q4, [x9, x18]
+	ldr	x18, [sp, #32]                  // 8-byte Folded Reload
+	lsl	x18, x18, #5
+	add	x18, x18, x2, lsl #2
+	add	x18, x18, x1
+	add	x18, x18, #32
+	cmp	xzr, x23
+	b.ge	.LBB0_80
+	.p2align	2
+.LBB0_79:                               // =>This Inner Loop Header: Depth=1
+	add	x1, x17, #16
+	fmla	v0.2s, v16.2s, v5.s[0]
+	fmla	v1.2s, v16.2s, v7.s[0]
+	add	x16, x16, #4
+	fmla	v2.2s, v16.2s, v6.s[0]
+	fmla	v3.2s, v16.2s, v4.s[0]
+	prfm	pldl1keep, [x1]
+	add	x1, x18, x28
+	ldp	d16, d17, [x17, #-16]
+	fmla	v0.2s, v16.2s, v5.s[1]
+	fmla	v1.2s, v16.2s, v7.s[1]
+	fmla	v2.2s, v16.2s, v6.s[1]
+	fmla	v3.2s, v16.2s, v4.s[1]
+	fmla	v0.2s, v17.2s, v5.s[2]
+	fmla	v1.2s, v17.2s, v7.s[2]
+	fmla	v2.2s, v17.2s, v6.s[2]
+	fmla	v3.2s, v17.2s, v4.s[2]
+	ldp	d17, d16, [x17], #32
+	prfm	pldl1keep, [x18]
+	fmla	v0.2s, v17.2s, v5.s[3]
+	ldur	q5, [x18, #-16]
+	prfm	pldl1keep, [x1]
+	fmla	v1.2s, v17.2s, v7.s[3]
+	ldur	q7, [x1, #-16]
+	add	x1, x1, x28
+	fmla	v2.2s, v17.2s, v6.s[3]
+	fmla	v3.2s, v17.2s, v4.s[3]
+	add	x18, x18, #16
+	prfm	pldl1keep, [x1]
+	ldur	q6, [x1, #-16]
+	add	x1, x1, x28
+	prfm	pldl1keep, [x1]
+	ldur	q4, [x1, #-16]
+	cmp	x16, x23
+	b.lt	.LBB0_79
+.LBB0_80:
+	ldp	x17, x16, [sp, #400]            // 16-byte Folded Reload
+	fmla	v0.2s, v16.2s, v5.s[0]
+	fmla	v1.2s, v16.2s, v7.s[0]
+	fmla	v2.2s, v16.2s, v6.s[0]
+	fmla	v3.2s, v16.2s, v4.s[0]
+	add	x15, x19, x15
+	sub	x15, x15, x30
+	add	x10, x10, #8
+	ldr	d17, [x8, x17, lsl #3]
+	fmla	v0.2s, v17.2s, v5.s[1]
+	ldr	d16, [x8, x16, lsl #3]
+	ldr	x16, [sp, #416]                 // 8-byte Folded Reload
+	fmla	v1.2s, v17.2s, v7.s[1]
+	fmla	v2.2s, v17.2s, v6.s[1]
+	fmla	v3.2s, v17.2s, v4.s[1]
+	ldr	d18, [x8, x16, lsl #3]
+	ldr	x16, [sp, #344]                 // 8-byte Folded Reload
+	add	x15, x16, x15, lsl #2
+	ldr	x16, [sp, #328]                 // 8-byte Folded Reload
+	fmla	v0.2s, v16.2s, v5.s[2]
+	fmla	v1.2s, v16.2s, v7.s[2]
+	fmla	v2.2s, v16.2s, v6.s[2]
+	fmla	v3.2s, v16.2s, v4.s[2]
+	add	x15, x15, x16
+	mov	x16, x29
+	add	x15, x15, #4
+	fmla	v0.2s, v18.2s, v5.s[3]
+	fmla	v1.2s, v18.2s, v7.s[3]
+	fmla	v2.2s, v18.2s, v6.s[3]
+	fmla	v3.2s, v18.2s, v4.s[3]
+	cmp	x29, x19
+	b.ge	.LBB0_82
+	.p2align	2
+.LBB0_81:                               // =>This Inner Loop Header: Depth=1
+	add	x17, x15, x28
+	prfm	pldl1keep, [x15]
+	ldur	s4, [x15, #-4]
+	add	x16, x16, #1
+	prfm	pldl1keep, [x17]
+	ldur	s5, [x17, #-4]
+	add	x17, x17, x28
+	add	x15, x15, #4
+	prfm	pldl1keep, [x17]
+	ldur	s6, [x17, #-4]
+	add	x17, x17, x28
+	prfm	pldl1keep, [x17]
+	ldur	s7, [x17, #-4]
+	prfm	pldl1keep, [x10]
+	ldur	d16, [x10, #-8]
+	add	x10, x10, #8
+	fmla	v0.2s, v16.2s, v4.s[0]
+	fmla	v1.2s, v16.2s, v5.s[0]
+	fmla	v2.2s, v16.2s, v6.s[0]
+	fmla	v3.2s, v16.2s, v7.s[0]
+	cmp	x16, x19
+	b.lt	.LBB0_81
+.LBB0_82:
+	str	d0, [x11]
+	str	d1, [x12]
+	str	d2, [x13]
+	ldr	x13, [sp, #336]                 // 8-byte Folded Reload
+	str	d3, [x14]
+	ldr	x10, [sp, #312]                 // 8-byte Folded Reload
+	cmp	x13, x10
+	b.ge	.LBB0_77
+.LBB0_83:
+	mul	x10, x13, x27
+	add	x12, x13, #1
+	mov	x16, x8
+	ldr	x18, [sp, #328]                 // 8-byte Folded Reload
+	mul	x13, x13, x22
+	ldr	d4, [x16], #24
+	mov	x15, xzr
+	madd	x11, x12, x27, x20
+	lsl	x14, x13, #2
+	add	x10, x10, x20
+	add	x10, x24, x10, lsl #2
+	ldr	q3, [x9, x14]
+	mul	x14, x12, x22
+	add	x11, x24, x11, lsl #2
+	ldr	d0, [x10]
+	ldr	d1, [x11]
+	lsl	x17, x14, #2
+	ldr	q2, [x9, x17]
+	add	x17, x18, x17
+	cmp	xzr, x23
+	b.ge	.LBB0_85
+	.p2align	2
+.LBB0_84:                               // =>This Inner Loop Header: Depth=1
+	add	x4, x16, #16
+	ldr	x2, [sp, #344]                  // 8-byte Folded Reload
+	ldr	x5, [sp, #248]                  // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v3.s[0]
+	prfm	pldl1keep, [x4]
+	fmla	v1.2s, v4.2s, v2.s[0]
+	ldp	d4, d5, [x16, #-16]
+	add	x15, x15, #4
+	add	x18, x17, x2
+	add	x2, x5, x2
+	add	x5, x5, #16
+	add	x17, x17, #16
+	add	x1, x18, #32
+	add	x3, x2, #32
+	fmla	v0.2s, v4.2s, v3.s[1]
+	fmla	v1.2s, v4.2s, v2.s[1]
+	fmla	v0.2s, v5.2s, v3.s[2]
+	fmla	v1.2s, v5.2s, v2.s[2]
+	ldp	d5, d4, [x16], #32
+	prfm	pldl1keep, [x3]
+	fmla	v0.2s, v5.2s, v3.s[3]
+	ldr	q3, [x2, #16]
+	prfm	pldl1keep, [x1]
+	fmla	v1.2s, v5.2s, v2.s[3]
+	ldr	q2, [x18, #16]
+	str	x5, [sp, #248]                  // 8-byte Folded Spill
+	cmp	x15, x23
+	b.lt	.LBB0_84
+.LBB0_85:
+	ldr	x15, [sp, #400]                 // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v3.s[0]
+	fmla	v1.2s, v4.2s, v2.s[0]
+	ldr	x17, [sp, #344]                 // 8-byte Folded Reload
+	add	x13, x19, x13
+	mul	x12, x22, x12
+	ldr	x16, [sp, #328]                 // 8-byte Folded Reload
+	add	x12, x17, x12, lsl #2
+	ldr	d5, [x8, x15, lsl #3]
+	ldr	x15, [sp, #408]                 // 8-byte Folded Reload
+	add	x12, x16, x12
+	ldr	d4, [x8, x15, lsl #3]
+	ldr	x15, [sp, #416]                 // 8-byte Folded Reload
+	fmla	v0.2s, v5.2s, v3.s[1]
+	fmla	v1.2s, v5.2s, v2.s[1]
+	ldr	d5, [x8, x15, lsl #3]
+	sub	x15, x13, x30
+	add	x13, x19, x14
+	add	x14, x17, x15, lsl #2
+	ldr	x15, [sp, #40]                  // 8-byte Folded Reload
+	sub	x13, x13, x30
+	fmla	v0.2s, v4.2s, v3.s[2]
+	fmla	v1.2s, v4.2s, v2.s[2]
+	add	x13, x17, x13, lsl #2
+	add	x14, x14, x16
+	add	x13, x13, x16
+	add	x14, x14, #4
+	add	x15, x17, x15, lsl #4
+	add	x13, x13, #4
+	fmla	v0.2s, v5.2s, v3.s[3]
+	fmla	v1.2s, v5.2s, v2.s[3]
+	add	x15, x16, x15
+	mov	x16, x29
+	cmp	x29, x19
+	b.ge	.LBB0_87
+	.p2align	2
+.LBB0_86:                               // =>This Inner Loop Header: Depth=1
+	add	x17, x8, x16, lsl #3
+	prfm	pldl1keep, [x14]
+	ldr	s2, [x15, x16, lsl #2]
+	prfm	pldl1keep, [x13]
+	ldr	s3, [x12, x16, lsl #2]
+	add	x13, x13, #4
+	add	x17, x17, #8
+	add	x14, x14, #4
+	prfm	pldl1keep, [x17]
+	ldr	d4, [x8, x16, lsl #3]
+	add	x16, x16, #1
+	fmla	v0.2s, v4.2s, v2.s[0]
+	fmla	v1.2s, v4.2s, v3.s[0]
+	cmp	x16, x19
+	b.lt	.LBB0_86
+.LBB0_87:
+	str	d0, [x10]
+	str	d1, [x11]
+	ldr	x10, [sp, #288]                 // 8-byte Folded Reload
+	ldr	x11, [sp, #312]                 // 8-byte Folded Reload
+	cmp	x11, x10
+	b.ge	.LBB0_93
+.LBB0_88:
+	ldr	x11, [sp, #312]                 // 8-byte Folded Reload
+	mov	x13, x8
+	mov	x12, xzr
+	mul	x10, x11, x27
+	mul	x11, x11, x22
+	ldr	d2, [x13], #24
+	lsl	x14, x11, #2
+	add	x10, x10, x20
+	ldr	q1, [x9, x14]
+	ldr	x9, [sp, #48]                   // 8-byte Folded Reload
+	ldp	x15, x14, [sp, #320]            // 16-byte Folded Reload
+	add	x10, x24, x10, lsl #2
+	ldr	d0, [x10]
+	lsl	x9, x9, #3
+	add	x9, x9, x15, lsl #2
+	add	x9, x9, x14
+	add	x9, x9, #32
+	cmp	xzr, x23
+	b.ge	.LBB0_90
+	.p2align	2
+.LBB0_89:                               // =>This Inner Loop Header: Depth=1
+	add	x14, x13, #16
+	fmla	v0.2s, v2.2s, v1.s[0]
+	add	x12, x12, #4
+	prfm	pldl1keep, [x14]
+	ldp	d2, d3, [x13, #-16]
+	fmla	v0.2s, v2.2s, v1.s[1]
+	fmla	v0.2s, v3.2s, v1.s[2]
+	ldp	d3, d2, [x13], #32
+	prfm	pldl1keep, [x9]
+	fmla	v0.2s, v3.2s, v1.s[3]
+	ldur	q1, [x9, #-16]
+	add	x9, x9, #16
+	cmp	x12, x23
+	b.lt	.LBB0_89
+.LBB0_90:
+	ldr	x9, [sp, #400]                  // 8-byte Folded Reload
+	fmla	v0.2s, v2.2s, v1.s[0]
+	ldr	x12, [sp, #24]                  // 8-byte Folded Reload
+	ldr	d3, [x8, x9, lsl #3]
+	ldr	x9, [sp, #408]                  // 8-byte Folded Reload
+	fmla	v0.2s, v3.2s, v1.s[1]
+	ldr	d2, [x8, x9, lsl #3]
+	ldr	x9, [sp, #416]                  // 8-byte Folded Reload
+	fmla	v0.2s, v2.2s, v1.s[2]
+	ldr	d3, [x8, x9, lsl #3]
+	add	x9, x19, x11
+	ldr	x11, [sp, #344]                 // 8-byte Folded Reload
+	sub	x9, x9, x30
+	add	x9, x11, x9, lsl #2
+	ldr	x11, [sp, #328]                 // 8-byte Folded Reload
+	fmla	v0.2s, v3.2s, v1.s[3]
+	add	x9, x9, x11
+	add	x11, x11, x12
+	mov	x12, x29
+	add	x9, x9, #4
+	cmp	x29, x19
+	b.ge	.LBB0_92
+	.p2align	2
+.LBB0_91:                               // =>This Inner Loop Header: Depth=1
+	add	x13, x8, x12, lsl #3
+	prfm	pldl1keep, [x9]
+	ldr	s1, [x11, x12, lsl #2]
+	add	x9, x9, #4
+	add	x13, x13, #8
+	prfm	pldl1keep, [x13]
+	ldr	d2, [x8, x12, lsl #3]
+	add	x12, x12, #1
+	fmla	v0.2s, v2.2s, v1.s[0]
+	cmp	x12, x19
+	b.lt	.LBB0_91
+.LBB0_92:
+	str	d0, [x10]
+.LBB0_93:
+	bl	free
+	ldr	x20, [sp, #472]                 // 8-byte Folded Reload
+.LBB0_94:
+	ldr	x8, [sp, #128]                  // 8-byte Folded Reload
+	ldr	x25, [sp, #480]                 // 8-byte Folded Reload
+	cmp	x25, x8
+	b.ge	.LBB0_126
+// %bb.95:
+	ldr	x8, [sp, #120]                  // 8-byte Folded Reload
+	add	x0, x8, #64
+	bl	malloc
+	add	x10, x25, x27, lsl #2
+	ldr	x18, [sp, #328]                 // 8-byte Folded Reload
+	ldr	x1, [sp, #344]                  // 8-byte Folded Reload
+	mov	w5, #12                         // =0xc
+	add	x9, x25, x27, lsl #1
+	sub	x13, x25, x27
+	mov	w11, #6                         // =0x6
+	add	x8, x27, x25
+	add	x16, x10, x27
+	ldr	s2, [x24, x10, lsl #2]
+	mul	x10, x22, x5
+	add	x15, x9, x27
+	ldr	s5, [x24, x9, lsl #2]
+	add	x13, x13, x27, lsl #3
+	ldr	s3, [x24, x16, lsl #2]
+	add	x9, x18, x1
+	mov	w16, #20                        // =0x14
+	ldr	s4, [x24, x15, lsl #2]
+	mov	w15, #24                        // =0x18
+	madd	x11, x27, x11, x25
+	ldr	q19, [x9, x10]
+	mul	x10, x22, x16
+	add	x14, x0, #63
+	ldr	s0, [x24, x13, lsl #2]
+	mul	x13, x22, x15
+	ldr	x4, [sp, #64]                   // 8-byte Folded Reload
+	ldp	x7, x6, [sp, #296]              // 16-byte Folded Reload
+	ldr	q22, [x9, x13]
+	mov	w13, #28                        // =0x1c
+	ldr	s1, [x24, x11, lsl #2]
+	add	x11, x7, x21
+	ldr	q21, [x9, x10]
+	ldr	x10, [sp, #464]                 // 8-byte Folded Reload
+	ldr	q16, [x9]
+	ldr	s7, [x24, x8, lsl #2]
+	ldr	s6, [x24, x25, lsl #2]
+	ldr	s23, [x11, x25, lsl #2]
+	ldr	q17, [x9, x28]
+	ldr	q18, [x9, x20]
+	ldr	q20, [x9, x22, lsl #4]
+	lsl	x10, x10, #3
+	and	x8, x14, #0xffffffffffffffc0
+	ldr	x29, [sp, #504]                 // 8-byte Folded Reload
+	ldr	x30, [sp, #104]                 // 8-byte Folded Reload
+	madd	x17, x6, x13, x10
+	add	x14, x10, x6, lsl #5
+	madd	x15, x6, x15, x10
+	madd	x16, x6, x16, x10
+	madd	x5, x6, x5, x10
+	add	x2, x10, x6, lsl #3
+	mov	w3, #16                         // =0x10
+	mov	x12, xzr
+	add	x13, x7, x14
+	add	x2, x7, x2
+	sub	x3, x3, x4
+	add	x14, x7, x17
+	add	x17, x1, x18
+	add	x1, x10, x6, lsl #2
+	add	x18, x26, x10
+	add	x15, x7, x15
+	add	x16, x7, x16
+	add	x5, x7, x5
+	add	x17, x4, x17
+	add	x18, x7, x18
+	orr	x4, x8, #0x8
+	add	x17, x17, #16
+	add	x1, x7, x1
+	.p2align	2
+.LBB0_96:                               // =>This Inner Loop Header: Depth=1
+	prfm	pldl1keep, [x17]
+	ldur	q24, [x17, #-16]
+	ext	v31.16b, v16.16b, v16.16b, #8
+	ext	v8.16b, v17.16b, v17.16b, #8
+	cmp	x12, x23
+	ext	v30.16b, v18.16b, v18.16b, #8
+	ext	v29.16b, v19.16b, v19.16b, #8
+	ext	v28.16b, v20.16b, v20.16b, #8
+	ext	v27.16b, v21.16b, v21.16b, #8
+	ext	v26.16b, v22.16b, v22.16b, #8
+	ext	v25.16b, v24.16b, v24.16b, #8
+	b.ge	.LBB0_98
+// %bb.97:                              //   in Loop: Header=BB0_96 Depth=1
+	add	x6, x16, x21
+	stur	s23, [x4, #-8]
+	fmla	v4.2s, v23.2s, v19.2s
+	fmla	v6.2s, v23.2s, v16.2s
+	prfm	pldl1keep, [x6]
+	ldr	s9, [x1, x21]
+	add	x6, x15, x21
+	fmla	v7.2s, v23.2s, v17.2s
+	fmla	v5.2s, v23.2s, v18.2s
+	fmla	v2.2s, v23.2s, v20.2s
+	fmla	v3.2s, v23.2s, v21.2s
+	fmla	v1.2s, v23.2s, v22.2s
+	fmla	v0.2s, v23.2s, v24.2s
+	add	x7, x14, x21
+	add	x20, x17, x3
+	add	x25, x20, x28
+	add	x12, x12, #4
+	add	x14, x14, x26
+	add	x15, x15, x26
+	add	x16, x16, x26
+	add	x17, x17, #16
+	add	x1, x1, x26
+	stur	s9, [x4, #-4]
+	prfm	pldl1keep, [x6]
+	ldr	s23, [x2, x21]
+	fmla	v4.2s, v9.2s, v19.s[1]
+	fmla	v6.2s, v9.2s, v16.s[1]
+	fmla	v7.2s, v9.2s, v17.s[1]
+	fmla	v5.2s, v9.2s, v18.s[1]
+	fmla	v2.2s, v9.2s, v20.s[1]
+	fmla	v3.2s, v9.2s, v21.s[1]
+	fmla	v1.2s, v9.2s, v22.s[1]
+	fmla	v0.2s, v9.2s, v24.s[1]
+	add	x6, x13, x21
+	add	x13, x13, x26
+	add	x2, x2, x26
+	str	s23, [x4]
+	prfm	pldl1keep, [x7]
+	fmla	v4.2s, v23.2s, v29.2s
+	ldr	s29, [x5, x21]
+	fmla	v6.2s, v23.2s, v31.2s
+	fmla	v7.2s, v23.2s, v8.2s
+	fmla	v5.2s, v23.2s, v30.2s
+	fmla	v2.2s, v23.2s, v28.2s
+	add	x7, x25, x28
+	fmla	v3.2s, v23.2s, v27.2s
+	fmla	v1.2s, v23.2s, v26.2s
+	fmla	v0.2s, v23.2s, v25.2s
+	add	x5, x5, x26
+	str	s29, [x4, #4]
+	prfm	pldl1keep, [x6]
+	add	x6, x7, x28
+	fmla	v6.2s, v29.2s, v16.s[3]
+	fmla	v7.2s, v29.2s, v17.s[3]
+	fmla	v5.2s, v29.2s, v18.s[3]
+	fmla	v4.2s, v29.2s, v19.s[3]
+	fmla	v2.2s, v29.2s, v20.s[3]
+	ldr	s23, [x18, x21]
+	prfm	pldl1keep, [x20]
+	ldur	q16, [x20, #-16]
+	prfm	pldl1keep, [x25]
+	ldur	q17, [x25, #-16]
+	prfm	pldl1keep, [x7]
+	ldur	q18, [x7, #-16]
+	fmla	v3.2s, v29.2s, v21.s[3]
+	fmla	v1.2s, v29.2s, v22.s[3]
+	fmla	v0.2s, v29.2s, v24.s[3]
+	add	x18, x18, x26
+	add	x4, x4, #16
+	prfm	pldl1keep, [x6]
+	ldur	q19, [x6, #-16]
+	add	x6, x6, x28
+	prfm	pldl1keep, [x6]
+	ldur	q20, [x6, #-16]
+	add	x6, x6, x28
+	prfm	pldl1keep, [x6]
+	ldur	q21, [x6, #-16]
+	add	x6, x6, x28
+	prfm	pldl1keep, [x6]
+	ldur	q22, [x6, #-16]
+	b	.LBB0_96
+.LBB0_98:
+	ldp	x13, x14, [sp, #400]            // 16-byte Folded Reload
+	ldr	x15, [sp, #304]                 // 8-byte Folded Reload
+	fmla	v6.2s, v23.2s, v16.2s
+	ldr	x25, [sp, #480]                 // 8-byte Folded Reload
+	str	s23, [x8, x23, lsl #2]
+	fmla	v7.2s, v23.2s, v17.2s
+	fmla	v5.2s, v23.2s, v18.2s
+	fmla	v4.2s, v23.2s, v19.2s
+	fmla	v2.2s, v23.2s, v20.2s
+	fmla	v3.2s, v23.2s, v21.2s
+	fmla	v1.2s, v23.2s, v22.2s
+	fmla	v0.2s, v23.2s, v24.2s
+	ldr	x7, [sp, #520]                  // 8-byte Folded Reload
+	ldr	x16, [sp, #96]                  // 8-byte Folded Reload
+	madd	x12, x13, x15, x25
+	ldr	s23, [x11, x12, lsl #2]
+	madd	x12, x14, x15, x25
+	str	s23, [x8, x13, lsl #2]
+	ldr	x13, [sp, #416]                 // 8-byte Folded Reload
+	fmla	v6.2s, v23.2s, v16.s[1]
+	fmla	v7.2s, v23.2s, v17.s[1]
+	fmla	v5.2s, v23.2s, v18.s[1]
+	fmla	v4.2s, v23.2s, v19.s[1]
+	fmla	v2.2s, v23.2s, v20.s[1]
+	fmla	v3.2s, v23.2s, v21.s[1]
+	fmla	v1.2s, v23.2s, v22.s[1]
+	fmla	v0.2s, v23.2s, v24.s[1]
+	ldr	s23, [x11, x12, lsl #2]
+	madd	x12, x13, x15, x25
+	fmla	v6.2s, v23.2s, v31.2s
+	str	s23, [x8, x14, lsl #2]
+	fmla	v7.2s, v23.2s, v8.2s
+	fmla	v5.2s, v23.2s, v30.2s
+	fmla	v4.2s, v23.2s, v29.2s
+	fmla	v2.2s, v23.2s, v28.2s
+	fmla	v3.2s, v23.2s, v27.2s
+	fmla	v1.2s, v23.2s, v26.2s
+	fmla	v0.2s, v23.2s, v25.2s
+	ldr	s31, [x11, x12, lsl #2]
+	ldr	x11, [sp, #72]                  // 8-byte Folded Reload
+	ldp	x17, x12, [sp, #272]            // 16-byte Folded Reload
+	add	x10, x10, x12, lsl #2
+	ldr	x12, [sp, #296]                 // 8-byte Folded Reload
+	add	x11, x11, #4
+	mul	x11, x15, x11
+	str	s31, [x8, x13, lsl #2]
+	ldr	x13, [sp, #120]                 // 8-byte Folded Reload
+	fmla	v6.2s, v31.2s, v16.s[3]
+	fmla	v7.2s, v31.2s, v17.s[3]
+	fmla	v5.2s, v31.2s, v18.s[3]
+	fmla	v4.2s, v31.2s, v19.s[3]
+	fmla	v2.2s, v31.2s, v20.s[3]
+	fmla	v3.2s, v31.2s, v21.s[3]
+	fmla	v1.2s, v31.2s, v22.s[3]
+	fmla	v0.2s, v31.2s, v24.s[3]
+	add	x10, x12, x10
+	ldr	x12, [sp, #320]                 // 8-byte Folded Reload
+	add	x12, x13, x12, lsl #2
+	ldr	x13, [sp, #80]                  // 8-byte Folded Reload
+	sub	x12, x12, x13
+	ldr	x13, [sp, #328]                 // 8-byte Folded Reload
+	add	x12, x12, x13
+	mov	x13, x29
+	add	x12, x12, #4
+	cmp	x29, x19
+	b.ge	.LBB0_100
+	.p2align	2
+.LBB0_99:                               // =>This Inner Loop Header: Depth=1
+	add	x15, x12, x28
+	prfm	pldl1keep, [x12]
+	ldur	s16, [x12, #-4]
+	add	x14, x10, x11
+	prfm	pldl1keep, [x15]
+	ldur	s17, [x15, #-4]
+	add	x15, x15, x28
+	add	x12, x12, #4
+	prfm	pldl1keep, [x15]
+	ldur	s18, [x15, #-4]
+	add	x15, x15, x28
+	prfm	pldl1keep, [x15]
+	ldur	s19, [x15, #-4]
+	add	x15, x15, x28
+	prfm	pldl1keep, [x15]
+	ldur	s20, [x15, #-4]
+	add	x15, x15, x28
+	prfm	pldl1keep, [x15]
+	ldur	s21, [x15, #-4]
+	add	x15, x15, x28
+	prfm	pldl1keep, [x15]
+	ldur	s22, [x15, #-4]
+	add	x15, x15, x28
+	prfm	pldl1keep, [x15]
+	ldur	s23, [x15, #-4]
+	prfm	pldl1keep, [x14]
+	ldr	s24, [x10, x16, lsl #2]
+	add	x10, x10, x17
+	fmla	v6.2s, v24.2s, v16.2s
+	str	s24, [x8, x13, lsl #2]
+	add	x13, x13, #1
+	fmla	v7.2s, v24.2s, v17.2s
+	fmla	v5.2s, v24.2s, v18.2s
+	fmla	v4.2s, v24.2s, v19.2s
+	fmla	v2.2s, v24.2s, v20.2s
+	fmla	v3.2s, v24.2s, v21.2s
+	fmla	v1.2s, v24.2s, v22.2s
+	fmla	v0.2s, v24.2s, v23.2s
+	cmp	x13, x19
+	b.lt	.LBB0_99
+.LBB0_100:                              // %.preheader
+	ldr	x10, [sp, #344]                 // 8-byte Folded Reload
+	ldr	x11, [sp, #496]                 // 8-byte Folded Reload
+	mov	x6, xzr
+	mov	w16, #7                         // =0x7
+	ldr	x15, [sp, #328]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #88]                  // 8-byte Folded Reload
+	mov	w17, #6                         // =0x6
+	mov	w18, #5                         // =0x5
+	mov	w1, #4                          // =0x4
+	mov	w2, #3                          // =0x3
+	mov	w3, #2                          // =0x2
+	mov	w4, #1                          // =0x1
+	add	x13, x11, x10
+	sub	x10, x8, x30, lsl #2
+	add	x14, x12, x15
+	add	x11, x8, #12
+	mov	w12, #8                         // =0x8
+	add	x15, x13, x15
+	add	x13, x14, #4
+	add	x10, x10, x19, lsl #2
+	add	x14, x15, #32
+	add	x15, x10, #4
+	b	.LBB0_102
+	.p2align	2
+.LBB0_101:                              // %.loopexit
+                                        //   in Loop: Header=BB0_102 Depth=1
+	ldr	x6, [sp, #496]                  // 8-byte Folded Reload
+	ldr	x7, [sp, #520]                  // 8-byte Folded Reload
+	add	x14, x14, x6
+	add	x13, x13, x6
+	mov	x6, x12
+	mov	x12, x5
+.LBB0_102:                              // =>This Loop Header: Depth=1
+                                        //     Child Loop BB0_104 Depth 2
+                                        //     Child Loop BB0_106 Depth 2
+	madd	x5, x6, x27, x25
+	cmp	x12, x7
+	str	s6, [x24, x5, lsl #2]
+	madd	x4, x4, x27, x25
+	madd	x3, x3, x27, x25
+	madd	x2, x2, x27, x25
+	madd	x1, x1, x27, x25
+	str	s7, [x24, x4, lsl #2]
+	str	s5, [x24, x3, lsl #2]
+	str	s4, [x24, x2, lsl #2]
+	str	s2, [x24, x1, lsl #2]
+	madd	x18, x18, x27, x25
+	str	s3, [x24, x18, lsl #2]
+	madd	x17, x17, x27, x25
+	str	s1, [x24, x17, lsl #2]
+	madd	x16, x16, x27, x25
+	str	s0, [x24, x16, lsl #2]
+	b.ge	.LBB0_107
+// %bb.103:                             //   in Loop: Header=BB0_102 Depth=1
+	madd	x3, x12, x27, x25
+	add	x2, x12, #3
+	add	x18, x12, #5
+	add	x1, x12, #4
+	madd	x4, x2, x27, x25
+	add	x17, x12, #6
+	add	x16, x12, #7
+	ldr	s24, [x8]
+	madd	x7, x18, x27, x25
+	mov	x6, xzr
+	ldr	s6, [x24, x3, lsl #2]
+	add	x3, x12, #2
+	madd	x5, x1, x27, x25
+	madd	x20, x17, x27, x25
+	madd	x21, x16, x27, x25
+	ldr	s3, [x24, x7, lsl #2]
+	ldr	s4, [x24, x4, lsl #2]
+	ldr	s2, [x24, x5, lsl #2]
+	ldr	s0, [x24, x21, lsl #2]
+	madd	x4, x3, x27, x25
+	ldr	s1, [x24, x20, lsl #2]
+	mov	x20, x14
+	mul	x7, x12, x22
+	ldr	s5, [x24, x4, lsl #2]
+	add	x4, x12, #1
+	lsl	x7, x7, #2
+	ldr	q23, [x9, x7]
+	mul	x7, x4, x22
+	madd	x5, x4, x27, x25
+	lsl	x7, x7, #2
+	ldr	s7, [x24, x5, lsl #2]
+	add	x5, x12, #8
+	ldr	q22, [x9, x7]
+	mul	x7, x3, x22
+	lsl	x7, x7, #2
+	ldr	q21, [x9, x7]
+	mul	x7, x2, x22
+	lsl	x7, x7, #2
+	ldr	q20, [x9, x7]
+	mul	x7, x1, x22
+	lsl	x7, x7, #2
+	ldr	q19, [x9, x7]
+	mul	x7, x18, x22
+	lsl	x7, x7, #2
+	ldr	q18, [x9, x7]
+	mul	x7, x17, x22
+	lsl	x7, x7, #2
+	ldr	q17, [x9, x7]
+	mul	x7, x16, x22
+	lsl	x7, x7, #2
+	ldr	q16, [x9, x7]
+	mov	x7, x11
+	fmla	v6.2s, v24.2s, v23.2s
+	cmp	xzr, x23
+	b.ge	.LBB0_105
+	.p2align	2
+.LBB0_104:                              //   Parent Loop BB0_102 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x21, x7, #8
+	fmla	v4.2s, v24.2s, v20.2s
+	fmla	v7.2s, v24.2s, v22.2s
+	add	x6, x6, #4
+	prfm	pldl1keep, [x21]
+	ldp	s27, s25, [x7, #-8]
+	fmla	v5.2s, v24.2s, v21.2s
+	fmla	v2.2s, v24.2s, v19.2s
+	fmla	v3.2s, v24.2s, v18.2s
+	fmla	v1.2s, v24.2s, v17.2s
+	add	x21, x20, x28
+	ext	v28.16b, v20.16b, v20.16b, #8
+	fmla	v0.2s, v24.2s, v16.2s
+	fmla	v4.2s, v27.2s, v20.s[1]
+	fmla	v6.2s, v27.2s, v23.s[1]
+	fmla	v7.2s, v27.2s, v22.s[1]
+	fmla	v5.2s, v27.2s, v21.s[1]
+	fmla	v2.2s, v27.2s, v19.s[1]
+	ldp	s26, s24, [x7], #16
+	prfm	pldl1keep, [x20]
+	fmla	v3.2s, v27.2s, v18.s[1]
+	fmla	v1.2s, v27.2s, v17.s[1]
+	fmla	v0.2s, v27.2s, v16.s[1]
+	fmla	v4.2s, v25.2s, v28.2s
+	ext	v30.16b, v23.16b, v23.16b, #8
+	ext	v31.16b, v22.16b, v22.16b, #8
+	fmla	v6.2s, v25.2s, v30.2s
+	fmla	v7.2s, v25.2s, v31.2s
+	fmla	v6.2s, v26.2s, v23.s[3]
+	ldur	q23, [x20, #-16]
+	ext	v29.16b, v21.16b, v21.16b, #8
+	ext	v28.16b, v19.16b, v19.16b, #8
+	fmla	v5.2s, v25.2s, v29.2s
+	prfm	pldl1keep, [x21]
+	add	x20, x20, #16
+	fmla	v2.2s, v25.2s, v28.2s
+	fmla	v7.2s, v26.2s, v22.s[3]
+	ldur	q22, [x21, #-16]
+	add	x21, x21, x28
+	prfm	pldl1keep, [x21]
+	fmla	v5.2s, v26.2s, v21.s[3]
+	ldur	q21, [x21, #-16]
+	add	x21, x21, x28
+	prfm	pldl1keep, [x21]
+	ext	v28.16b, v18.16b, v18.16b, #8
+	fmla	v3.2s, v25.2s, v28.2s
+	fmla	v4.2s, v26.2s, v20.s[3]
+	ldur	q20, [x21, #-16]
+	add	x21, x21, x28
+	prfm	pldl1keep, [x21]
+	fmla	v2.2s, v26.2s, v19.s[3]
+	ldur	q19, [x21, #-16]
+	add	x21, x21, x28
+	prfm	pldl1keep, [x21]
+	fmla	v3.2s, v26.2s, v18.s[3]
+	ldur	q18, [x21, #-16]
+	add	x21, x21, x28
+	prfm	pldl1keep, [x21]
+	ext	v28.16b, v17.16b, v17.16b, #8
+	fmla	v1.2s, v25.2s, v28.2s
+	fmla	v1.2s, v26.2s, v17.s[3]
+	ldur	q17, [x21, #-16]
+	add	x21, x21, x28
+	prfm	pldl1keep, [x21]
+	ext	v27.16b, v16.16b, v16.16b, #8
+	fmla	v0.2s, v25.2s, v27.2s
+	fmla	v0.2s, v26.2s, v16.s[3]
+	ldur	q16, [x21, #-16]
+	fmla	v6.2s, v24.2s, v23.2s
+	cmp	x6, x23
+	b.lt	.LBB0_104
+.LBB0_105:                              //   in Loop: Header=BB0_102 Depth=1
+	ldp	x7, x6, [sp, #400]              // 16-byte Folded Reload
+	fmla	v7.2s, v24.2s, v22.2s
+	fmla	v4.2s, v24.2s, v20.2s
+	fmla	v5.2s, v24.2s, v21.2s
+	fmla	v2.2s, v24.2s, v19.2s
+	mov	x20, x29
+	fmla	v3.2s, v24.2s, v18.2s
+	fmla	v1.2s, v24.2s, v17.2s
+	ldr	s26, [x8, x7, lsl #2]
+	fmla	v0.2s, v24.2s, v16.2s
+	ldr	s27, [x8, x6, lsl #2]
+	ext	v24.16b, v23.16b, v23.16b, #8
+	ldr	x6, [sp, #416]                  // 8-byte Folded Reload
+	mov	x7, x15
+	ldr	s25, [x8, x6, lsl #2]
+	mov	x6, x13
+	fmla	v6.2s, v26.2s, v23.s[1]
+	fmla	v7.2s, v26.2s, v22.s[1]
+	fmla	v4.2s, v26.2s, v20.s[1]
+	fmla	v2.2s, v26.2s, v19.s[1]
+	fmla	v5.2s, v26.2s, v21.s[1]
+	fmla	v3.2s, v26.2s, v18.s[1]
+	fmla	v1.2s, v26.2s, v17.s[1]
+	fmla	v0.2s, v26.2s, v16.s[1]
+	ext	v26.16b, v21.16b, v21.16b, #8
+	fmla	v6.2s, v27.2s, v24.2s
+	ext	v24.16b, v22.16b, v22.16b, #8
+	fmla	v5.2s, v27.2s, v26.2s
+	fmla	v7.2s, v27.2s, v24.2s
+	ext	v24.16b, v20.16b, v20.16b, #8
+	ext	v26.16b, v17.16b, v17.16b, #8
+	fmla	v1.2s, v27.2s, v26.2s
+	fmla	v4.2s, v27.2s, v24.2s
+	ext	v24.16b, v19.16b, v19.16b, #8
+	fmla	v6.2s, v25.2s, v23.s[3]
+	fmla	v5.2s, v25.2s, v21.s[3]
+	fmla	v2.2s, v27.2s, v24.2s
+	fmla	v7.2s, v25.2s, v22.s[3]
+	ext	v24.16b, v18.16b, v18.16b, #8
+	fmla	v1.2s, v25.2s, v17.s[3]
+	fmla	v3.2s, v27.2s, v24.2s
+	ext	v24.16b, v16.16b, v16.16b, #8
+	fmla	v4.2s, v25.2s, v20.s[3]
+	fmla	v0.2s, v27.2s, v24.2s
+	fmla	v2.2s, v25.2s, v19.s[3]
+	fmla	v3.2s, v25.2s, v18.s[3]
+	fmla	v0.2s, v25.2s, v16.s[3]
+	cmp	x29, x19
+	b.ge	.LBB0_101
+	.p2align	2
+.LBB0_106:                              //   Parent Loop BB0_102 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x21, x6, x28
+	prfm	pldl1keep, [x6]
+	ldur	s16, [x6, #-4]
+	add	x20, x20, #1
+	prfm	pldl1keep, [x21]
+	ldur	s17, [x21, #-4]
+	add	x21, x21, x28
+	add	x6, x6, #4
+	prfm	pldl1keep, [x21]
+	ldur	s18, [x21, #-4]
+	add	x21, x21, x28
+	prfm	pldl1keep, [x21]
+	ldur	s19, [x21, #-4]
+	add	x21, x21, x28
+	prfm	pldl1keep, [x21]
+	ldur	s20, [x21, #-4]
+	add	x21, x21, x28
+	prfm	pldl1keep, [x21]
+	ldur	s21, [x21, #-4]
+	add	x21, x21, x28
+	prfm	pldl1keep, [x21]
+	ldur	s22, [x21, #-4]
+	add	x21, x21, x28
+	prfm	pldl1keep, [x21]
+	ldur	s23, [x21, #-4]
+	prfm	pldl1keep, [x7]
+	ldur	s24, [x7, #-4]
+	add	x7, x7, #4
+	fmla	v6.2s, v24.2s, v16.2s
+	fmla	v7.2s, v24.2s, v17.2s
+	fmla	v5.2s, v24.2s, v18.2s
+	fmla	v4.2s, v24.2s, v19.2s
+	fmla	v2.2s, v24.2s, v20.2s
+	fmla	v3.2s, v24.2s, v21.2s
+	fmla	v1.2s, v24.2s, v22.2s
+	fmla	v0.2s, v24.2s, v23.2s
+	cmp	x20, x19
+	b.lt	.LBB0_106
+	b	.LBB0_101
+.LBB0_107:
+	ldr	x13, [sp, #336]                 // 8-byte Folded Reload
+	cmp	x7, x13
+	b.lt	.LBB0_110
+// %bb.108:
+	ldr	x11, [sp, #312]                 // 8-byte Folded Reload
+	cmp	x13, x11
+	b.lt	.LBB0_115
+.LBB0_109:
+	ldr	x11, [sp, #288]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #312]                 // 8-byte Folded Reload
+	cmp	x12, x11
+	b.lt	.LBB0_120
+	b	.LBB0_125
+.LBB0_110:
+	add	x18, x7, #1
+	add	x1, x7, #2
+	add	x2, x7, #3
+	mul	x14, x7, x27
+	madd	x13, x18, x27, x25
+	mov	x16, xzr
+	add	x14, x14, x25
+	mul	x18, x18, x22
+	mul	x15, x7, x22
+	madd	x12, x1, x27, x25
+	lsl	x17, x15, #2
+	lsl	x18, x18, #2
+	madd	x11, x2, x27, x25
+	ldr	s2, [x24, x14, lsl #2]
+	ldr	s0, [x24, x11, lsl #2]
+	ldr	s1, [x24, x12, lsl #2]
+	ldr	s3, [x24, x13, lsl #2]
+	ldr	q6, [x9, x17]
+	ldr	q7, [x9, x18]
+	mul	x18, x1, x22
+	mov	x17, x8
+	ldr	s16, [x17], #12
+	lsl	x18, x18, #2
+	ldr	q5, [x9, x18]
+	mul	x18, x2, x22
+	lsl	x18, x18, #2
+	ldp	x2, x1, [sp, #320]              // 16-byte Folded Reload
+	ldr	q4, [x9, x18]
+	ldr	x18, [sp, #32]                  // 8-byte Folded Reload
+	lsl	x18, x18, #5
+	add	x18, x18, x2, lsl #2
+	add	x18, x18, x1
+	add	x18, x18, #32
+	ext	v20.16b, v6.16b, v6.16b, #8
+	cmp	xzr, x23
+	ext	v19.16b, v7.16b, v7.16b, #8
+	ext	v18.16b, v5.16b, v5.16b, #8
+	ext	v17.16b, v4.16b, v4.16b, #8
+	b.ge	.LBB0_112
+	.p2align	2
+.LBB0_111:                              // =>This Inner Loop Header: Depth=1
+	add	x1, x17, #8
+	fmla	v2.2s, v16.2s, v6.2s
+	fmla	v3.2s, v16.2s, v7.2s
+	add	x16, x16, #4
+	fmla	v1.2s, v16.2s, v5.2s
+	fmla	v0.2s, v16.2s, v4.2s
+	prfm	pldl1keep, [x1]
+	add	x1, x18, x28
+	ldp	s16, s21, [x17, #-8]
+	fmla	v0.2s, v16.2s, v4.s[1]
+	fmla	v2.2s, v16.2s, v6.s[1]
+	fmla	v3.2s, v16.2s, v7.s[1]
+	fmla	v1.2s, v16.2s, v5.s[1]
+	fmla	v0.2s, v21.2s, v17.2s
+	fmla	v2.2s, v21.2s, v20.2s
+	ldp	s17, s16, [x17], #16
+	fmla	v3.2s, v21.2s, v19.2s
+	fmla	v1.2s, v21.2s, v18.2s
+	prfm	pldl1keep, [x18]
+	fmla	v2.2s, v17.2s, v6.s[3]
+	ldur	q6, [x18, #-16]
+	prfm	pldl1keep, [x1]
+	fmla	v3.2s, v17.2s, v7.s[3]
+	ldur	q7, [x1, #-16]
+	add	x1, x1, x28
+	fmla	v1.2s, v17.2s, v5.s[3]
+	fmla	v0.2s, v17.2s, v4.s[3]
+	add	x18, x18, #16
+	prfm	pldl1keep, [x1]
+	ldur	q5, [x1, #-16]
+	add	x1, x1, x28
+	prfm	pldl1keep, [x1]
+	ldur	q4, [x1, #-16]
+	ext	v20.16b, v6.16b, v6.16b, #8
+	cmp	x16, x23
+	ext	v19.16b, v7.16b, v7.16b, #8
+	ext	v18.16b, v5.16b, v5.16b, #8
+	ext	v17.16b, v4.16b, v4.16b, #8
+	b.lt	.LBB0_111
+.LBB0_112:
+	ldp	x17, x16, [sp, #400]            // 16-byte Folded Reload
+	fmla	v2.2s, v16.2s, v6.2s
+	fmla	v3.2s, v16.2s, v7.2s
+	fmla	v1.2s, v16.2s, v5.2s
+	fmla	v0.2s, v16.2s, v4.2s
+	add	x15, x19, x15
+	ldr	s21, [x8, x17, lsl #2]
+	ldr	s16, [x8, x16, lsl #2]
+	ldr	x16, [sp, #416]                 // 8-byte Folded Reload
+	ldr	x17, [sp, #344]                 // 8-byte Folded Reload
+	ldr	s22, [x8, x16, lsl #2]
+	sub	x16, x15, x30
+	add	x15, x10, #4
+	add	x16, x17, x16, lsl #2
+	ldr	x17, [sp, #328]                 // 8-byte Folded Reload
+	fmla	v2.2s, v21.2s, v6.s[1]
+	fmla	v3.2s, v21.2s, v7.s[1]
+	fmla	v1.2s, v21.2s, v5.s[1]
+	fmla	v0.2s, v21.2s, v4.s[1]
+	add	x16, x16, x17
+	mov	x17, x29
+	fmla	v2.2s, v16.2s, v20.2s
+	fmla	v3.2s, v16.2s, v19.2s
+	fmla	v1.2s, v16.2s, v18.2s
+	fmla	v0.2s, v16.2s, v17.2s
+	add	x16, x16, #4
+	fmla	v2.2s, v22.2s, v6.s[3]
+	fmla	v3.2s, v22.2s, v7.s[3]
+	fmla	v1.2s, v22.2s, v5.s[3]
+	fmla	v0.2s, v22.2s, v4.s[3]
+	cmp	x29, x19
+	b.ge	.LBB0_114
+	.p2align	2
+.LBB0_113:                              // =>This Inner Loop Header: Depth=1
+	add	x18, x16, x28
+	prfm	pldl1keep, [x16]
+	ldur	s4, [x16, #-4]
+	add	x17, x17, #1
+	prfm	pldl1keep, [x18]
+	ldur	s5, [x18, #-4]
+	add	x18, x18, x28
+	add	x16, x16, #4
+	prfm	pldl1keep, [x18]
+	ldur	s6, [x18, #-4]
+	add	x18, x18, x28
+	prfm	pldl1keep, [x18]
+	ldur	s7, [x18, #-4]
+	prfm	pldl1keep, [x15]
+	ldur	s16, [x15, #-4]
+	add	x15, x15, #4
+	fmla	v2.2s, v16.2s, v4.2s
+	fmla	v3.2s, v16.2s, v5.2s
+	fmla	v1.2s, v16.2s, v6.2s
+	fmla	v0.2s, v16.2s, v7.2s
+	cmp	x17, x19
+	b.lt	.LBB0_113
+.LBB0_114:
+	str	s2, [x24, x14, lsl #2]
+	str	s3, [x24, x13, lsl #2]
+	ldr	x13, [sp, #336]                 // 8-byte Folded Reload
+	str	s1, [x24, x12, lsl #2]
+	str	s0, [x24, x11, lsl #2]
+	ldr	x11, [sp, #312]                 // 8-byte Folded Reload
+	cmp	x13, x11
+	b.ge	.LBB0_109
+.LBB0_115:
+	mul	x11, x13, x27
+	add	x14, x13, #1
+	ldr	x1, [sp, #328]                  // 8-byte Folded Reload
+	ldr	x2, [sp, #344]                  // 8-byte Folded Reload
+	mul	x13, x13, x22
+	ldr	s4, [x8]
+	mov	x15, xzr
+	mov	x16, xzr
+	madd	x12, x14, x27, x25
+	lsl	x17, x13, #2
+	mul	x14, x14, x22
+	add	x18, x1, x2
+	add	x11, x11, x25
+	ldr	q3, [x9, x17]
+	lsl	x17, x14, #2
+	ldr	s0, [x24, x11, lsl #2]
+	ldr	s1, [x24, x12, lsl #2]
+	ldr	q2, [x9, x17]
+	add	x17, x18, x17
+	ldr	x18, [sp, #40]                  // 8-byte Folded Reload
+	add	x18, x2, x18, lsl #4
+	add	x18, x1, x18
+	ext	v6.16b, v3.16b, v3.16b, #8
+	cmp	xzr, x23
+	ext	v5.16b, v2.16b, v2.16b, #8
+	b.ge	.LBB0_117
+	.p2align	2
+.LBB0_116:                              // =>This Inner Loop Header: Depth=1
+	add	x5, x8, x15
+	fmla	v0.2s, v4.2s, v3.2s
+	fmla	v1.2s, v4.2s, v2.2s
+	add	x1, x17, x15
+	add	x6, x5, #20
+	add	x3, x18, x15
+	add	x2, x1, #32
+	add	x4, x3, #32
+	prfm	pldl1keep, [x6]
+	ldp	s4, s7, [x5, #4]
+	add	x16, x16, #4
+	add	x15, x15, #16
+	fmla	v1.2s, v4.2s, v2.s[1]
+	fmla	v0.2s, v4.2s, v3.s[1]
+	fmla	v1.2s, v7.2s, v5.2s
+	ldp	s5, s4, [x5, #12]
+	fmla	v0.2s, v7.2s, v6.2s
+	prfm	pldl1keep, [x4]
+	fmla	v0.2s, v5.2s, v3.s[3]
+	ldr	q3, [x3, #16]
+	prfm	pldl1keep, [x2]
+	fmla	v1.2s, v5.2s, v2.s[3]
+	ldr	q2, [x1, #16]
+	ext	v6.16b, v3.16b, v3.16b, #8
+	cmp	x16, x23
+	ext	v5.16b, v2.16b, v2.16b, #8
+	b.lt	.LBB0_116
+.LBB0_117:
+	ldp	x17, x16, [sp, #400]            // 16-byte Folded Reload
+	fmla	v0.2s, v4.2s, v3.2s
+	fmla	v1.2s, v4.2s, v2.2s
+	add	x13, x19, x13
+	mov	x15, xzr
+	ldr	s7, [x8, x17, lsl #2]
+	ldr	s4, [x8, x16, lsl #2]
+	ldr	x16, [sp, #416]                 // 8-byte Folded Reload
+	ldr	x17, [sp, #328]                 // 8-byte Folded Reload
+	fmla	v0.2s, v7.2s, v3.s[1]
+	fmla	v1.2s, v7.2s, v2.s[1]
+	ldr	s7, [x8, x16, lsl #2]
+	sub	x16, x13, x30
+	add	x13, x19, x14
+	ldr	x14, [sp, #344]                 // 8-byte Folded Reload
+	sub	x13, x13, x30
+	fmla	v0.2s, v4.2s, v6.2s
+	fmla	v1.2s, v4.2s, v5.2s
+	add	x13, x14, x13, lsl #2
+	add	x14, x14, x16, lsl #2
+	mov	x16, x29
+	add	x13, x17, x13
+	add	x14, x17, x14
+	fmla	v0.2s, v7.2s, v3.s[3]
+	fmla	v1.2s, v7.2s, v2.s[3]
+	cmp	x29, x19
+	b.ge	.LBB0_119
+	.p2align	2
+.LBB0_118:                              // =>This Inner Loop Header: Depth=1
+	add	x17, x10, x15
+	add	x18, x13, x15
+	add	x1, x14, x15
+	add	x16, x16, #1
+	add	x17, x17, #4
+	add	x18, x18, #4
+	add	x1, x1, #4
+	prfm	pldl1keep, [x1]
+	prfm	pldl1keep, [x18]
+	ldr	s2, [x14, x15]
+	prfm	pldl1keep, [x17]
+	ldr	s3, [x10, x15]
+	fmla	v0.2s, v3.2s, v2.2s
+	ldr	s2, [x13, x15]
+	add	x15, x15, #4
+	fmla	v1.2s, v3.2s, v2.2s
+	cmp	x16, x19
+	b.lt	.LBB0_118
+.LBB0_119:
+	str	s0, [x24, x11, lsl #2]
+	str	s1, [x24, x12, lsl #2]
+	ldr	x11, [sp, #288]                 // 8-byte Folded Reload
+	ldr	x12, [sp, #312]                 // 8-byte Folded Reload
+	cmp	x12, x11
+	b.ge	.LBB0_125
+.LBB0_120:
+	mul	x11, x12, x27
+	mov	x14, x8
+	mov	x13, xzr
+	add	x11, x11, x25
+	mul	x12, x12, x22
+	lsl	x15, x12, #2
+	ldr	s2, [x14], #12
+	ldr	q1, [x9, x15]
+	ldr	x9, [sp, #48]                   // 8-byte Folded Reload
+	ldr	x15, [sp, #320]                 // 8-byte Folded Reload
+	ldr	s0, [x24, x11, lsl #2]
+	lsl	x9, x9, #3
+	add	x9, x9, x15, lsl #2
+	ldr	x15, [sp, #328]                 // 8-byte Folded Reload
+	add	x9, x9, x15
+	add	x9, x9, #32
+	ext	v3.16b, v1.16b, v1.16b, #8
+	cmp	xzr, x23
+	b.ge	.LBB0_122
+	.p2align	2
+.LBB0_121:                              // =>This Inner Loop Header: Depth=1
+	add	x15, x14, #8
+	fmla	v0.2s, v2.2s, v1.2s
+	add	x13, x13, #4
+	prfm	pldl1keep, [x15]
+	ldp	s2, s4, [x14, #-8]
+	fmla	v0.2s, v2.2s, v1.s[1]
+	fmla	v0.2s, v4.2s, v3.2s
+	ldp	s3, s2, [x14], #16
+	prfm	pldl1keep, [x9]
+	fmla	v0.2s, v3.2s, v1.s[3]
+	ldur	q1, [x9, #-16]
+	add	x9, x9, #16
+	ext	v3.16b, v1.16b, v1.16b, #8
+	cmp	x13, x23
+	b.lt	.LBB0_121
+.LBB0_122:
+	ldp	x14, x13, [sp, #400]            // 16-byte Folded Reload
+	fmla	v0.2s, v2.2s, v1.2s
+	mov	x9, xzr
+	ldr	s4, [x8, x14, lsl #2]
+	ldr	s2, [x8, x13, lsl #2]
+	ldr	x13, [sp, #416]                 // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v1.s[1]
+	ldr	s4, [x8, x13, lsl #2]
+	add	x8, x19, x12
+	ldr	x12, [sp, #344]                 // 8-byte Folded Reload
+	sub	x8, x8, x30
+	fmla	v0.2s, v2.2s, v3.2s
+	add	x8, x12, x8, lsl #2
+	ldr	x12, [sp, #328]                 // 8-byte Folded Reload
+	fmla	v0.2s, v4.2s, v1.s[3]
+	add	x8, x12, x8
+	cmp	x29, x19
+	b.ge	.LBB0_124
+	.p2align	2
+.LBB0_123:                              // =>This Inner Loop Header: Depth=1
+	add	x12, x10, x9
+	add	x13, x8, x9
+	add	x29, x29, #1
+	add	x12, x12, #4
+	add	x13, x13, #4
+	prfm	pldl1keep, [x13]
+	prfm	pldl1keep, [x12]
+	ldr	s1, [x10, x9]
+	ldr	s2, [x8, x9]
+	add	x9, x9, #4
+	fmla	v0.2s, v1.2s, v2.2s
+	cmp	x29, x19
+	b.lt	.LBB0_123
+.LBB0_124:
+	str	s0, [x24, x11, lsl #2]
+.LBB0_125:
+	bl	free
+.LBB0_126:
+	add	sp, sp, #512
+	ldp	d9, d8, [sp, #32]               // 16-byte Folded Reload
+	ldp	d11, d10, [sp, #16]             // 16-byte Folded Reload
+	ldp	x20, x19, [sp, #128]            // 16-byte Folded Reload
+	ldp	x22, x21, [sp, #112]            // 16-byte Folded Reload
+	ldp	x24, x23, [sp, #96]             // 16-byte Folded Reload
+	ldp	x26, x25, [sp, #80]             // 16-byte Folded Reload
+	ldp	x28, x27, [sp, #64]             // 16-byte Folded Reload
+	ldp	x29, x30, [sp, #48]             // 16-byte Folded Reload
+	ldr	d12, [sp], #144                 // 8-byte Folded Reload
+	ret
+.Lfunc_end0:
+	.size	sgemm_nn_alpha1_beta1_mlir, .Lfunc_end0-sgemm_nn_alpha1_beta1_mlir
+	.cfi_endproc
+                                        // -- End function
+	.section	".note.GNU-stack","",@progbits
diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemv_n_alpha1_beta1_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemv_n_alpha1_beta1_mlir.s
new file mode 100644
index 00000000000000..ffd32ba76066c8
--- /dev/null
+++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemv_n_alpha1_beta1_mlir.s
@@ -0,0 +1,709 @@
+	.text
+	.file	"LLVMDialectModule"
+	.globl	sgemv_n_alpha1_beta1_mlir                            // -- Begin function sgemv_n_alpha1_beta1_mlir
+	.p2align	4
+	.type	sgemv_n_alpha1_beta1_mlir,@function
+sgemv_n_alpha1_beta1_mlir:                                   // @sgemv_n_alpha1_beta1_mlir
+	.cfi_startproc
+// %bb.0:
+	sub	sp, sp, #112
+	stp	x29, x30, [sp, #16]             // 16-byte Folded Spill
+	stp	x28, x27, [sp, #32]             // 16-byte Folded Spill
+	stp	x26, x25, [sp, #48]             // 16-byte Folded Spill
+	stp	x24, x23, [sp, #64]             // 16-byte Folded Spill
+	stp	x22, x21, [sp, #80]             // 16-byte Folded Spill
+	stp	x20, x19, [sp, #96]             // 16-byte Folded Spill
+	.cfi_def_cfa_offset 112
+	.cfi_offset w19, -8
+	.cfi_offset w20, -16
+	.cfi_offset w21, -24
+	.cfi_offset w22, -32
+	.cfi_offset w23, -40
+	.cfi_offset w24, -48
+	.cfi_offset w25, -56
+	.cfi_offset w26, -64
+	.cfi_offset w27, -72
+	.cfi_offset w28, -80
+	.cfi_offset w30, -88
+	.cfi_offset w29, -96
+	cmp	x4, #0
+	ldr	x9, [sp, #112]
+	lsl	x7, x5, #2
+	mov	x27, xzr
+	cinv	x8, x4, lt
+	add	x0, x1, #448
+	lsl	x2, x5, #4
+	add	x10, x8, x8, lsr #63
+	add	x11, x8, #3
+	add	x12, x8, #7
+	add	x19, x7, #448
+	asr	x10, x10, #1
+	mov	x15, x9
+	cinv	x10, x10, lt
+	cmp	x8, #0
+	csel	x11, x11, x8, lt
+	csel	x8, x12, x8, lt
+	cmp	x4, #0
+	asr	x11, x11, #2
+	asr	x8, x8, #3
+	cinv	x12, x11, lt
+	cinv	x14, x8, lt
+	cmp	x3, #0
+	lsl	x11, x10, #3
+	cinv	x6, x3, lt
+	lsl	x13, x12, #4
+	lsl	x16, x14, #5
+	add	x21, x9, x11
+	lsl	x12, x12, #2
+	lsl	x14, x14, #3
+	add	x8, x6, #3
+	cmp	x6, #0
+	csel	x8, x8, x6, lt
+	cmp	x3, #0
+	asr	x8, x8, #2
+	cinv	x8, x8, lt
+	stp	x8, x9, [sp]                    // 16-byte Folded Spill
+	lsl	x17, x8, #2
+	add	x8, x11, x1
+	add	x20, x8, #72
+	lsl	x8, x10, #1
+	add	x10, x13, #128
+	add	x22, x1, x10
+	add	x23, x9, x10
+	add	x10, x16, #256
+	add	x24, x1, x10
+	add	x25, x9, x10
+	ldr	x10, [sp, #152]
+	b	.LBB0_2
+	.p2align	2
+.LBB0_1:                                //   in Loop: Header=BB0_2 Depth=1
+	mov	s5, v0.s[2]
+	fadd	s2, s2, s0
+	add	x0, x0, x2
+	add	x20, x20, x2
+	add	x24, x24, x2
+	add	x22, x22, x2
+	fadd	s3, s3, s5
+	mov	s5, v0.s[1]
+	mov	s0, v0.s[3]
+	fadd	s4, s4, s5
+	fadd	s0, s1, s0
+	mov	v2.s[1], v4.s[0]
+	mov	v2.s[2], v3.s[0]
+	mov	v2.s[3], v0.s[0]
+	str	q2, [x26]
+.LBB0_2:                                // =>This Loop Header: Depth=1
+                                        //     Child Loop BB0_4 Depth 2
+                                        //     Child Loop BB0_6 Depth 2
+                                        //     Child Loop BB0_8 Depth 2
+                                        //     Child Loop BB0_10 Depth 2
+	cmp	x27, x17
+	b.ge	.LBB0_11
+// %bb.3:                               //   in Loop: Header=BB0_2 Depth=1
+	add	x26, x10, x27, lsl #2
+	movi	v4.2d, #0000000000000000
+	movi	v3.2d, #0000000000000000
+	movi	v5.2d, #0000000000000000
+	movi	v0.2d, #0000000000000000
+	mov	x28, x0
+	mov	x29, xzr
+	mov	x30, x15
+	ldr	q1, [x26]
+	movi	v7.2d, #0000000000000000
+	movi	v16.2d, #0000000000000000
+	add	x27, x27, #4
+	movi	v2.2d, #0000000000000000
+	movi	v6.2d, #0000000000000000
+	cmp	xzr, x14
+	b.ge	.LBB0_5
+	.p2align	2
+.LBB0_4:                                //   Parent Loop BB0_2 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	sub	x18, x28, #448
+	prfm	pldl1keep, [x28]
+	add	x28, x28, #32
+	add	x29, x29, #8
+	ldp	q18, q17, [x18]
+	add	x18, x18, x19
+	prfm	pldl1keep, [x18]
+	sub	x18, x18, #448
+	add	x9, x18, x19
+	ldp	q20, q19, [x18]
+	prfm	pldl1keep, [x9]
+	sub	x9, x9, #448
+	add	x18, x9, x19
+	ldp	q22, q21, [x9]
+	add	x9, x30, #448
+	prfm	pldl1keep, [x18]
+	ldp	q23, q24, [x18, #-448]!
+	prfm	pldl1keep, [x9]
+	ldp	q26, q25, [x30], #32
+	fmla	v6.4s, v21.4s, v25.4s
+	fmla	v3.4s, v19.4s, v25.4s
+	fmla	v7.4s, v24.4s, v25.4s
+	fmla	v0.4s, v17.4s, v25.4s
+	fmla	v5.4s, v22.4s, v26.4s
+	fmla	v4.4s, v20.4s, v26.4s
+	fmla	v2.4s, v18.4s, v26.4s
+	fmla	v16.4s, v23.4s, v26.4s
+	cmp	x29, x14
+	b.lt	.LBB0_4
+.LBB0_5:                                //   in Loop: Header=BB0_2 Depth=1
+	mov	s17, v1.s[3]
+	mov	s18, v16.s[1]
+	mov	x28, x25
+	mov	x29, x14
+	mov	x30, x24
+	fadd	s17, s17, s16
+	fadd	s17, s17, s18
+	mov	s18, v16.s[2]
+	mov	s16, v16.s[3]
+	fadd	s17, s17, s18
+	fadd	s16, s17, s16
+	mov	s17, v7.s[1]
+	fadd	s16, s16, s7
+	fadd	s16, s16, s17
+	mov	s17, v7.s[2]
+	mov	s7, v7.s[3]
+	fadd	s16, s16, s17
+	mov	s17, v5.s[1]
+	fadd	s7, s16, s7
+	mov	s16, v1.s[2]
+	fadd	s16, s16, s5
+	fadd	s16, s16, s17
+	mov	s17, v5.s[2]
+	mov	s5, v5.s[3]
+	fadd	s16, s16, s17
+	fadd	s5, s16, s5
+	mov	s16, v6.s[1]
+	fadd	s5, s5, s6
+	fadd	s5, s5, s16
+	mov	s16, v6.s[2]
+	mov	s6, v6.s[3]
+	fadd	s5, s5, s16
+	mov	s16, v4.s[1]
+	fadd	s5, s5, s6
+	mov	s6, v1.s[1]
+	fadd	s1, s1, s2
+	fadd	s6, s6, s4
+	fadd	s6, s6, s16
+	mov	s16, v4.s[2]
+	mov	s4, v4.s[3]
+	fadd	s6, s6, s16
+	fadd	s4, s6, s4
+	mov	s6, v3.s[1]
+	fadd	s4, s4, s3
+	fadd	s4, s4, s6
+	mov	s6, v3.s[2]
+	mov	s3, v3.s[3]
+	fadd	s4, s4, s6
+	fadd	s3, s4, s3
+	mov	s4, v2.s[1]
+	fadd	s1, s1, s4
+	mov	s4, v2.s[2]
+	mov	s2, v2.s[3]
+	fadd	s1, s1, s4
+	movi	v4.2d, #0000000000000000
+	fadd	s1, s1, s2
+	mov	s2, v0.s[1]
+	fadd	s1, s1, s0
+	fadd	s1, s1, s2
+	mov	s2, v0.s[2]
+	mov	s0, v0.s[3]
+	fadd	s1, s1, s2
+	movi	v2.2d, #0000000000000000
+	fadd	s0, s1, s0
+	movi	v1.2d, #0000000000000000
+	mov	v0.s[1], v3.s[0]
+	movi	v3.2d, #0000000000000000
+	mov	v0.s[2], v5.s[0]
+	mov	v0.s[3], v7.s[0]
+	str	q0, [x26]
+	cmp	x14, x12
+	b.ge	.LBB0_7
+	.p2align	2
+.LBB0_6:                                //   Parent Loop BB0_2 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x9, x30, x7
+	prfm	pldl1keep, [x30]
+	ldur	q5, [x30, #-256]
+	add	x30, x30, #16
+	add	x18, x9, x7
+	prfm	pldl1keep, [x9]
+	ldur	q6, [x9, #-256]
+	add	x29, x29, #4
+	add	x9, x18, x7
+	prfm	pldl1keep, [x18]
+	ldur	q7, [x18, #-256]
+	prfm	pldl1keep, [x9]
+	ldur	q16, [x9, #-256]
+	prfm	pldl1keep, [x28]
+	ldur	q17, [x28, #-256]
+	add	x28, x28, #16
+	fmla	v3.4s, v7.4s, v17.4s
+	fmla	v1.4s, v16.4s, v17.4s
+	fmla	v4.4s, v6.4s, v17.4s
+	fmla	v2.4s, v5.4s, v17.4s
+	cmp	x29, x12
+	b.lt	.LBB0_6
+.LBB0_7:                                //   in Loop: Header=BB0_2 Depth=1
+	mov	s5, v0.s[1]
+	mov	s6, v4.s[1]
+	mov	x28, x23
+	mov	x29, x22
+	mov	s7, v1.s[1]
+	mov	x30, x12
+	fadd	s5, s5, s4
+	fadd	s5, s5, s6
+	mov	s6, v4.s[2]
+	mov	s4, v4.s[3]
+	fadd	s5, s5, s6
+	fadd	s6, s0, s2
+	fadd	s4, s5, s4
+	mov	s5, v2.s[1]
+	fadd	s5, s6, s5
+	mov	s6, v2.s[2]
+	mov	s2, v2.s[3]
+	fadd	s5, s5, s6
+	movi	d6, #0000000000000000
+	fadd	s2, s5, s2
+	mov	s5, v3.s[1]
+	mov	v2.s[1], v4.s[0]
+	mov	s4, v0.s[2]
+	mov	s0, v0.s[3]
+	fadd	s4, s4, s3
+	fadd	s0, s0, s1
+	fadd	s4, s4, s5
+	mov	s5, v3.s[2]
+	fadd	s0, s0, s7
+	mov	s7, v1.s[2]
+	mov	s3, v3.s[3]
+	mov	s1, v1.s[3]
+	fadd	s4, s4, s5
+	fadd	s0, s0, s7
+	movi	d5, #0000000000000000
+	fadd	s3, s4, s3
+	fadd	s0, s0, s1
+	movi	d4, #0000000000000000
+	mov	v2.s[2], v3.s[0]
+	movi	d3, #0000000000000000
+	mov	v2.s[3], v0.s[0]
+	str	q2, [x26]
+	cmp	x12, x8
+	b.ge	.LBB0_9
+	.p2align	2
+.LBB0_8:                                //   Parent Loop BB0_2 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x9, x29, x7
+	prfm	pldl1keep, [x29]
+	ldur	d0, [x29, #-128]
+	add	x29, x29, #8
+	add	x18, x9, x7
+	prfm	pldl1keep, [x9]
+	ldur	d1, [x9, #-128]
+	add	x30, x30, #2
+	add	x9, x18, x7
+	prfm	pldl1keep, [x18]
+	ldur	d7, [x18, #-128]
+	prfm	pldl1keep, [x9]
+	ldur	d16, [x9, #-128]
+	prfm	pldl1keep, [x28]
+	ldur	d17, [x28, #-128]
+	add	x28, x28, #8
+	fmla	v5.2s, v7.2s, v17.2s
+	fmla	v6.2s, v16.2s, v17.2s
+	fmla	v4.2s, v1.2s, v17.2s
+	fmla	v3.2s, v0.2s, v17.2s
+	cmp	x30, x8
+	b.lt	.LBB0_8
+.LBB0_9:                                //   in Loop: Header=BB0_2 Depth=1
+	mov	s0, v2.s[3]
+	mov	s1, v6.s[1]
+	mov	x28, x21
+	mov	x29, x20
+	mov	x30, x8
+	fadd	s0, s0, s6
+	mov	s6, v2.s[2]
+	fadd	s6, s6, s5
+	fadd	s1, s0, s1
+	mov	s0, v2.s[1]
+	fadd	s2, s2, s3
+	fadd	s0, s0, s4
+	mov	s4, v4.s[1]
+	fadd	s4, s0, s4
+	mov	s0, v5.s[1]
+	fadd	s5, s6, s0
+	mov	s0, v3.s[1]
+	movi	d3, #0000000000000000
+	fadd	s0, s2, s0
+	movi	d2, #0000000000000000
+	mov	v0.s[1], v4.s[0]
+	movi	d4, #0000000000000000
+	mov	v0.s[2], v5.s[0]
+	mov	v0.s[3], v1.s[0]
+	movi	d1, #0000000000000000
+	str	q0, [x26]
+	cmp	x8, x4
+	b.ge	.LBB0_1
+	.p2align	2
+.LBB0_10:                               //   Parent Loop BB0_2 Depth=1
+                                        // =>  This Inner Loop Header: Depth=2
+	add	x9, x29, x7
+	prfm	pldl1keep, [x29]
+	ldur	s5, [x29, #-72]
+	add	x29, x29, #4
+	add	x18, x9, x7
+	prfm	pldl1keep, [x9]
+	ldur	s6, [x9, #-72]
+	add	x30, x30, #1
+	prfm	pldl1keep, [x18]
+	add	x9, x18, x7
+	ldur	s7, [x18, #-72]
+	add	x18, x28, #72
+	prfm	pldl1keep, [x9]
+	ldur	s16, [x9, #-72]
+	prfm	pldl1keep, [x18]
+	ldr	s17, [x28], #4
+	fmul	s7, s7, s17
+	fmul	s6, s6, s17
+	fmul	s5, s5, s17
+	fadd	v3.2s, v3.2s, v7.2s
+	fmul	s7, s16, s17
+	fadd	v4.2s, v4.2s, v6.2s
+	fadd	v2.2s, v2.2s, v5.2s
+	fadd	v1.2s, v1.2s, v7.2s
+	cmp	x30, x4
+	b.lt	.LBB0_10
+	b	.LBB0_1
+.LBB0_11:
+	add	x9, x6, x6, lsr #63
+	cmp	x3, #0
+	asr	x9, x9, #1
+	cinv	x2, x9, lt
+	lsl	x0, x2, #1
+	cmp	x17, x0
+	b.ge	.LBB0_21
+// %bb.12:
+	ldr	x9, [sp]                        // 8-byte Folded Reload
+	movi	v4.2d, #0000000000000000
+	movi	v2.2d, #0000000000000000
+	mov	x7, x15
+	movi	v3.2d, #0000000000000000
+	movi	v0.2d, #0000000000000000
+	mov	x19, xzr
+	mul	x9, x9, x5
+	add	x18, x1, x9, lsl #4
+	add	x9, x17, #1
+	add	x17, x10, x17, lsl #2
+	mul	x9, x5, x9
+	ldr	d1, [x17]
+	mov	x20, x18
+	add	x6, x1, x9, lsl #2
+	mov	x21, x6
+	cmp	xzr, x14
+	b.ge	.LBB0_14
+	.p2align	2
+.LBB0_13:                               // =>This Inner Loop Header: Depth=1
+	add	x9, x20, #736
+	add	x19, x19, #8
+	prfm	pldl1keep, [x9]
+	add	x9, x21, #736
+	ldp	q6, q5, [x20], #32
+	prfm	pldl1keep, [x9]
+	add	x9, x7, #736
+	ldp	q16, q7, [x21], #32
+	prfm	pldl1keep, [x9]
+	ldr	q17, [x7, #16]
+	fmla	v0.4s, v5.4s, v17.4s
+	fmla	v3.4s, v7.4s, v17.4s
+	ldr	q5, [x7], #32
+	fmla	v4.4s, v16.4s, v5.4s
+	fmla	v2.4s, v6.4s, v5.4s
+	cmp	x19, x14
+	b.lt	.LBB0_13
+.LBB0_14:
+	mov	s5, v1.s[1]
+	mov	s6, v4.s[1]
+	mov	x7, x18
+	mov	x19, x6
+	fadd	s1, s1, s2
+	mov	x20, x14
+	mov	x21, x15
+	fadd	s5, s5, s4
+	fadd	s5, s5, s6
+	mov	s6, v4.s[2]
+	mov	s4, v4.s[3]
+	fadd	s5, s5, s6
+	fadd	s4, s5, s4
+	mov	s5, v3.s[1]
+	fadd	s4, s4, s3
+	fadd	s4, s4, s5
+	mov	s5, v3.s[2]
+	mov	s3, v3.s[3]
+	fadd	s4, s4, s5
+	fadd	s3, s4, s3
+	mov	s4, v2.s[1]
+	fadd	s1, s1, s4
+	mov	s4, v2.s[2]
+	mov	s2, v2.s[3]
+	fadd	s1, s1, s4
+	fadd	s1, s1, s2
+	mov	s2, v0.s[1]
+	fadd	s1, s1, s0
+	fadd	s1, s1, s2
+	mov	s2, v0.s[2]
+	mov	s0, v0.s[3]
+	fadd	s1, s1, s2
+	movi	v2.2d, #0000000000000000
+	fadd	s0, s1, s0
+	movi	v1.2d, #0000000000000000
+	mov	v0.s[1], v3.s[0]
+	str	d0, [x17]
+	cmp	x14, x12
+	b.ge	.LBB0_16
+	.p2align	2
+.LBB0_15:                               // =>This Inner Loop Header: Depth=1
+	add	x9, x7, x16
+	add	x20, x20, #4
+	add	x9, x9, #432
+	prfm	pldl1keep, [x9]
+	add	x9, x19, x16
+	ldr	q3, [x7, x16]
+	add	x7, x7, #16
+	add	x9, x9, #432
+	prfm	pldl1keep, [x9]
+	add	x9, x21, x16
+	add	x9, x9, #432
+	ldr	q4, [x19, x16]
+	add	x19, x19, #16
+	prfm	pldl1keep, [x9]
+	ldr	q5, [x21, x16]
+	add	x21, x21, #16
+	fmla	v2.4s, v4.4s, v5.4s
+	fmla	v1.4s, v3.4s, v5.4s
+	cmp	x20, x12
+	b.lt	.LBB0_15
+.LBB0_16:
+	mov	s3, v0.s[1]
+	mov	s4, v2.s[1]
+	mov	x7, x18
+	mov	x19, x6
+	fadd	s0, s0, s1
+	mov	x20, x15
+	mov	x21, x12
+	fadd	s3, s3, s2
+	fadd	s3, s3, s4
+	mov	s4, v2.s[2]
+	mov	s2, v2.s[3]
+	fadd	s3, s3, s4
+	fadd	s2, s3, s2
+	mov	s3, v1.s[1]
+	fadd	s0, s0, s3
+	mov	s3, v1.s[2]
+	mov	s1, v1.s[3]
+	fadd	s0, s0, s3
+	fadd	s0, s0, s1
+	movi	d1, #0000000000000000
+	mov	v0.s[1], v2.s[0]
+	movi	d2, #0000000000000000
+	str	d0, [x17]
+	cmp	x12, x8
+	b.ge	.LBB0_18
+	.p2align	2
+.LBB0_17:                               // =>This Inner Loop Header: Depth=1
+	add	x9, x7, x13
+	add	x21, x21, #2
+	add	x9, x9, #216
+	prfm	pldl1keep, [x9]
+	add	x9, x19, x13
+	ldr	d3, [x7, x13]
+	add	x7, x7, #8
+	add	x9, x9, #216
+	prfm	pldl1keep, [x9]
+	add	x9, x20, x13
+	add	x9, x9, #216
+	ldr	d4, [x19, x13]
+	add	x19, x19, #8
+	prfm	pldl1keep, [x9]
+	ldr	d5, [x20, x13]
+	add	x20, x20, #8
+	fmla	v2.2s, v4.2s, v5.2s
+	fmla	v1.2s, v3.2s, v5.2s
+	cmp	x21, x8
+	b.lt	.LBB0_17
+.LBB0_18:
+	mov	s3, v0.s[1]
+	fadd	s0, s0, s1
+	mov	x7, x15
+	mov	x19, x8
+	fadd	s3, s3, s2
+	mov	s2, v2.s[1]
+	fadd	s2, s3, s2
+	mov	s3, v1.s[1]
+	movi	d1, #0000000000000000
+	fadd	s0, s0, s3
+	mov	v0.s[1], v2.s[0]
+	movi	d2, #0000000000000000
+	str	d0, [x17]
+	cmp	x8, x4
+	b.ge	.LBB0_20
+	.p2align	2
+.LBB0_19:                               // =>This Inner Loop Header: Depth=1
+	add	x9, x18, x11
+	add	x19, x19, #1
+	add	x9, x9, #128
+	prfm	pldl1keep, [x9]
+	add	x9, x6, x11
+	ldr	s3, [x18, x11]
+	add	x18, x18, #4
+	add	x9, x9, #128
+	prfm	pldl1keep, [x9]
+	add	x9, x7, x11
+	add	x9, x9, #128
+	ldr	s4, [x6, x11]
+	add	x6, x6, #4
+	prfm	pldl1keep, [x9]
+	ldr	s5, [x7, x11]
+	add	x7, x7, #4
+	fmul	s4, s4, s5
+	fmul	s3, s3, s5
+	fadd	v1.2s, v1.2s, v4.2s
+	fadd	v2.2s, v2.2s, v3.2s
+	cmp	x19, x4
+	b.lt	.LBB0_19
+.LBB0_20:
+	mov	s3, v0.s[1]
+	fadd	s0, s2, s0
+	fadd	s1, s1, s3
+	mov	v0.s[1], v1.s[0]
+	str	d0, [x17]
+.LBB0_21:
+	cmp	x0, x3
+	b.ge	.LBB0_31
+// %bb.22:
+	mul	x17, x2, x5
+	ldr	s2, [x10, x0, lsl #2]
+	movi	v0.2d, #0000000000000000
+	movi	v1.2d, #0000000000000000
+	mov	x18, xzr
+	add	x2, x1, x17, lsl #3
+	cmp	xzr, x14
+	b.ge	.LBB0_24
+	.p2align	2
+.LBB0_23:                               // =>This Inner Loop Header: Depth=1
+	add	x9, x2, #1152
+	add	x18, x18, #8
+	prfm	pldl1keep, [x9]
+	add	x9, x15, #1152
+	ldp	q3, q4, [x2], #32
+	prfm	pldl1keep, [x9]
+	ldr	q5, [x15]
+	fmla	v1.4s, v3.4s, v5.4s
+	ldr	q3, [x15, #16]
+	add	x15, x15, #32
+	fmla	v0.4s, v4.4s, v3.4s
+	cmp	x18, x14
+	b.lt	.LBB0_23
+.LBB0_24:
+	fadd	s2, s2, s1
+	mov	s3, v1.s[1]
+	ldr	x18, [sp, #8]                   // 8-byte Folded Reload
+	add	x9, x16, x17, lsl #3
+	add	x15, x1, x9
+	add	x16, x18, x16
+	fadd	s2, s2, s3
+	mov	s3, v1.s[2]
+	mov	s1, v1.s[3]
+	fadd	s2, s2, s3
+	fadd	s1, s2, s1
+	mov	s2, v0.s[1]
+	fadd	s1, s1, s0
+	fadd	s1, s1, s2
+	mov	s2, v0.s[2]
+	mov	s0, v0.s[3]
+	fadd	s1, s1, s2
+	fadd	s0, s1, s0
+	movi	v1.2d, #0000000000000000
+	str	s0, [x10, x0, lsl #2]
+	cmp	x14, x12
+	b.ge	.LBB0_26
+	.p2align	2
+.LBB0_25:                               // =>This Inner Loop Header: Depth=1
+	add	x9, x15, #672
+	add	x14, x14, #4
+	prfm	pldl1keep, [x9]
+	add	x9, x16, #672
+	ldr	q2, [x15], #16
+	prfm	pldl1keep, [x9]
+	ldr	q3, [x16], #16
+	fmla	v1.4s, v2.4s, v3.4s
+	cmp	x14, x12
+	b.lt	.LBB0_25
+.LBB0_26:
+	fadd	s0, s0, s1
+	mov	s2, v1.s[1]
+	add	x9, x13, x17, lsl #3
+	add	x13, x18, x13
+	add	x14, x1, x9
+	fadd	s0, s0, s2
+	mov	s2, v1.s[2]
+	mov	s1, v1.s[3]
+	fadd	s0, s0, s2
+	fadd	s0, s0, s1
+	movi	d1, #0000000000000000
+	str	s0, [x10, x0, lsl #2]
+	cmp	x12, x8
+	b.ge	.LBB0_28
+	.p2align	2
+.LBB0_27:                               // =>This Inner Loop Header: Depth=1
+	add	x9, x14, #336
+	add	x12, x12, #2
+	prfm	pldl1keep, [x9]
+	add	x9, x13, #336
+	ldr	d2, [x14], #8
+	prfm	pldl1keep, [x9]
+	ldr	d3, [x13], #8
+	fmla	v1.2s, v2.2s, v3.2s
+	cmp	x12, x8
+	b.lt	.LBB0_27
+.LBB0_28:
+	fadd	s0, s0, s1
+	mov	s2, v1.s[1]
+	add	x9, x11, x17, lsl #3
+	add	x12, x1, x9
+	movi	d1, #0000000000000000
+	add	x9, x18, x11
+	fadd	s0, s0, s2
+	str	s0, [x10, x0, lsl #2]
+	cmp	x8, x4
+	b.ge	.LBB0_30
+	.p2align	2
+.LBB0_29:                               // =>This Inner Loop Header: Depth=1
+	add	x11, x12, #200
+	add	x8, x8, #1
+	prfm	pldl1keep, [x11]
+	add	x11, x9, #200
+	ldr	s2, [x12], #4
+	prfm	pldl1keep, [x11]
+	ldr	s3, [x9], #4
+	fmul	s2, s2, s3
+	fadd	v1.2s, v1.2s, v2.2s
+	cmp	x8, x4
+	b.lt	.LBB0_29
+.LBB0_30:
+	fadd	s0, s1, s0
+	str	s0, [x10, x0, lsl #2]
+.LBB0_31:
+	ldp	x20, x19, [sp, #96]             // 16-byte Folded Reload
+	ldp	x22, x21, [sp, #80]             // 16-byte Folded Reload
+	ldp	x24, x23, [sp, #64]             // 16-byte Folded Reload
+	ldp	x26, x25, [sp, #48]             // 16-byte Folded Reload
+	ldp	x28, x27, [sp, #32]             // 16-byte Folded Reload
+	ldp	x29, x30, [sp, #16]             // 16-byte Folded Reload
+	add	sp, sp, #112
+	ret
+.Lfunc_end0:
+	.size	sgemv_n_alpha1_beta1_mlir, .Lfunc_end0-sgemv_n_alpha1_beta1_mlir
+	.cfi_endproc
+                                        // -- End function
+	.section	".note.GNU-stack","",@progbits
diff --git a/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_3d.cpp b/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_3d.cpp
new file mode 100644
index 00000000000000..c65157a12444ae
--- /dev/null
+++ b/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_3d.cpp
@@ -0,0 +1,46 @@
+#include <cassert>
+#include <stdint.h>
+
+#include <MemrefHelpers.h>
+#include <cblas.h>
+#include <string.h>
+
+extern "C" void sbatch_matmul_3d_nn_mlir(
+    /* A */ const float *, const float *, int64_t, int64_t, int64_t, int64_t,
+    int64_t, int64_t, int64_t,
+    /* B */ const float *, const float *, int64_t, int64_t, int64_t, int64_t,
+    int64_t, int64_t, int64_t,
+    /* C */ float *, float *, int64_t, int64_t, int64_t, int64_t, int64_t,
+    int64_t, int64_t);
+
+extern "C" void sbatch_matmul_3d_nt_mlir(
+    /* A */ const float *, const float *, int64_t, int64_t, int64_t, int64_t,
+    int64_t, int64_t, int64_t,
+    /* B */ const float *, const float *, int64_t, int64_t, int64_t, int64_t,
+    int64_t, int64_t, int64_t,
+    /* C */ float *, float *, int64_t, int64_t, int64_t, int64_t, int64_t,
+    int64_t, int64_t);
+
+// C interface
+extern "C" void cblas_sbatch_matmul_mlir(
+    const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+    const enum CBLAS_TRANSPOSE TransB, const BLASINT BATCH, const BLASINT M,
+    const BLASINT N, const BLASINT K, const float *A, const BLASINT LDA,
+    const float *B, const BLASINT LDB, float *C, const BLASINT LDC) {
+
+  // For the mini lib we only have nn,nt
+  assert(Order == CblasRowMajor);
+  assert(TransA == CblasNoTrans);
+
+  memset(C, 0, BATCH * M * N * sizeof(float));
+
+  if (TransB == CblasTrans) {
+    sbatch_matmul_3d_nt_mlir(/* A */ Memref_3D_Args(A, BATCH, M, K, LDA),
+                             /* B */ Memref_3D_Args(B, BATCH, N, K, LDB),
+                             /* C */ Memref_3D_Args(C, BATCH, M, N, LDC));
+  } else {
+    sbatch_matmul_3d_nn_mlir(/* A */ Memref_3D_Args(A, BATCH, M, K, LDA),
+                             /* B */ Memref_3D_Args(B, BATCH, K, N, LDB),
+                             /* C */ Memref_3D_Args(C, BATCH, M, N, LDC));
+  }
+}
diff --git a/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_4d.cpp b/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_4d.cpp
new file mode 100644
index 00000000000000..f92e217d3a1693
--- /dev/null
+++ b/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_4d.cpp
@@ -0,0 +1,49 @@
+#include <cassert>
+#include <stdint.h>
+
+#include <MemrefHelpers.h>
+#include <cblas.h>
+#include <string.h>
+
+extern "C" void sbatch_matmul_4d_nn_mlir(
+    /* A */ const float *, const float *, int64_t, int64_t, int64_t, int64_t,
+    int64_t, int64_t, int64_t, int64_t, int64_t,
+    /* B */ const float *, const float *, int64_t, int64_t, int64_t, int64_t,
+    int64_t, int64_t, int64_t, int64_t, int64_t,
+    /* C */ float *, float *, int64_t, int64_t, int64_t, int64_t, int64_t,
+    int64_t, int64_t, int64_t, int64_t);
+
+extern "C" void sbatch_matmul_4d_nt_mlir(
+    /* A */ const float *, const float *, int64_t, int64_t, int64_t, int64_t,
+    int64_t, int64_t, int64_t, int64_t, int64_t,
+    /* B */ const float *, const float *, int64_t, int64_t, int64_t, int64_t,
+    int64_t, int64_t, int64_t, int64_t, int64_t,
+    /* C */ float *, float *, int64_t, int64_t, int64_t, int64_t, int64_t,
+    int64_t, int64_t, int64_t, int64_t);
+
+// C interface
+extern "C" void cblas_sbatch_matmul_4d_mlir(
+    const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+    const enum CBLAS_TRANSPOSE TransB, const BLASINT BATCH1,
+    const BLASINT BATCH2, const BLASINT M, const BLASINT N, const BLASINT K,
+    const float *A, const BLASINT LDA, const float *B, const BLASINT LDB,
+    float *C, const BLASINT LDC) {
+
+  // For the mini lib we only have nn,nt
+  assert(Order == CblasRowMajor);
+  assert(TransA == CblasNoTrans);
+
+  memset(C, 0, BATCH1 * BATCH2 * M * N * sizeof(float));
+
+  if (TransB == CblasTrans) {
+    sbatch_matmul_4d_nt_mlir(
+        /* A */ Memref_4D_Args(A, BATCH1, BATCH2, M, K, LDA),
+        /* B */ Memref_4D_Args(B, BATCH1, BATCH2, N, K, LDB),
+        /* C */ Memref_4D_Args(C, BATCH1, BATCH2, M, N, LDC));
+  } else {
+    sbatch_matmul_4d_nn_mlir(
+        /* A */ Memref_4D_Args(A, BATCH1, BATCH2, M, K, LDA),
+        /* B */ Memref_4D_Args(B, BATCH1, BATCH2, K, N, LDB),
+        /* C */ Memref_4D_Args(C, BATCH1, BATCH2, M, N, LDC));
+  }
+}
diff --git a/third_party/xla/xla/service/libs/libblas_mlir/src/sgemm.cpp b/third_party/xla/xla/service/libs/libblas_mlir/src/sgemm.cpp
new file mode 100644
index 00000000000000..b51efca3f51b71
--- /dev/null
+++ b/third_party/xla/xla/service/libs/libblas_mlir/src/sgemm.cpp
@@ -0,0 +1,43 @@
+#include <cassert>
+#include <stdint.h>
+
+#include <MemrefHelpers.h>
+#include <cblas.h>
+
+#include <string.h>
+
+extern "C" void sgemm_nn_alpha1_beta1_mlir(
+    /* alpha */ float,
+    /* beta */ float,
+    /* A */ const float *, const float *, int64_t, int64_t, int64_t, int64_t,
+    int64_t,
+    /* B */ const float *, const float *, int64_t, int64_t, int64_t, int64_t,
+    int64_t,
+    /* C */ float *, float *, int64_t, int64_t, int64_t, int64_t, int64_t);
+
+// C interface
+extern "C" void cblas_sgemm_mlir(
+    const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
+    const enum CBLAS_TRANSPOSE TransB, const BLASINT M, const BLASINT N,
+    const BLASINT K, const float alpha, const float *A, const BLASINT LDA,
+    const float *B, const BLASINT LDB, const float beta, float *C,
+    const BLASINT LDC) {
+  // For the mini lib we only have nn, alpha=1, beta=1 or beta=0.
+  assert(Order == CblasRowMajor);
+  assert(TransA == CblasNoTrans);
+  assert(TransB == CblasNoTrans);
+  assert(alpha == 1.0);
+  assert(beta == 1.0 || beta == 0.0);
+
+  // This is faster
+  if (beta == 0.0) {
+    memset(C, 0, M * N * sizeof(float));
+  }
+
+  // Call MLIR kernel
+  sgemm_nn_alpha1_beta1_mlir(/* alpha */ 1.0,
+                             /* beta */ 1.0,
+                             /* A */ Memref_2D_Args(A, M, K, LDA),
+                             /* B */ Memref_2D_Args(B, K, N, LDB),
+                             /* C */ Memref_2D_Args(C, M, N, LDC));
+}
diff --git a/third_party/xla/xla/service/libs/libblas_mlir/src/sgemv.cpp b/third_party/xla/xla/service/libs/libblas_mlir/src/sgemv.cpp
new file mode 100644
index 00000000000000..4ee3441735218a
--- /dev/null
+++ b/third_party/xla/xla/service/libs/libblas_mlir/src/sgemv.cpp
@@ -0,0 +1,43 @@
+#include <cassert>
+#include <stdint.h>
+#include <string.h>
+
+#include <MemrefHelpers.h>
+#include <cblas.h>
+
+extern "C" void sgemv_n_alpha1_beta1_mlir(/* alpha */ float,
+                                          /* beta */ float,
+                                          /* A */ const float *, const float *,
+                                          int64_t, int64_t, int64_t, int64_t,
+                                          int64_t,
+                                          /* X */ const float *, const float *,
+                                          int64_t, int64_t, int64_t,
+                                          /* Y */ float *, float *, int64_t,
+                                          int64_t, int64_t);
+
+// C interface
+extern "C" void cblas_sgemv_mlir(const enum CBLAS_ORDER Order,
+                                 const enum CBLAS_TRANSPOSE TransA,
+                                 const BLASINT M, const BLASINT N,
+                                 const float alpha, const float *A,
+                                 const BLASINT LDA, const float *X,
+                                 const BLASINT INCX, const float beta, float *Y,
+                                 const BLASINT INCY) {
+  // For the mini lib we only have nn, alpha=1, beta=0 or beta=1.
+  assert(TransA == CblasNoTrans);
+  assert(Order == CblasRowMajor);
+  assert(alpha == 1.0);
+  assert(beta == 1.0 || beta == 0.0);
+
+  // This is faster
+  if (beta == 0.0) {
+    memset(Y, 0, M * sizeof(float));
+  }
+
+  // Call MLIR kernel
+  sgemv_n_alpha1_beta1_mlir(/* alpha */ 1.0,
+                            /* beta */ 1.0,
+                            /* A */ Memref_2D_Args(A, M, N, LDA),
+                            /* X */ Memref_1D_Args(X, N, INCX),
+                            /* Y */ Memref_1D_Args(Y, M, INCY));
+}
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 854eed7235720a..7d2a49a59c2f75 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -223,6 +223,7 @@ message DebugOptions {
   bool xla_cpu_use_xnnpack = 359;
 
   bool xla_cpu_enable_xnnpack = 389;
+  bool xla_cpu_use_kernel_selector = 390;
 
   // Enabling this will enable optimizations that ignore the possibility of NaN.
   bool xla_enable_fast_math = 335;
@@ -1210,7 +1211,7 @@ message DebugOptions {
   // Note: when adding a new flag, please add it to one of the hardware-specific
   // or hardware-agnostic sections at the top of this proto message.
 
-  // Next id: 389
+  // Next id: 391
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.

From e5dedb2a04bbbec73a03e852e424fe6cf405f31e Mon Sep 17 00:00:00 2001
From: Wen Di <wendi5@huawei.com>
Date: Mon, 12 Jan 2026 17:38:20 +0800
Subject: [PATCH 3/3] add env to set cpu instructions fusion not duplicate

---
 .../xla/third_party/openblas/workspace.bzl    |    4 +-
 third_party/xla/xla/debug_options_flags.cc    |    4 +-
 third_party/xla/xla/service/cpu/BUILD.orig    | 2224 --------------
 .../xla/xla/service/cpu/cpu_compiler.cc       |   19 +-
 .../xla/xla/service/cpu/cpu_compiler.cc.orig  | 2720 -----------------
 .../xla/service/cpu/cpu_instruction_fusion.h  |    4 +-
 6 files changed, 17 insertions(+), 4958 deletions(-)
 delete mode 100644 third_party/xla/xla/service/cpu/BUILD.orig
 delete mode 100644 third_party/xla/xla/service/cpu/cpu_compiler.cc.orig

diff --git a/third_party/xla/third_party/openblas/workspace.bzl b/third_party/xla/third_party/openblas/workspace.bzl
index 6728207dbfe58f..74367fa1a8801d 100644
--- a/third_party/xla/third_party/openblas/workspace.bzl
+++ b/third_party/xla/third_party/openblas/workspace.bzl
@@ -3,8 +3,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     tf_http_archive(
         name = "openblas",
-        strip_prefix = "OpenBLAS-0.3.29",
-        sha256 = "38240eee1b29e2bde47ebb5d61160207dc68668a54cac62c076bb5032013b1eb",
+        strip_prefix = "OpenBLAS-8795fc7985635de1ecf674b87e2008a15097ffab",
+        sha256 = "f5ff825b3a82417d47c2ba97606ce8a5d868f863e555025f5d4112e6dfd62e2f",
         urls = tf_mirror_urls("https://github.com/OpenMathLib/OpenBLAS/archive/8795fc7985635de1ecf674b87e2008a15097ffab.tar.gz"),
         build_file = "//third_party/openblas:openblas.BUILD",
     )
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index 7792ab22f7f929..6cdfcfa8241c7c 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -100,8 +100,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 #ifdef XLA_CPU_USE_ACL
   opts.set_xla_cpu_use_acl(true);
 #endif
-  opts.set_xla_cpu_use_fusion_emitters(false);
-  opts.set_xla_cpu_use_thunk_runtime(false);
+  opts.set_xla_cpu_use_fusion_emitters(true);
+  opts.set_xla_cpu_use_thunk_runtime(true);
   opts.set_xla_cpu_use_xnnpack(false);
   opts.set_xla_cpu_enable_xnnpack(false);  // For softmax
   opts.set_xla_cpu_use_kernel_selector(false);
diff --git a/third_party/xla/xla/service/cpu/BUILD.orig b/third_party/xla/xla/service/cpu/BUILD.orig
deleted file mode 100644
index f951a6ac93b626..00000000000000
--- a/third_party/xla/xla/service/cpu/BUILD.orig
+++ /dev/null
@@ -1,2224 +0,0 @@
-# Description:
-#    LLVM-based CPU backend for XLA.
-
-load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load(
-    "//third_party/compute_library:build_defs.bzl",
-    "acl_deps",
-    "if_enable_acl",
-)
-load(
-    "//xla:xla.default.bzl",
-    "xla_cc_binary",
-    "xla_cc_test",
-)
-load("//xla/tests:build_defs.bzl", "xla_test")
-load("//xla/tsl:tsl.bzl", "internal_visibility", "tf_openmp_copts", "tsl_copts")
-load("//xla/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
-load("//xla/tsl/mkl:build_defs.bzl", "if_graph_api", "mkl_deps")
-load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
-load(
-    "//xla/tsl/platform:build_config_root.bzl",
-    "if_llvm_aarch64_available",
-    "if_llvm_powerpc_available",
-    "if_llvm_system_z_available",
-    "if_llvm_x86_available",
-)
-load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
-load(":build_defs.bzl", "runtime_copts")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = internal_visibility([":friends"]),
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "friends",
-    includes = [
-        "//xla:friends",
-    ],
-)
-
-# Filegroup used to collect source files for dependency checking.
-filegroup(
-    name = "c_srcs",
-    data = glob([
-        "**/*.cc",
-        "**/*.h",
-    ]),
-)
-
-cc_library(
-    name = "test_header_helper",
-    testonly = True,
-    hdrs = ["test_target_triple_helper.h"],
-)
-
-filegroup(
-    name = "runtime_srcs",
-    srcs = [
-        # Single-threaded support.
-        "runtime_custom_call_status.cc",
-        "runtime_fp16.cc",
-        "runtime_key_value_sort.cc",
-        "runtime_pow.cc",
-        "runtime_single_threaded_conv2d.cc",
-        "runtime_single_threaded_conv3d.cc",
-        "runtime_single_threaded_fft.cc",
-        "runtime_single_threaded_matmul_c128.cc",
-        "runtime_single_threaded_matmul_c64.cc",
-        "runtime_single_threaded_matmul_common.h",
-        "runtime_single_threaded_matmul_f8.cc",
-        "runtime_single_threaded_matmul_f16.cc",
-        "runtime_single_threaded_matmul_f32.cc",
-        "runtime_single_threaded_matmul_f64.cc",
-        "runtime_single_threaded_matmul_s32.cc",
-        "runtime_single_threaded_matmul_u8.cc",
-        "runtime_topk.cc",
-        "xnnpack_ops.cc",
-        # Multi-threaded support.
-        "runtime_conv2d.cc",
-        "runtime_conv3d.cc",
-        "runtime_fft.cc",
-        "runtime_matmul_c128.cc",
-        "runtime_matmul_c64.cc",
-        "runtime_matmul_common.h",
-        "runtime_matmul_f16.cc",
-        "runtime_matmul_f32.cc",
-        "runtime_matmul_f64.cc",
-        "runtime_matmul_s32.cc",
-        "runtime_fork_join.cc",
-        "//xla/backends/cpu/runtime:runtime_srcs",
-        #"runtime_handle_ffi_call.cc", # TODO(b/338344732): Add  "runtime_handle_ffi_call.cc".
-    ],
-    visibility = internal_visibility([":friends"]),
-)
-
-filegroup(
-    name = "runtime_hdrs",
-    srcs = [
-        # XLA Runtime support.
-        "buffer_desc.h",
-        # Single-threaded support.
-        "runtime_custom_call_status.h",
-        "runtime_fp16.h",
-        "runtime_key_value_sort.h",
-        "runtime_pow.h",
-        "runtime_single_threaded_conv2d.h",
-        "runtime_single_threaded_conv3d.h",
-        "runtime_single_threaded_fft.h",
-        "runtime_single_threaded_matmul.h",
-        "runtime_topk.h",
-        "xnnpack_ops.h",
-        # Multi-threaded support.
-        "runtime_conv2d.h",
-        "runtime_conv3d.h",
-        "runtime_fft.h",
-        "runtime_fork_join.h",
-        "runtime_lightweight_check.h",
-        "runtime_matmul.h",
-        "//xla/backends/cpu/runtime:runtime_hdrs",
-        #"runtime_handle_ffi_call.h", # TODO(b/338344732): Add  "runtime_handle_ffi_call.h"
-    ],
-    visibility = internal_visibility([":friends"]),
-)
-
-cc_library(
-    name = "cpu_xfeed",
-    srcs = ["cpu_xfeed.cc"],
-    hdrs = ["cpu_xfeed.h"],
-    deps = [
-        ":cpu_runtime",
-        "//xla:literal",
-        "//xla:literal_util",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:types",
-        "//xla:util",
-        "//xla/service:hlo_cost_analysis",
-        "//xla/service:shaped_buffer",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:notification",
-    ],
-)
-
-cc_library(
-    name = "cpu_transfer_manager",
-    srcs = ["cpu_transfer_manager.cc"],
-    hdrs = ["cpu_transfer_manager.h"],
-    deps = [
-        ":cpu_runtime",
-        ":cpu_xfeed",
-        "//xla:literal",
-        "//xla:literal_util",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:types",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/service:compiler",
-        "//xla/service:generic_transfer_manager",
-        "//xla/service:transfer_manager",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:platform_manager",
-        "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/host:host_platform_id",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-    ],
-    alwayslink = True,  # Contains per-platform transfer manager registration
-)
-
-cc_library(
-    name = "buffer_info_util",
-    srcs = ["buffer_info_util.cc"],
-    hdrs = ["buffer_info_util.h"],
-    deps = [
-        "//xla:cpu_function_runtime",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_assignment",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-cc_library(
-    name = "cpu_compiler_pure",
-    srcs = ["cpu_compiler.cc"],
-    hdrs = ["cpu_compiler.h"],
-    copts = tsl_copts(),
-    deps = [
-        ":buffer_info_util",
-        ":conv_canonicalization",
-        ":cpu_aot_compilation_result",
-        ":cpu_executable",
-        ":cpu_float_support",
-        ":cpu_instruction_fusion",
-        ":cpu_layout_assignment",
-        ":cpu_options",
-        ":dot_op_emitter",
-        ":executable_proto_cc",
-        ":fusion_wrapper",
-        ":ir_emission_utils",
-        ":ir_emitter",
-        ":ir_emitter2",
-        ":metrics",
-        ":onednn_contraction_rewriter",
-        ":onednn_float_support",
-        ":onednn_ops_rewriter",
-        ":parallel_task_assignment",
-        ":runtime_symbol_generator",
-        ":small_while_loop_hoisting_pass",
-        ":thunk_emitter",
-        ":xla_framework",
-        ":xnnpack_ops_rewriter",
-        "//xla:cpu_function_runtime",
-        "//xla:debug_options_flags",
-        "//xla:literal",
-        "//xla:literal_pool",
-        "//xla:protobuf_util",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:types",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/backends/cpu:constant_allocation",
-        "//xla/backends/cpu:xnn_fusion",
-        "//xla/backends/cpu/codegen:compiled_function_library",
-        "//xla/backends/cpu/codegen:cpu_features",
-        "//xla/backends/cpu/codegen:execution_engine",
-        "//xla/backends/cpu/codegen:ir_compiler",
-        "//xla/backends/cpu/codegen:jit_compiler",
-        "//xla/backends/cpu/codegen:object_loader",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitter_config",
-        "//xla/backends/cpu/runtime:function_library",
-        "//xla/backends/cpu/runtime:kernel_thunk",
-        "//xla/backends/cpu/runtime:thunk",
-        "//xla/backends/cpu/runtime:thunk_proto_cc_impl",
-        "//xla/backends/cpu/runtime:thunk_proto_serdes",
-        "//xla/backends/cpu/transforms:xnn_graph_fusion",
-        "//xla/hlo/analysis:hlo_ordering",
-        "//xla/hlo/analysis:indexed_array_analysis",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/hlo/pass:hlo_pass_pipeline",
-        "//xla/hlo/transforms:literal_canonicalizer",
-        "//xla/hlo/transforms:operand_upcaster",
-        "//xla/hlo/transforms:while_loop_trip_count_annotator",
-        "//xla/hlo/transforms/expanders:bitcast_dtypes_expander",
-        "//xla/hlo/transforms/expanders:cholesky_expander",
-        "//xla/hlo/transforms/expanders:comparison_expander",
-        "//xla/hlo/transforms/expanders:dot_decomposer",
-        "//xla/hlo/transforms/expanders:dynamic_index_splitter",
-        "//xla/hlo/transforms/expanders:eigh_expander",
-        "//xla/hlo/transforms/expanders:logistic_expander",
-        "//xla/hlo/transforms/expanders:optimization_barrier_expander",
-        "//xla/hlo/transforms/expanders:qr_expander",
-        "//xla/hlo/transforms/expanders:reduce_decomposer",
-        "//xla/hlo/transforms/expanders:reshape_decomposer",
-        "//xla/hlo/transforms/expanders:rng_bit_generator_expander",
-        "//xla/hlo/transforms/expanders:rng_expander",
-        "//xla/hlo/transforms/expanders:stochastic_convert_decomposer",
-        "//xla/hlo/transforms/simplifiers:algebraic_simplifier",
-        "//xla/hlo/transforms/simplifiers:batch_dot_simplification",
-        "//xla/hlo/transforms/simplifiers:broadcast_canonicalizer",
-        "//xla/hlo/transforms/simplifiers:conditional_canonicalizer",
-        "//xla/hlo/transforms/simplifiers:convolution_group_converter",
-        "//xla/hlo/transforms/simplifiers:dynamic_dimension_simplifier",
-        "//xla/hlo/transforms/simplifiers:flatten_call_graph",
-        "//xla/hlo/transforms/simplifiers:float_normalization",
-        "//xla/hlo/transforms/simplifiers:gather_simplifier",
-        "//xla/hlo/transforms/simplifiers:hlo_constant_folding",
-        "//xla/hlo/transforms/simplifiers:hlo_dce",
-        "//xla/hlo/transforms/simplifiers:hlo_memory_scheduler",
-        "//xla/hlo/transforms/simplifiers:optimize_input_output_buffer_alias",
-        "//xla/hlo/transforms/simplifiers:reduce_window_rewriter",
-        "//xla/hlo/transforms/simplifiers:reshape_mover",
-        "//xla/hlo/transforms/simplifiers:result_caster",
-        "//xla/hlo/transforms/simplifiers:simplify_fp_conversions",
-        "//xla/hlo/transforms/simplifiers:slice_sinker",
-        "//xla/hlo/transforms/simplifiers:sort_simplifier",
-        "//xla/hlo/transforms/simplifiers:sub_byte_normalization",
-        "//xla/hlo/transforms/simplifiers:tree_reduction_rewriter",
-        "//xla/hlo/transforms/simplifiers:tuple_simplifier",
-        "//xla/hlo/transforms/simplifiers:zero_sized_hlo_elimination",
-        "//xla/mlir_hlo",
-        "//xla/mlir_hlo:all_passes",
-        "//xla/mlir_hlo:transforms_passes",
-        "//xla/service:all_reduce_promotion",
-        "//xla/service:outer_dimension_propagation",
-        "//xla/service:get_outer_batch_value_simplifier",
-        "//xla/service:all_to_all_decomposer",
-        "//xla/service:batched_gather_scatter_normalizer",
-        "//xla/service:batchnorm_expander",
-        "//xla/service:buffer_assignment",
-        "//xla/service:call_graph",
-        "//xla/service:call_inliner",
-        "//xla/service:change_op_data_type",
-        "//xla/service:compiler",
-        "//xla/service:conditional_simplifier",
-        "//xla/service:conditional_to_select",
-        "//xla/service:copy_insertion",
-        "//xla/service:cpu_gpu_shape_verifier",
-        "//xla/service:dump",
-        "//xla/service:dynamic_dimension_inference",
-        "//xla/service:dynamic_padder",
-        "//xla/service:executable",
-        "//xla/service:float_support",
-        "//xla/service:gather_expander",
-        "//xla/service:hlo_cost_analysis",
-        "//xla/service:hlo_cse",
-        "//xla/service:hlo_execution_profile",
-        "//xla/service:hlo_module_config",
-        "//xla/service:hlo_profile_printer_data_cc",
-        "//xla/service:hlo_proto_cc",
-        "//xla/service:hlo_proto_util",
-        "//xla/service:hlo_verifier",
-        "//xla/service:layout_assignment",
-        "//xla/service:llvm_compiler",
-        "//xla/service:logical_buffer",
-        "//xla/service:map_inliner",
-        "//xla/service:scatter_expander",
-        "//xla/service:scatter_simplifier",
-        "//xla/service:select_and_scatter_expander",
-        "//xla/service:sharding_propagation",
-        "//xla/service:sharding_remover",
-        "//xla/service:slow_operation_alarm",
-        "//xla/service:topk_rewriter",
-        "//xla/service:transpose_folding",
-        "//xla/service:triangular_solve_expander",
-        "//xla/service:while_loop_constant_sinking",
-        "//xla/service:while_loop_invariant_code_motion",
-        "//xla/service:while_loop_simplifier",
-        "//xla/service/llvm_ir:llvm_command_line_options",
-        "//xla/service/llvm_ir:llvm_util",
-        "//xla/service/spmd:stateful_rng_spmd_partitioner",
-        "//xla/service/spmd/shardy:shardy_xla_pass",
-        "//xla/stream_executor:platform",
-        "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/host:host_platform_id",
-        "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/protobuf:error_codes_proto_impl_cc",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:BitReader",
-        "@llvm-project//llvm:BitWriter",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Linker",
-        "@llvm-project//llvm:MC",
-        "@llvm-project//llvm:Object",
-        "@llvm-project//llvm:OrcJIT",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:Target",
-        "@llvm-project//llvm:TargetParser",
-        "@llvm-project//llvm:TransformUtils",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:AffineToStandard",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ArithTransforms",
-        "@llvm-project//mlir:BufferizationTransforms",
-        "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LLVMDialect",
-        "@llvm-project//mlir:LLVMToLLVMIRTranslation",
-        "@llvm-project//mlir:LinalgDialect",
-        "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:MemRefTransforms",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:ReconcileUnrealizedCasts",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorDialect",
-        "@llvm-project//mlir:ToLLVMIRTranslation",
-        "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:Transforms",
-        "@llvm-project//mlir:VectorDialect",
-        "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:threadpool_async_executor",
-        "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/profiler/lib:traceme_encode",
-    ] + if_llvm_aarch64_available([
-        "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
-    ]) + if_llvm_powerpc_available([
-        "@llvm-project//llvm:PowerPCCodeGen",  # fixdeps: keep
-    ]) + if_llvm_system_z_available([
-        "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
-    ]) + if_llvm_x86_available([
-        "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
-    ]),
-)
-
-cc_library(
-    name = "cpu_aot_compilation_result",
-    srcs = ["cpu_aot_compilation_result.cc"],
-    hdrs = ["cpu_aot_compilation_result.h"],
-    deps = [
-        ":buffer_info_util",
-        ":cpu_executable",
-        ":executable_proto_cc",
-        "//xla:cpu_function_runtime",
-        "//xla:util",
-        "//xla/backends/cpu:constant_allocation",
-        "//xla/backends/cpu/runtime:function_library",
-        "//xla/backends/cpu/runtime:thunk",
-        "//xla/backends/cpu/runtime:thunk_proto_cc",
-        "//xla/backends/cpu/runtime:thunk_proto_serdes",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_assignment",
-        "//xla/service:buffer_value",
-        "//xla/service:compiler",
-        "//xla/service:executable",
-        "//xla/service:hlo_cost_analysis",
-        "//xla/service:hlo_module_config",
-        "//xla/service:hlo_profile_printer_data_cc",
-        "//xla/service:hlo_proto_cc",
-        "//xla/stream_executor:platform",
-        "//xla/stream_executor/host:host_platform_id",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_library(
-    # The old target name will still be used so that dependencies won't break.
-    # In the future, dependencies should be cleaned up and relinked to the above
-    # target if registration is not necesary.
-    name = "cpu_compiler",
-    srcs = ["cpu_compiler_registerer.cc"],
-    hdrs = ["cpu_compiler.h"],
-    deps = [
-        "cpu_compiler_pure",
-        ":cpu_aot_compilation_result",
-        ":executable_proto_cc",
-        "//xla:util",
-        "//xla/backends/cpu/codegen:ir_compiler",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
-        "//xla/service:buffer_assignment",
-        "//xla/service:compiler",
-        "//xla/service:executable",
-        "//xla/service:hlo_cost_analysis",
-        "//xla/service:hlo_profile_printer_data_cc",
-        "//xla/service:hlo_proto_cc",
-        "//xla/service:llvm_compiler",
-        "//xla/stream_executor:platform",
-        "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/host:host_platform_id",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:Target",
-        "@llvm-project//llvm:TargetParser",
-    ],
-    alwayslink = True,  # Contains compiler registration
-)
-
-xla_test(
-    name = "cpu_compiler_test",
-    srcs = ["cpu_compiler_test.cc"],
-    backends = [
-        "cpu",
-    ],
-    tags = [
-        "test_migrated_to_hlo_runner_pjrt",
-        "test_xla_cpu_no_thunks",
-    ],
-    deps = [
-        "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/tests:hlo_pjrt_test_base",
-        "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/lib/monitoring:collected_metrics",
-        "//xla/tsl/lib/monitoring:collection_registry",
-        "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
-    ],
-)
-
-xla_test(
-    name = "cpu_compiler_internals_test",
-    srcs = ["cpu_compiler_internals_test.cc"],
-    backends = [
-        "cpu",
-    ],
-    deps = [
-        "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitter_config",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/service:llvm_compiler",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
-        "@com_google_absl//absl/base:nullability",
-        "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-xla_test(
-    name = "cpu_aot_compiler_test",
-    srcs = ["cpu_aot_compiler_test.cc"],
-    backends = [
-        "cpu",
-    ],
-    deps = [
-        ":cpu_aot_compilation_result",
-        ":test_header_helper",
-        "//xla:literal",
-        "//xla:literal_util",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
-        "//xla/service:compiler",
-        "//xla/service:executable",
-        "//xla/service:hlo_runner",
-        "//xla/service:hlo_runner_interface",
-        "//xla/stream_executor:platform",
-        "//xla/stream_executor:platform_manager",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:literal_test_util",
-        "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-tf_proto_library(
-    name = "executable_proto",
-    srcs = ["executable.proto"],
-    protodeps = [
-        ":xla_framework_proto",
-        "//xla/service:hlo_proto",
-        "//xla:xla_proto",
-        "//xla/backends/cpu/runtime:thunk_proto",
-    ],
-)
-
-tf_proto_library(
-    name = "xla_framework_proto",
-    srcs = ["xla_framework.proto"],
-)
-
-cc_library(
-    name = "xla_framework",
-    hdrs = ["xla_framework.h"],
-    deps = [":xla_framework_proto_cc"],
-)
-
-cc_library(
-    name = "runtime_symbol_generator",
-    srcs = [
-        "runtime_symbol_generator.cc",
-        "windows_compatibility.cc",
-        "windows_compatibility.h",
-    ],
-    hdrs = ["runtime_symbol_generator.h"],
-    copts = if_enable_acl(["-DXLA_CPU_USE_ACL=1"]) + tsl_copts(),
-    deps = [
-        ":cpu_runtime",
-        ":onednn_convolution",
-        ":onednn_layer_norm",
-        ":onednn_matmul",
-        ":onednn_softmax",
-        ":runtime_conv2d",
-        ":runtime_conv2d_acl",
-        ":runtime_conv2d_mkl",
-        ":runtime_conv3d",
-        ":runtime_custom_call_status",
-        ":runtime_fft",
-        ":runtime_fork_join",
-        ":runtime_fp16",
-        ":runtime_handle_ffi_call",
-        ":runtime_key_value_sort",
-        ":runtime_matmul",
-        ":runtime_matmul_acl",
-        ":runtime_pow",
-        ":runtime_single_threaded_conv2d",
-        ":runtime_single_threaded_conv3d",
-        ":runtime_single_threaded_fft",
-        ":runtime_single_threaded_matmul",
-        ":runtime_topk",
-        ":xnnpack_ops",
-        "//xla/service:custom_call_target_registry",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:OrcJIT",
-        "@llvm-project//llvm:OrcShared",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:mlir_c_runner_utils",
-        "@local_tsl//tsl/platform:logging",
-    ],
-)
-
-cc_library(
-    name = "runtime_lightweight_check",
-    hdrs = ["runtime_lightweight_check.h"],
-    compatible_with = get_compatible_with_portable(),
-    copts = runtime_copts(),
-)
-
-cc_library(
-    name = "runtime_fp16",
-    srcs = [
-        "runtime_fp16.cc",
-    ],
-    hdrs = [
-        "runtime_fp16.h",
-    ],
-    copts = runtime_copts(),
-    deps = ["@com_google_absl//absl/base:core_headers"],
-)
-
-cc_library(
-    name = "runtime_pow",
-    srcs = [
-        "runtime_pow.cc",
-    ],
-    hdrs = [
-        "runtime_pow.h",
-    ],
-    copts = runtime_copts(),
-    deps = ["@com_google_absl//absl/base:core_headers"],
-)
-
-cc_library(
-    name = "buffer_desc",
-    hdrs = ["buffer_desc.h"],
-)
-
-cc_library(
-    name = "cpu_executable",
-    srcs = ["cpu_executable.cc"],
-    hdrs = ["cpu_executable.h"],
-    deps = [
-        ":cpu_runtime",
-        ":executable_proto_cc",
-        "//xla:executable_run_options",
-        "//xla:literal",
-        "//xla:shape_tree",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu:constant_allocation",
-        "//xla/backends/cpu/runtime:buffer_allocations",
-        "//xla/backends/cpu/runtime:function_library",
-        "//xla/backends/cpu/runtime:thread_pool_task_runner",
-        "//xla/backends/cpu/runtime:thunk",
-        "//xla/backends/cpu/runtime:thunk_executor",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_assignment",
-        "//xla/service:custom_call_status",
-        "//xla/service:custom_call_status_internal",
-        "//xla/service:executable",
-        "//xla/service:hlo_execution_profile",
-        "//xla/service:hlo_profile_printer_data_cc",
-        "//xla/service:hlo_value",
-        "//xla/service:maybe_owning_device_memory",
-        "//xla/service:shaped_buffer",
-        "//xla/service:xla_debug_info_manager",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor/host:host_stream",
-        "//xla/tsl/concurrency:async_value",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-cc_library(
-    name = "elemental_math_emitter",
-    srcs = ["elemental_math_emitter.cc"],
-    hdrs = ["elemental_math_emitter.h"],
-    deps = [
-        "//xla:xla_data_proto_cc",
-        "//xla/service/llvm_ir:math_ops",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Support",
-    ],
-)
-
-cc_library(
-    name = "ir_emitter2",
-    srcs = ["ir_emitter2.cc"],
-    hdrs = ["ir_emitter2.h"],
-    deps = [
-        ":backend_config_proto_cc",
-        ":dot_op_emitter",
-        ":elemental_ir_emitter",
-        ":ir_emitter",
-        ":parallel_loop_emitter",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/backends/cpu/codegen:fusion_compiler",
-        "//xla/backends/cpu/codegen:kernel_api_ir_builder",
-        "//xla/backends/cpu/codegen:symbol_name_util",
-        "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitter_config",
-        "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitters",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_assignment",
-        "//xla/service:hlo_module_config",
-        "//xla/service/llvm_ir:dynamic_update_slice_util",
-        "//xla/service/llvm_ir:fused_ir_emitter",
-        "//xla/service/llvm_ir:ir_array",
-        "//xla/service/llvm_ir:llvm_util",
-        "//xla/service/llvm_ir:loop_emitter",
-        "//xla/stream_executor:launch_dim",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Linker",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-xla_cc_test(
-    name = "ir_emitter_test",
-    srcs = ["ir_emitter_test.cc"],
-    deps = [
-        ":cpu_compiler",
-        ":cpu_executable",
-        ":cpu_options",
-        ":ir_emitter",
-        ":ir_function",
-        ":runtime_symbol_generator",
-        ":target_machine_features_stub",
-        "//xla:cpu_function_runtime",
-        "//xla:shape_util",
-        "//xla/backends/cpu/codegen:cpu_features",
-        "//xla/backends/cpu/codegen:execution_engine",
-        "//xla/backends/cpu/codegen:ir_compiler",
-        "//xla/backends/cpu/codegen:jit_compiler",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/hlo/analysis:hlo_ordering",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/parser:hlo_parser",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/transforms/simplifiers:hlo_memory_scheduler",
-        "//xla/service:buffer_assignment",
-        "//xla/service:buffer_value",
-        "//xla/service:hlo_module_config",
-        "//xla/service:logical_buffer",
-        "//xla/service/llvm_ir:llvm_util",
-        "//xla/tsl/lib/core:status_test_util",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:Target",
-        "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
-    ],
-)
-
-cc_library(
-    name = "ir_emitter",
-    srcs = ["ir_emitter.cc"],
-    hdrs = ["ir_emitter.h"],
-    copts = tsl_copts(),
-    deps = [
-        ":backend_config_proto_cc",
-        ":cpu_instruction_fusion",
-        ":cpu_options",
-        ":cpu_runtime",
-        ":dot_op_emitter",
-        ":elemental_ir_emitter",
-        ":ir_emission_utils",
-        ":ir_function",
-        ":onednn_config_proto_cc",
-        ":onednn_memory_util",
-        ":parallel_loop_emitter",
-        ":xnnpack_ops_rewriter",
-        ":xnnpack_ops",
-        "//xla:literal",
-        "//xla:literal_util",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_assignment",
-        "//xla/service:collective_ops_utils",
-        "//xla/service:elemental_ir_emitter",
-        "//xla/service:hlo_module_config",
-        "//xla/service:name_uniquer",
-        "//xla/service/llvm_ir:alias_analysis",
-        "//xla/service/llvm_ir:buffer_assignment_util",
-        "//xla/service/llvm_ir:dynamic_update_slice_util",
-        "//xla/service/llvm_ir:fused_ir_emitter",
-        "//xla/service/llvm_ir:ir_array",
-        "//xla/service/llvm_ir:ir_builder_mixin",
-        "//xla/service/llvm_ir:llvm_loop",
-        "//xla/service/llvm_ir:llvm_type_conversion_util",
-        "//xla/service/llvm_ir:llvm_util",
-        "//xla/service/llvm_ir:loop_emitter",
-        "//xla/service/llvm_ir:tuple_ops",
-        "//xla/tsl/lib/math:math_util",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/meta:type_traits",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:TargetParser",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "target_machine_features_stub",
-    testonly = 1,
-    hdrs = ["target_machine_features_stub.h"],
-    deps = [
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "@llvm-project//llvm:Core",
-        "@local_tsl//tsl/platform:logging",
-    ],
-)
-
-cc_library(
-    name = "ir_function",
-    srcs = ["ir_function.cc"],
-    hdrs = ["ir_function.h"],
-    deps = [
-        ":cpu_runtime",
-        ":ir_emission_utils",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:types",
-        "//xla/service:hlo_module_config",
-        "//xla/service/llvm_ir:llvm_util",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Core",
-    ],
-)
-
-cc_library(
-    name = "parallel_loop_emitter",
-    srcs = ["parallel_loop_emitter.cc"],
-    hdrs = ["parallel_loop_emitter.h"],
-    deps = [
-        ":ir_emission_utils",
-        "//xla/service/llvm_ir:ir_array",
-        "//xla/service/llvm_ir:llvm_loop",
-        "//xla/service/llvm_ir:llvm_util",
-        "//xla/service/llvm_ir:loop_emitter",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//llvm:Core",
-    ],
-)
-
-cc_library(
-    name = "thunk_emitter",
-    srcs = ["thunk_emitter.cc"],
-    hdrs = ["thunk_emitter.h"],
-    local_defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]),
-    deps = [
-        ":backend_config_proto_cc",
-        ":dot_op_emitter",
-        ":ir_emission_utils",
-        ":ir_emitter2",
-        "//xla:comparison_util",
-        "//xla:cpu_function_runtime",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu:onednn_emitter",
-        "//xla/backends/cpu:onednn_fusion",
-        "//xla/backends/cpu:xnn_emitter",
-        "//xla/backends/cpu:xnn_fusion",
-        "//xla/backends/cpu/codegen:computation_kernel_emitter",
-        "//xla/backends/cpu/codegen:fusion_compiler",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/backends/cpu/codegen/dot:dot_kernel_emitter",
-        "//xla/backends/cpu/codegen/elemental:concatenate_kernel_emitter",
-        "//xla/backends/cpu/codegen/elemental:elemental_kernel_emitter",
-        "//xla/backends/cpu/codegen/emitters:cpu_fusion_emitters",
-        "//xla/backends/cpu/runtime:all_gather_thunk",
-        "//xla/backends/cpu/runtime:all_reduce_thunk",
-        "//xla/backends/cpu/runtime:all_to_all_thunk",
-        "//xla/backends/cpu/runtime:call_thunk",
-        "//xla/backends/cpu/runtime:collective_permute_thunk",
-        "//xla/backends/cpu/runtime:collective_thunk",
-        "//xla/backends/cpu/runtime:conditional_thunk",
-        "//xla/backends/cpu/runtime:convolution_thunk",
-        "//xla/backends/cpu/runtime:copy_thunk",
-        "//xla/backends/cpu/runtime:custom_call_thunk",
-        "//xla/backends/cpu/runtime:dot_thunk",
-        "//xla/backends/cpu/runtime:fft_thunk",
-        "//xla/backends/cpu/runtime:infeed_thunk",
-        "//xla/backends/cpu/runtime:kernel_thunk",
-        "//xla/backends/cpu/runtime:logical_id_thunk",
-        "//xla/backends/cpu/runtime:outfeed_thunk",
-        "//xla/backends/cpu/runtime:reduce_scatter_thunk",
-        "//xla/backends/cpu/runtime:rng_state_thunk",
-        "//xla/backends/cpu/runtime:sort_thunk",
-        "//xla/backends/cpu/runtime:thunk",
-        "//xla/backends/cpu/runtime:topk_thunk",
-        "//xla/backends/cpu/runtime:while_thunk",
-        "//xla/backends/cpu/runtime/onednn:onednn_fusion_thunk",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_dot_thunk",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_fusion_thunk",
-        "//xla/codegen:kernel_definition",
-        "//xla/codegen:kernel_spec",
-        "//xla/codegen:llvm_ir_kernel_source",
-        "//xla/codegen:mlir_kernel_source",
-        "//xla/hlo/ir:hlo",
-        "//xla/runtime:resource_use",
-        "//xla/service:buffer_assignment",
-        "//xla/service:collective_ops_utils",
-        "//xla/service:hlo_module_config",
-        "//xla/service:hlo_proto_cc",
-        "//xla/service:pattern_matcher",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:JITLink",
-        "@llvm-project//llvm:ir_headers",
-        "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/profiler/lib:traceme",
-    ],
-)
-
-cc_library(
-    name = "tiled_dot_emitter",
-    srcs = ["tiled_dot_emitter.cc"],
-    hdrs = ["tiled_dot_emitter.h"],
-    deps = [
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/codegen:vector_ir_builder",
-        "//xla/service:hlo_module_config",
-        "//xla/service/llvm_ir:kernel_support_library",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/numeric:bits",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Core",
-    ],
-)
-
-cc_library(
-    name = "dot_op_emitter",
-    srcs = ["dot_op_emitter.cc"],
-    hdrs = [
-        "dot_op_emitter.h",
-    ],
-    deps = [
-        ":backend_config_proto_cc",
-        ":cpu_options",
-        ":cpu_runtime",
-        ":tiled_dot_emitter",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:hlo_module_config",
-        "//xla/service/llvm_ir:ir_array",
-        "//xla/service/llvm_ir:kernel_support_library",
-        "//xla/service/llvm_ir:llvm_loop",
-        "//xla/service/llvm_ir:llvm_util",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-    ],
-)
-
-build_test(
-    name = "sample_harness_build_test",
-    targets = [
-        ":sample_harness",
-    ],
-)
-
-xla_cc_binary(
-    name = "sample_harness",
-    srcs = ["sample_harness.cc"],
-    deps = [
-        "//xla:array4d",
-        "//xla:literal",
-        "//xla:types",
-        "//xla:xla_data_proto_cc",
-        "//xla/client",
-        "//xla/client:client_library",
-        "//xla/client:local_client",
-        "//xla/hlo/builder:xla_builder",
-        "//xla/hlo/builder:xla_computation",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:platform_port",
-    ],
-)
-
-cc_library(
-    name = "cpu_runtime",
-    srcs = [
-        "cpu_runtime.cc",
-        "xfeed_manager.cc",
-    ],
-    hdrs = [
-        "cpu_runtime.h",
-        "xfeed_manager.h",
-    ],
-    copts = runtime_copts(),
-    deps = [
-        ":cpu_executable_run_options",
-        "//xla:executable_run_options",
-        "//xla:shape_util",
-        "//xla:types",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/collectives:cpu_clique_key",
-        "//xla/backends/cpu/collectives:cpu_cliques",
-        "//xla/backends/cpu/collectives:cpu_collectives",
-        "//xla/backends/cpu/collectives:in_process_collectives",
-        "//xla/core/collectives:communicator",
-        "//xla/core/collectives:rank_id",
-        "//xla/hlo/parser:hlo_parser",
-        "//xla/service:collective_ops_utils",
-        "//xla/service:computation_placer",
-        "//xla/service:global_device_id",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:stream_executor_h",
-        "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/profiler/lib:traceme",
-    ],
-)
-
-cc_library(
-    name = "runtime_conv2d",
-    srcs = ["runtime_conv2d.cc"],
-    hdrs = ["runtime_conv2d.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_lightweight_check",
-        "//xla:executable_run_options",
-        "//xla/backends/cpu/runtime:convolution_thunk_internal",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "runtime_conv3d",
-    srcs = ["runtime_conv3d.cc"],
-    hdrs = ["runtime_conv3d.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_lightweight_check",
-        "//xla:executable_run_options",
-        "//xla/backends/cpu/runtime:convolution_thunk_internal",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "runtime_custom_call_status",
-    srcs = ["runtime_custom_call_status.cc"],
-    hdrs = ["runtime_custom_call_status.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/service:custom_call_status_internal",
-        "@com_google_absl//absl/base:core_headers",
-    ],
-)
-
-cc_library(
-    name = "runtime_conv2d_mkl",
-    srcs = [
-        "runtime_conv2d_mkl.cc",
-    ],
-    hdrs = ["runtime_conv2d_mkl.h"],
-    copts = runtime_copts() + tf_openmp_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_conv2d",
-        ":runtime_single_threaded_conv2d",
-        "//xla:executable_run_options",
-        "//xla/tsl/framework/convolution:eigen_helpers",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@eigen_archive//:eigen3",
-    ] + mkl_deps(),
-)
-
-cc_library(
-    name = "runtime_fft",
-    srcs = [
-        "runtime_fft.cc",
-    ],
-    hdrs = ["runtime_fft.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:executable_run_options",
-        "@com_google_absl//absl/base:core_headers",
-        "@ducc//:fft_wrapper",
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "runtime_matmul",
-    srcs = [
-        "runtime_matmul_c128.cc",
-        "runtime_matmul_c64.cc",
-        "runtime_matmul_common.h",
-        "runtime_matmul_f16.cc",
-        "runtime_matmul_f32.cc",
-        "runtime_matmul_f64.cc",
-        "runtime_matmul_s32.cc",
-    ],
-    hdrs = ["runtime_matmul.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_lightweight_check",
-        "//xla:executable_run_options",
-        "//xla/tsl/framework/contraction:eigen_contraction_kernel",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "runtime_matmul_acl",
-    srcs = ["runtime_matmul_acl.cc"],
-    hdrs = ["runtime_matmul_acl.h"],
-    copts = tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_lightweight_check",
-        ":runtime_matmul",
-        "//xla:executable_run_options",
-        "//xla/tsl/platform:dynamic_annotations",
-        "@com_google_absl//absl/base",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:types",
-    ] + acl_deps(),
-)
-
-cc_library(
-    name = "runtime_conv2d_acl",
-    srcs = [
-        "runtime_conv2d_acl.cc",
-    ],
-    hdrs = ["runtime_conv2d_acl.h"],
-    copts = tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_conv2d",
-        ":runtime_lightweight_check",
-        ":runtime_single_threaded_conv2d",
-        "//xla:executable_run_options",
-        "//xla/tsl/framework/convolution:eigen_helpers",
-        "//xla/tsl/platform:dynamic_annotations",
-        "@com_google_absl//absl/base",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:types",
-    ] + acl_deps(),
-)
-
-cc_library(
-    name = "runtime_single_threaded_conv2d",
-    srcs = ["runtime_single_threaded_conv2d.cc"],
-    hdrs = ["runtime_single_threaded_conv2d.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/backends/cpu/runtime:convolution_thunk_internal",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "runtime_single_threaded_conv3d",
-    srcs = ["runtime_single_threaded_conv3d.cc"],
-    hdrs = ["runtime_single_threaded_conv3d.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/backends/cpu/runtime:convolution_thunk_internal",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/synchronization",  # build_cleaner: keep
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "runtime_single_threaded_fft",
-    srcs = [
-        "runtime_single_threaded_fft.cc",
-    ],
-    hdrs = ["runtime_single_threaded_fft.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_fft",
-        "@com_google_absl//absl/base:core_headers",
-    ],
-)
-
-cc_library(
-    name = "runtime_single_threaded_matmul_impl",
-    srcs = [
-        "runtime_single_threaded_matmul_c128.cc",
-        "runtime_single_threaded_matmul_c64.cc",
-        "runtime_single_threaded_matmul_common.h",
-        "runtime_single_threaded_matmul_f16.cc",
-        "runtime_single_threaded_matmul_f32.cc",
-        "runtime_single_threaded_matmul_f64.cc",
-        "runtime_single_threaded_matmul_f8.cc",
-        "runtime_single_threaded_matmul_s32.cc",
-        "runtime_single_threaded_matmul_u8.cc",
-    ],
-    hdrs = ["runtime_single_threaded_matmul.h"],
-    compatible_with = get_compatible_with_portable(),
-    copts = runtime_copts(),
-    linkstatic = 1,
-    visibility = ["//visibility:private"],
-    deps = [
-        "//xla/tsl/framework/contraction:eigen_contraction_kernel_no_mkl",
-        "@com_google_absl//absl/base:core_headers",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:ml_dtypes",
-    ],
-)
-
-cc_library(
-    name = "runtime_single_threaded_matmul",
-    hdrs = ["runtime_single_threaded_matmul.h"],
-    compatible_with = get_compatible_with_portable(),
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_single_threaded_matmul_impl",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:ml_dtypes",
-    ],
-)
-
-cc_library(
-    name = "runtime_single_threaded_matmul_nomkl",
-    compatible_with = get_compatible_with_portable(),
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_single_threaded_matmul_impl",
-        "//xla/tsl/framework/contraction:eigen_contraction_kernel_no_mkl",
-        "@com_google_absl//absl/base:core_headers",
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "runtime_key_value_sort",
-    srcs = ["runtime_key_value_sort.cc"],
-    hdrs = ["runtime_key_value_sort.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "runtime_topk",
-    srcs = ["runtime_topk.cc"],
-    hdrs = ["runtime_topk.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-    ],
-)
-
-cc_library(
-    name = "runtime_fork_join",
-    srcs = ["runtime_fork_join.cc"],
-    hdrs = ["runtime_fork_join.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:executable_run_options",
-        "//xla/service:custom_call_status_internal",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:logging",
-    ],
-)
-
-cc_library(
-    name = "runtime_handle_ffi_call",
-    srcs = ["runtime_handle_ffi_call.cc"],
-    hdrs = ["runtime_handle_ffi_call.h"],
-    copts = runtime_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:executable_run_options",
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/ffi:attribute_map",
-        "//xla/ffi:call_frame",
-        "//xla/ffi:execution_state",
-        "//xla/ffi:ffi_api",
-        "//xla/ffi/api:c_api",
-        "//xla/service:custom_call_status_public_headers",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:AsmParser",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-xla_cc_test(
-    name = "cpu_runtime_test",
-    srcs = ["cpu_runtime_test.cc"],
-    shard_count = 10,
-    tags = ["optonly"],
-    deps = [
-        ":cpu_runtime",
-        ":runtime_custom_call_status",
-        ":runtime_matmul",
-        ":runtime_matmul_acl",
-        ":runtime_single_threaded_matmul",
-        "//xla:array2d",
-        "//xla:executable_run_options",
-        "//xla:types",
-        "//xla/client:local_client",
-        "//xla/service:custom_call_status_internal",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/strings:str_format",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:test",
-    ],
-)
-
-xla_cc_test(
-    name = "cpu_instruction_fusion_test",
-    srcs = ["cpu_instruction_fusion_test.cc"],
-    tags = ["not_run:arm"],
-    deps = [
-        ":cpu_instruction_fusion",
-        "//xla:literal_util",
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/utils:hlo_matchers",
-        "//xla/service:transpose_folding",
-        "//xla/tests:test_utils",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-xla_cc_test(
-    name = "xfeed_manager_test",
-    size = "small",
-    srcs = ["xfeed_manager_test.cc"],
-    deps = [
-        ":cpu_runtime",
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/lib/core:status_test_util",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:test",
-    ],
-)
-
-cc_library(
-    name = "cpu_instruction_fusion",
-    srcs = ["cpu_instruction_fusion.cc"],
-    hdrs = ["cpu_instruction_fusion.h"],
-    deps = [
-        "//xla:shape_util",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:fusion_node_indexing_evaluation",
-        "//xla/service:instruction_fusion",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_library(
-    name = "fusion_wrapper",
-    srcs = ["fusion_wrapper.cc"],
-    hdrs = ["fusion_wrapper.h"],
-    deps = [
-        "//xla/codegen/emitters:fusion_wrapper_base",
-        "//xla/hlo/ir:hlo",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-xla_cc_test(
-    name = "fusion_wrapper_test",
-    srcs = ["fusion_wrapper_test.cc"],
-    deps = [
-        ":fusion_wrapper",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "ir_emission_utils",
-    srcs = ["ir_emission_utils.cc"],
-    hdrs = ["ir_emission_utils.h"],
-    deps = [
-        ":cpu_runtime",
-        "//xla:shape_util",
-        "//xla:window_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/hlo/ir:hlo",
-        "@com_google_absl//absl/log:check",
-        "@llvm-project//llvm:Core",
-    ],
-)
-
-xla_cc_test(
-    name = "ir_emission_utils_test",
-    srcs = ["ir_emission_utils_test.cc"],
-    deps = [
-        ":ir_emission_utils",
-        ":target_machine_features_stub",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/testlib:test",
-        "//xla/tests:xla_internal_test_main",
-    ],
-)
-
-cc_library(
-    name = "cpu_layout_assignment",
-    srcs = ["cpu_layout_assignment.cc"],
-    hdrs = ["cpu_layout_assignment.h"],
-    deps = [
-        ":dot_op_emitter",
-        ":ir_emission_utils",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:computation_layout",
-        "//xla/service:layout_assignment",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
-    ],
-)
-
-xla_cc_test(
-    name = "cpu_layout_assignment_test",
-    size = "small",
-    srcs = ["cpu_layout_assignment_test.cc"],
-    deps = [
-        ":cpu_layout_assignment",
-        ":target_machine_features_stub",
-        "//xla:literal",
-        "//xla:shape_layout",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/testlib:test",
-        "//xla/hlo/testlib:test_helpers",
-        "//xla/hlo/utils:hlo_matchers",
-        "//xla/service:computation_layout",
-        "//xla/tests:test_utils",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:status",
-    ],
-)
-
-cc_library(
-    name = "conv_canonicalization",
-    srcs = ["conv_canonicalization.cc"],
-    hdrs = ["conv_canonicalization.h"],
-    deps = [
-        ":cpu_runtime",
-        ":ir_emission_utils",
-        "//xla:permutation_util",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:errors",
-    ],
-)
-
-xla_cc_test(
-    name = "conv_canonicalization_test",
-    srcs = ["conv_canonicalization_test.cc"],
-    deps = [
-        ":conv_canonicalization",
-        ":target_machine_features_stub",
-        "//xla:literal_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/testlib:test",
-        "//xla/hlo/testlib:test_helpers",
-        "//xla/tests:xla_internal_test_main",
-    ],
-)
-
-cc_library(
-    name = "parallel_task_assignment",
-    srcs = ["parallel_task_assignment.cc"],
-    hdrs = ["parallel_task_assignment.h"],
-    deps = [
-        ":backend_config_proto_cc",
-        ":ir_emission_utils",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/service:hlo_cost_analysis",
-        "//xla/service/llvm_ir:dynamic_update_slice_util",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:status",
-    ],
-)
-
-xla_cc_test(
-    name = "parallel_task_assignment_test",
-    srcs = ["parallel_task_assignment_test.cc"],
-    deps = [
-        ":backend_config_proto_cc",
-        ":cpu_executable",
-        ":parallel_task_assignment",
-        ":target_machine_features_stub",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/testlib:test",
-        "//xla/service:hlo_cost_analysis",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-cc_library(
-    name = "cpu_options",
-    srcs = ["cpu_options.cc"],
-    hdrs = ["cpu_options.h"],
-    deps = [
-        "//xla/service:hlo_module_config",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "orc_jit_memory_mapper",
-    srcs = ["orc_jit_memory_mapper.cc"],
-    hdrs = ["orc_jit_memory_mapper.h"],
-    deps = [
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/synchronization",
-        "@llvm-project//llvm:ExecutionEngine",
-        "@local_tsl//tsl/platform:logging",
-    ],
-)
-
-xla_cc_test(
-    name = "cpu_eigen_tensor_alignment_test",
-    size = "small",
-    srcs = ["cpu_eigen_tensor_alignment_test.cc"],
-    deps = [
-        ":ir_emission_utils",
-        ":target_machine_features_stub",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/testlib:test",
-        "//xla/tests:xla_internal_test_main",
-    ],
-)
-
-xla_cc_test(
-    name = "vectorized_reduce_with_no_vector_registers_test",
-    size = "small",
-    srcs = ["vectorized_reduce_with_no_vector_registers_test.cc"],
-    tags = ["not_run:arm"],
-    target_compatible_with = ["@platforms//cpu:x86_64"],
-    deps = [
-        ":cpu_compiler",
-        ":cpu_transfer_manager",
-        ":test_header_helper",
-        "//xla:util",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/testlib:test",
-        "//xla/service:compiler",
-        "//xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:MC",
-        "@llvm-project//llvm:Target",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-xla_cc_test(
-    name = "scoped_ir_builder_test",
-    srcs = ["scoped_ir_builder_test.cc"],
-    deps = [
-        ":cpu_executable",
-        ":ir_emitter",
-        ":target_machine_features_stub",
-        "//xla/hlo/analysis:hlo_ordering",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/service:buffer_assignment",
-        "//xla/service:buffer_value",
-        "//xla/service:logical_buffer",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Core",
-        "@local_tsl//tsl/platform:test",
-    ],
-)
-
-tf_proto_library(
-    name = "onednn_config_proto",
-    srcs = ["onednn_config.proto"],
-)
-
-tf_proto_library(
-    name = "backend_config_proto",
-    srcs = ["backend_config.proto"],
-    protodeps = [
-        ":onednn_config_proto",
-    ],
-)
-
-cc_library(
-    name = "onednn_util",
-    srcs = ["onednn_util.cc"],
-    hdrs = [
-        "onednn_util.h",
-        "//xla/tsl/util:onednn_util_hdrs",
-    ],
-    copts = runtime_copts() + tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":backend_config_proto_cc",
-        ":onednn_config_proto_cc",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/tsl/platform:env",
-        "@com_google_absl//absl/synchronization",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:platform_port",
-    ] + mkl_deps(),
-)
-
-cc_library(
-    name = "onednn_memory_util",
-    srcs = ["onednn_memory_util.cc"],
-    hdrs = ["onednn_memory_util.h"],
-    copts = runtime_copts() + tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":runtime_lightweight_check",
-        "//xla:literal",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:types",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/service/llvm_ir:ir_array",
-        "//xla/service/llvm_ir:ir_builder_mixin",
-        "//xla/service/llvm_ir:llvm_util",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:TargetParser",
-        "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-    ] + mkl_deps(),
-)
-
-cc_library(
-    name = "onednn_matmul",
-    srcs = ["onednn_matmul.cc"],
-    hdrs = ["onednn_matmul.h"],
-    copts = runtime_copts() + tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":backend_config_proto_cc",
-        ":onednn_config_proto_cc",
-        ":onednn_memory_util",
-        ":onednn_util",
-        ":runtime_lightweight_check",
-        "//xla:executable_run_options",
-        "//xla:shape_util",
-        "//xla/hlo/ir:hlo",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/synchronization",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:platform_port",
-    ] + mkl_deps(),
-)
-
-cc_library(
-    name = "onednn_convolution",
-    srcs = ["onednn_convolution.cc"],
-    hdrs = ["onednn_convolution.h"],
-    copts = runtime_copts() + tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":backend_config_proto_cc",
-        ":onednn_config_proto_cc",
-        ":onednn_memory_util",
-        ":onednn_util",
-        ":runtime_lightweight_check",
-        "//xla:executable_run_options",
-        "//xla:shape_util",
-        "//xla/hlo/ir:hlo",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/synchronization",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:platform_port",
-    ] + mkl_deps(),
-)
-
-cc_library(
-    name = "onednn_layer_norm",
-    srcs = ["onednn_layer_norm.cc"],
-    hdrs = [
-        "onednn_layer_norm.h",
-        "//xla/tsl/util:onednn_util_hdrs",
-    ],
-    copts = runtime_copts() + tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":backend_config_proto_cc",
-        ":onednn_config_proto_cc",
-        ":onednn_memory_util",
-        ":runtime_lightweight_check",
-        "//xla:executable_run_options",
-        "//xla/tsl/platform:env",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/synchronization",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:platform_port",
-    ] + mkl_deps(),
-)
-
-cc_library(
-    name = "onednn_softmax",
-    srcs = ["onednn_softmax.cc"],
-    hdrs = [
-        "onednn_softmax.h",
-        "//xla/tsl/util:onednn_util_hdrs",
-    ],
-    copts = runtime_copts() + tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":backend_config_proto_cc",
-        ":onednn_config_proto_cc",
-        ":onednn_memory_util",
-        ":runtime_lightweight_check",
-        "//xla:executable_run_options",
-        "//xla/tsl/platform:env",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/synchronization",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:platform_port",
-    ] + mkl_deps(),
-)
-
-cc_library(
-    name = "onednn_pattern_utils",
-    hdrs = ["onednn_pattern_utils.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":onednn_util",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:pattern_matcher",
-    ] + mkl_deps(),
-)
-
-cc_library(
-    name = "onednn_contraction_rewriter",
-    srcs = ["onednn_contraction_rewriter.cc"],
-    hdrs = [
-        "onednn_contraction_rewriter.h",
-        "onednn_convolution.h",
-        "onednn_matmul.h",
-        "//xla/tsl/util:onednn_util_hdrs",
-    ],
-    copts = tsl_copts(),
-    deps = [
-        ":backend_config_proto_cc",
-        ":onednn_config_proto_cc",
-        ":onednn_convolution",
-        ":onednn_matmul",
-        ":onednn_memory_util",
-        ":onednn_pattern_utils",
-        ":onednn_util",
-        "//xla:executable_run_options",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/evaluator:hlo_evaluator",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/service:hlo_cost_analysis",
-        "//xla/service:hlo_creation_utils",
-        "//xla/service:pattern_matcher",
-        "//xla/tsl/platform:env",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/synchronization",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:platform_port",
-    ] + mkl_deps(),
-)
-
-cc_library(
-    name = "onednn_ops_rewriter",
-    srcs = ["onednn_ops_rewriter.cc"],
-    hdrs = ["onednn_ops_rewriter.h"],
-    copts = tsl_copts(),
-    deps = [
-        ":backend_config_proto_cc",
-        ":onednn_config_proto_cc",
-        ":onednn_memory_util",
-        ":onednn_pattern_utils",
-        ":onednn_util",
-        "//xla:literal_comparison",
-        "//xla:literal_util",
-        "//xla:status_macros",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/service:hlo_creation_utils",
-        "//xla/service:pattern_matcher",
-        "@com_google_absl//absl/algorithm:container",
-        "@local_tsl//tsl/platform:platform_port",
-    ] + mkl_deps(),
-)
-
-cc_library(
-    name = "onednn_float_support",
-    srcs = ["onednn_float_support.cc"],
-    hdrs = ["onednn_float_support.h"],
-    copts = tsl_copts(),
-    deps = [
-        ":onednn_contraction_rewriter",
-        "//xla/service:float_support",
-    ],
-)
-
-cc_library(
-    name = "cpu_float_support",
-    hdrs = ["cpu_float_support.h"],
-    copts = tsl_copts(),
-    deps = [
-        "//xla/backends/cpu:xnn_fusion",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:float_support",
-    ],
-)
-
-xla_cc_test(
-    name = "cpu_float_support_test",
-    srcs = ["cpu_float_support_test.cc"],
-    deps = [
-        ":cpu_float_support",
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/backends/cpu/codegen:target_machine_test_base",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/hlo/transforms/simplifiers:float_normalization",
-        "//xla/service:hlo_module_config",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-cc_library(
-    name = "cpu_symbol_repository",
-    hdrs = ["cpu_symbol_repository.h"],
-    deps = [
-        "//xla:xla_proto_cc",
-        "//xla/service:symbol_repository",
-    ],
-)
-
-cc_library(
-    name = "cpu_executable_run_options",
-    hdrs = ["cpu_executable_run_options.h"],
-    deps = ["//xla/backends/cpu/collectives:cpu_collectives"],
-)
-
-cc_library(
-    name = "metrics",
-    srcs = ["metrics.cc"],
-    hdrs = ["metrics.h"],
-    deps = [
-        "//xla/tsl/lib/monitoring:counter",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:stacktrace",
-        "@local_tsl//tsl/profiler/lib:traceme",
-    ],
-)
-
-cc_library(
-    name = "elemental_ir_emitter",
-    srcs = ["elemental_ir_emitter.cc"],
-    hdrs = ["elemental_ir_emitter.h"],
-    deps = [
-        ":elemental_math_emitter",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:elemental_ir_emitter",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:ir_headers",
-    ],
-)
-
-cc_library(
-    name = "small_while_loop_hoisting_pass",
-    srcs = ["small_while_loop_hoisting_pass.cc"],
-    hdrs = ["small_while_loop_hoisting_pass.h"],
-    deps = [
-        ":cpu_executable",
-        "//xla:util",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/service:collective_ops_utils",
-        "//xla/service:hlo_cost_analysis",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-xla_cc_test(
-    name = "small_while_loop_hoisting_pass_test",
-    srcs = ["small_while_loop_hoisting_pass_test.cc"],
-    deps = [
-        ":backend_config_proto_cc",
-        ":small_while_loop_hoisting_pass",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/testlib:test",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
-xla_cc_test(
-    name = "metrics_test",
-    srcs = ["metrics_test.cc"],
-    deps = [
-        ":metrics",
-        "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/lib/monitoring:collected_metrics",
-        "//xla/tsl/lib/monitoring:collection_registry",
-        "@local_tsl//tsl/platform:test",
-    ],
-)
-
-cc_library(
-    name = "xnnpack_ops_rewriter",
-    srcs = ["xnnpack_ops_rewriter.cc"],
-    hdrs = [
-        "xnnpack_ops_rewriter.h",
-        "xnnpack_pattern_utils.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/hlo/ir:hlo",
-        "//xla:literal_comparison",
-        "//xla:literal_util",
-        "//xla:status_macros",
-        "//xla/hlo/pass:hlo_pass",
-        "//xla/service:pattern_matcher",
-    ],
-)
-
-cc_library(
-    name = "xnnpack_ops",
-    srcs = ["xnnpack_ops.cc"],
-    hdrs = ["xnnpack_ops.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@XNNPACK",
-        "@com_google_absl//absl/base",
-    ],
-)
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index c6d02568dfb9e4..9f2e6f5e6210d5 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -599,13 +599,6 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   if (enable_xnnpack)
     pipeline.AddPass<XnnPackOpsRewriter>();
 
-  bool use_kernel_selector =
-      xla::GetDebugOptionsFromFlags().xla_cpu_use_kernel_selector();
-  if (use_kernel_selector) {
-    // This pass rewrites hlo.dot into custom calls.
-    pipeline.AddPass<KernelSelectorOpsRewriter>();
-  }
-
   // Expand random number generation.
   pipeline.AddPass<RngExpander>();
   pipeline.AddPass<RngBitGeneratorExpander>(RandomAlgorithm::RNG_PHILOX);
@@ -846,6 +839,13 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
 
   pipeline.AddPass<ReshapeDecomposer>();
 
+  bool use_kernel_selector =
+      xla::GetDebugOptionsFromFlags().xla_cpu_use_kernel_selector();
+  if (use_kernel_selector) {
+    // This pass rewrites hlo.dot into custom calls.
+    pipeline.AddPass<KernelSelectorOpsRewriter>();
+  }
+
   const int max_parallelism =
       module->config().intra_op_parallelism_threads() > 0
           ? module->config().intra_op_parallelism_threads()
@@ -878,7 +878,10 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   }
 
   // Add a fusion pass now that layout assignment is done.
-  pipeline.AddPass<CpuInstructionFusion>();
+  if (getenv("SET_CPU_INS_FUSION_NOT_DUPLICATE") != NULL)
+    pipeline.AddPass<CpuInstructionFusion>(/*may_duplicate=*/false);
+  else
+    pipeline.AddPass<CpuInstructionFusion>(/*may_duplicate=*/true);
   if (is_fusion_emitters) {
     pipeline.AddPass<FusionWrapper>();
   }
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig b/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig
deleted file mode 100644
index 4a1402c6934cba..00000000000000
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc.orig
+++ /dev/null
@@ -1,2720 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/cpu/cpu_compiler.h"
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <stack>
-#include <string>
-#include <tuple>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-// IWYU pragma: no_include "llvm/Config/Disassemblers.def.inc"
-// IWYU pragma: no_include "llvm/Config/Targets.def.inc"
-
-#include "absl/cleanup/cleanup.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/memory/memory.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/Linker/Linker.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Object/ObjectFile.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/MemoryBufferRef.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/TargetParser/Host.h"
-#include "llvm/TargetParser/Triple.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/SplitModule.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Support/LLVM.h"
-#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
-#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
-#include "mlir/Target/LLVMIR/Export.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "xla/backends/cpu/codegen/cpu_features.h"
-#include "xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_config.h"
-#include "xla/backends/cpu/codegen/execution_engine.h"
-#include "xla/backends/cpu/codegen/ir_compiler.h"
-#include "xla/backends/cpu/codegen/jit_compiler.h"
-#include "xla/backends/cpu/codegen/object_loader.h"
-#include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/backends/cpu/constant_allocation.h"
-#include "xla/backends/cpu/runtime/function_library.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/thunk.pb.h"
-#include "xla/backends/cpu/runtime/thunk_proto_serdes.h"
-#include "xla/backends/cpu/transforms/xnn_graph_fusion.h"
-#include "xla/backends/cpu/xnn_fusion.h"
-#include "xla/cpu_function_runtime.h"
-#include "xla/hlo/analysis/hlo_ordering.h"
-#include "xla/hlo/analysis/indexed_array_analysis.h"
-#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_module_group.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/ir/hlo_schedule.h"
-#include "xla/hlo/pass/hlo_pass_fix.h"
-#include "xla/hlo/pass/hlo_pass_pipeline.h"
-#include "xla/hlo/transforms/expanders/bitcast_dtypes_expander.h"
-#include "xla/hlo/transforms/expanders/cholesky_expander.h"
-#include "xla/hlo/transforms/expanders/comparison_expander.h"
-#include "xla/hlo/transforms/expanders/dot_decomposer.h"
-#include "xla/hlo/transforms/expanders/dynamic_index_splitter.h"
-#include "xla/hlo/transforms/expanders/eigh_expander.h"
-#include "xla/hlo/transforms/expanders/logistic_expander.h"
-#include "xla/hlo/transforms/expanders/optimization_barrier_expander.h"
-#include "xla/hlo/transforms/expanders/qr_expander.h"
-#include "xla/hlo/transforms/expanders/reduce_decomposer.h"
-#include "xla/hlo/transforms/expanders/reshape_decomposer.h"
-#include "xla/hlo/transforms/expanders/rng_bit_generator_expander.h"
-#include "xla/hlo/transforms/expanders/rng_expander.h"
-#include "xla/hlo/transforms/expanders/stochastic_convert_decomposer.h"
-#include "xla/hlo/transforms/literal_canonicalizer.h"
-#include "xla/hlo/transforms/operand_upcaster.h"
-#include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
-#include "xla/hlo/transforms/simplifiers/batch_dot_simplification.h"
-#include "xla/hlo/transforms/simplifiers/broadcast_canonicalizer.h"
-#include "xla/hlo/transforms/simplifiers/conditional_canonicalizer.h"
-#include "xla/hlo/transforms/simplifiers/convolution_group_converter.h"
-#include "xla/hlo/transforms/simplifiers/dynamic_dimension_simplifier.h"
-#include "xla/hlo/transforms/simplifiers/flatten_call_graph.h"
-#include "xla/hlo/transforms/simplifiers/float_normalization.h"
-#include "xla/hlo/transforms/simplifiers/gather_simplifier.h"
-#include "xla/hlo/transforms/simplifiers/hlo_constant_folding.h"
-#include "xla/hlo/transforms/simplifiers/hlo_dce.h"
-#include "xla/hlo/transforms/simplifiers/hlo_memory_scheduler.h"
-#include "xla/hlo/transforms/simplifiers/optimize_input_output_buffer_alias.h"
-#include "xla/hlo/transforms/simplifiers/reduce_window_rewriter.h"
-#include "xla/hlo/transforms/simplifiers/reshape_mover.h"
-#include "xla/hlo/transforms/simplifiers/result_caster.h"
-#include "xla/hlo/transforms/simplifiers/sort_simplifier.h"
-#include "xla/hlo/transforms/simplifiers/sub_byte_normalization.h"
-#include "xla/hlo/transforms/simplifiers/tree_reduction_rewriter.h"
-#include "xla/hlo/transforms/simplifiers/tuple_simplifier.h"
-#include "xla/hlo/transforms/simplifiers/zero_sized_hlo_elimination.h"
-#include "xla/hlo/transforms/while_loop_trip_count_annotator.h"
-#include "xla/literal_pool.h"
-#include "xla/map_util.h"
-#include "xla/mlir_hlo/transforms/passes.h"
-#include "xla/service/all_reduce_promotion.h"
-#include "xla/service/outer_dimension_propagation.h"
-#include "xla/service/get_outer_batch_value_simplifier.h"
-#include "xla/service/all_to_all_decomposer.h"
-#include "xla/service/batched_gather_scatter_normalizer.h"
-#include "xla/service/batchnorm_expander.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/service/call_graph.h"
-#include "xla/service/call_inliner.h"
-#include "xla/service/change_op_data_type.h"
-#include "xla/service/compiler.h"
-#include "xla/service/conditional_simplifier.h"
-#include "xla/service/conditional_to_select.h"
-#include "xla/service/copy_insertion.h"
-#include "xla/service/cpu/buffer_info_util.h"
-#include "xla/service/cpu/conv_canonicalization.h"
-#include "xla/service/cpu/cpu_aot_compilation_result.h"
-#include "xla/service/cpu/cpu_executable.h"
-#include "xla/service/cpu/cpu_float_support.h"
-#include "xla/service/cpu/cpu_instruction_fusion.h"
-#include "xla/service/cpu/cpu_layout_assignment.h"
-#include "xla/service/cpu/cpu_options.h"
-#include "xla/service/cpu/dot_op_emitter.h"
-#include "xla/service/cpu/executable.pb.h"
-#include "xla/service/cpu/fusion_wrapper.h"
-#include "xla/service/cpu/ir_emitter.h"
-#include "xla/service/cpu/ir_emitter2.h"
-#include "xla/service/cpu/metrics.h"
-#include "xla/service/cpu/parallel_task_assignment.h"
-#include "xla/service/cpu/runtime_symbol_generator.h"
-#include "xla/service/cpu/small_while_loop_hoisting_pass.h"
-#include "xla/service/cpu/thunk_emitter.h"
-#include "xla/service/cpu_gpu_shape_verifier.h"
-#include "xla/service/dump.h"
-#include "xla/service/dynamic_dimension_inference.h"
-#include "xla/service/dynamic_padder.h"
-#include "xla/service/executable.h"
-#include "xla/service/float_support.h"
-#include "xla/service/gather_expander.h"
-#include "xla/service/hlo.pb.h"
-#include "xla/service/hlo_cost_analysis.h"
-#include "xla/service/hlo_cse.h"
-#include "xla/service/hlo_execution_profile.h"
-#include "xla/service/hlo_module_config.h"
-#include "xla/service/hlo_profile_printer_data.pb.h"
-#include "xla/service/hlo_verifier.h"
-#include "xla/service/layout_assignment.h"
-#include "xla/service/llvm_compiler.h"
-#include "xla/service/llvm_ir/llvm_command_line_options.h"
-#include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/service/logical_buffer.h"
-#include "xla/service/map_inliner.h"
-#include "xla/service/scatter_expander.h"
-#include "xla/service/scatter_simplifier.h"
-#include "xla/service/select_and_scatter_expander.h"
-#include "xla/service/sharding_propagation.h"
-#include "xla/service/sharding_remover.h"
-#include "xla/service/slow_operation_alarm.h"
-#include "xla/service/spmd/shardy/shardy_xla_pass.h"
-#include "xla/service/spmd/stateful_rng_spmd_partitioner.h"
-#include "xla/service/topk_rewriter.h"
-#include "xla/service/transpose_folding.h"
-#include "xla/service/triangular_solve_expander.h"
-#include "xla/service/while_loop_constant_sinking.h"
-#include "xla/service/while_loop_invariant_code_motion.h"
-#include "xla/service/while_loop_simplifier.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/status_macros.h"
-#include "xla/stream_executor/host/host_platform_id.h"
-#include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/status.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/threadpool.h"
-#include "xla/util.h"
-#include "xla/xla.pb.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/casts.h"
-#include "tsl/platform/cpu_info.h"
-#include "tsl/platform/logging.h"  // IWYU pragma: keep
-#include "tsl/profiler/lib/traceme.h"
-#include "tsl/profiler/lib/traceme_encode.h"
-
-#include "xnnpack_ops_rewriter.h"
-
-#ifdef TF_LLVM_X86_AVAILABLE
-#include "llvm/TargetParser/X86TargetParser.h"
-#endif
-
-#if defined(INTEL_MKL)
-#include "xla/hlo/transforms/simplifiers/simplify_fp_conversions.h"
-#include "xla/service/cpu/onednn_contraction_rewriter.h"
-#include "xla/service/cpu/onednn_float_support.h"
-#include "xla/service/cpu/onednn_ops_rewriter.h"
-#endif
-
-namespace xla {
-namespace {
-
-using tsl::profiler::TraceMe;
-using tsl::profiler::TraceMeEncode;
-
-// A module identifier (prefix) for emitted LLVM modules.
-static constexpr absl::string_view kXlaModuleIdentifier = "__compute_module";
-
-// Returns a global (per-process) thread pool for XLA CPU compilation tasks.
-static tsl::thread::ThreadPool* GetCompilationThreadPool() {
-  // LLVM compilation has a lot of memory-bound pointer chasing and not
-  // so much CPU-bound work. Based on profiling a few examples, 32 threads seems
-  // to be enough to achieve maximum parallel compilation speedup.
-  static constexpr int kMaxCompilationThreads = 32;
-
-  // On Mac OS the default stack size is 512KiB, this is too small for compiling
-  // reasonably sized programs
-  tsl::ThreadOptions thread_options;
-  thread_options.stack_size = 4 * 1024 * 1024;  // 4 MB
-
-  static auto* const thread_pool = new tsl::thread::ThreadPool(
-      tsl::Env::Default(), thread_options, "xla-cpu-llvm-codegen",
-      std::min(kMaxCompilationThreads, tsl::port::MaxParallelism()));
-  return thread_pool;
-}
-
-// Returns task runner that uses the global compilation thread pool.
-static cpu::JitCompiler::TaskRunner GetCompilationTaskRunner() {
-  return [](cpu::JitCompiler::Task task) {
-    GetCompilationThreadPool()->Schedule(std::move(task));
-  };
-}
-
-// For each computation in the module, determines whether that computation
-// calls a custom-call function, either directly or indirectly (e.g. because it
-// calls another computation that does).
-absl::flat_hash_map<const HloComputation*, bool>
-ModuleComputationsTransitivelyContainCustomCall(const HloModule& module) {
-  absl::flat_hash_map<const HloComputation*, bool> custom_call_map;
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(&module);
-
-  // Can never fail because we always return an OK status from the visitor.
-  TF_CHECK_OK(call_graph->VisitNodes([&custom_call_map](
-                                         const CallGraphNode& node) {
-    const HloComputation* computation = node.computation();
-
-    for (const HloInstruction* instruction : computation->instructions()) {
-      // The computation contains a custom-call instruction directly.
-      if (DynCast<HloCustomCallInstruction>(instruction)) {
-        custom_call_map[computation] = true;
-        return absl::OkStatus();
-      }
-      // The computation calls something that contains a custom-call
-      // instruction (directly or indirectly). This lookup relies on the call
-      // graph traversing callees before callers, so that the map is always
-      // populated for all callees at this point.
-      for (const HloComputation* callee : instruction->called_computations()) {
-        bool callee_contains_custom_call = FindOrDie(custom_call_map, callee);
-        if (callee_contains_custom_call) {
-          custom_call_map[computation] = true;
-          return absl::OkStatus();
-        }
-      }
-    }
-
-    custom_call_map[computation] = false;
-    return absl::OkStatus();
-  }));
-
-  return custom_call_map;
-}
-
-}  // namespace
-
-namespace cpu {
-
-CpuCompiler::CpuCompiler() {
-  // Initialize LLVM the first time the CpuCompiler is initialized.
-  static bool llvm_initialized = []() {
-    InitializeLLVMTarget();
-    return true;
-  }();
-  (void)llvm_initialized;
-}
-
-absl::StatusOr<std::vector<std::unique_ptr<Executable>>> CpuCompiler::Compile(
-    std::unique_ptr<HloModuleGroup> module_group,
-    std::vector<std::vector<se::StreamExecutor*>> stream_execs,
-    const CompileOptions& options) {
-  for (const std::vector<se::StreamExecutor*>& se_vector : stream_execs) {
-    if (se_vector.size() != 1) {
-      return Unimplemented(
-          "Model partitioning not implemented for the CPU compiler");
-    }
-  }
-  return LLVMCompiler::Compile(std::move(module_group), stream_execs, options);
-}
-
-/* static */ void CpuCompiler::InitializeLLVMTarget() {
-  // Initialize LLVM's MC layer for the native target.
-  llvm::InitializeNativeTarget();
-  llvm::InitializeNativeTargetAsmPrinter();
-}
-
-namespace {
-
-// This visitor records which HLO instructions should have profiling information
-// recorded.
-class CollectProfileCandidates : public DfsHloVisitorWithDefault {
- public:
-  static absl::StatusOr<absl::flat_hash_map<const HloInstruction*, int64_t>>
-  GetCandidatesForComputation(
-      const HloComputation& computation,
-      const absl::flat_hash_map<const HloInstruction*, int64_t>&
-          assigned_indices) {
-    absl::flat_hash_map<const HloInstruction*, int64_t> hlo_to_profile_idx;
-    CollectProfileCandidates profile_candidates_for_computation(
-        &hlo_to_profile_idx, assigned_indices);
-    TF_RETURN_IF_ERROR(computation.Accept(&profile_candidates_for_computation));
-    return hlo_to_profile_idx;
-  }
-
- private:
-  CollectProfileCandidates(
-      absl::flat_hash_map<const HloInstruction*, int64_t>* hlo_to_profile_idx,
-      const absl::flat_hash_map<const HloInstruction*, int64_t>&
-          assigned_indices)
-      : hlo_to_profile_idx_(hlo_to_profile_idx),
-        assigned_indices_(assigned_indices) {}
-
-  absl::Status DefaultAction(HloInstruction* hlo_instruction) override {
-    hlo_to_profile_idx_->insert(
-        {hlo_instruction, FindOrDie(assigned_indices_, hlo_instruction)});
-    return absl::OkStatus();
-  }
-
-  absl::Status HandleCall(HloInstruction* call) override {
-    TF_RETURN_IF_ERROR(DefaultAction(call));
-    CollectProfileCandidates candidates_for_call(hlo_to_profile_idx_,
-                                                 assigned_indices_);
-    TF_RETURN_IF_ERROR(call->to_apply()->Accept(&candidates_for_call));
-    return absl::OkStatus();
-  }
-  // Recurse into "conditional" so we can profile inside of it.
-  absl::Status HandleConditional(HloInstruction* conditional) override {
-    TF_RETURN_IF_ERROR(DefaultAction(conditional));
-
-    CollectProfileCandidates candidates_for_true(hlo_to_profile_idx_,
-                                                 assigned_indices_);
-    TF_RETURN_IF_ERROR(
-        conditional->true_computation()->Accept(&candidates_for_true));
-
-    CollectProfileCandidates candidates_for_false(hlo_to_profile_idx_,
-                                                  assigned_indices_);
-    TF_RETURN_IF_ERROR(
-        conditional->false_computation()->Accept(&candidates_for_false));
-
-    return absl::OkStatus();
-  }
-
-  // Skip constants, there is nothing to profile.
-  absl::Status HandleConstant(HloInstruction*) override {
-    return absl::OkStatus();
-  }
-  // Skip parameters, they are a simple load.
-  absl::Status HandleParameter(HloInstruction*) override {
-    return absl::OkStatus();
-  }
-  // It is important to recurse for "while" or else we risk overly coarse
-  // profiling information.
-  absl::Status HandleWhile(HloInstruction* xla_while) override {
-    TF_RETURN_IF_ERROR(DefaultAction(xla_while));
-
-    CollectProfileCandidates candidates_for_condition(hlo_to_profile_idx_,
-                                                      assigned_indices_);
-    TF_RETURN_IF_ERROR(
-        xla_while->while_condition()->Accept(&candidates_for_condition));
-
-    CollectProfileCandidates candidates_for_body(hlo_to_profile_idx_,
-                                                 assigned_indices_);
-    TF_RETURN_IF_ERROR(xla_while->while_body()->Accept(&candidates_for_body));
-
-    return absl::OkStatus();
-  }
-
-  absl::flat_hash_map<const HloInstruction*, int64_t>* hlo_to_profile_idx_;
-  const absl::flat_hash_map<const HloInstruction*, int64_t>& assigned_indices_;
-};
-
-// Adds the HloVerifier for CPU to the given pipeline.
-void AddHloVerifier(HloPassPipeline* pipeline, HloVerifierOpts&& opts = {},
-                    bool debug_only = false) {
-  auto verifier_metadata =
-      std::make_unique<CpuGpuVerifierMetadata>(std::move(opts));
-
-  if (debug_only) {
-    pipeline->AddInvariantCheckerDebug<HloVerifier>(
-        std::move(verifier_metadata), "hlo verifier (debug)");
-  } else {
-    pipeline->AddInvariantChecker<HloVerifier>(std::move(verifier_metadata),
-                                               "hlo verifier");
-  }
-}
-
-std::unique_ptr<HloPassFix<HloPassPipeline>> CreateSimplificationPipeline(
-    absl::string_view name, HloModule* module, bool is_fusion_emitters) {
-  // Run the following passes to a fixed point.
-  auto pipeline =
-      std::make_unique<HloPassFix<HloPassPipeline>>(std::string(name));
-  AddHloVerifier(pipeline.get(), HloVerifierOpts{},
-                 /*debug_only=*/true);
-
-  AlgebraicSimplifierOptions options;
-  options.set_enable_dot_strength_reduction(false);
-  // "slow" minmax means we propagate nan.
-  options.set_minmax_propagate_nan(
-      !module->config().debug_options().xla_cpu_enable_fast_min_max());
-  options.set_supports_non_canonical_dots(false);
-  options.set_executing_on_cpu(true);
-  pipeline->AddPass<AlgebraicSimplifier>(options);
-  pipeline->AddPass<SortSimplifier>();
-  pipeline->AddPass<HloDCE>();
-  pipeline->AddPass<GatherExpander>(GatherExpander::kEliminateSimpleGathers);
-  if (is_fusion_emitters) {
-    // Conversion to MLIR only works with simplified gathers.
-    pipeline->AddPass<GatherSimplifier>();
-  }
-
-  // Needs to happen after algebraic simplifier.
-  // pipeline->AddPass<TreeReductionRewriter>();
-
-  // BatchNormExpander can create zero-sized ops, so zero-sized HLO
-  // elimination has to come after that pass.
-  pipeline->AddPass<ZeroSizedHloElimination>();
-
-  pipeline->AddPass<WhileLoopInvariantCodeMotion>();
-  pipeline->AddPass<TupleSimplifier>();
-  pipeline->AddPass<WhileLoopConstantSinking>();
-  pipeline->AddPass<WhileLoopSimplifier>();
-
-  // TODO(b/134075051): Re-enable after b/134075051 is fixed.
-  // pipeline->AddPass<SliceSinker>();
-
-  pipeline->AddPass<HloDCE>();
-  pipeline->AddPass<ReshapeMover>();
-  pipeline->AddPass<HloConstantFolding>(
-      options::FoldAllConstants(module->config())
-          ? HloConstantFolding::Level::kAggressive
-          : HloConstantFolding::Level::kDefault);
-  pipeline->AddPass<ConditionalSimplifier>();
-
-  return pipeline;
-}
-
-}  // namespace
-
-absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
-    HloModule* module, bool is_aot_compile,
-    TargetMachineFeatures* target_machine_features) {
-  const int64_t num_partitions = module->config().num_partitions();
-  const bool is_thunk_runtime =
-      module->config().debug_options().xla_cpu_use_thunk_runtime();
-  const bool is_fusion_emitters =
-      is_thunk_runtime &&
-      module->config().debug_options().xla_cpu_use_fusion_emitters();
-  bool use_shardy_partitioner = module->config().use_shardy_partitioner();
-  if (num_partitions > 1) {
-    if (!module->config().use_spmd_partitioning()) {
-      return InvalidArgument(
-          "num_partitions=%d but SPMD partitioning not enabled.",
-          num_partitions);
-    }
-    HloPassPipeline spmd_pipeline("spmd-partitioner");
-    // Run some IR cleanup passes before running the SPMD partitioning
-    // passes.
-    AddHloVerifier(&spmd_pipeline);
-    spmd_pipeline.AddPass<FlattenCallGraph>();
-    spmd_pipeline.AddPass<CallInliner>();
-    spmd_pipeline.AddPass<ZeroSizedHloElimination>();
-    spmd_pipeline.AddPass<ConditionalCanonicalizer>();
-    if (use_shardy_partitioner) {
-      spmd_pipeline.AddPass<sdy::ShardyXLA>();
-    } else {
-      spmd_pipeline.AddPass<ShardingPropagation>(
-          /*is_spmd=*/true, /*propagate_metadata=*/false,
-          module->config().allow_spmd_sharding_propagation_to_output(),
-          module->config().allow_spmd_sharding_propagation_to_parameters());
-    }
-    spmd_pipeline.AddPass<spmd::StatefulRngSpmdPartitioner>(
-        num_partitions, module->config().replica_count());
-    TF_RETURN_IF_ERROR(spmd_pipeline.Run(module).status());
-  } else {
-    HloPassPipeline sharding_removal_pipeline("sharding-removal");
-    AddHloVerifier(&sharding_removal_pipeline);
-    // Remove redundant sharding ops when partition_count == 1.
-    sharding_removal_pipeline.AddPass<ShardingRemover>();
-    // Run ShardyXLA without propagation, which enforces use-tuple-args.
-    if (use_shardy_partitioner) {
-      sharding_removal_pipeline.AddPass<sdy::ShardyXLA>(
-          /*runSdyShardingPropagation=*/false);
-    }
-    sharding_removal_pipeline.AddPass<HloDCE>();
-    TF_RETURN_IF_ERROR(sharding_removal_pipeline.Run(module).status());
-  }
-
-  {
-    // SubbytePacker must be run before the rest of the pipeline since it
-    // modifies the layout of the entry computation inputs/outputs, which is
-    // passed to LayoutAssignment.
-    HloPassPipeline subbyte_packer_pipeline("SubbytePacker pipeline");
-    subbyte_packer_pipeline.AddPass<SubByteNormalization>(
-        SubByteNormalization::SET_ELEMENT_SIZE);
-    TF_RETURN_IF_ERROR(subbyte_packer_pipeline.Run(module).status());
-  }
-  HloPassPipeline pipeline("HLO passes through layout assignment");
-  AddHloVerifier(&pipeline);
-  pipeline.AddPass<BatchedGatherScatterNormalizer>();
-  pipeline.AddPass<ResultCaster>();
-
-  // If XNNPACK is enabled, we only need to upcast dots that XnnDotThunk does
-  // not support. `upcaster_filter` returns false if the instruction shouldn't
-  // be processed.
-  // TODO(b/406806134): Stop calling XNNPACK from regular Dot thunks. All XNN
-  // Dots should be wrapped in an `__xnn_fusion` fusion region and processed in
-  // `XnnFusionThunk`.
-  bool xnnpack_enabled = module->config().debug_options().xla_cpu_use_xnnpack();
-  auto call_library_for_dot = [&](const HloInstruction& instr) {
-    if (!xnnpack_enabled) return false;
-    DotImplementationStrategy strategy = GetDotImplementationStrategy(
-        module->config(), instr, *target_machine_features,
-        /*allow_runtime_calls=*/true);
-    return strategy == DotImplementationStrategy::kEigen;
-  };
-  HloPredicate upcaster_filter = [&](const HloInstruction* instr) {
-    if (!call_library_for_dot(*instr)) return true;
-    return !IsXnnDotSupported(instr->dot_dimension_numbers(),
-                              instr->operand(0)->shape(),
-                              instr->operand(1)->shape(), instr->shape(),
-                              target_machine_features)
-                .value_or(false);
-  };
-  pipeline.AddPass<OperandUpcaster>(upcaster_filter);
-
-  // For softmax, rewrite to custom calls with XNNPACK targets.
-  bool enable_xnnpack =
-      xla::GetDebugOptionsFromFlags().xla_cpu_enable_xnnpack();
-  if (enable_xnnpack)
-    pipeline.AddPass<XnnPackOpsRewriter>();
-
-  // Expand random number generation.
-  pipeline.AddPass<RngExpander>();
-  pipeline.AddPass<RngBitGeneratorExpander>(RandomAlgorithm::RNG_PHILOX);
-
-  // Remove zero-sized HLO from the input so that other passes don't have to
-  // handle it.
-  pipeline.AddPass<ZeroSizedHloElimination>();
-
-  pipeline.AddPass<DynamicIndexSplitter>();
-
-  pipeline.AddPass<ConditionalToSelect>();
-  pipeline.AddPass<MapInliner>();
-
-  // The TopkDecomposer generates a compare op with type=TOTALORDER and must
-  // run before the ComparisonExpander which rewrites such comparisons.
-  pipeline.AddPass<TopkDecomposer>([&](const HloInstruction* instr) {
-    return instr->opcode() == HloOpcode::kTopK;
-  });
-
-  pipeline.AddPass<ComparisonExpander>();
-  pipeline.AddPass<CholeskyExpander>();
-  pipeline.AddPass<QrExpander>();
-  pipeline.AddPass<EighExpander>();
-  pipeline.AddPass<TriangularSolveExpander>();
-  pipeline.AddPass<AllToAllDecomposer>();
-  pipeline.AddPass<StochasticConvertDecomposer>();
-
-  // Inline computations with a single call site.
-  pipeline.AddPass<CallInliner>(/*single_call_site=*/true);
-  pipeline.AddPass<BatchDotSimplification>();
-  pipeline.AddPass<DotDecomposer>();
-
-  // Rewrite to custom calls with target as oneDNN library calls.
-#if defined(INTEL_MKL)
-  // AOT compiled code runs in single thread.
-  if (!is_aot_compile && !is_thunk_runtime) {
-    // Placing OneDnnOpsRewriter here to match the flax patterns
-    // TODO: Decide where would be the appropriate place for this pass to make
-    // it more generic
-    // TODO - intel: Name of the pass might seem redundant as oneDnnRewriter,
-    // but in future plan to rename oneDNNrewriter to specific to onednn matmul
-    pipeline.AddPass<OneDnnOpsRewriter>();
-  }
-#endif  // INTEL_MKL
-
-  // Promote BF16 all-reduce to F32.
-  const std::pair<PrimitiveType, PrimitiveType> ar_promoted_types[] = {
-      {BF16, F32}};
-  pipeline.AddPass<AllReducePromotion>(ar_promoted_types);
-  // Convert BF16 and F8 operations to F32 and F16 respectively so that the CPU
-  // backend can support BF16/F8 operations without directly implementing a
-  // BF16/F8 lowering for most ops.
-  CpuFloatSupport bf16_support(BF16, call_library_for_dot,
-                               target_machine_features);
-#if defined(INTEL_MKL)
-  OneDnnFloatSupport onednn_bf16_support(BF16);
-  if (!is_aot_compile && !is_thunk_runtime) {
-    pipeline.AddPass<FloatNormalization>(&onednn_bf16_support);
-  } else {
-    pipeline.AddPass<FloatNormalization>(&bf16_support);
-  }
-#else
-  pipeline.AddPass<FloatNormalization>(&bf16_support);
-#endif
-  FloatSupport f8e5m2_support(F8E5M2, F16);
-  pipeline.AddPass<FloatNormalization>(&f8e5m2_support);
-  FloatSupport f8e4m3_support(F8E4M3, F16);
-  pipeline.AddPass<FloatNormalization>(&f8e4m3_support);
-  FloatSupport f8e4m3fn_support(F8E4M3FN, F16);
-  pipeline.AddPass<FloatNormalization>(&f8e4m3fn_support);
-  FloatSupport f8e4m3b11fnuz_support(F8E4M3B11FNUZ, F16);
-  pipeline.AddPass<FloatNormalization>(&f8e4m3b11fnuz_support);
-  FloatSupport f8e5m2fnuz_support(F8E5M2FNUZ, F16);
-  pipeline.AddPass<FloatNormalization>(&f8e5m2fnuz_support);
-  FloatSupport f8e4m3fnuz_support(F8E4M3FNUZ, F16);
-  pipeline.AddPass<FloatNormalization>(&f8e4m3fnuz_support);
-  FloatSupport f8e3m4_support(F8E3M4, F16);
-  pipeline.AddPass<FloatNormalization>(&f8e3m4_support);
-  FloatSupport s4_support(S4, S8);
-  pipeline.AddPass<FloatNormalization>(&s4_support);
-  FloatSupport u4_support(U4, U8);
-  pipeline.AddPass<FloatNormalization>(&u4_support);
-  FloatSupport f4e2m1fn_support(F4E2M1FN, F16);
-  pipeline.AddPass<FloatNormalization>(&f4e2m1fn_support);
-  FloatSupport f8e8m0fnu_support(F8E8M0FNU, F32);
-  pipeline.AddPass<FloatNormalization>(&f8e8m0fnu_support);
-  // After canonicalization, there may be more batch dots that can be
-  // simplified.
-  pipeline.AddPass<BatchDotSimplification>();
-  auto cost_model = [](HloInstruction* conv) {
-    // We need a cost model for CPUs. Currently, do nothing.
-    return false;
-  };
-  pipeline.AddPass<ConvolutionGroupConverter>(
-      /*should_expand=*/[](HloInstruction* conv) { return true; }, cost_model,
-      /*convert_batch_groups_only=*/true);
-  auto feature_group_should_expand = [](HloInstruction* conv) {
-    switch (conv->shape().element_type()) {
-      case F16:
-      case F32:
-        return false;
-      default:
-        return true;
-    }
-  };
-  pipeline.AddPass<ConvolutionGroupConverter>(
-      feature_group_should_expand, cost_model,
-      /*convert_batch_groups_only=*/false);
-  pipeline.AddPass<BatchNormExpander>(
-      /*rewrite_training_op=*/true,
-      /*rewrite_inference_op=*/true,
-      /*rewrite_grad_op=*/true);
-  pipeline.AddPass<LogisticExpander>();
-  pipeline.AddPass<ConditionalCanonicalizer>();
-  pipeline.AddPass<DynamicDimensionSimplifier>();
-
-  if (module->config()
-          .debug_options()
-          .xla_reduce_window_rewrite_base_length() != 0) {
-    pipeline.AddPass<HloPassFix<ReduceWindowRewriter>>(
-        module->config()
-            .debug_options()
-            .xla_reduce_window_rewrite_base_length());
-  }
-  auto dynamic_padder_options = DynamicPadderOptions();
-  // TODO(pgavin): ShapeChecks were never implemented correctly by the dynamic
-  // padder.  The mode defaults to kIgnore, and it was not overridden for nested
-  // computations (such as while bodies or conditional branches), and so cases
-  // that could not be proven would still be accepted even with compile-time
-  // checks enabled.  Recent changes to the DynamicPadder correctly
-  // override the mode.  However, some models have started to rely on the check
-  // being ignored, and they would be broken if it is enforced.
-  dynamic_padder_options.shape_check_mode =
-      DynamicDimensionInference::ShapeCheckMode::kIgnore;
-  pipeline.AddPass<DynamicPadder>(dynamic_padder_options);
-
-  pipeline.AddPass<ConvCanonicalization>(target_machine_features);
-
-  // Run fp16 dots/convs in fp32 and then downcast the result to fp16.
-  // Justification:
-  //
-  //   - This is significantly faster on our CPUs today than true fp16.
-  //   - It's numerically more accurate.  (Granted, this is not always
-  //     desirable, thus the ability to disable this functionality.)
-  //   - It matches more closely the GPU's behavior on fp16 dot/conv, where
-  //     accumulation happens in f32.
-  if (!module->config().debug_options().xla_cpu_strict_dot_conv_math()) {
-    pipeline.AddPass<ChangeOpDataType>(
-        F16, F32, HloPredicateIsOp<HloOpcode::kDot, HloOpcode::kConvolution>);
-  }
-
-  pipeline.AddPass(CreateSimplificationPipeline("simplification", module,
-                                                is_fusion_emitters));
-
-  // Scatter expander is sandwiched between two simplification pipelines to
-  // enable constant folding with the original scatter instructions (which is
-  // more efficient than with the expanded version) but then to also ensure that
-  // the resulting while loops are simplified.
-  pipeline.AddPass<SelectAndScatterExpander>();
-  if (is_fusion_emitters) {
-    pipeline.AddPass<ScatterExpander>(
-        ScatterExpander::kEliminateSimpleScatters);
-    pipeline.AddPass<ScatterSimplifier>();
-  }
-  if (!is_fusion_emitters || !kFusionEmitterScatterEnabled) {
-    pipeline.AddPass<ScatterExpander>(ScatterExpander::kEliminateAllScatters);
-  }
-
-  pipeline.AddPass(CreateSimplificationPipeline(
-      "post_scatter_expansion_simplification", module, is_fusion_emitters));
-
-  pipeline.AddPass<BitcastDtypesExpander>();
-
-  pipeline.AddPass<TopkRewriter>([](const HloSortInstruction* sort, int64_t) {
-    return sort->operand(0)->shape().element_type() == F32;
-  });
-  pipeline.AddPass<IndexedArrayAnalysisPrinterPass>();
-  pipeline.AddPass<TransposeFolding>(
-      [&](const HloInstruction& dot, int64_t operand) -> absl::StatusOr<bool> {
-        if (DotImplementationCanHandleTranspose(dot, *target_machine_features,
-                                                /*allow_runtime_calls=*/true)) {
-          return TransposeFolding::IsRowColumnTransposeDotOperand(dot, operand);
-        }
-        return false;
-      },
-      TransposeFolding::NeverFoldTranspose);
-  pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
-
-  pipeline.AddPass<OptimizationBarrierExpander>();
-  pipeline.AddPass<TupleSimplifier>();
-
-  // Annotate while loops with statically known trip counts, so that at run time
-  // we can avoid running the loop condition computations.
-  pipeline.AddPass<WhileLoopTripCountAnnotator>();
-
-  // Layout assignment uses alias analysis, which requires the call graph to be
-  // flattened.
-  pipeline.AddPass<FlattenCallGraph>();
-  ChannelLayoutConstraints layout_constraints;
-  pipeline.AddPass<CpuLayoutAssignment>(
-      module->mutable_entry_computation_layout(), target_machine_features,
-      &layout_constraints);
-  // Run SubByteNormalization because CpuLayoutAssignment may modify a
-  // Layout's element_size_in_bits field.
-  pipeline.AddPass<SubByteNormalization>(
-      SubByteNormalization::SET_ELEMENT_SIZE);
-
-  // Finally canonicalize all literals larger than 1024 bytes in the module to
-  // reuse the same literal across multiple HLO modules.
-  pipeline.AddPass<LiteralCanonicalizer>(LiteralPool::Default(),
-                                         /*min_size_bytes=*/1024);
-
-  return pipeline.Run(module).status();
-}
-
-absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
-    HloModule* module, bool is_aot_compile,
-    TargetMachineFeatures* target_machine_features,
-    const CompileOptions& compile_options) {
-  const auto& debug_options = module->config().debug_options();
-  const bool is_thunk_runtime = debug_options.xla_cpu_use_thunk_runtime();
-  const bool is_fusion_emitters =
-      is_thunk_runtime && debug_options.xla_cpu_use_fusion_emitters();
-  HloPassPipeline pipeline("HLO passes after layout assignment");
-
-  {
-    HloPassPipeline normalization_pipeline("hlo normalization");
-    normalization_pipeline.AddPass<ReshapeDecomposer>();
-    normalization_pipeline.AddPass<ReduceDecomposer>();
-    normalization_pipeline.AddPass<BroadcastCanonicalizer>();
-    TF_RETURN_IF_ERROR(normalization_pipeline.Run(module).status());
-  }
-
-  // After layout assignment, use a layout-sensitive verifier.
-  pipeline.AddPass<HloPassPipeline>("after layout assignment");
-  AddHloVerifier(&pipeline, HloVerifierOpts{}.MakeLayoutSensitive(),
-                 /*debug_only=*/true);
-
-  pipeline.AddPass<ReshapeDecomposer>();
-
-  const int max_parallelism =
-      module->config().intra_op_parallelism_threads() > 0
-          ? module->config().intra_op_parallelism_threads()
-          : tsl::port::NumSchedulableCPUs();
-
-#if defined(INTEL_MKL)
-  // AOT compiled code runs in single thread.
-  if (!is_aot_compile && !is_thunk_runtime) {
-    // Run SimplifyFPConversions pass to simplify the BF16 pattern and make it
-    // easier to match.
-    // Remove `f32 -> bf16 -> f32` casts inserted by bf16 normalization.
-    if (debug_options.xla_allow_excess_precision()) {
-      pipeline.AddPass<SimplifyFPConversions>();
-    }
-    pipeline.AddPass<OneDnnContractionRewriter>(max_parallelism,
-                                                compile_options.thread_pool);
-    // Run SimplifyFPConversions pass again to remove redundant Convert ops
-    // that may exist as a result of running OneDnnContractionRewriter pass.
-    if (debug_options.xla_allow_excess_precision()) {
-      pipeline.AddPass<SimplifyFPConversions>();
-    }
-  }
-#endif  // INTEL_MKL
-
-  if (module->config()
-          .debug_options()
-          .xla_cpu_experimental_xnn_graph_fusion_mode() !=
-      DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED) {
-    pipeline.AddPass<XnnGraphFusion>();
-  }
-
-  // Add a fusion pass now that layout assignment is done.
-  pipeline.AddPass<CpuInstructionFusion>();
-  if (is_fusion_emitters) {
-    pipeline.AddPass<FusionWrapper>();
-  }
-
-  // The LayoutAssignment pass may leave behind kCopy instructions which are
-  // duplicate or NOPs, so remove them with algebraic simplification and CSE.
-  // Run this to a fixed point.
-  [&pipeline = pipeline.AddPass<HloPassFix<HloPassPipeline>>(
-       "simplification after layout assignment"),
-   &module] {
-    AddHloVerifier(
-        &pipeline,
-        HloVerifierOpts{}.MakeLayoutSensitive().WithInstructionCanChangeLayout(
-            LayoutAssignment::InstructionCanChangeLayout),
-        /*debug_only=*/true);
-    AlgebraicSimplifierOptions options;
-    options.set_is_layout_sensitive(true);
-    options.set_supports_non_canonical_dots(false);
-    options.set_enable_dot_strength_reduction(false);
-    // "slow" minmax means we propagate nan.
-    options.set_minmax_propagate_nan(
-        !module->config().debug_options().xla_cpu_enable_fast_min_max());
-    options.set_executing_on_cpu(true);
-    pipeline.AddPass<AlgebraicSimplifier>(options);
-    pipeline.AddPass<HloDCE>();
-    pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
-  }();
-
-  // Outline ops in the entry computation into calls to subcomputations.
-  if (!is_aot_compile) {
-    // Run ParallelTaskAssigner to assign parallel tasks to HLOs in module.
-    // Note this is not run for AOT because it would bring in thread pool
-    // and thread synchronization dependencies which would likely increase
-    // binary size (and most AOT applications are single-threaded).
-    // TODO(b/29630486) Support multi-threaded AOT.
-    pipeline.AddPass<ParallelTaskAssigner>(
-        max_parallelism, ShapeSizeBytesFunction(), target_machine_features);
-  }
-  // Copy insertion should be performed immediately before IR emission to
-  // avoid inserting unnecessary copies (later pass adds an instruction which
-  // materializes the value) or missing a necessary copy (later pass removes
-  // an instruction which materializes a value). DCE must be run immediately
-  // before (and sometime after) copy insertion, to avoid dead code from
-  // interfering with the rewrites.
-  pipeline.AddPass<HloDCE>();
-  pipeline.AddPass<OptimizeInputOutputBufferAlias>(true);
-
-  // If enabled we'll use more precise region based analysis for copy removal.
-  if (debug_options.xla_cpu_copy_insertion_use_region_analysis()) {
-    pipeline.AddPass<CopyInsertion>(
-        /*can_share_buffer=*/nullptr,
-        /*use_region_based_live_range_analysis=*/-1);
-  } else {
-    pipeline.AddPass<CopyInsertion>();
-  }
-
-  // The hoisting of small while loops is only useful in the context of the
-  // thunk runtime.
-  if (module->config().debug_options().xla_cpu_use_thunk_runtime()) {
-    TF_ASSIGN_OR_RETURN(
-        int64_t byte_threshold,
-        xla::cpu::options::SmallWhileLoopByteThreshold(module->config()));
-    pipeline.AddPass<SmallWhileLoopHoistingPass>(byte_threshold);
-  }
-
-  pipeline.AddPass<OuterDimensionPropagationPass>();
-  pipeline.AddPass<GetOuterBatchValueSimplifier>();
-  pipeline.AddPass<HloDCE>();
-  return pipeline.Run(module).status();
-}
-
-absl::Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
-                                       llvm::TargetMachine* target_machine,
-                                       const CompileOptions& compile_options) {
-  TargetMachineFeatures target_machine_features(target_machine);
-  TF_RETURN_IF_ERROR(RunHloPassesThroughLayoutAssn(module, is_aot_compile,
-                                                   &target_machine_features));
-
-  return RunHloPassesAfterLayoutAssn(module, is_aot_compile,
-                                     &target_machine_features, compile_options);
-}
-
-namespace {
-
-// Align buffers to XLA:CPU minimal alignment.
-int64_t memory_alignment(LogicalBuffer::Color) {
-  return cpu_function_runtime::MinAlign();
-}
-
-llvm::TargetOptions CompilerTargetOptions(
-    const HloModuleConfig& module_config) {
-  llvm::TargetOptions target_options;
-  // Always allow FMA fusion. This increases precision instead of decreasing it.
-  target_options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
-  return target_options;
-}
-
-std::pair<LLVMCompiler::ModuleHook, LLVMCompiler::ModuleHook> GetIRModuleHooks(
-    const HloModule& hlo_module,
-    const LLVMCompiler::ModuleHook& user_pre_optimization_hook,
-    const LLVMCompiler::ModuleHook& user_post_optimization_hook) {
-  // Create the IR hooks. If applicable, each IR hook does the following:
-  //
-  //  * Calls the user supplied module hook.
-  //  * Writes out the IR to a file in the output directory designated by
-  //    --xla_dump_to
-  const HloModule* hlo_module_ptr = &hlo_module;
-  auto hook = [user_pre_optimization_hook, user_post_optimization_hook,
-               hlo_module_ptr](bool optimized,
-                               const llvm::Module& llvm_module) {
-    const auto& user_hook =
-        !optimized ? user_pre_optimization_hook : user_post_optimization_hook;
-    if (user_hook) {
-      user_hook(llvm_module);
-    }
-
-    // Include LLVM module identifier suffix in case `llvm_module` is just a
-    // part of the original LLVM module constructed by the XLA.
-    absl::string_view id = llvm_module.getModuleIdentifier();
-    size_t pos = std::min(id.size(), 1 + kXlaModuleIdentifier.size());
-    llvm_ir::DumpIrIfEnabled(*hlo_module_ptr, llvm_module, optimized,
-                             /*filename_suffix=*/id.substr(pos));
-  };
-  return {[hook](const llvm::Module& llvm_module) {
-            return hook(/*optimized=*/false, llvm_module);
-          },
-          [hook](const llvm::Module& llvm_module) {
-            return hook(/*optimized=*/true, llvm_module);
-          }};
-}
-
-absl::Status VerifyLlvmModule(const llvm::Module& llvm_module) {
-  XLA_SCOPED_LOGGING_TIMER("CpuCompiler - Running LLVM verifier");
-
-  std::string err;
-  llvm::raw_string_ostream err_stream(err);
-
-  // verifyModule() returns true if the module is broken.
-  TF_RET_CHECK(!llvm::verifyModule(llvm_module, &err_stream))
-      << "Invalid LLVM IR before optimizations:\n"
-      << err_stream.str()
-      << "\nThis probably indicates a bug in the HLO -> LLVM IR lowering. "
-         "Rerun with --xla_dump_to to get the IR. ";
-  return absl::OkStatus();
-}
-
-absl::Status CreateHloProfilingArtifacts(
-    const HloModule& module,
-    absl::flat_hash_map<const HloInstruction*, int64_t>*
-        instruction_to_profile_idx,
-    absl::flat_hash_map<const HloComputation*, int64_t>*
-        computation_to_profile_idx,
-    std::unique_ptr<HloProfileIndexMap>* hlo_profile_index_map,
-    std::unique_ptr<HloProfilePrinterData>* hlo_profile_printer_data) {
-  *hlo_profile_index_map = std::make_unique<HloProfileIndexMap>(module);
-  const HloComputation& entry_computation = *module.entry_computation();
-
-  TF_ASSIGN_OR_RETURN(
-      *instruction_to_profile_idx,
-      CollectProfileCandidates::GetCandidatesForComputation(
-          entry_computation,
-          (*hlo_profile_index_map)->instruction_to_profile_idx()));
-
-  auto shape_size_bytes = [](const Shape& shape) {
-    // On the cpu, opaques are pointers.
-    if (shape.IsOpaque()) {
-      return static_cast<int64_t>(sizeof(void*));
-    }
-    return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
-  };
-
-  HloCostAnalysis cost_analysis(shape_size_bytes);
-  TF_RETURN_IF_ERROR(entry_computation.Accept(&cost_analysis));
-  *hlo_profile_printer_data = CreateHloProfilePrinterData(
-      **hlo_profile_index_map, cost_analysis, entry_computation.name());
-  *computation_to_profile_idx =
-      (*hlo_profile_index_map)->computation_to_profile_idx();
-
-  return absl::OkStatus();
-}
-
-}  // namespace
-
-absl::StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
-    std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/,
-    const CompileOptions& options) {
-  auto& config = module->config();
-
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<llvm::TargetMachine> jit_target_machine,
-      IrCompiler::InferTargetMachine(
-          CompilerTargetOptions(config), IrCompiler::GetCodeGenOptLevel(config),
-          CpuFeatureFromString(config.debug_options().xla_cpu_max_isa())));
-
-  TF_RETURN_IF_ERROR(RunHloPasses(module.get(), /*is_aot_compile=*/false,
-                                  jit_target_machine.get(),
-                                  /*compile_options=*/options));
-  return std::move(module);
-}
-
-namespace {
-
-static void DumpModuleToFile(const llvm::Module& llvm_module,
-                             const llvm::object::ObjectFile& obj_file,
-                             const HloModule& hlo_module) {
-  absl::string_view id = llvm_module.getModuleIdentifier();
-  size_t pos = std::min(id.size(), 1 + kXlaModuleIdentifier.size());
-  auto get_file_suffix = [&]() {
-    std::vector<absl::string_view> parts = {"obj-file"};
-    parts.reserve(3);
-    absl::string_view middle_name = id.substr(pos);
-    if (!middle_name.empty()) {
-      parts.push_back(middle_name);
-    }
-    parts.push_back("o");
-    return absl::StrJoin(parts, ".");
-  };
-  DumpToFileInDir(
-      hlo_module, /*file_prefix=*/"", get_file_suffix(),
-      absl::string_view(obj_file.getData().data(), obj_file.getData().size()));
-}
-
-// Post-compilation callback functor for use by SimpleOrcJIT.
-//
-// Dumps machine code if dumping is enabled for the module.
-static std::function<void(const llvm::Module&, const llvm::object::ObjectFile&)>
-CreateOrcJITPostCompilationHook(const HloModule* hlo_module,
-                                std::vector<std::string>* obj_files) {
-  return [=](const llvm::Module& llvm_module,
-             const llvm::object::ObjectFile& obj_file) {
-    if (obj_files) obj_files->push_back(obj_file.getData().str());
-
-    if (DumpingEnabledForHloModule(*hlo_module)) {
-      DumpModuleToFile(llvm_module, obj_file, *hlo_module);
-    }
-  };
-}
-
-struct ComputationToEmit {
-  HloComputation* computation;
-
-  // Are we emitting this computation with fast-math reassociation enabled?
-  // We enable reassociation for reductions because it has a significant
-  // performance impact.
-  bool allow_reassociation;
-
-  bool operator==(const ComputationToEmit& other) const {
-    return computation == other.computation &&
-           allow_reassociation == other.allow_reassociation;
-  }
-
-  template <typename H>
-  friend H AbslHashValue(H h, const ComputationToEmit& c) {
-    return H::combine(std::move(h), c.computation, c.allow_reassociation);
-  }
-};
-
-std::vector<ComputationToEmit> SubcomputationEmissionOrder(
-    HloComputation* root) {
-  absl::flat_hash_set<ComputationToEmit> visited;
-  std::vector<ComputationToEmit> postorder;
-
-  // agenda of (node, leave) pairs.
-  std::stack<std::pair<ComputationToEmit, bool>> agenda;
-  agenda.emplace(ComputationToEmit{root, false}, false);
-  while (!agenda.empty()) {
-    ComputationToEmit c;
-    bool leave;
-    std::tie(c, leave) = agenda.top();
-    agenda.pop();
-
-    if (leave) {
-      postorder.push_back(c);
-      continue;
-    }
-
-    if (visited.insert(c).second) {
-      agenda.emplace(c, true);
-      for (auto* instruction : c.computation->instructions()) {
-        bool allow_reassociation =
-            instruction->opcode() == HloOpcode::kAllReduce ||
-            instruction->opcode() == HloOpcode::kReduce ||
-            instruction->opcode() == HloOpcode::kReduceWindow;
-        auto cc = absl::MakeSpan(instruction->called_computations());
-        for (auto it = cc.rbegin(); it != cc.rend(); ++it) {
-          HloComputation* called_computation = *it;
-          ComputationToEmit callee{
-              called_computation, c.allow_reassociation || allow_reassociation};
-          if (!visited.contains(callee)) {
-            agenda.emplace(callee, false);
-          }
-        }
-      }
-    }
-  }
-  DCHECK(!postorder.empty() && postorder.back().computation == root);
-  postorder.pop_back();
-  return postorder;
-}
-
-}  // namespace
-
-// Removes unused globals and function declarations from the LLVM module.
-//
-// After splitting LLVM module into multiple parts, we end up with unused
-// symbols in each part: external globals and function declarations. We don't
-// support linking across modules added to SimpleOrcJIT, and we don't need it,
-// because we never construct LLVM IR that might require cross-module linking,
-// so we can just remove unused symbols from each part.
-static void RemoveUnusedSymbols(llvm::Module& module) {
-  llvm::SmallVector<llvm::GlobalVariable*> unused_globals;
-  llvm::SmallVector<llvm::Function*> unused_functions;
-
-  for (llvm::GlobalVariable& gv : module.globals()) {
-    if (gv.use_empty()) unused_globals.push_back(&gv);
-  }
-  for (llvm::Function& f : module.functions()) {
-    if (f.isDeclaration() && f.use_empty()) unused_functions.push_back(&f);
-  }
-
-  for (auto* gv : unused_globals) {
-    module.eraseGlobalVariable(gv);
-  }
-  for (auto* f : unused_functions) {
-    f->eraseFromParent();
-  }
-}
-
-// Clones a ThreadSafeModule from the given LLVM module in a new LLVM context.
-//
-// To enable parallel compilation, each LLVM module has to be owned by a
-// separate LLVM context. We take each part of the original module after a
-// split, and clone it into a new LLVM context.
-static llvm::orc::ThreadSafeModule CloneAsThreadSafeModule(
-    int64_t part, std::unique_ptr<llvm::Module> module) {
-  TraceMe trace([&] {
-    return TraceMeEncode("CpuCompiler::CloneAsThreadSafeModule",
-                         {{"part", part}});
-  });
-
-  // There is no way to clone a module from one context to another, so we need
-  // to serialize the module to bitcode and parse it back into the new context.
-  llvm::SmallString<0> bc;
-  llvm::raw_svector_ostream bcos(bc);
-  llvm::WriteBitcodeToFile(*module, bcos);
-
-  // Parse module back into its own LLVM context.
-  auto clone_context = std::make_unique<llvm::LLVMContext>();
-  auto clone_module = llvm::parseBitcodeFile(
-      llvm::MemoryBufferRef(
-          llvm::StringRef(bc.data(), bc.size()),
-          absl::StrFormat("%s_part_%02d", kXlaModuleIdentifier, part)),
-      *clone_context);
-
-  return llvm::orc::ThreadSafeModule(std::move(*clone_module),
-                                     std::move(clone_context));
-}
-
-namespace {
-// Compiled symbols (kernels and comparators) from a single LLVM module part.
-struct CompiledSymbolsPart {
-  std::vector<IrEmitter2::KernelInfo> kernels;
-  std::vector<IrEmitter2::ComparatorInfo> comparators;
-};
-}  // namespace
-
-// Collect IrEmitter2 symbols that got into the LLVM module part. We issue
-// compilation tasks in parallel, and to maximize concurrency we don't issue
-// separate compilation tasks that compile symbols from the same module.
-static CompiledSymbolsPart CollectCompiledSymbolsPart(
-    const IrEmitter2& ir_emitter, const llvm::Module& module) {
-  CompiledSymbolsPart syms;
-
-  auto find_kernel =
-      [&](llvm::StringRef name) -> std::optional<IrEmitter2::KernelInfo> {
-    for (auto& k : ir_emitter.kernels()) {
-      if (k.name == name) return k;
-    }
-    return std::nullopt;
-  };
-
-  auto find_comparator =
-      [&](llvm::StringRef name) -> std::optional<IrEmitter2::ComparatorInfo> {
-    for (auto& c : ir_emitter.comparators()) {
-      if (c.name == name) return c;
-    }
-    return std::nullopt;
-  };
-
-  for (auto& f : module.functions()) {
-    if (auto kernel = find_kernel(f.getName())) {
-      syms.kernels.push_back(*kernel);
-    }
-    if (auto comparator = find_comparator(f.getName())) {
-      syms.comparators.push_back(*comparator);
-    }
-  }
-
-  return syms;
-}
-
-// If LLVM module has large constants constructed from literals, we don't want
-// to split it, because it will cause us to copy large constants across module
-// parts. We should not be storing large constants in LLVM IR in a first place,
-// but while we do that, we have to be extra-careful, or it leads to extremely
-// long compilation times, OOMs and timeouts.
-//
-// TODO(b/361800465): Figure out how to avoid putting large constants into
-// LLVM IR in the first place.
-static bool HasLargeConstants(llvm::Module& module) {
-  static constexpr int kMaxConstantSize = 10000;
-  for (auto& g : module.globals()) {
-    if (!g.hasInitializer()) {
-      continue;
-    }
-
-    llvm::Constant* initializer = g.getInitializer();
-    if (auto* arr = llvm::dyn_cast<llvm::ArrayType>(initializer->getType())) {
-      if (arr->getNumElements() > kMaxConstantSize) return true;
-    }
-  }
-  return false;
-}
-
-inline void VlogMaxIsa(absl::string_view max_cpu_isa) {
-  if (VLOG_IS_ON(1) && !max_cpu_isa.empty()) {
-    if (tsl::port::IsX86CPU()) {
-      VLOG(1) << "`xla_cpu_max_isa` is set. Will not use features newer than: "
-              << max_cpu_isa;
-    } else {
-      VLOG(1) << "`xla_cpu_max_isa` is set to `" << max_cpu_isa
-              << "`. This flag is not supported on non-x86 CPUs yet.";
-    }
-  }
-}
-
-// We keep HloProto in the CpuExecutable, but we don't need to keep literals
-// payload in it as we use it only for debugging and memory analysis.
-static void StripPayloadFromLiteralProto(HloProto& proto) {
-  auto* module = proto.mutable_hlo_module();
-  for (auto& computation : *module->mutable_computations()) {
-    for (auto& instruction : *computation.mutable_instructions()) {
-      // We only keep literal shape to correctly estimate memory usage of the
-      // HLO module, but we don't need the actual literal data.
-      if (instruction.has_literal()) {
-        LiteralProto literal;
-        *literal.mutable_shape() = instruction.literal().shape();
-        *instruction.mutable_literal() = std::move(literal);
-      }
-    }
-  }
-}
-
-// Extracts the given set of kernels from the original module.
-// Returns a new module with the extracted kernels.
-static absl::StatusOr<std::unique_ptr<llvm::Module>> ExtractKernelsFromModule(
-    llvm::Module* original_module,
-    absl::flat_hash_set<llvm::StringRef> kernels) {
-  // Clone into a new module, only keeping definitions of the relevant kernels.
-  auto should_clone_definition = [&kernels](const llvm::GlobalValue* gv) {
-    if (auto* func = llvm::dyn_cast<llvm::Function>(gv)) {
-      return kernels.contains(func->getName());
-    }
-    return false;
-  };
-  llvm::ValueToValueMapTy vmap;
-  std::unique_ptr<llvm::Module> module =
-      llvm::CloneModule(*original_module, vmap, should_clone_definition);
-
-  // Erase the cloned symbols from the original module.
-  for (const auto& kernel_name : kernels) {
-    llvm::Function* to_be_removed = original_module->getFunction(kernel_name);
-    if (to_be_removed == nullptr) {
-      return Internal("Cannot remove kernel %s: cannot be found in module %s",
-                      kernel_name, original_module->getName());
-    }
-    to_be_removed->eraseFromParent();
-  }
-  return module;
-}
-
-static void AddXlaBackendExtraOptionsAsModuleFlag(
-    llvm::Module* llvm_module, llvm::StringRef backend_extra_options) {
-  auto* options_mdstring =
-      llvm::MDString::get(llvm_module->getContext(), backend_extra_options);
-  llvm_module->addModuleFlag(llvm::Module::Error, "xla_backend_extra_options",
-                             options_mdstring);
-}
-
-absl::StatusOr<std::unique_ptr<CpuExecutable>>
-CpuCompiler::CompileCpuExecutable(std::unique_ptr<HloModule> module) {
-  TraceMe trace([&] {
-    return TraceMeEncode("CpuCompiler::CompileCpuExecutable",
-                         {{"name", module->name()}});
-  });
-
-  ModuleHook pre_optimization_ir_hook;
-  ModuleHook post_optimization_ir_hook;
-  std::tie(pre_optimization_ir_hook, post_optimization_ir_hook) =
-      GetIRModuleHooks(*module, user_pre_optimization_hook_,
-                       user_post_optimization_hook_);
-
-  // Compile must be thread-safe so create a new LLVM context for the module.
-  mlir::MLIRContext mlir_context;
-  auto llvm_context = std::make_unique<llvm::LLVMContext>();
-  auto llvm_module =
-      std::make_unique<llvm::Module>(kXlaModuleIdentifier, *llvm_context);
-
-  const DebugOptions& debug_options = module->config().debug_options();
-
-  // We collect compiled object files (machine code) so we can export
-  // CpuExecutable to an AOT compilation result.
-  std::vector<std::string> obj_files;
-
-  // We split LLVM module and distribute it across separate DyLibs to enable
-  // parallel compilation at run time.
-  size_t parallel_codegen_split_count =
-      debug_options.xla_cpu_parallel_codegen_split_count();
-  VlogMaxIsa(debug_options.xla_cpu_max_isa());
-
-  const HloModuleConfig& config = module->config();
-
-  // Options for compiling LLVM IR to machine code.
-  IrCompiler::Options ir_compiler_options{
-      /*optimization_level=*/IrCompiler::GetCodeGenOptLevel(config),
-      /*optimize_for_size=*/options::OptimizeForSizeRequested(config),
-      /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()),
-      /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(config),
-      /*disable_expensive_passes=*/
-      debug_options.xla_llvm_disable_expensive_passes(),
-      /*slp_vectorizer_disabled=*/options::SlpVectorizerDisabled(config),
-      /*disable_loop_unrolling=*/options::DisableLoopUnrolling(config),
-  };
-
-  // Compiler hooks to intercept compiled LLVM IR modules.
-  IrCompiler::CompilationHooks ir_compiler_hooks{
-      pre_optimization_ir_hook,
-      post_optimization_ir_hook,
-      CreateOrcJITPostCompilationHook(module.get(), &obj_files),
-  };
-
-  // Definition generator to link with XLA:CPU host runtime symbols.
-  ExecutionEngine::DefinitionGenerator definition_generator =
-      [](const llvm::DataLayout& data_layout) {
-        return std::make_unique<RuntimeSymbolGenerator>(data_layout);
-      };
-
-  // Options for orchestrating the JIT compilation process.
-  JitCompiler::Options jit_compiler_options{
-      /*num_dylibs=*/parallel_codegen_split_count,
-      /*definition_generator=*/std::move(definition_generator),
-  };
-
-  std::unique_ptr<IrCompiler> ir_compiler = IrCompiler::Create(
-      CompilerTargetOptions(module->config()), std::move(ir_compiler_options),
-      std::move(ir_compiler_hooks));
-
-  TF_ASSIGN_OR_RETURN(
-      JitCompiler jit_compiler,
-      JitCompiler::Create(std::move(jit_compiler_options),
-                          std::move(ir_compiler), GetCompilationTaskRunner()));
-
-  HloComputation* entry_computation = module->entry_computation();
-  absl::flat_hash_map<const HloInstruction*, int64_t>
-      instruction_to_profile_idx;
-  absl::flat_hash_map<const HloComputation*, int64_t>
-      computation_to_profile_idx;
-  std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
-  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data;
-  if (module->config().hlo_profiling_enabled()) {
-    TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts(
-        *module, &instruction_to_profile_idx, &computation_to_profile_idx,
-        &hlo_profile_index_map, &hlo_profile_printer_data));
-  }
-
-  // Cache these flags here since we'll want to access them after the module's
-  // ownership is std::moved.
-  const bool embed_ir_in_executable =
-      debug_options.xla_embed_ir_in_executable();
-
-  TF_ASSIGN_OR_RETURN(HloSchedule schedule, CreateHloSchedule(*module));
-  TF_RETURN_IF_ERROR(module->set_schedule(schedule));
-
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferAssignment> assignment,
-                      CreateBufferAssignment(*module));
-  DumpHloModuleIfEnabled(*module, *assignment,
-                         absl::StrCat("cpu_", kAfterOptimizationsDumpName));
-
-  // Dump computation proto state and buffer assignment for
-  // GetCompiledMemoryStats results.
-  auto with_hlo_proto = [&](std::unique_ptr<CpuExecutable> cpu_executable) {
-    auto hlo_proto = std::make_unique<HloProto>();
-    *hlo_proto->mutable_hlo_module() = cpu_executable->module().ToProto();
-    *hlo_proto->mutable_buffer_assignment() =
-        cpu_executable->buffer_assignment().ToProto();
-    StripPayloadFromLiteralProto(*hlo_proto);
-    cpu_executable->set_hlo_proto(std::move(hlo_proto));
-    return cpu_executable;
-  };
-
-  TargetMachineFeatures target_machine_features(jit_compiler.target_machine());
-
-  // TODO(ezhulenev): Once we fully migrate to Thunks current IrEmitter should
-  // be renamed to NestedIrEmitter and be used only for emitting nested (aka
-  // thread local or embedded) computations (reductions, maps, etc.).
-
-  // (Nested) IrEmitter is responsible for building LLVM module with functions
-  // for all HLO computations. In thunk execution mode we only build LLVM
-  // functions for embedded computations (e.g. reduction computations) and all
-  // high-level operations (fusions, elementwise, etc.) are lowered to kernel
-  // functions (which are also LLVM functions, but use a HostKernel ABI).
-  IrEmitter nested_ir_emitter(
-      &mlir_context, *module, *assignment, llvm_module.get(),
-      std::move(instruction_to_profile_idx),
-      std::move(computation_to_profile_idx),
-      ModuleComputationsTransitivelyContainCustomCall(*module),
-      &target_machine_features,
-#ifdef MEMORY_SANITIZER
-      /*emit_code_for_msan=*/true
-#else
-      /*emit_code_for_msan=*/false
-#endif
-  );
-
-  // If we use Thunk runtime then instead of emitting LLVM function for the
-  // entry computation we emit a sequence of thunks that implement the
-  // computation as a sequence of interpreted commands.
-  if (module->config().debug_options().xla_cpu_use_thunk_runtime()) {
-    // The thunk runtime manages large constants, therefore we only emit
-    // small ones.
-    TF_RETURN_IF_ERROR(nested_ir_emitter.EmitSmallConstantGlobals());
-
-    // IR emitter is responsible for building LLVM module with host kernels for
-    // corresponding HLO instructions (fusions, elemental instructions, etc.).
-    IrEmitter2 ir_emitter2(*module, llvm_module.get(), &nested_ir_emitter);
-
-    // Thunk emitter is responsible for building a Thunk sequence that will
-    // resolved kernels in the compiled LLVM module and execute them together
-    // with Thunks implemented as library calls (e.g. oneDNN or Eigen).
-    ThunkEmitter thunk_emitter(ir_emitter2, *assignment,
-                               target_machine_features, module->config());
-    TF_ASSIGN_OR_RETURN(ThunkSequence thunks,
-                        thunk_emitter.EmitEntryComputation(*module));
-
-    std::string ir_module_string;
-    if (embed_ir_in_executable) {
-      std::string emitter2_ir = llvm_ir::DumpToString(llvm_module.get());
-
-      auto thunk_kernel_fmt = [](std::string* out,
-                                 const ThunkEmitter::EmittedKernel& kernel) {
-        absl::StrAppend(
-            out, llvm_ir::DumpToString(kernel.module.getModuleUnlocked()));
-      };
-      std::string thunks_ir =
-          absl::StrJoin(thunk_emitter.kernels(), "\n", thunk_kernel_fmt);
-
-      ir_module_string = absl::StrCat(emitter2_ir, "\n", thunks_ir);
-    }
-
-    TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
-    for (const auto& [name, module] : thunk_emitter.kernels()) {
-      TF_RETURN_IF_ERROR(VerifyLlvmModule(*module.getModuleUnlocked()));
-    }
-
-    // Some kernels have to be compiled separately because they have
-    // extra backend options.
-    int num_extra_functions = 0;
-    using BackendOptions = llvm::StringRef;
-    using Kernel = llvm::StringRef;
-    absl::flat_hash_map<BackendOptions, absl::flat_hash_set<Kernel>>
-        backend_extra_options_to_kernels;
-    for (const auto& k : ir_emitter2.kernels()) {
-      if (k.backend_extra_options.empty()) continue;
-      auto [_, inserted] =
-          backend_extra_options_to_kernels[k.backend_extra_options].insert(
-              k.name);
-      CHECK(inserted) << "Kernel " << k.name << " is not unique";
-      num_extra_functions++;
-    }
-    const int num_extra_parts = backend_extra_options_to_kernels.size();
-    // We assign one dylib to each set of kernels that have the same extra
-    // backend options. We do this because we work under the assumption that
-    // very few kernels will set extra options, and if they do, the options are
-    // likely to be identical.
-    if (num_extra_parts >= parallel_codegen_split_count) {
-      return Internal(
-          "Too many extra compilation parts due to non-default options (%d). "
-          "Consider reducing this number or increasing "
-          "parallel_codegen_split_count (%d)",
-          num_extra_parts, parallel_codegen_split_count);
-    }
-
-    // We define the number of module parts based on the total number of
-    // compiled functions (kernels and comparators) that are called from thunks,
-    // and the maximum number of parts that we want to split the module into.
-    size_t num_compiled_functions = ir_emitter2.kernels().size() +
-                                    ir_emitter2.comparators().size() +
-                                    thunk_emitter.kernels().size();
-    size_t num_default_parts =
-        std::min(num_compiled_functions - num_extra_functions,
-                 parallel_codegen_split_count - num_extra_parts);
-
-    // JIT compile the LLVM IR module to in-memory machine code. We split the
-    // module into `num_jit_dylibs` parts to allow parallel compilation. In
-    // practice, all of the kernel functions are independent and don't call each
-    // other, so we can compile each individual part in parallel. We split
-    // module preserving locals, which should guarantee that all thread local
-    // computations end up in the same module with the corresponding kernel.
-
-    // Collect all compiled symbols grouped by LLVM module part, so that we can
-    // issue compile tasks in parallel without any interference.
-    std::vector<CompiledSymbolsPart> compiled_parts;
-
-    VLOG(2) << "Compile LLVM module with " << ir_emitter2.kernels().size()
-            << " kernels and " << ir_emitter2.comparators().size()
-            << " comparators";
-
-    int dylib_index = 0;
-    auto add_jit_module = [&](std::unique_ptr<llvm::Module> llvm_module_part) {
-      // Collect symbols that are compiled in this LLVM module part.
-      RemoveUnusedSymbols(*llvm_module_part);
-      compiled_parts.push_back(
-          CollectCompiledSymbolsPart(ir_emitter2, *llvm_module_part));
-
-      std::string dump = llvm_ir::DumpToString(llvm_module_part.get());
-      VLOG(5) << "Adding compilation module:\n" << dump;
-
-      // Clone LLVM module part into its own thread safe context.
-      auto tsm =
-          CloneAsThreadSafeModule(dylib_index, std::move(llvm_module_part));
-      TF_CHECK_OK(jit_compiler.AddModule(std::move(tsm), dylib_index++));
-    };
-
-    // If there are extra parts, compile them first, since we must
-    // remove the affected kernels from the LLVM module.
-    if (num_extra_parts > 0) {
-      TraceMe trace([&] {
-        return TraceMeEncode("CompileExtraKernels",
-                             {{"num_extra_parts", num_extra_parts}});
-      });
-      for (const auto& [backend_extra_options, kernels] :
-           backend_extra_options_to_kernels) {
-        TF_ASSIGN_OR_RETURN(
-            std::unique_ptr<llvm::Module> new_module,
-            ExtractKernelsFromModule(llvm_module.get(), kernels));
-        AddXlaBackendExtraOptionsAsModuleFlag(new_module.get(),
-                                              backend_extra_options);
-        add_jit_module(std::move(new_module));
-      }
-    }
-
-    if (HasLargeConstants(*llvm_module)) {
-      VLOG(3) << "Skip parallel compilation due to large constants";
-      num_default_parts = 1;
-    }
-
-    if (num_default_parts > 1) {
-      VLOG(3) << "Split LLVM module into " << num_default_parts
-              << " parts before codegen to enable parallel compilation"
-              << " (max split count: " << parallel_codegen_split_count << ")";
-
-      TraceMe trace([&] {
-        return TraceMeEncode("SplitModule",
-                             {{"num_default_parts", num_default_parts}});
-      });
-
-      llvm::SplitModule(*llvm_module, num_default_parts, add_jit_module,
-                        /*PreserveLocals=*/true, /*RoundRobin=*/true);
-      // Free resources used by the original LLVM module.
-      llvm_module.reset();
-      llvm_context.reset();
-
-    } else {
-      VLOG(3) << "Compile LLVM module without splitting (max split count: "
-              << parallel_codegen_split_count << ")";
-      compiled_parts.push_back(
-          CollectCompiledSymbolsPart(ir_emitter2, *llvm_module));
-      TF_CHECK_OK(jit_compiler.AddModule(llvm::orc::ThreadSafeModule(
-          std::move(llvm_module), std::move(llvm_context))));
-    }
-
-    // Collect compiled symbols from all LLVM module parts.
-    std::vector<FunctionLibrary::Symbol> compiled_symbols;
-
-    absl::flat_hash_map<FunctionLibrary::TypeId, SymbolProto::FunctionTypeId>
-        symbol_type_id_to_function_type_id;
-
-    VLOG(3) << "Adding " << thunk_emitter.kernels().size()
-            << " kernels to the JIT compiler";
-    // Make sure we use all the "default" modules for maximum parallelism.
-    int num_default_so_far = dylib_index - num_extra_parts;
-    int kernel_dylib_index =
-        num_default_so_far < num_default_parts ? num_default_so_far : 0;
-    for (auto& [name, module] : thunk_emitter.kernels()) {
-      compiled_symbols.push_back(
-          FunctionLibrary::Sym<FunctionLibrary::Kernel>(name));
-      symbol_type_id_to_function_type_id.emplace(
-          compiled_symbols.back().type_id, SymbolProto::KERNEL);
-      TF_CHECK_OK(jit_compiler.AddModule(std::move(module),
-                                         num_extra_parts + kernel_dylib_index));
-      // Simply roundrobin the default kernel dylibs
-      kernel_dylib_index = (kernel_dylib_index + 1) % num_default_parts;
-    }
-
-    for (const CompiledSymbolsPart& part : compiled_parts) {
-      for (const IrEmitter2::KernelInfo& kernel : part.kernels) {
-        compiled_symbols.push_back(
-            FunctionLibrary::Sym<FunctionLibrary::Kernel>(kernel.name));
-        symbol_type_id_to_function_type_id.emplace(
-            compiled_symbols.back().type_id, SymbolProto::KERNEL);
-      }
-      for (const IrEmitter2::ComparatorInfo& comparator : part.comparators) {
-        compiled_symbols.push_back(
-            FunctionLibrary::Sym<FunctionLibrary::Comparator>(comparator.name));
-        symbol_type_id_to_function_type_id.emplace(
-            compiled_symbols.back().type_id, SymbolProto::COMPARATOR);
-      }
-    }
-
-    VLOG(3) << "Collected " << compiled_symbols.size() << " compiled symbols";
-
-    TraceMe trace_codegen([&] {
-      return TraceMeEncode(
-          "Codegen", {{"num_default_parts", num_default_parts},
-                      {"num_extra_parts", num_extra_parts},
-                      {"num_compiled_functions", num_compiled_functions}});
-    });
-
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<FunctionLibrary> function_library,
-                        std::move(jit_compiler).Compile(compiled_symbols));
-
-    // Create constant allocations from the buffer assignment.
-    TF_ASSIGN_OR_RETURN(std::vector<ConstantAllocation> constants,
-                        CreateConstantAllocations(*assignment));
-
-    TF_ASSIGN_OR_RETURN(
-        auto cpu_executable,
-        CpuExecutable::Create(std::move(function_library),
-                              std::move(assignment), std::move(module),
-                              std::move(thunks), std::move(constants),
-                              std::move(hlo_profile_printer_data),
-                              std::move(hlo_profile_index_map)));
-
-    // Save object files to be able to export them to AOT compilation
-    // result.
-    cpu_executable->set_obj_files(std::move(obj_files));
-
-    // Save compiled symbols to be able to export them to AOT compilation
-    // result.
-    cpu_executable->set_compiled_symbols(std::move(compiled_symbols));
-
-    // Save mapping between symbol type id and function type id to be able to
-    // export them to AOT compilation result.
-    cpu_executable->set_symbol_type_id_to_function_type_id(
-        symbol_type_id_to_function_type_id);
-
-    if (embed_ir_in_executable) {
-      cpu_executable->set_ir_module_string(ir_module_string);
-    }
-
-    return with_hlo_proto(std::move(cpu_executable));
-  }
-
-  TF_RETURN_IF_ERROR(nested_ir_emitter.EmitAllConstantGlobals());
-
-  // Each computation is a single function.  Emit all embedded computations
-  // before the entry computation. The order of computations returned from
-  // SubcomputationEmissionOrder guarantees that a called computation occurs
-  // before a caller computation.
-  for (ComputationToEmit subcomputation :
-       SubcomputationEmissionOrder(entry_computation)) {
-    if (subcomputation.computation->IsFusionComputation()) {
-      continue;
-    }
-    TF_RETURN_IF_ERROR(
-        nested_ir_emitter
-            .EmitComputation(
-                subcomputation.computation, subcomputation.computation->name(),
-                /*is_top_level_computation=*/false,
-                schedule.sequence(subcomputation.computation).instructions(),
-                subcomputation.allow_reassociation)
-            .status());
-  }
-  absl::string_view function_name_prefix = entry_computation->name().empty()
-                                               ? "__compute"
-                                               : entry_computation->name();
-  TF_ASSIGN_OR_RETURN(llvm::Function * entry_function,
-                      nested_ir_emitter.EmitComputation(
-                          entry_computation, function_name_prefix,
-                          /*is_top_level_computation=*/true,
-                          schedule.sequence(entry_computation).instructions(),
-                          /*allow_reassociation=*/false));
-
-  std::string ir_module_string;
-  if (embed_ir_in_executable) {
-    ir_module_string = llvm_ir::DumpToString(llvm_module.get());
-  }
-
-  TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
-
-  // Save entry function name before destroying LLVM module.
-  std::string entry_function_name = entry_function->getName().str();
-
-  // JIT compile the LLVM IR module to in-memory machine code.
-  llvm::orc::ThreadSafeModule thread_safe_module(std::move(llvm_module),
-                                                 std::move(llvm_context));
-  TF_RETURN_IF_ERROR(jit_compiler.AddModule(std::move(thread_safe_module)));
-
-  using ComputeFn = std::remove_pointer_t<CpuExecutable::ComputeFunctionType>;
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<FunctionLibrary> function_library,
-      std::move(jit_compiler)
-          .Compile({FunctionLibrary::Sym<ComputeFn>(entry_function_name)}));
-
-  TF_ASSIGN_OR_RETURN(
-      auto cpu_executable,
-      CpuExecutable::Create(std::move(function_library), std::move(assignment),
-                            std::move(module), entry_function_name,
-                            std::move(hlo_profile_printer_data),
-                            std::move(hlo_profile_index_map)));
-
-  cpu_executable->set_obj_files(std::move(obj_files));
-
-  if (embed_ir_in_executable) {
-    cpu_executable->set_ir_module_string(ir_module_string);
-  }
-
-  return with_hlo_proto(std::move(cpu_executable));
-}
-
-absl::StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
-    std::unique_ptr<HloModule> module,
-    [[maybe_unused]] se::StreamExecutor* stream_exec,
-    const CompileOptions& options) {
-  TraceMe trace([&] {
-    return TraceMeEncode("CpuCompiler::RunBackend", {{"name", module->name()}});
-  });
-
-  VLOG(1) << "Compiling: " << module->name();
-  RecordCpuCompilerStacktrace();
-  XLA_SCOPED_LOGGING_TIMER(
-      absl::StrFormat("Compiling [%s] for CPU using JIT", module->name()));
-  std::string slow_compilation_msg =
-      absl::StrCat("Compiling module ", module->name());
-  auto slow_compile_alarm = SlowCompilationAlarm(slow_compilation_msg);
-  auto llvm_options = llvm_ir::ExtractXlaBackendExtraOptions(
-      module->config().debug_options().xla_backend_extra_options());
-  llvm_ir::LLVMCommandLineOptionsLock llvm_lock(llvm_options);
-
-  std::unique_ptr<CpuExecutable> cpu_executable;
-  TF_ASSIGN_OR_RETURN(cpu_executable, CompileCpuExecutable(std::move(module)));
-
-  cpu_executable->set_debug_info(
-      cpu_executable->buffer_assignment().StatsString(
-          /*report_total_fragmentation=*/true));
-  VLOG(1) << "Compilation finished";
-  return std::unique_ptr<Executable>(std::move(cpu_executable));
-}
-
-absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
-                                const AotCompilationOptions& aot_options) {
-  TF_RET_CHECK(!module_group->empty());
-  std::vector<std::unique_ptr<HloModule>> modules =
-      module_group->ConsumeModules();
-
-  auto llvm_options = llvm_ir::ExtractXlaBackendExtraOptions(
-      modules[0]->config().debug_options().xla_backend_extra_options());
-  VlogMaxIsa(modules[0]->config().debug_options().xla_cpu_max_isa());
-  llvm_ir::LLVMCommandLineOptionsLock llvm_lock(llvm_options);
-
-  // We can pass just one llvm::TargetOptions when we compile the LLVM module,
-  // so we bail if the configs have conflicting flags. At the moment, the only
-  // flags that need to be consistent are for fast-math.
-  for (const auto& fn_and_name :
-       {std::make_pair(&DebugOptions::xla_cpu_enable_fast_math,
-                       "xla_cpu_enable_fast_math"),
-        std::make_pair(&DebugOptions::xla_cpu_fast_math_honor_infs,
-                       "xla_cpu_fast_math_honor_infs"),
-        std::make_pair(&DebugOptions::xla_cpu_fast_math_honor_nans,
-                       "xla_cpu_fast_math_honor_nans")}) {
-    // This only works because each of the method pointers above returns a
-    // bool. Otherwise we'd have to do some template magic.
-    const auto& field_method_ptr = fn_and_name.first;
-    const auto& field_name = fn_and_name.second;
-    bool first_module_val =
-        (modules[0]->config().debug_options().*field_method_ptr)();
-    for (int64_t i = 0; i < modules.size(); ++i) {
-      bool cur_module_val =
-          (modules[i]->config().debug_options().*field_method_ptr)();
-      if (first_module_val != cur_module_val) {
-        return InvalidArgument(
-            "All HLO module configs must have the same value for %s, but "
-            "module 0 and %d have different values (%d vs %d).",
-            field_name, i, first_module_val, cur_module_val);
-      }
-    }
-  }
-
-  if (aot_options.PlatformId() != se::host::kHostPlatformId) {
-    return InvalidArgument("Incompatible AOT compilation platform");
-  }
-  const CpuAotCompilationOptions& options =
-      static_cast<const CpuAotCompilationOptions&>(aot_options);
-  llvm::Triple triple(llvm::Triple::normalize(options.triple()));
-  std::string error;
-  const llvm::Target* target =
-      llvm::TargetRegistry::lookupTarget(triple.getTriple(), error);
-  if (target == nullptr) {
-    return Internal("TargetRegistry::lookupTarget failed: %s", error);
-  }
-
-  llvm::Reloc::Model reloc_model = llvm::Reloc::Static;
-  llvm::PICLevel::Level pic_level = llvm::PICLevel::NotPIC;
-  llvm::PIELevel::Level pie_level = llvm::PIELevel::Default;
-  switch (options.relocation_model()) {
-    case CpuAotCompilationOptions::RelocationModel::Static:
-      reloc_model = llvm::Reloc::Static;
-      pic_level = llvm::PICLevel::NotPIC;
-      pie_level = llvm::PIELevel::Default;
-      break;
-    case CpuAotCompilationOptions::RelocationModel::SmallPic:
-      reloc_model = llvm::Reloc::PIC_;
-      pic_level = llvm::PICLevel::SmallPIC;
-      pie_level = llvm::PIELevel::Default;
-      break;
-    case CpuAotCompilationOptions::RelocationModel::BigPic:
-      reloc_model = llvm::Reloc::PIC_;
-      pic_level = llvm::PICLevel::BigPIC;
-      pie_level = llvm::PIELevel::Default;
-      break;
-    case CpuAotCompilationOptions::RelocationModel::SmallPie:
-      reloc_model = llvm::Reloc::PIC_;
-      pic_level = llvm::PICLevel::SmallPIC;
-      pie_level = llvm::PIELevel::Small;
-      break;
-    case CpuAotCompilationOptions::RelocationModel::BigPie:
-      reloc_model = llvm::Reloc::PIC_;
-      pic_level = llvm::PICLevel::BigPIC;
-      pie_level = llvm::PIELevel::Large;
-      break;
-  }
-  llvm::CodeGenOptLevel opt_level =
-      IrCompiler::GetCodeGenOptLevel(modules[0]->config());
-  llvm::TargetOptions target_options =
-      CompilerTargetOptions(modules[0]->config());
-  auto target_machine_builder = [&]() {
-    return absl::WrapUnique(target->createTargetMachine(
-        triple.getTriple(), options.cpu_name(), options.features(),
-        target_options, reloc_model, std::nullopt, opt_level));
-  };
-
-  std::unique_ptr<llvm::TargetMachine> target_machine =
-      target_machine_builder();
-
-  // Compile must be thread-safe so create a new LLVM context for the module.
-  mlir::MLIRContext mlir_context;
-  llvm::LLVMContext llvm_context;
-
-  std::vector<std::unique_ptr<AotCompilationResult>> results;
-  for (auto& hlo_module : modules) {
-    VLOG(1) << "Compiling ahead-of-time: " << hlo_module->name();
-    if (hlo_module->has_schedule()) {
-      continue;
-    }
-
-    TF_RETURN_IF_ERROR(RunHloPasses(hlo_module.get(), /*is_aot_compile=*/true,
-                                    target_machine.get(),
-                                    /*dummy*/ CompileOptions{}));
-
-    if (hlo_module->config().debug_options().xla_cpu_use_thunk_runtime()) {
-      TF_ASSIGN_OR_RETURN(results.emplace_back(),
-                          CompileAheadOfTimeThunks(
-                              std::move(hlo_module), target_machine_builder,
-                              options, triple, pic_level, pie_level));
-    } else {
-      TF_ASSIGN_OR_RETURN(results.emplace_back(),
-                          CompileAheadOfTimeLegacy(
-                              std::move(hlo_module), target_machine_builder,
-                              options, triple, pic_level, pie_level));
-    }
-  }
-
-  VLOG(1) << "Compilation finished";
-  return std::move(results);
-}
-
-absl::StatusOr<std::unique_ptr<AotCompilationResult>>
-CpuCompiler::CompileAheadOfTimeLegacy(
-    std::unique_ptr<HloModule> module,
-    IrCompiler::TargetMachineBuilder target_machine_builder,
-    const CpuAotCompilationOptions& aot_options, const llvm::Triple& triple,
-    const llvm::PICLevel::Level& pic_level,
-    const llvm::PIELevel::Level& pie_level) {
-  TF_ASSIGN_OR_RETURN(HloSchedule schedule,
-                      ScheduleModule(module.get(), BufferSizeBytesFunction()));
-
-  // Run buffer analysis on the HLO graph. This analysis figures out which
-  // temporary buffers are required to run the computation.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<BufferAssignment> assignment,
-      BufferAssigner::Run(module.get(),
-                          std::make_unique<SequentialHloOrdering>(schedule),
-                          BufferSizeBytesFunction(), memory_alignment,
-                          /*allocate_buffers_for_constants=*/true));
-  // BufferAssignment::ToString() includes a header, so no need for us to
-  // print one ourselves.
-  if (DumpingEnabledForHloModule(*module)) {
-    DumpToFileInDirOrStdout(*module, "", "buffer_assignment",
-                            assignment->ToString());
-  }
-  DumpHloModuleIfEnabled(*module, *assignment,
-                         absl::StrCat("cpu_", kAfterOptimizationsDumpName));
-
-  absl::flat_hash_map<const HloInstruction*, int64_t>
-      instruction_to_profile_idx;
-  absl::flat_hash_map<const HloComputation*, int64_t>
-      computation_to_profile_idx;
-  std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
-  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data;
-
-  if (module->config().hlo_profiling_enabled()) {
-    TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts(
-        *module, &instruction_to_profile_idx, &computation_to_profile_idx,
-        &hlo_profile_index_map, &hlo_profile_printer_data));
-  }
-
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::TargetMachine> target_machine,
-                      target_machine_builder());
-  TargetMachineFeatures target_machine_features(target_machine.get());
-  std::vector<cpu_function_runtime::BufferInfo> buffer_infos =
-      CreateBufferInfosFromBufferAssignment(*module, *assignment);
-  HloComputation* computation = module->entry_computation();
-
-  // Compile must be thread-safe so create a new LLVM context for the module.
-  mlir::MLIRContext mlir_context;
-  auto llvm_context = std::make_unique<llvm::LLVMContext>();
-
-  // Set required information before emitting IR
-  auto llvm_module =
-      std::make_unique<llvm::Module>(kXlaModuleIdentifier, *llvm_context);
-  llvm_module->setDataLayout(target_machine->createDataLayout());
-  llvm_module->setTargetTriple(triple);
-  if (pic_level != llvm::PICLevel::NotPIC) {
-    llvm_module->setPICLevel(pic_level);
-  }
-  if (pie_level != llvm::PIELevel::Default) {
-    llvm_module->setPIELevel(pie_level);
-  }
-  IrEmitter ir_emitter(&mlir_context, *module, *assignment, llvm_module.get(),
-                       std::move(instruction_to_profile_idx),
-                       std::move(computation_to_profile_idx),
-                       ModuleComputationsTransitivelyContainCustomCall(*module),
-                       &target_machine_features,
-                       // TODO(b/66051036): Run full msan for AOT.
-                       /*emit_code_for_msan=*/false);
-
-  TF_RETURN_IF_ERROR(ir_emitter.EmitAllConstantGlobals());
-
-  for (ComputationToEmit subcomputation :
-       SubcomputationEmissionOrder(computation)) {
-    if (subcomputation.computation->IsFusionComputation()) {
-      continue;
-    }
-    TF_RETURN_IF_ERROR(
-        ir_emitter
-            .EmitComputation(
-                subcomputation.computation, subcomputation.computation->name(),
-                /*is_top_level_computation=*/false,
-                schedule.sequence(subcomputation.computation).instructions(),
-                subcomputation.allow_reassociation)
-            .status());
-  }
-  const std::string& entry_point_name = aot_options.entry_point_name();
-  TF_ASSIGN_OR_RETURN(
-      llvm::Function * entry_function,
-      ir_emitter.EmitComputation(computation, entry_point_name,
-                                 /*is_top_level_computation=*/true,
-                                 schedule.sequence(computation).instructions(),
-                                 /*allow_reassociation=*/false));
-
-  CHECK(entry_function->getName() == entry_point_name);
-
-  ModuleHook pre_optimization_ir_hook;
-  ModuleHook post_optimization_ir_hook;
-  std::tie(pre_optimization_ir_hook, post_optimization_ir_hook) =
-      GetIRModuleHooks(*module, user_pre_optimization_hook_,
-                       user_post_optimization_hook_);
-
-  // Run the LLVM verifier over the unoptimized LLVM IR.  If it fails, run
-  // the pre-optimization IR dump hook before returning.
-  {
-    absl::Status verify_status = VerifyLlvmModule(*llvm_module);
-    if (!verify_status.ok() && pre_optimization_ir_hook) {
-      pre_optimization_ir_hook(*llvm_module);
-    }
-    TF_RETURN_IF_ERROR(verify_status);
-  }
-
-  auto post_codegen_hook = [&](const llvm::Module& llvm_module,
-                               const llvm::object::ObjectFile& obj_file) {
-    if (!DumpingEnabledForHloModule(*module)) {
-      return;
-    }
-    DumpModuleToFile(llvm_module, obj_file, *module);
-  };
-
-  DebugOptions debug_options = module->config().debug_options();
-  IrCompiler::Options ir_compiler_options = {
-      /*optimization_level=*/target_machine->getOptLevel(),
-      /*optimize_for_size=*/
-      options::OptimizeForSizeRequested(module->config()),
-      /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()),
-      /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(module->config()),
-      /*disable_expensive_passes=*/
-      debug_options.xla_llvm_disable_expensive_passes(),
-      /*disable_slp_vectorizer=*/
-      options::SlpVectorizerDisabled(module->config()),
-      /*disable_loop_unrolling=*/
-      options::DisableLoopUnrolling(module->config()),
-      /*dfsan_enabled=*/aot_options.sanitize_dataflow(),
-      /*dfsan_abilists_enabled=*/aot_options.sanitize_abilists_dataflow()};
-
-  IrCompiler::CompilationHooks ir_compiler_hooks = {
-      pre_optimization_ir_hook,
-      post_optimization_ir_hook,
-      post_codegen_hook,
-  };
-
-  IrCompiler ir_compiler(std::move(target_machine_builder),
-                         std::move(ir_compiler_options),
-                         std::move(ir_compiler_hooks));
-
-  std::unique_ptr<llvm::MemoryBuffer> object_file =
-      cantFail(ir_compiler(*llvm_module));
-  ObjectFileData object_file_data(object_file->getBufferStart(),
-                                  object_file->getBufferEnd());
-
-  TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
-                      assignment->GetUniqueTopLevelOutputSlice());
-
-  return std::make_unique<CpuAotCompilationResultLegacy>(
-      std::move(object_file_data), std::move(buffer_infos),
-      result_slice.index(), std::move(module),
-      std::move(hlo_profile_printer_data));
-}
-
-absl::StatusOr<std::unique_ptr<AotCompilationResult>>
-CpuCompiler::CompileAheadOfTimeThunks(
-    std::unique_ptr<HloModule> module,
-    IrCompiler::TargetMachineBuilder target_machine_builder,
-    const CpuAotCompilationOptions& aot_options, const llvm::Triple& triple,
-    const llvm::PICLevel::Level& pic_level,
-    const llvm::PIELevel::Level& pie_level) {
-  TraceMe trace([&] {
-    return TraceMeEncode("CpuCompiler::CompileAheadOfTimeThunks",
-                         {{"name", module->name()}});
-  });
-  // Compile must be thread-safe so create a new LLVM context for the module.
-  mlir::MLIRContext mlir_context;
-  auto llvm_context = std::make_unique<llvm::LLVMContext>();
-
-  const DebugOptions& debug_options = module->config().debug_options();
-
-  TF_ASSIGN_OR_RETURN(HloSchedule schedule, CreateHloSchedule(*module));
-  TF_RETURN_IF_ERROR(module->set_schedule(schedule));
-
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferAssignment> assignment,
-                      CreateBufferAssignment(*module));
-  DumpHloModuleIfEnabled(*module, *assignment,
-                         absl::StrCat("cpu_aot_", kAfterOptimizationsDumpName));
-
-  // TODO profiling related, probably delete this
-  absl::flat_hash_map<const HloInstruction*, int64_t>
-      instruction_to_profile_idx;
-  absl::flat_hash_map<const HloComputation*, int64_t>
-      computation_to_profile_idx;
-  std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
-  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data;
-  if (module->config().hlo_profiling_enabled()) {
-    TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts(
-        *module, &instruction_to_profile_idx, &computation_to_profile_idx,
-        &hlo_profile_index_map, &hlo_profile_printer_data));
-  }
-  // probably delete this end
-
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::TargetMachine> target_machine,
-                      target_machine_builder());
-  TargetMachineFeatures target_machine_features(target_machine.get());
-
-  auto llvm_module =
-      std::make_unique<llvm::Module>(kXlaModuleIdentifier, *llvm_context);
-
-  llvm_module->setDataLayout(target_machine->createDataLayout());
-  llvm_module->setTargetTriple(triple);
-  if (pic_level != llvm::PICLevel::NotPIC) {
-    llvm_module->setPICLevel(pic_level);
-  }
-  if (pie_level != llvm::PIELevel::Default) {
-    llvm_module->setPIELevel(pie_level);
-  }
-
-  // Emitting part
-  // TODO(ezhulenev): Once we fully migrate to Thunks current IrEmitter should
-  // be renamed to NestedIrEmitter and be used only for emitting nested (aka
-  // thread local or embedded) computations (reductions, maps, etc.).
-
-  // (Nested) IrEmitter is responsible for building LLVM module with functions
-  // for all HLO computations. In thunk execution mode we only build LLVM
-  // functions for embedded computations (e.g. reduction computations) and all
-  // high-level operations (fusions, elementwise, etc.) are lowered to kernel
-  // functions (which are also LLVM functions, but use a HostKernel ABI).
-  IrEmitter nested_ir_emitter(
-      &mlir_context, *module, *assignment, llvm_module.get(),
-      std::move(instruction_to_profile_idx),
-      std::move(computation_to_profile_idx),
-      ModuleComputationsTransitivelyContainCustomCall(*module),
-      &target_machine_features,
-      // TODO(b/66051036): Run full msan for AOT.
-      /*emit_code_for_msan=*/false);
-
-  // The thunk runtime manages large constants, therefore we only emit
-  // small ones.
-  TF_RETURN_IF_ERROR(nested_ir_emitter.EmitSmallConstantGlobals());
-
-  // IR emitter is responsible for building LLVM module with host kernels for
-  // corresponding HLO instructions (fusions, elemental instructions, etc.).
-  IrEmitter2 ir_emitter2(*module, llvm_module.get(), &nested_ir_emitter);
-
-  // Thunk emitter is responsible for building a Thunk sequence that will
-  // resolved kernels in the compiled LLVM module and execute them together
-  // with Thunks implemented as library calls (e.g. oneDNN or Eigen).
-  ThunkEmitter thunk_emitter(ir_emitter2, *assignment, target_machine_features,
-                             module->config());
-  TF_ASSIGN_OR_RETURN(ThunkSequence thunks,
-                      thunk_emitter.EmitEntryComputation(*module));
-
-  // Cache these flags here since we'll want to access them after the module's
-  // ownership is std::moved.
-  const bool embed_ir_in_executable =
-      debug_options.xla_embed_ir_in_executable();
-
-  std::string ir_module_string;
-  if (embed_ir_in_executable) {
-    std::string emitter2_ir = llvm_ir::DumpToString(llvm_module.get());
-
-    auto thunk_kernel_fmt = [](std::string* out,
-                               const ThunkEmitter::EmittedKernel& kernel) {
-      absl::StrAppend(out,
-                      llvm_ir::DumpToString(kernel.module.getModuleUnlocked()));
-    };
-    std::string thunks_ir =
-        absl::StrJoin(thunk_emitter.kernels(), "\n", thunk_kernel_fmt);
-
-    ir_module_string = absl::StrCat(emitter2_ir, "\n", thunks_ir);
-  }
-
-  TF_RETURN_IF_ERROR(VerifyLlvmModule(*llvm_module));
-  for (const auto& [name, module] : thunk_emitter.kernels()) {
-    TF_RETURN_IF_ERROR(VerifyLlvmModule(*module.getModuleUnlocked()));
-  }
-
-  // Compilation part
-  ModuleHook pre_optimization_ir_hook;
-  ModuleHook post_optimization_ir_hook;
-  std::tie(pre_optimization_ir_hook, post_optimization_ir_hook) =
-      GetIRModuleHooks(*module, user_pre_optimization_hook_,
-                       user_post_optimization_hook_);
-
-  std::vector<std::string> obj_files;
-  auto post_codegen_hook = [&](const llvm::Module& llvm_module,
-                               const llvm::object::ObjectFile& obj_file) {
-    obj_files.push_back(obj_file.getData().str());
-    if (!DumpingEnabledForHloModule(*module)) {
-      return;
-    }
-    absl::string_view id = llvm_module.getModuleIdentifier();
-    size_t pos = std::min(id.size(), 1 + kXlaModuleIdentifier.size());
-    DumpToFileInDir(
-        *module, /*file_prefix=*/"",
-        /*file_suffix=*/absl::StrCat("obj-file.", id.substr(pos), ".o"),
-        absl::string_view(obj_file.getData().data(),
-                          obj_file.getData().size()));
-  };
-
-  IrCompiler::Options ir_compiler_options = {
-      /*optimization_level=*/target_machine->getOptLevel(),
-      /*optimize_for_size=*/
-      options::OptimizeForSizeRequested(module->config()),
-      /*max_cpu_isa=*/CpuFeatureFromString(debug_options.xla_cpu_max_isa()),
-      /*fast_math_flags=*/llvm_ir::GetCpuFastMathFlags(module->config()),
-      /*disable_expensive_passes=*/
-      module->config().debug_options().xla_llvm_disable_expensive_passes(),
-      /*disable_slp_vectorizer=*/
-      options::SlpVectorizerDisabled(module->config()),
-      /*disable_loop_unrolling=*/
-      options::DisableLoopUnrolling(module->config()),
-      /*dfsan_enabled=*/aot_options.sanitize_dataflow(),
-      /*dfsan_abilists_enabled=*/aot_options.sanitize_abilists_dataflow()};
-
-  IrCompiler::CompilationHooks ir_compiler_hooks = {
-      pre_optimization_ir_hook,
-      post_optimization_ir_hook,
-      post_codegen_hook,
-  };
-
-  IrCompiler ir_compiler(std::move(target_machine_builder),
-                         std::move(ir_compiler_options),
-                         std::move(ir_compiler_hooks));
-
-  // For simplicity no parallel compilation is used.
-  std::vector<CompiledSymbolsPart> compiled_parts;
-  compiled_parts.push_back(
-      CollectCompiledSymbolsPart(ir_emitter2, *llvm_module));
-
-  // Collect compiled symbols from all LLVM module parts.
-  std::vector<FunctionLibrary::Symbol> compiled_symbols;
-
-  absl::flat_hash_map<FunctionLibrary::TypeId, SymbolProto::FunctionTypeId>
-      symbol_type_id_to_function_type_id;
-
-  VLOG(3) << "Compiling " << thunk_emitter.kernels().size()
-          << " thunk kernels.";
-
-  // We have to clone the LLVM module into a local context to be able to link
-  // it with the other modules. This enables us to have one object file for all
-  // the kernels.
-  auto copy_llvm_module_to_local_context =
-      [&llvm_context](llvm::Module& module) {
-        // There is no way to clone a module from one context to another, so we
-        // need to serialize the module to bitcode and parse it back into the
-        // new context.
-        llvm::SmallString<0> bc;
-        llvm::raw_svector_ostream bcos(bc);
-        llvm::WriteBitcodeToFile(module, bcos);
-
-        // Parse module back into its own LLVM context.
-        auto clone_module = llvm::parseBitcodeFile(
-            llvm::MemoryBufferRef(llvm::StringRef(bc.data(), bc.size()),
-                                  absl::StrFormat("%s_cloned_to_local_context",
-                                                  kXlaModuleIdentifier)),
-            *llvm_context);
-
-        return clone_module;
-      };
-
-  llvm::Linker linker(*llvm_module);
-
-  for (auto& [name, module] : thunk_emitter.kernels()) {
-    compiled_symbols.push_back(
-        FunctionLibrary::Sym<FunctionLibrary::Kernel>(name));
-    symbol_type_id_to_function_type_id.emplace(compiled_symbols.back().type_id,
-                                               SymbolProto::KERNEL);
-    auto cloned_module =
-        copy_llvm_module_to_local_context(*module.getModuleUnlocked());
-    if (!cloned_module) {
-      return Internal("Failed to clone LLVM module.");
-    }
-    // Match data layouts to avoid warning messages.
-    cloned_module->get()->setDataLayout(llvm_module->getDataLayout());
-    linker.linkInModule(std::move(cloned_module.get()));
-  }
-
-  cantFail(ir_compiler(*llvm_module));
-
-  for (const CompiledSymbolsPart& part : compiled_parts) {
-    for (const IrEmitter2::KernelInfo& kernel : part.kernels) {
-      compiled_symbols.push_back(
-          FunctionLibrary::Sym<FunctionLibrary::Kernel>(kernel.name));
-      symbol_type_id_to_function_type_id.emplace(
-          compiled_symbols.back().type_id, SymbolProto::KERNEL);
-    }
-    for (const IrEmitter2::ComparatorInfo& comparator : part.comparators) {
-      compiled_symbols.push_back(
-          FunctionLibrary::Sym<FunctionLibrary::Comparator>(comparator.name));
-      symbol_type_id_to_function_type_id.emplace(
-          compiled_symbols.back().type_id, SymbolProto::COMPARATOR);
-    }
-  }
-
-  VLOG(3) << "Collected " << compiled_symbols.size() << " compiled symbols";
-
-  // Create constant allocations from the buffer assignment.
-  TF_ASSIGN_OR_RETURN(std::vector<ConstantAllocation> constants,
-                      CreateConstantAllocations(*assignment));
-
-  TF_ASSIGN_OR_RETURN(
-      auto cpu_executable,
-      CpuExecutable::Create(
-          /*function_library=*/nullptr,  // NOTE: We don't need to generate a
-                                         // function library as the only purpose
-                                         // of this executable is to get
-                                         // exported.
-          std::move(assignment), std::move(module), std::move(thunks),
-          std::move(constants), std::move(hlo_profile_printer_data),
-          std::move(hlo_profile_index_map)));
-
-  // Save compiled symbols to be able to export them to AOT compilation
-  // result.
-  cpu_executable->set_compiled_symbols(std::move(compiled_symbols));
-
-  // Save mapping between symbol type id and function type id to be able to
-  // export them to AOT compilation result.
-  cpu_executable->set_symbol_type_id_to_function_type_id(
-      symbol_type_id_to_function_type_id);
-
-  if (embed_ir_in_executable) {
-    cpu_executable->set_ir_module_string(ir_module_string);
-  }
-
-  // Dump computation proto state and buffer assignment for
-  // GetCompiledMemoryStats results.
-  auto with_hlo_proto = [&](std::unique_ptr<CpuExecutable> cpu_executable) {
-    auto hlo_proto = std::make_unique<HloProto>();
-    *hlo_proto->mutable_hlo_module() = cpu_executable->module().ToProto();
-    *hlo_proto->mutable_buffer_assignment() =
-        cpu_executable->buffer_assignment().ToProto();
-    StripPayloadFromLiteralProto(*hlo_proto);
-    cpu_executable->set_hlo_proto(std::move(hlo_proto));
-    return cpu_executable;
-  };
-
-  cpu_executable = with_hlo_proto(std::move(cpu_executable));
-
-  const ThunkSequence& thunk_sequence =
-      cpu_executable->thunks().thunk_sequence();
-
-  std::unique_ptr<HloProfilePrinterData> executable_hlo_profile_printer_data =
-      cpu_executable->module().config().hlo_profiling_enabled()
-          ? std::make_unique<HloProfilePrinterData>(
-                cpu_executable->hlo_profile_printer_data())
-          : nullptr;
-
-  return CpuAotCompilationResultThunks::Create(
-      &cpu_executable->module(), &cpu_executable->buffer_assignment(),
-      cpu_executable->module_name(), std::move(obj_files),
-      cpu_executable->get_compiled_symbols_proto(), thunk_sequence,
-      std::move(*cpu_executable).consume_function_library().release(),
-      std::move(executable_hlo_profile_printer_data));
-}
-
-se::Platform::Id CpuCompiler::PlatformId() const {
-  return se::host::kHostPlatformId;
-}
-
-HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const {
-  return CpuExecutable::ShapeSizeBytes;
-}
-
-namespace {
-
-// TODO(basioli): This should be removed once new runtime is implemented, and
-// CpuAotCompilationResult will be the only implementation of
-// AotCompilationResult. This is still used as it allows us to `Export` and
-// subsequently load both runtimes.
-
-// This is a result of exporting JIT compiled
-// CpuExecutable to AOT compilation result that can be saved on disk and shipped
-// over the wire.
-class CpuExecutableAotCompilationResult : public AotCompilationResult {
- public:
-  static absl::StatusOr<std::unique_ptr<CpuExecutableAotCompilationResult>>
-  Create(const HloModule* hlo_module, const BufferAssignment* buffer_assignment,
-         absl::string_view function_name, std::vector<std::string> obj_files,
-         std::vector<SymbolProto> symbols, const ThunkSequence* thunks,
-         CompilationResultProto::ObjFileKind obj_file_kind) {
-    std::optional<ThunkSequenceProto> thunk_proto;
-
-    if (thunks != nullptr) {
-      ThunkSequenceSerDesProtobuf thunk_sequence_serdes(
-          &buffer_assignment->Allocations());
-      TF_ASSIGN_OR_RETURN(thunk_proto, thunk_sequence_serdes.ToProto(*thunks));
-    }
-
-    return absl::WrapUnique(new CpuExecutableAotCompilationResult(
-        hlo_module, buffer_assignment, function_name, std::move(obj_files),
-        std::move(symbols), thunk_proto, obj_file_kind));
-  }
-
-  absl::StatusOr<std::string> SerializeAsString() const override {
-    return proto_.SerializeAsString();
-  }
-
-  static absl::StatusOr<std::unique_ptr<CpuExecutableAotCompilationResult>>
-  FromString(const std::string& serialized) {
-    CompilationResultProto proto;
-    if (!proto.ParseFromString(serialized)) {
-      return Internal(
-          "Failed to parse serialized CpuExecutableAotCompilationResult.");
-    }
-
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloModule> module,
-        HloModule::CreateFromProtoWithConfig(proto.hlo_module()));
-
-    return std::unique_ptr<CpuExecutableAotCompilationResult>(
-        new CpuExecutableAotCompilationResult(proto, std::move(module)));
-  }
-
-  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler* compiler,
-      const se::StreamExecutor* stream_exec) const&& override;
-
-  const HloModule* optimized_module() const override { return module_.get(); }
-
-  std::unique_ptr<HloModule> consume_optimized_module() override {
-    return std::move(module_);
-  }
-
- private:
-  CpuExecutableAotCompilationResult(
-      const HloModule* hlo_module, const BufferAssignment* buffer_assignment,
-      absl::string_view function_name, std::vector<std::string> obj_files,
-      std::vector<SymbolProto> symbols,
-      const std::optional<ThunkSequenceProto>& thunks,
-      CompilationResultProto::ObjFileKind obj_file_kind) {
-    *proto_.mutable_hlo_module()->mutable_hlo_module() = hlo_module->ToProto();
-    *proto_.mutable_hlo_module()->mutable_config() =
-        hlo_module->config().ToProto();
-    *proto_.mutable_buffer_assignment() = buffer_assignment->ToProto();
-    proto_.set_entry_function_name(std::string(function_name));
-    for (std::string& obj_file : obj_files) {
-      proto_.add_obj_files(std::move(obj_file));
-    }
-
-    for (const auto& symbol : symbols) {
-      auto* symbol_proto = proto_.add_compiled_symbols();
-      *symbol_proto = symbol;
-    }
-    proto_.set_obj_files_kind(obj_file_kind);
-    module_ = hlo_module->Clone();
-
-    if (thunks.has_value()) {
-      ThunkSequenceSerDesProtobuf thunk_sequence_serdes(
-          &buffer_assignment->Allocations());
-      *proto_.mutable_thunk_sequence() = *thunks;
-    }
-  }
-
-  explicit CpuExecutableAotCompilationResult(CompilationResultProto proto,
-                                             std::unique_ptr<HloModule> module)
-      : proto_(std::move(proto)), module_(std::move(module)) {}
-
-  CompilationResultProto proto_;
-  std::unique_ptr<HloModule> module_;
-};
-
-}  // namespace
-
-absl::StatusOr<std::unique_ptr<Executable>>
-CpuExecutableAotCompilationResult::LoadExecutable(
-    Compiler* compiler, const se::StreamExecutor* stream_exec) const&& {
-  // Recreate HloModule from proto.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> module,
-      HloModule::CreateFromProtoWithConfig(proto_.hlo_module()));
-
-  VLOG(2) << "Load XLA:CPU executable for module: " << module->name();
-
-  // Recreate BufferAssignment from proto.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<BufferAssignment> buffer_assignment,
-      BufferAssignment::FromProto(proto_.buffer_assignment(), module.get(),
-                                  compiler->BufferSizeBytesFunction(),
-                                  /*can_share_buffer=*/nullptr));
-
-  const DebugOptions& debug_options = module->config().debug_options();
-  VlogMaxIsa(debug_options.xla_cpu_max_isa());
-  const HloModuleConfig& config = module->config();
-
-  // Infer target machine from the current host CPU.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<llvm::TargetMachine> target_machine,
-      IrCompiler::InferTargetMachine(
-          std::move(CompilerTargetOptions(module->config())),
-          IrCompiler::GetCodeGenOptLevel(config),
-          CpuFeatureFromString(debug_options.xla_cpu_max_isa())));
-
-  // Definition generator to link with XLA:CPU host runtime symbols.
-  ExecutionEngine::DefinitionGenerator definition_generator =
-      [](const llvm::DataLayout& data_layout) {
-        return std::make_unique<RuntimeSymbolGenerator>(data_layout);
-      };
-
-  ObjectLoader object_loader(/*num_dylibs=*/1,
-                             target_machine->createDataLayout(),
-                             definition_generator);
-
-  for (size_t i = 0; i < object_loader.num_dylibs(); ++i) {
-    object_loader.dylib(i).value()->addGenerator(
-        std::make_unique<RuntimeSymbolGenerator>(
-            target_machine->createDataLayout()));
-  }
-
-  // We might have an XLA:CPU executable that has only runtime thunks and
-  // doesn't have any corresponding object files, and it's absolutely fine.
-  VLOG(2) << "Load XLA:CPU executable from " << proto_.obj_files_size()
-          << " object files; entry_function_name="
-          << proto_.entry_function_name();
-
-  size_t obj_file_index = 0;
-  for (auto& obj_file : proto_.obj_files()) {
-    llvm::StringRef data(obj_file.data(), obj_file.size());
-    TF_RETURN_IF_ERROR(
-        object_loader.AddObjFile(llvm::MemoryBuffer::getMemBuffer(
-            data, absl::StrCat(proto_.entry_function_name(), "_",
-                               obj_file_index++))));
-  }
-
-  std::unique_ptr<CpuExecutable> cpu_executable;
-
-  if (proto_.obj_files_kind() == CompilationResultProto::KERNELS) {
-    ThunkSequenceSerDesProtobuf thunk_sequence_serdes(
-        &buffer_assignment->Allocations());
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<ThunkSequence> thunks,
-        thunk_sequence_serdes.FromProto(proto_.thunk_sequence()));
-
-    VLOG(3) << "Loaded " << thunks->size() << " thunks.";
-
-    std::vector<FunctionLibrary::Symbol> compiled_symbols;
-
-    for (const auto& symbol_proto : proto_.compiled_symbols()) {
-      switch (symbol_proto.function_type_id()) {
-        case SymbolProto::KERNEL:
-          compiled_symbols.push_back(
-              FunctionLibrary::Sym<FunctionLibrary::Kernel>(
-                  symbol_proto.name()));
-          break;
-        case SymbolProto::COMPARATOR:
-          compiled_symbols.push_back(
-              FunctionLibrary::Sym<FunctionLibrary::Comparator>(
-                  symbol_proto.name()));
-          break;
-        default:
-          return Internal(
-              "Unknown function type id %s",
-              SymbolProto_FunctionTypeId_Name(symbol_proto.function_type_id()));
-      }
-    }
-
-    VLOG(3) << "Collected " << compiled_symbols.size() << " compiled symbols";
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<FunctionLibrary> function_library,
-                        std::move(object_loader).Load(compiled_symbols));
-
-    // Create constant allocations from the buffer assignment.
-    TF_ASSIGN_OR_RETURN(std::vector<ConstantAllocation> constants,
-                        CreateConstantAllocations(*buffer_assignment));
-
-    TF_ASSIGN_OR_RETURN(
-        cpu_executable,
-        CpuExecutable::Create(std::move(function_library),
-                              std::move(buffer_assignment), std::move(module),
-                              std::move(*thunks), std::move(constants), nullptr,
-                              nullptr));
-
-  } else if (proto_.obj_files_kind() == CompilationResultProto::CLASSIC) {
-    // Create a "classic" CPU executable.
-    using ComputeFn = std::remove_pointer_t<CpuExecutable::ComputeFunctionType>;
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<FunctionLibrary> function_library,
-                        std::move(object_loader)
-                            .Load({FunctionLibrary::Sym<ComputeFn>(
-                                proto_.entry_function_name())}));
-
-    TF_ASSIGN_OR_RETURN(
-        cpu_executable,
-        CpuExecutable::Create(std::move(function_library),
-                              std::move(buffer_assignment), std::move(module),
-                              proto_.entry_function_name(), nullptr, nullptr));
-
-  } else {
-    return Internal("Unknown obj file kind");
-  }
-
-  // Dump computation proto state and buffer assignment for
-  // GetCompiledMemoryStats results.
-  auto hlo_proto = std::make_unique<HloProto>();
-  *hlo_proto->mutable_hlo_module() = cpu_executable->module().ToProto();
-  *hlo_proto->mutable_buffer_assignment() =
-      cpu_executable->buffer_assignment().ToProto();
-  cpu_executable->set_hlo_proto(std::move(hlo_proto));
-
-  return cpu_executable;
-}
-
-absl::StatusOr<std::unique_ptr<AotCompilationResult>> CpuCompiler::Export(
-    Executable* executable) const {
-  auto* cpu_executable = tensorflow::down_cast<CpuExecutable*>(executable);
-  if (!cpu_executable)
-    return Internal("Could not downcast Executable to CpuExecutable");
-
-  // Export object files for all dylibs.
-  std::vector<std::string> obj_files;
-  for (const auto& obj_file : cpu_executable->obj_files()) {
-    obj_files.push_back(std::string(obj_file));
-  }
-
-  auto kind = cpu_executable->has_thunks() ? CompilationResultProto::KERNELS
-                                           : CompilationResultProto::CLASSIC;
-  const ThunkSequence* thunk_sequence =
-      cpu_executable->has_thunks() ? &cpu_executable->thunks().thunk_sequence()
-                                   : nullptr;
-
-  std::vector<SymbolProto> compiled_symbols =
-      cpu_executable->get_compiled_symbols_proto();
-
-  return CpuExecutableAotCompilationResult::Create(
-      &cpu_executable->module(), &cpu_executable->buffer_assignment(),
-      cpu_executable->module_name(), std::move(obj_files),
-      std::move(compiled_symbols), thunk_sequence, kind);
-}
-
-absl::StatusOr<std::unique_ptr<AotCompilationResult>>
-CpuCompiler::LoadAotCompilationResult(
-    const std::string& serialized_aot_result) {
-  return CpuExecutableAotCompilationResult::FromString(serialized_aot_result);
-}
-
-absl::StatusOr<HloSchedule> CpuCompiler::CreateHloSchedule(
-    const HloModule& hlo_module) const {
-  // Select a memory scheduler optimized for concurrency vs minimal memory.
-  auto scheduler =
-      hlo_module.config()
-              .debug_options()
-              .xla_cpu_enable_concurrency_optimized_scheduler()
-          ? std::unique_ptr<ModuleSchedulerAlgorithm>(
-                std::make_unique<BFScheduler>(BufferSizeBytesFunction()))
-          : std::make_unique<DFSMemoryScheduler>(BufferSizeBytesFunction());
-
-  // Select an order for emitting the HLO instructions for each
-  // computation. Using this sequence enables tighter buffer liveness analysis
-  // and reduced memory usage (as compared to using `DependencyHloOrdering`).
-  return ScheduleModule(&hlo_module, *scheduler);
-}
-
-absl::StatusOr<std::unique_ptr<BufferAssignment>>
-CpuCompiler::CreateBufferAssignment(const HloModule& module) const {
-  // Run buffer allocation on the HLO graph.
-  return BufferAssigner::Run(
-      &module, std::make_unique<SequentialHloOrdering>(module.schedule()),
-      BufferSizeBytesFunction(), memory_alignment,
-      /*allocate_buffers_for_constants=*/true);
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h b/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h
index 68da3fd55523df..a04432292b43f1 100644
--- a/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h
+++ b/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h
@@ -31,8 +31,8 @@ namespace cpu {
 
 class CpuInstructionFusion : public InstructionFusion {
  public:
-  CpuInstructionFusion()
-      : InstructionFusion(CpuInstructionFusion::IsExpensive) {}
+  CpuInstructionFusion(bool may_duplicate)
+      : InstructionFusion(CpuInstructionFusion::IsExpensive, may_duplicate) {}
   ~CpuInstructionFusion() override = default;
 
   using HloPassInterface::Run;