diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl index 85edecae8c67a6..2b1aa738a475cd 100644 --- a/tensorflow/workspace2.bzl +++ b/tensorflow/workspace2.bzl @@ -29,6 +29,7 @@ load("@local_xla//third_party/nvshmem:workspace.bzl", nvshmem = "repo") load("@local_xla//third_party/pybind11_abseil:workspace.bzl", pybind11_abseil = "repo") load("@local_xla//third_party/pybind11_bazel:workspace.bzl", pybind11_bazel = "repo") load("@local_xla//third_party/robin_map:workspace.bzl", robin_map = "repo") +load("@local_xla//third_party/openblas:workspace.bzl", openblas = "repo") load("@rules_jvm_external//:defs.bzl", "maven_install") load("@tf_runtime//:dependencies.bzl", "tfrt_dependencies") load("//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl", "def_file_filter_configure") @@ -100,6 +101,7 @@ def _initialize_third_party(): tensorrt() nvshmem() triton() + openblas() # copybara: tsl vendor diff --git a/third_party/xla/third_party/openblas/BUILD b/third_party/xla/third_party/openblas/BUILD new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/third_party/xla/third_party/openblas/openblas.BUILD b/third_party/xla/third_party/openblas/openblas.BUILD new file mode 100644 index 00000000000000..6d36eec9e6b0d7 --- /dev/null +++ b/third_party/xla/third_party/openblas/openblas.BUILD @@ -0,0 +1,17 @@ +genrule( + name = "build_openblas", + srcs = glob(["**"], exclude = ["*.a"]), + outs = ["libopenblas.a"], + cmd = """ + cd $$(dirname $(location //:README.md)) && \ + make NO_SHARED=1 ONLY_CBLAS=1 TARGET=ARMV8 ARCH=arm64 && \ + cd - && \ + cp $$(dirname $(location //:README.md))/libopenblas_*.a $@ + """, +) + +cc_import( + name = "openblas", + static_library = "libopenblas.a", + visibility = ["//visibility:public"], +) diff --git a/third_party/xla/third_party/openblas/workspace.bzl b/third_party/xla/third_party/openblas/workspace.bzl new file mode 100644 index 00000000000000..74367fa1a8801d --- /dev/null +++ b/third_party/xla/third_party/openblas/workspace.bzl @@ -0,0 +1,10 @@ +load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls") + +def repo(): + tf_http_archive( + name = "openblas", + strip_prefix = "OpenBLAS-8795fc7985635de1ecf674b87e2008a15097ffab", + sha256 = "f5ff825b3a82417d47c2ba97606ce8a5d868f863e555025f5d4112e6dfd62e2f", + urls = tf_mirror_urls("https://github.com/OpenMathLib/OpenBLAS/archive/8795fc7985635de1ecf674b87e2008a15097ffab.tar.gz"), + build_file = "//third_party/openblas:openblas.BUILD", + ) diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl index 345f1931c68e47..cc2013365b40c8 100644 --- a/third_party/xla/workspace2.bzl +++ b/third_party/xla/workspace2.bzl @@ -18,6 +18,7 @@ load("//third_party/shardy:workspace.bzl", shardy = "repo") load("//third_party/stablehlo:workspace.bzl", stablehlo = "repo") load("//third_party/triton:workspace.bzl", triton = "repo") load("//third_party/uv:workspace.bzl", uv = "repo") +load("//third_party/openblas:workspace.bzl", openblas = "repo") def _initialize_third_party(): """ Load third party repositories. See above load() statements. """ @@ -31,6 +32,7 @@ def _initialize_third_party(): stablehlo() triton() uv() + openblas() # Define all external repositories required by TensorFlow def _tf_repositories(): diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc index 33fa90f7e35e9e..6cdfcfa8241c7c 100644 --- a/third_party/xla/xla/debug_options_flags.cc +++ b/third_party/xla/xla/debug_options_flags.cc @@ -103,6 +103,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() { opts.set_xla_cpu_use_fusion_emitters(true); opts.set_xla_cpu_use_thunk_runtime(true); opts.set_xla_cpu_use_xnnpack(false); + opts.set_xla_cpu_enable_xnnpack(false); // For softmax + opts.set_xla_cpu_use_kernel_selector(false); opts.set_xla_cpu_experimental_xnn_graph_fusion_mode( DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED); opts.set_xla_cpu_parallel_codegen_split_count(32); @@ -994,6 +996,16 @@ void MakeDebugOptionsFlags(std::vector* flag_list, bool_setter_for(&DebugOptions::set_xla_cpu_use_xnnpack), debug_options->xla_cpu_use_xnnpack(), "Use XNNPACK for supported operations.")); + flag_list->push_back(tsl::Flag( + "xla_cpu_enable_xnnpack", + bool_setter_for(&DebugOptions::set_xla_cpu_enable_xnnpack), + debug_options->xla_cpu_enable_xnnpack(), + "Enable XNNPACK ops rewriter.")); + flag_list->push_back(tsl::Flag( + "xla_cpu_use_kernel_selector", + bool_setter_for(&DebugOptions::set_xla_cpu_use_kernel_selector), + debug_options->xla_cpu_use_kernel_selector() , + "Replace dot with custom call to libraries.")); flag_list->push_back(tsl::Flag( "xla_cpu_experimental_xnn_graph_fusion_mode", setter_for_xla_cpu_experimental_xnn_graph_fusion_mode, diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD index 90388079ca2fcf..bc46d88d626fa0 100644 --- a/third_party/xla/xla/service/cpu/BUILD +++ b/third_party/xla/xla/service/cpu/BUILD @@ -76,6 +76,7 @@ filegroup( "runtime_single_threaded_matmul_s32.cc", "runtime_single_threaded_matmul_u8.cc", "runtime_topk.cc", + "xnnpack_ops.cc", # Multi-threaded support. "runtime_conv2d.cc", "runtime_conv3d.cc", @@ -88,6 +89,7 @@ filegroup( "runtime_matmul_f64.cc", "runtime_matmul_s32.cc", "runtime_fork_join.cc", + "kernel_selector.cc", "//xla/backends/cpu/runtime:runtime_srcs", #"runtime_handle_ffi_call.cc", # TODO(b/338344732): Add "runtime_handle_ffi_call.cc". ], @@ -109,6 +111,7 @@ filegroup( "runtime_single_threaded_fft.h", "runtime_single_threaded_matmul.h", "runtime_topk.h", + "xnnpack_ops.h", # Multi-threaded support. "runtime_conv2d.h", "runtime_conv3d.h", @@ -116,6 +119,7 @@ filegroup( "runtime_fork_join.h", "runtime_lightweight_check.h", "runtime_matmul.h", + "kernel_selector.h", "//xla/backends/cpu/runtime:runtime_hdrs", #"runtime_handle_ffi_call.h", # TODO(b/338344732): Add "runtime_handle_ffi_call.h" ], @@ -193,7 +197,11 @@ cc_library( name = "cpu_compiler_pure", srcs = ["cpu_compiler.cc"], hdrs = ["cpu_compiler.h"], - copts = tsl_copts(), + copts = tsl_copts() + select({ + ":enable_blas_mlir": ["-DENABLE_BLAS_MLIR"], + ":disable_blas_mlir": [], + "//conditions:default": [], + }), deps = [ ":buffer_info_util", ":conv_canonicalization", @@ -218,6 +226,8 @@ cc_library( ":small_while_loop_hoisting_pass", ":thunk_emitter", ":xla_framework", + ":xnnpack_ops_rewriter", + ":kernel_selector_ops_rewriter", "//xla:cpu_function_runtime", "//xla:debug_options_flags", "//xla:literal", @@ -417,7 +427,21 @@ cc_library( "@llvm-project//llvm:SystemZCodeGen", # fixdeps: keep ]) + if_llvm_x86_available([ "@llvm-project//llvm:X86CodeGen", # fixdeps: keep - ]), + ]) + select({ + ":enable_blas_mlir": [":libmlir"], + ":disable_blas_mlir": [], + "//conditions:default": [], + }), +) + +config_setting( + name = "enable_blas_mlir", + define_values = {"ENABLE_BLAS_MLIR": "true"}, +) + +config_setting( + name = "disable_blas_mlir", + define_values = {"ENABLE_BLAS_MLIR": "false"}, ) cc_library( @@ -592,7 +616,11 @@ cc_library( "windows_compatibility.h", ], hdrs = ["runtime_symbol_generator.h"], - copts = if_enable_acl(["-DXLA_CPU_USE_ACL=1"]) + tsl_copts(), + copts = if_enable_acl(["-DXLA_CPU_USE_ACL=1"]) + tsl_copts() + select({ + ":enable_blas_mlir": ["-DENABLE_BLAS_MLIR"], + ":disable_blas_mlir": [], + "//conditions:default": [], + }), deps = [ ":cpu_runtime", ":onednn_convolution", @@ -617,6 +645,8 @@ cc_library( ":runtime_single_threaded_fft", ":runtime_single_threaded_matmul", ":runtime_topk", + ":xnnpack_ops", + ":kernel_selector", "//xla/service:custom_call_target_registry", "@com_google_absl//absl/functional:any_invocable", "@com_google_absl//absl/strings:string_view", @@ -1102,7 +1132,11 @@ cc_library( "cpu_runtime.h", "xfeed_manager.h", ], - copts = runtime_copts(), + copts = runtime_copts() + select({ + ":enable_blas_mlir": ["-DENABLE_BLAS_MLIR"], + ":disable_blas_mlir": [], + "//conditions:default": [], + }), deps = [ ":cpu_executable_run_options", "//xla:executable_run_options", @@ -2187,3 +2221,74 @@ xla_cc_test( "@local_tsl//tsl/platform:test", ], ) + +cc_library( + name = "xnnpack_ops_rewriter", + srcs = ["xnnpack_ops_rewriter.cc"], + hdrs = [ + "xnnpack_ops_rewriter.h", + "xnnpack_pattern_utils.h", + ], + copts = ["-O3"], + visibility = ["//visibility:public"], + deps = [ + "//xla/hlo/ir:hlo", + "//xla:literal_comparison", + "//xla:literal_util", + "//xla:status_macros", + "//xla/hlo/pass:hlo_pass", + "//xla/service:pattern_matcher", + ], +) + +cc_library( + name = "xnnpack_ops", + srcs = ["xnnpack_ops.cc"], + hdrs = ["xnnpack_ops.h"], + copts = ["-O3"], + visibility = ["//visibility:public"], + deps = [ + "@XNNPACK", + "@com_google_absl//absl/base", + ], +) + +cc_library( + name = "kernel_selector", + srcs = ["kernel_selector.cc"], + hdrs = ["kernel_selector.h"], + copts = ["-O3"] + select({ + ":enable_blas_mlir": ["-DENABLE_BLAS_MLIR"], + ":disable_blas_mlir": [], + "//conditions:default": [], + }), + visibility = ["//visibility:public"], + deps = [ + ":runtime_lightweight_check", + "//xla:executable_run_options", + "@eigen_archive//:eigen3", + "@local_tsl//tsl/platform:blocking_counter", + "@openblas//:openblas", + ], +) + +cc_library( + name = "kernel_selector_ops_rewriter", + srcs = ["kernel_selector_ops_rewriter.cc"], + hdrs = ["kernel_selector_ops_rewriter.h"], + copts = ["-O3"], + visibility = ["//visibility:public"], + deps = [ + ":cpu_runtime", + "//xla/hlo/ir:hlo", + "//xla:literal_util", + "//xla/hlo/pass:hlo_pass", + ], +) + +cc_import( + name = "libmlir", + visibility = ["//visibility:public"], + shared_library = "//xla/service/libs:libblas_mlir.so", + system_provided = 0 +) diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc index 9ba0085b24d372..9f2e6f5e6210d5 100644 --- a/third_party/xla/xla/service/cpu/cpu_compiler.cc +++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc @@ -183,6 +183,8 @@ limitations under the License. #include "xla/service/cpu/runtime_symbol_generator.h" #include "xla/service/cpu/small_while_loop_hoisting_pass.h" #include "xla/service/cpu/thunk_emitter.h" +#include "xla/service/cpu/xnnpack_ops_rewriter.h" +#include "xla/service/cpu/kernel_selector_ops_rewriter.h" #include "xla/service/cpu_gpu_shape_verifier.h" #include "xla/service/dump.h" #include "xla/service/dynamic_dimension_inference.h" @@ -591,6 +593,12 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn( }; pipeline.AddPass(upcaster_filter); + // For softmax, rewrite to custom calls with XNNPACK targets. + bool enable_xnnpack = + xla::GetDebugOptionsFromFlags().xla_cpu_enable_xnnpack(); + if (enable_xnnpack) + pipeline.AddPass(); + // Expand random number generation. pipeline.AddPass(); pipeline.AddPass(RandomAlgorithm::RNG_PHILOX); @@ -831,6 +839,13 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn( pipeline.AddPass(); + bool use_kernel_selector = + xla::GetDebugOptionsFromFlags().xla_cpu_use_kernel_selector(); + if (use_kernel_selector) { + // This pass rewrites hlo.dot into custom calls. + pipeline.AddPass(); + } + const int max_parallelism = module->config().intra_op_parallelism_threads() > 0 ? module->config().intra_op_parallelism_threads() @@ -863,7 +878,10 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn( } // Add a fusion pass now that layout assignment is done. - pipeline.AddPass(); + if (getenv("SET_CPU_INS_FUSION_NOT_DUPLICATE") != NULL) + pipeline.AddPass(/*may_duplicate=*/false); + else + pipeline.AddPass(/*may_duplicate=*/true); if (is_fusion_emitters) { pipeline.AddPass(); } diff --git a/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h b/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h index 68da3fd55523df..a04432292b43f1 100644 --- a/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h +++ b/third_party/xla/xla/service/cpu/cpu_instruction_fusion.h @@ -31,8 +31,8 @@ namespace cpu { class CpuInstructionFusion : public InstructionFusion { public: - CpuInstructionFusion() - : InstructionFusion(CpuInstructionFusion::IsExpensive) {} + CpuInstructionFusion(bool may_duplicate) + : InstructionFusion(CpuInstructionFusion::IsExpensive, may_duplicate) {} ~CpuInstructionFusion() override = default; using HloPassInterface::Run; diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.cc b/third_party/xla/xla/service/cpu/cpu_runtime.cc index 7caf9c43b1119b..5b66495798d800 100644 --- a/third_party/xla/xla/service/cpu/cpu_runtime.cc +++ b/third_party/xla/xla/service/cpu/cpu_runtime.cc @@ -197,6 +197,54 @@ extern const char* const kOneDnnMatMulReorderSymbolName = "__xla_cpu_runtime_OneDnnMatMulReorder"; extern const char* const kHandleFfiCallSymbolName = "__xla_cpu_runtime_HandleFfiCall"; +extern const char* const kXnnPackSoftMaxNDSymbolName = + "__xla_cpu_runtime_XnnPackSoftMaxND"; +extern const char* const kArgMax3DParallelSymbolName = + "__xla_cpu_runtime_ArgMax3DParallel"; +extern const char* const kArgMax3DSequentialSymbolName = + "__xla_cpu_runtime_ArgMax3DSequential"; +extern const char* const kKernelSelectorGEMVSymbolName = + "__xla_cpu_runtime_KernelSelectorGEMV"; +extern const char* const kKernelSelectorGEMMSequentialSymbolName = + "__xla_cpu_runtime_KernelSelectorGEMMSequential"; +extern const char* const kKernelSelectorGEMMParallelSymbolName = + "__xla_cpu_runtime_KernelSelectorGEMMParallel"; +extern const char* const kKernelSelectorBatch3DSequentialSymbolName = + "__xla_cpu_runtime_KernelSelectorBatch3DSequential"; +extern const char* const kKernelSelectorBatch3DParallelSymbolName = + "__xla_cpu_runtime_KernelSelectorBatch3DParallel"; +#ifdef ENABLE_BLAS_MLIR +extern const char* const kKernelSelectorGEMVMLIRSymbolName = + "__xla_cpu_runtime_KernelSelectorGEMVMLIR"; +#endif // ENABLE_BLAS_MLIR +extern const char* const kKernelSelectorBatch4DSequentialSymbolName = + "__xla_cpu_runtime_KernelSelectorBatch4DSequential"; +extern const char* const kKernelSelectorBatch4DParallelSymbolName = + "__xla_cpu_runtime_KernelSelectorBatch4DParallel"; +#ifdef ENABLE_BLAS_MLIR +extern const char* const kKernelSelectorGEMMMLIRSymbolName = + "__xla_cpu_runtime_KernelSelectorGEMMMLIR"; +extern const char* const kKernelSelectorBatch3DMLIRSymbolName = + "__xla_cpu_runtime_KernelSelectorBatch3DMLIR"; +extern const char* const kKernelSelectorBatch4DMLIRSymbolName = + "__xla_cpu_runtime_KernelSelectorBatch4DMLIR"; +#endif // ENABLE_BLAS_MLIR +extern const char* const kKernelSelectorGEMVEmptySymbolName = + "__xla_cpu_runtime_KernelSelectorGEMVEmpty"; +extern const char* const kKernelSelectorGEMMEmptySymbolName = + "__xla_cpu_runtime_KernelSelectorGEMMEmpty"; +extern const char* const kKernelSelectorBatch3DEmptySymbolName = + "__xla_cpu_runtime_KernelSelectorBatch3DEmpty"; +extern const char* const kKernelSelectorBatch4DEmptySymbolName = + "__xla_cpu_runtime_KernelSelectorBatch4DEmpty"; +extern const char* const kArgMax3DEmptySymbolName = + "__xla_cpu_runtime_ArgMax3DEmpty"; +extern const char* const kKernelSelectorOperationGEMV = "GEMV"; +extern const char* const kKernelSelectorOperationGEMM = "GEMM"; +extern const char* const kKernelSelectorOperationBATCH3D = "BATCH3D"; +extern const char* const kKernelSelectorOperationBATCH4D = "BATCH4D"; +extern const char* const kKernelSelectorOperationARGMAX = "ARGMAX"; +extern const char* const kCustomCallKernelSelector = "KernelSelector"; namespace { diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.h b/third_party/xla/xla/service/cpu/cpu_runtime.h index 71e27ea600ee28..4469a468a2ff5c 100644 --- a/third_party/xla/xla/service/cpu/cpu_runtime.h +++ b/third_party/xla/xla/service/cpu/cpu_runtime.h @@ -97,6 +97,36 @@ extern const char* const kOneDnnLayerNormSymbolName; extern const char* const kOneDnnConvolutionSymbolName; extern const char* const kOneDnnMatMulReorderSymbolName; extern const char* const kHandleFfiCallSymbolName; +extern const char* const kXnnPackSoftMaxNDSymbolName; +extern const char* const kArgMax3DParallelSymbolName; +extern const char* const kArgMax3DSequentialSymbolName; +extern const char* const kKernelSelectorGEMVSymbolName; +extern const char* const kKernelSelectorGEMMSequentialSymbolName; +extern const char* const kKernelSelectorGEMMParallelSymbolName; +extern const char* const kKernelSelectorBatch3DSequentialSymbolName; +extern const char* const kKernelSelectorBatch3DParallelSymbolName; +extern const char* const kKernelSelectorBatch4DSequentialSymbolName; +extern const char* const kKernelSelectorBatch4DParallelSymbolName; +#ifdef ENABLE_BLAS_MLIR +extern const char* const kKernelSelectorGEMVMLIRSymbolName; +extern const char* const kKernelSelectorGEMMMLIRSymbolName; +extern const char* const kKernelSelectorBatch3DMLIRSymbolName; +extern const char* const kKernelSelectorBatch4DMLIRSymbolName; +#endif // ENABLE_BLAS_MLIR +extern const char* const kKernelSelectorGEMVEmptySymbolName; +extern const char* const kKernelSelectorGEMMEmptySymbolName; +extern const char* const kKernelSelectorBatch3DEmptySymbolName; +extern const char* const kKernelSelectorBatch4DEmptySymbolName; +extern const char* const kArgMax3DEmptySymbolName; + +// Kernel selector operation names. +extern const char* const kKernelSelectorOperationGEMV; +extern const char* const kKernelSelectorOperationGEMM; +extern const char* const kKernelSelectorOperationBATCH3D; +extern const char* const kKernelSelectorOperationBATCH4D; +extern const char* const kKernelSelectorOperationARGMAX; + +extern const char* const kCustomCallKernelSelector; // All symbol names for XLA CPU runtime functions need to start with this // prefix. diff --git a/third_party/xla/xla/service/cpu/ir_emitter.cc b/third_party/xla/xla/service/cpu/ir_emitter.cc index feca6552d243f8..f99308bcd6104f 100644 --- a/third_party/xla/xla/service/cpu/ir_emitter.cc +++ b/third_party/xla/xla/service/cpu/ir_emitter.cc @@ -2463,6 +2463,217 @@ absl::Status IrEmitter::HandleTopK(HloInstruction* hlo) { return absl::OkStatus(); } +absl::Status IrEmitter::HandleXnnPackSoftMax(HloInstruction* hlo) { + const HloInstruction* input = hlo->operand(0); + Shape shape = input->shape(); + + TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo)); + TF_RET_CHECK(input->shape().element_type() == F32); + TF_RET_CHECK(shape.dimensions().size() >= 2); + + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice input_values_slice, + assignment_.GetUniqueSlice(hlo->operand(0), {})); + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice out_values_slice, + assignment_.GetUniqueSlice(hlo, {})); + + llvm::Value* values_ptr = EmitBufferPointer(input_values_slice, shape); + llvm::Value* out_values_ptr = EmitBufferPointer(out_values_slice, shape); + + // Flatten the batches into a single dimension. + int channels = shape.dimensions(shape.dimensions().size() - 1); + int batch_size = 1; + for (int i = 0; i < shape.dimensions().size() - 1; i++) + batch_size = batch_size * shape.dimensions(i); + + EmitCallToFunc(runtime::kXnnPackSoftMaxNDSymbolName, + {/*run_options=*/GetExecutableRunOptionsArgument(), + /*input*/ values_ptr, + /*output*/ out_values_ptr, + /*batch_size*/ b()->getInt64(batch_size), + /*channels*/ b()->getInt64(channels)}, + b()->getVoidTy()); + + return absl::OkStatus(); +} + +absl::Status IrEmitter::HandleKernelSelectorArgMax(HloInstruction* hlo) { + OpMetadata metadata = hlo->metadata(); + + const HloInstruction* in1 = hlo->operand(0); + const HloInstruction* in2 = hlo->operand(1); + const HloInstruction* in3 = hlo->operand(2); + const HloInstruction* in4 = hlo->operand(3); + + Shape shape = in1->shape(); + TF_RET_CHECK(shape.dimensions().size() == 3); + + TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo)); + + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice input1_slice, + assignment_.GetUniqueSlice(in1, {})); + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice input2_slice, + assignment_.GetUniqueSlice(in2, {})); + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice out_values_slice, + assignment_.GetUniqueSlice(hlo, {0})); + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice out_indices_slice, + assignment_.GetUniqueSlice(hlo, {1})); + + llvm::Value* values1_ptr = EmitBufferPointer(input1_slice, in1->shape()); + llvm::Value* values2_ptr = EmitBufferPointer(input2_slice, in2->shape()); + llvm::Value* out_values_ptr = + EmitBufferPointer(out_values_slice, hlo->shape().tuple_shapes(0)); + llvm::Value* out_indices_ptr = + EmitBufferPointer(out_indices_slice, hlo->shape().tuple_shapes(1)); + + float cst1_val = in3->literal().Get({}); + llvm::Constant* cst1 = llvm::ConstantFP::get(b()->getFloatTy(), cst1_val); + + EmitCallToFunc( + metadata.op_name(), + {/*run_options=*/GetExecutableRunOptionsArgument(), + /*B*/ b()->getInt64(shape.dimensions(0)), + /*M*/ b()->getInt64(shape.dimensions(1)), + /*N*/ b()->getInt64(shape.dimensions(2)), + /*invals*/ BitCast(values1_ptr, b()->getInt32Ty()->getPointerTo()), + /*inidxs*/ BitCast(values2_ptr, b()->getInt32Ty()->getPointerTo()), + /*init_value*/ cst1, + /*init_idx*/ b()->getInt32(in4->literal().Get({})), + /*outvals*/ BitCast(out_values_ptr, b()->getFloatTy()->getPointerTo()), + /*outidxs*/ BitCast(out_indices_ptr, b()->getInt32Ty()->getPointerTo())}, + b()->getVoidTy()); + + llvm_ir::EmitTuple(GetIrArrayFor(hlo), {out_values_ptr, out_indices_ptr}, + b()); + return absl::OkStatus(); +} + +absl::Status IrEmitter::HandleKernelSelectorBlas(HloInstruction* custom_call) { + OpMetadata metadata = custom_call->metadata(); + + bool isGEMV = (metadata.op_type() == runtime::kKernelSelectorOperationGEMV); + bool isGEMM = (metadata.op_type() == runtime::kKernelSelectorOperationGEMM); + bool isBATCHMATMUL3D = + (metadata.op_type() == runtime::kKernelSelectorOperationBATCH3D); + bool isBATCHMATMUL4D = + (metadata.op_type() == runtime::kKernelSelectorOperationBATCH4D); + bool isBATCHMATMUL = isBATCHMATMUL3D | isBATCHMATMUL4D; + + int operand = 0; + std::vector arguments; + + // | arguments | + // | gemm | batch3d | batch4d | gemv | + // ----------------------------------------- + // | trA | trA | trA | trA | + // | trB | trB | trB | | + // | A | A | A | A | + // | B | B | B | X | + // | | | Q | | + // | | P | P | | + // | M | M | M | M | + // | N | N | N | N | + // | K | K | K | | + // | alpha | | | alpha | + // | beta | | | beta | + + arguments.push_back(/*run_options=*/GetExecutableRunOptionsArgument()); + + // trA + HloInstruction const* trA = custom_call->operand(operand++); + bool tranA = trA->literal().Get({}); + arguments.push_back(b()->getInt1(tranA)); + + if (isGEMM || isBATCHMATMUL) { + // trB + HloInstruction const* trB = custom_call->operand(operand++); + bool tranB = trB->literal().Get({}); + arguments.push_back(b()->getInt1(tranB)); + } + + // A + HloInstruction const* A = custom_call->operand(operand++); + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice a_slice, + assignment_.GetUniqueSlice(A, {})); + llvm::Value* A_ptr = EmitBufferPointer(a_slice, A->shape()); + arguments.push_back(A_ptr); + + // B (or X in GEMV) + HloInstruction const* B = custom_call->operand(operand++); + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice b_slice, + assignment_.GetUniqueSlice(B, {})); + llvm::Value* B_ptr = EmitBufferPointer(b_slice, B->shape()); + arguments.push_back(B_ptr); + + if (isBATCHMATMUL) { + // Q + if (isBATCHMATMUL4D) { + HloInstruction const* Q = custom_call->operand(operand++); + int q = Q->literal().Get({}); + arguments.push_back(b()->getInt32(q)); + } + + // P + HloInstruction const* P = custom_call->operand(operand++); + int p = P->literal().Get({}); + arguments.push_back(b()->getInt32(p)); + } + + // M + HloInstruction const* M = custom_call->operand(operand++); + int m = M->literal().Get({}); + arguments.push_back(b()->getInt32(m)); + + // N + HloInstruction const* N = custom_call->operand(operand++); + int n = N->literal().Get({}); + arguments.push_back(b()->getInt32(n)); + + if (isGEMM || isBATCHMATMUL) { + // K + HloInstruction const* K = custom_call->operand(operand++); + int k = K->literal().Get({}); + arguments.push_back(b()->getInt32(k)); + } + + float beta = 0.0; + if (isGEMM || isGEMV) { + // Alpha + HloInstruction const* Alpha = custom_call->operand(operand++); + float alpha = Alpha->literal().Get({}); + llvm::Constant* alphaConst = llvm::ConstantFP::get(b()->getFloatTy(), alpha); + arguments.push_back(alphaConst); + + // Beta + HloInstruction const* Beta = custom_call->operand(operand++); + beta = Beta->literal().Get({}); + llvm::Constant* betaConst = llvm::ConstantFP::get(b()->getFloatTy(), beta); + arguments.push_back(betaConst); + } + + // C (or Y in GEMV) + HloInstruction const* C = custom_call; + + TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice c_slice, + assignment_.GetUniqueSlice(C, {})); + llvm::Value* C_ptr = EmitBufferPointer(c_slice, C->shape()); + arguments.push_back(C_ptr); + + TF_RETURN_IF_ERROR(EmitTargetAddressForOp(custom_call)); + + EmitCallToFunc(metadata.op_name(), arguments, b()->getVoidTy()); + + return absl::OkStatus(); +} + +absl::Status IrEmitter::HandleKernelSelector(HloInstruction* custom_call) { + OpMetadata metadata = custom_call->metadata(); + + if (metadata.op_type() == runtime::kKernelSelectorOperationARGMAX) + return HandleKernelSelectorArgMax(custom_call); + else + return HandleKernelSelectorBlas(custom_call); +} + #if defined(INTEL_MKL) // Emits operands alloca vector for oneDNN custom calls. @@ -2815,6 +3026,12 @@ absl::Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) { if (custom_call->custom_call_target() == "TopK") { return HandleTopK(custom_call); } + if (custom_call->custom_call_target() == "__xnnpack$softmax") { + return HandleXnnPackSoftMax(custom_call); + } + if (custom_call->custom_call_target() == runtime::kCustomCallKernelSelector) { + return HandleKernelSelector(custom_call); + } #if defined(INTEL_MKL) if (custom_call->custom_call_target() == "__onednn$matmul") { return HandleOneDnnMatMulCalls(custom_call, diff --git a/third_party/xla/xla/service/cpu/ir_emitter.h b/third_party/xla/xla/service/cpu/ir_emitter.h index 40f54d2f4bff97..9d668325d1618b 100644 --- a/third_party/xla/xla/service/cpu/ir_emitter.h +++ b/third_party/xla/xla/service/cpu/ir_emitter.h @@ -336,6 +336,10 @@ class IrEmitter : public DfsHloVisitorWithDefault, absl::Status HandleTopK(HloInstruction* hlo) override; absl::Status HandleAllReduceSingleReplica(HloInstruction* crs); absl::Status HandleAllReduceMultipleReplica(HloInstruction* crs); + absl::Status HandleXnnPackSoftMax(HloInstruction* hlo); + absl::Status HandleKernelSelector(HloInstruction* hlo); + absl::Status HandleKernelSelectorBlas(HloInstruction* hlo); + absl::Status HandleKernelSelectorArgMax(HloInstruction* hlo); #if defined(INTEL_MKL) std::vector EmitOneDnnOperandsAlloca(HloInstruction* custom_call, llvm::Value*& args_val, diff --git a/third_party/xla/xla/service/cpu/kernel_selector.cc b/third_party/xla/xla/service/cpu/kernel_selector.cc new file mode 100644 index 00000000000000..0ba46ab5989c44 --- /dev/null +++ b/third_party/xla/xla/service/cpu/kernel_selector.cc @@ -0,0 +1,423 @@ +/* Copyright 2025 Huawei. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "kernel_selector.h" + +#define EIGEN_USE_THREADS + +#include + +#include "tsl/platform/blocking_counter.h" +#include "unsupported/Eigen/CXX11/Tensor" +#include "xla/executable_run_options.h" +#include "xla/service/cpu/runtime_lightweight_check.h" + +namespace xla { +namespace cpu { + +// TODO: Need to test handling trA, trB +void __xla_cpu_runtime_KernelSelectorGEMMSequential( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int M, int N, int K, float alpha, float beta, float* C) { + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, + ldc); +} + +// TODO: Need to test handling trA, trB +void __xla_cpu_runtime_KernelSelectorGEMMParallel( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int M, int N, int K, float alpha, float beta, float* C) { + const xla::ExecutableRunOptions* run_options = + static_cast(run_options_ptr); + XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr); + const Eigen::ThreadPoolDevice* thread_pool = + (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool()); + Eigen::ThreadPoolInterface* eigen_interface_ = thread_pool->getPool(); + + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + float beta_v = beta; + if (beta == 0.0) { + beta_v = 1.0; + memset(C, 0.0, M * N * sizeof(float)); + } + + int njobs = eigen_interface_->NumThreads(); + + int sqrt_jobs = (int)sqrt(njobs); + + tsl::BlockingCounter bc(njobs); + + // TODO: Look at a more flexible way to distribute computation amongst + // threads. + for (int i = 0; i < sqrt_jobs; i++) { + for (int j = 0; j < sqrt_jobs; j++) { + int M_tile = M / sqrt_jobs; + int N_tile = N / sqrt_jobs; + + int M_start = i * M_tile; + int N_start = j * N_tile; + + int M_len = (i == sqrt_jobs - 1) ? (M - M_start) : M_tile; + int N_len = (j == sqrt_jobs - 1) ? (N - N_start) : N_tile; + + eigen_interface_->Schedule([=, &bc]() { + cblas_sgemm(Order, TransA, TransB, M_len, N_len, K, alpha, + &A[M_start * lda], lda, &B[N_start], ldb, beta_v, + &C[M_start * ldc + N_start], ldc); + bc.DecrementCount(); + }); + } + } + bc.Wait(); +} + +void __xla_cpu_runtime_KernelSelectorBatch3DSequential( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int P, int M, int N, int K, float* C) { + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + float alpha = 1.0; + float beta = 0.0; + + for (int i = 0; i < P; ++i) { + cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, &A[i * M * K], lda, + &B[i * K * N], ldb, beta, &C[i * M * N], ldc); + } +} + +void __xla_cpu_runtime_KernelSelectorBatch3DParallel( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int P, int M, int N, int K, float* C) { + const xla::ExecutableRunOptions* run_options = + static_cast(run_options_ptr); + XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr); + const Eigen::ThreadPoolDevice* thread_pool = + (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool()); + Eigen::ThreadPoolInterface* eigen_interface_ = thread_pool->getPool(); + + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + float alpha = 1.0; + float beta = 0.0; + + int njobs = eigen_interface_->NumThreads(); + + int num_batches = P; + + tsl::BlockingCounter bc(num_batches < njobs ? num_batches : njobs); + + // parallelize batches + int PB = (num_batches) / njobs; + int rem = (num_batches) % njobs; + + // TODO: Need to test handling trA + for (int batchIdx = 0, threadIdx = 0; batchIdx < num_batches; threadIdx++) { + int adjPB = PB + (threadIdx < rem ? 1 : 0); + + eigen_interface_->Schedule([=, &bc]() { + for (int i = 0; i < adjPB; i++) { + const float* AA = &A[(batchIdx + i) * M * K]; + const float* BB = &B[(batchIdx + i) * K * N]; + float* CC = &C[(batchIdx + i) * M * N]; + cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, AA, lda, BB, ldb, + beta, CC, ldc); + } + bc.DecrementCount(); + }); + + batchIdx += adjPB; + } + bc.Wait(); +} + +void __xla_cpu_runtime_KernelSelectorGEMV(const void* run_options_ptr, bool trA, + const float* A, const float* X, int M, + int N, float alpha, float beta, + float* Y) { + int lda = trA ? M : N; + int incX = 1; + int incY = 1; + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + cblas_sgemv(Order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY); +} + +#ifdef ENABLE_BLAS_MLIR +void __xla_cpu_runtime_KernelSelectorGEMMMLIR(const void* run_options_ptr, + bool trA, bool trB, + const float* A, const float* B, + int M, int N, int K, float alpha, + float beta, float* C) { + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + float beta_v = beta; + if (beta == 0.0) { + beta_v = 1.0; + memset(C, 0.0, M * N * sizeof(float)); + } + + cblas_sgemm_mlir(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, + beta_v, C, ldc); +} + +void __xla_cpu_runtime_KernelSelectorBatch3DMLIR(const void* run_options_ptr, + bool trA, bool trB, + const float* A, const float* B, + int P, int M, int N, int K, + float* C) { + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + cblas_sbatch_matmul_mlir(Order, TransA, TransB, P, M, N, K, A, lda, B, ldb, C, + ldc); +} + +void __xla_cpu_runtime_KernelSelectorBatch4DMLIR(const void* run_options_ptr, + bool trA, bool trB, + const float* A, const float* B, + int Q, int P, int M, int N, + int K, float* C) { + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + cblas_sbatch_matmul_4d_mlir(Order, TransA, TransB, Q, P, M, N, K, A, lda, B, + ldb, C, ldc); +} +#endif // ENABLE_BLAS_MLIR + +void __xla_cpu_runtime_KernelSelectorBatch4DSequential( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int Q, int P, int M, int N, int K, float* C) { + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + float alpha = 1.0; + float beta = 0.0; + + for (int i = 0; i < Q * P; ++i) { + cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, &A[i * M * K], lda, + &B[i * K * N], ldb, beta, &C[i * M * N], ldc); + } +} + +void __xla_cpu_runtime_KernelSelectorBatch4DParallel( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int Q, int P, int M, int N, int K, float* C) { + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + CBLAS_TRANSPOSE TransB = (trB) ? CblasTrans : CblasNoTrans; + int lda = trA ? M : K; + int ldb = trB ? K : N; + int ldc = N; + + float alpha = 1.0; + float beta = 0.0; + + const xla::ExecutableRunOptions* run_options = + static_cast(run_options_ptr); + XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr); + const Eigen::ThreadPoolDevice* thread_pool = + (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool()); + Eigen::ThreadPoolInterface* eigen_interface_ = thread_pool->getPool(); + + int njobs = eigen_interface_->NumThreads(); + + int num_batches = P * Q; + + tsl::BlockingCounter bc(num_batches < njobs ? num_batches : njobs); + + // parallelize batches + int PB = (num_batches) / njobs; + int rem = (num_batches) % njobs; + + // TODO: Need to test handling trA + for (int batchIdx = 0, threadIdx = 0; batchIdx < num_batches; threadIdx++) { + int adjPB = PB + (threadIdx < rem ? 1 : 0); + + eigen_interface_->Schedule([=, &bc]() { + for (int i = 0; i < adjPB; i++) { + const float* AA = &A[(batchIdx + i) * M * K]; + const float* BB = &B[(batchIdx + i) * K * N]; + float* CC = &C[(batchIdx + i) * M * N]; + cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, AA, lda, BB, ldb, + beta, CC, ldc); + } + bc.DecrementCount(); + }); + + batchIdx += adjPB; + } + bc.Wait(); +} + +#ifdef ENABLE_BLAS_MLIR +void __xla_cpu_runtime_KernelSelectorGEMVMLIR(const void* run_options_ptr, + bool trA, const float* A, + const float* X, int M, int N, + float alpha, float beta, + float* Y) { + int lda = trA ? M : N; + int incX = 1; + int incY = 1; + CBLAS_LAYOUT Order = CblasRowMajor; + CBLAS_TRANSPOSE TransA = (trA) ? CblasTrans : CblasNoTrans; + + cblas_sgemv_mlir(Order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY); +} +#endif // ENABLE_BLAS_MLIR + +void __xla_cpu_runtime_ArgMaxTask(size_t out_idx, int N, float* invals, + int32_t* inidxs, float init_value, + int32_t init_idx, float* outvals, + int32_t* outidxs) { + float maxval = init_value; + int32_t maxidx = init_idx; + size_t idx = (out_idx)*N; + + for (int i = 0; i < N; i++) { + float val = invals[idx]; + int32_t idx_val = inidxs[idx]; + + if (val >= maxval) { + maxval = val; + maxidx = idx_val; + } + + idx++; + } + + outvals[out_idx] = maxval; + outidxs[out_idx] = maxidx; +} + +void __xla_cpu_runtime_ArgMax3DParallel(const void* run_options_ptr, int B, + int M, int N, float* invals, + int32_t* inidxs, float init_value, + int32_t init_idx, float* outvals, + int32_t* outidxs) { + const xla::ExecutableRunOptions* run_options = + static_cast(run_options_ptr); + XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr); + const Eigen::ThreadPoolDevice* thread_pool = + (Eigen::ThreadPoolDevice*)(run_options->intra_op_thread_pool()); + Eigen::ThreadPoolInterface* eigen_interface_ = thread_pool->getPool(); + + int BM = B * M; + int num_threads = eigen_interface_->NumThreads(); + const int block_size = (BM + num_threads - 1) / num_threads; + tsl::BlockingCounter bc(num_threads); + + for (size_t t = 0; t < num_threads; t++) { + size_t start = t * block_size; + size_t end = std::min((t + 1) * block_size, BM); + + eigen_interface_->ScheduleWithHint( + [=, &bc]() { + for (size_t bm = start; bm < end; bm++) { + __xla_cpu_runtime_ArgMaxTask(bm, N, invals, inidxs, init_value, + init_idx, outvals, outidxs); + } + bc.DecrementCount(); + }, + t, t + 1); + } + + bc.Wait(); +} + +void __xla_cpu_runtime_ArgMax3DSequential(const void* run_options_ptr, int B, + int M, int N, float* invals, + int32_t* inidxs, float init_value, + int32_t init_idx, float* outvals, + int32_t* outidxs) { + // NB: run_options_ptr is ignored in the sequential version. + for (int b = 0; b < B; b++) { + for (int m = 0; m < M; m++) { + size_t out_idx = b * M + m; + __xla_cpu_runtime_ArgMaxTask(out_idx, N, invals, inidxs, init_value, + init_idx, outvals, outidxs); + } + } +} + +void __xla_cpu_runtime_ArgMax3DEmpty(const void* run_options_ptr, int B, int M, + int N, float* invals, int32_t* inidxs, + float init_value, int32_t init_idx, + float* outvals, int32_t* outidxs) {} + +void __xla_cpu_runtime_KernelSelectorGEMVEmpty(const void* run_options_ptr, + bool trA, const float* A, + const float* X, int M, int N, + float alpha, float beta, + float* Y) {} + +void __xla_cpu_runtime_KernelSelectorGEMMEmpty(const void* run_options_ptr, + bool trA, bool trB, + const float* A, const float* B, + int m, int n, int k, float alpha, + float beta, float* C) {} + +void __xla_cpu_runtime_KernelSelectorBatch3DEmpty(const void* run_options_ptr, + bool trA, bool trB, + const float* A, + const float* B, int P, int M, + int N, int K, float* C) {} + +void __xla_cpu_runtime_KernelSelectorBatch4DEmpty( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int Q, int P, int M, int N, int K, float* C) {} + +} // namespace cpu +} // namespace xla diff --git a/third_party/xla/xla/service/cpu/kernel_selector.h b/third_party/xla/xla/service/cpu/kernel_selector.h new file mode 100644 index 00000000000000..beb64d033f6b99 --- /dev/null +++ b/third_party/xla/xla/service/cpu/kernel_selector.h @@ -0,0 +1,191 @@ +/* Copyright 2025 Huawei. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef XLA_SERVICE_CPU_KERNEL_SELECTOR_H_ +#define XLA_SERVICE_CPU_KERNEL_SELECTOR_H_ +#include + +namespace xla { +namespace cpu { + +#ifndef OPENBLAS_CONST +#define OPENBLAS_CONST const +#endif + +typedef enum CBLAS_ORDER { + CblasRowMajor = 101, + CblasColMajor = 102 +} CBLAS_ORDER; + +typedef enum CBLAS_TRANSPOSE { + CblasNoTrans = 111, + CblasTrans = 112, + CblasConjTrans = 113, + CblasConjNoTrans = 114 +} CBLAS_TRANSPOSE; + +typedef int blasint; +typedef CBLAS_ORDER CBLAS_LAYOUT; + +extern "C" { + +// BLAS interface +extern void cblas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, + OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, + OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, + OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, + OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, + OPENBLAS_CONST float* A, OPENBLAS_CONST blasint lda, + OPENBLAS_CONST float* B, OPENBLAS_CONST blasint ldb, + OPENBLAS_CONST float beta, float* C, + OPENBLAS_CONST blasint ldc); + +extern void cblas_sgemv(OPENBLAS_CONST enum CBLAS_ORDER order, + OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, + OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, + OPENBLAS_CONST float alpha, OPENBLAS_CONST float* a, + OPENBLAS_CONST blasint lda, OPENBLAS_CONST float* x, + OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, + float* y, OPENBLAS_CONST blasint incy); + +#ifdef ENABLE_BLAS_MLIR +// MLIR LIB +extern void cblas_sbatch_matmul_mlir( + const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const blasint P, const blasint M, + const blasint N, const blasint K, const float* A, const blasint lda, + const float* B, const blasint ldb, float* C, const blasint ldc); + +extern void cblas_sbatch_matmul_4d_mlir( + const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const blasint Q, const blasint P, + const blasint M, const blasint N, const blasint K, const float* A, + const blasint lda, const float* B, const blasint ldb, float* C, + const blasint ldc); + +extern void cblas_sgemm_mlir(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const blasint M, + const blasint N, const blasint K, + const float alpha, const float* A, + const blasint lda, const float* B, + const blasint ldb, const float beta, float* C, + const blasint ldc); + +extern void cblas_sgemv_mlir(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, const blasint M, + const blasint N, const float alpha, const float* A, + const blasint lda, const float* X, + const blasint incX, const float beta, float* Y, + const blasint incY); +#endif // ENABLE_BLAS_MLIR +} // extern "C" + +void __xla_cpu_runtime_KernelSelectorGEMMSequential( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int M, int N, int K, float alpha, float beta, float* C); + +void __xla_cpu_runtime_KernelSelectorGEMMParallel( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int m, int n, int k, float alpha, float beta, float* C); + +void __xla_cpu_runtime_KernelSelectorBatch3DSequential( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int P, int M, int N, int K, float* C); + +void __xla_cpu_runtime_KernelSelectorBatch3DParallel( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int P, int M, int N, int K, float* C); + +void __xla_cpu_runtime_KernelSelectorBatch4DSequential( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int Q, int P, int M, int N, int K, float* C); + +void __xla_cpu_runtime_KernelSelectorBatch4DParallel( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int Q, int P, int M, int N, int K, float* C); + +void __xla_cpu_runtime_KernelSelectorGEMV(const void* run_options_ptr, bool trA, + const float* A, const float* X, int M, + int N, float alpha, float beta, + float* Y); + +#ifdef ENABLE_BLAS_MLIR +void __xla_cpu_runtime_KernelSelectorGEMMMLIR(const void* run_options_ptr, + bool trA, bool trB, + const float* A, const float* B, + int m, int n, int k, float alpha, + float beta, float* C); + +void __xla_cpu_runtime_KernelSelectorBatch3DMLIR(const void* run_options_ptr, + bool trA, bool trB, + const float* A, const float* B, + int P, int M, int N, int K, + float* C); + +void __xla_cpu_runtime_KernelSelectorBatch4DMLIR(const void* run_options_ptr, + bool trA, bool trB, + const float* A, const float* B, + int Q, int P, int M, int N, + int K, float* C); + +void __xla_cpu_runtime_KernelSelectorGEMVMLIR(const void* run_options_ptr, + bool trA, const float* A, + const float* X, int M, int N, + float alpha, float beta, + float* Y); +#endif // ENABLE_BLAS_MLIR + +void __xla_cpu_runtime_ArgMax3DParallel(const void* run_options_ptr, int B, + int M, int N, float* invals, + int32_t* inidxs, float init_value, + int32_t init_idx, float* outvals, + int32_t* outidxs); +void __xla_cpu_runtime_ArgMax3DSequential(const void* run_options_ptr, int B, + int M, int N, float* invals, + int32_t* inidxs, float init_value, + int32_t init_idx, float* outvals, + int32_t* outidxs); + +void __xla_cpu_runtime_ArgMax3DEmpty(const void* run_options_ptr, int B, int M, + int N, float* invals, int32_t* inidxs, + float init_value, int32_t init_idx, + float* outvals, int32_t* outidxs); + +void __xla_cpu_runtime_KernelSelectorGEMVEmpty(const void* run_options_ptr, + bool trA, const float* A, + const float* X, int M, int N, + float alpha, float beta, + float* Y); + +void __xla_cpu_runtime_KernelSelectorGEMMEmpty(const void* run_options_ptr, + bool trA, bool trB, + const float* A, const float* B, + int m, int n, int k, float alpha, + float beta, float* C); + +void __xla_cpu_runtime_KernelSelectorBatch3DEmpty(const void* run_options_ptr, + bool trA, bool trB, + const float* A, + const float* B, int P, int M, + int N, int K, float* C); + +void __xla_cpu_runtime_KernelSelectorBatch4DEmpty( + const void* run_options_ptr, bool trA, bool trB, const float* A, + const float* B, int Q, int P, int M, int N, int K, float* C); + +} // namespace cpu +} // namespace xla + +#endif // XLA_SERVICE_CPU_KERNEL_SELECTOR_H_ diff --git a/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.cc b/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.cc new file mode 100644 index 00000000000000..79868054c13ed1 --- /dev/null +++ b/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.cc @@ -0,0 +1,658 @@ +/* Copyright 2025 Huawei. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "kernel_selector_ops_rewriter.h" + +#include +#include +#include +#include + +#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h" +#include "xla/hlo/ir/hlo_casting_utils.h" +#include "xla/literal_util.h" +#include "xla/service/cpu/cpu_runtime.h" + +namespace xla { +namespace cpu { + +// Uncomment to get printed information about the sizes and the call selected. +#define PRINT_DEBUG + +#ifdef PRINT_DEBUG +#include +#define DEBUG(x) std::cerr << x << "\n"; +#else +#define DEBUG(x) \ + do { \ + } while (0); +#endif + +enum Operation { NONE, GEMV, GEMM, BATCH_MATMUL_3D, BATCH_MATMUL_4D }; +enum KernelType { kGEMV, kGEMM, kBATCH3D, kBATCH4D, kARGMAX }; + +using Range = std::pair; +using RangeSet = std::vector; + +Range maxRange = {0, INT_MAX}; + +class IntervalMap { + using TypedRange = std::pair; + std::map m_map; + + public: + void insert(KernelType kTy, RangeSet& ranges, std::string& value) { + m_map[{kTy, ranges}] = value; + } + + bool lookup(KernelType kTy, std::vector& keys, std::string& outValue, + bool& fallback) const { + fallback = false; + for (const auto& entry : m_map) { + TypedRange typedRange = entry.first; + std::string value = entry.second; + if (typedRange.first != kTy) continue; + + const RangeSet& ranges = typedRange.second; + if (ranges.size() != keys.size()) continue; + + bool match = true; + for (size_t i = 0; i < ranges.size(); ++i) { + if (keys[i] < ranges[i].first || keys[i] > ranges[i].second) { + match = false; + break; + } + if (ranges[i] == maxRange) { + fallback = true; + } + } + + if (match) { + outValue = value; + return true; + } + } + return false; + } + + void print() const { + for (const auto& entry : m_map) { + TypedRange typedRange = entry.first; + std::string value = entry.second; + int kTy = typedRange.first; + const RangeSet& ranges = typedRange.second; + + DEBUG("[" << kTy << "]("); + for (const auto& range : ranges) { + DEBUG("[" << range.first << ":" << range.second << "] "); + } + DEBUG(") -> " << value << "\n"); + } + } + + void clear() { m_map.clear(); } +}; + +struct ParsedData { + std::string kernelName; + RangeSet sizes; + std::string functionName; + bool isValid; +}; + +std::map kernelStringToType = {{"gemv", kGEMV}, + {"gemm", kGEMM}, + {"batch3d", kBATCH3D}, + {"batch4d", kBATCH4D}, + {"argmax", kARGMAX}}; +std::map kernelTypeToString; // filled automatically. + +std::map kernelTypeToSizeRank = { + {kGEMV, 2}, {kGEMM, 3}, {kARGMAX, 3}, {kBATCH3D, 4}, {kBATCH4D, 5}}; + +int parseInt(const std::string& str) { + if (str == "*") return maxRange.second; + + int size = std::stoi(str); + if (size < 0) { + LOG(ERROR) << "Found invalid size: " << size; + return -1; + } + + return size; +} + +Range parseRange(const std::string& str) { + size_t colonPos = str.find(':'); + + if (str == "*") { + return maxRange; + } + + // For non-range strings like "1" we create a range {1,1} + if (colonPos == std::string::npos) { + int value = parseInt(str); + return {value, value}; + } + + auto left = str.substr(0, colonPos); + auto right = str.substr(colonPos + 1); + + int start = parseInt(left); + int end = parseInt(right); + + assert(start <= end); + + return {start, end}; +} + +// Parses line from the mapping file which look like [kernel](size1,size2,...) +// -> symbol +ParsedData parseLine(std::string& line) { + // Remove all whitespace from the line first. + line.erase(std::remove_if(line.begin(), line.end(), ::isspace), line.end()); + // A range looks like 23:29 or 12:* + std::string range = R"(\d+:(?:\d+|\*))"; + // An element is either a number, a *, or a range + std::string element = R"((?:\d+|\*|)" + range + R"())"; + // Sizes is a list of elements in parentheses + std::string sizes = R"(\(((?:)" + element + R"(,)*)" + element + R"()\))"; + std::regex pattern(R"(^\[(.+)\])" + sizes + R"(->(.+))"); + + std::smatch matches; + + ParsedData data; + data.isValid = false; + + if (std::regex_match(line, matches, pattern)) { + data.kernelName = matches[1]; + std::stringstream ss(matches[2]); + std::string token; + + while (std::getline(ss, token, ',')) { + auto range = parseRange(token); + if (range.first == -1 || range.second == -1) return data; + data.sizes.push_back(range); + } + data.functionName = matches[3]; + data.isValid = true; + } else { + XLA_VLOG_LINES(3, "KernelSelectorOpsRewriter::parseLine() : No match.\n"); + } + + return data; +} + +IntervalMap sizesToSymbol; + +const char* kernel_map_file = std::getenv("KERNEL_MAP_FILE"); + +void fill_map_from_file(const char* map_file, IntervalMap& map) { + if (!map_file) { + XLA_VLOG_LINES(3, "NO MAP FILE\n"); + return; + } + + std::ifstream file(map_file); + if (!file.is_open()) { + std::string file_name(map_file); + XLA_VLOG_LINES(3, + "KernelSelectorOpsRewriter::fill_map_from_file() : Cannot " + "open file. \n"); + return; + } + + // Clear the map to prevent conflicts and unexpected + // behaviour due to default pre-filled values. + map.clear(); + + std::string line; + int lineno = 1; + while (std::getline(file, line)) { + // If the file we are reading has Windows line endings, make sure + // we remove the `\r` before processing the regex, otherwise it will + // not match. + if (!line.empty() && line.back() == '\r') { + line.pop_back(); + } + + ParsedData data = parseLine(line); + if (!data.isValid) { + LOG(ERROR) << "Regex did not match on line " << lineno; + } else { + if (kernelStringToType.find(data.kernelName) == + kernelStringToType.end()) { + LOG(ERROR) << data.kernelName << " is not a valid kernel type"; + return; + } + + KernelType kTy = kernelStringToType[data.kernelName]; + int expectedRank = kernelTypeToSizeRank[kTy]; + + // Fallback case (i.e. lines like [gemm](*) -> symbol): store in the map + // the correct amount of "infinite" ranges: + if (data.sizes.size() == 1 && data.sizes[0] == maxRange) { + data.sizes.assign(expectedRank, maxRange); + } + + if (data.sizes.size() != expectedRank) { + LOG(ERROR) << data.kernelName + << " expected to have an input size of rank " << expectedRank + << ", but got " << data.sizes.size() << "(line " << lineno + << ")"; + } else { + map.insert(kTy, data.sizes, data.functionName); + } + } + lineno++; + } + + return; +} + +class KernelSelectorOpsRewriterVisitor : public DfsHloRewriteVisitor { + private: + void printDebugMessage(KernelType kTy, std::vector sizes) { + std::string debug_msg = "{"; + for (size_t i = 0; i < sizes.size(); ++i) { + debug_msg += std::to_string(sizes[i]); + if (i != sizes.size() - 1) { + debug_msg += ", "; + } + } + debug_msg += + "} -> Is not on the map and a fallback was not specified. The " + + kernelTypeToString[kTy] + " will not be replaced."; + + DEBUG(debug_msg); + } + + std::string GetKernelSelectorFunction(KernelType kTy, std::vector sizes, + bool& fallback) { + std::string fun_name; + bool found = sizesToSymbol.lookup(kTy, sizes, fun_name, fallback); + fallback = false; + + if (!found) { +#ifdef PRINT_DEBUG + printDebugMessage(kTy, sizes); +#endif + } + return fun_name; + } + + Operation getOperation(HloInstruction* instr) { + if (auto* dot = DynCast(instr)) { + auto batch_dims = dot->dot_dimension_numbers().lhs_batch_dimensions(); + auto dims = dot->shape().dimensions(); + if (batch_dims.size() == 1) { + return Operation::BATCH_MATMUL_3D; + } + if (batch_dims.size() == 2) { + return Operation::BATCH_MATMUL_4D; + } + if (dims.size() == 1) { + return Operation::GEMV; + } + if (batch_dims.empty()) { + return Operation::GEMM; + } + } + return Operation::NONE; + } + + template + HloInstruction* makeConstant(HloInstruction* op, T value) { + auto litteral = LiteralUtil::CreateR0(value); + return op->AddInstruction( + HloInstruction::CreateConstant(std::move(litteral))); + } + +#ifdef PRINT_DEBUG + std::map, std::string> AllocatedGemmSizes; + std::map, std::string> AllocatedGemvSizes; + std::map, std::string> AllocatedBatchMatmul3DSizes; + std::map, std::string> AllocatedBatchMatmul4DSizes; + std::map, std::string> AllocatedArgMax3DSizes; +#endif + + public: + absl::Status HandleDot(HloInstruction* dot) override { + Operation operation = getOperation(dot); + if (operation == Operation::NONE) { + return absl::OkStatus(); + } + bool fallbackSelected; + + // Collect all the operands for the CustomCall + switch (operation) { + case GEMM: { + KernelType kTy = kGEMM; + auto dnums = dot->dot_dimension_numbers(); + auto lhs_contracting_dims = dnums.lhs_contracting_dimensions(); + auto rhs_contracting_dims = dnums.rhs_contracting_dimensions(); + + assert(lhs_contracting_dims.size() == 1); + assert(rhs_contracting_dims.size() == 1); + + HloInstruction* trA = makeConstant(dot, lhs_contracting_dims[0] == 0); + HloInstruction* trB = makeConstant(dot, rhs_contracting_dims[0] == 1); + + HloInstruction* alpha = makeConstant(dot, (float)1.0); + HloInstruction* beta = makeConstant(dot, (float)0.0); + + HloInstruction* A = dot->operands()[0]; + HloInstruction* B = dot->operands()[1]; + + int m = dot->shape().dimensions(0); + HloInstruction* M = makeConstant(dot, m); + + int n = dot->shape().dimensions(1); + HloInstruction* N = makeConstant(dot, n); + + int k = A->shape().dimensions(lhs_contracting_dims[0]); + HloInstruction* K = makeConstant(dot, k); + + std::string fun_name = + GetKernelSelectorFunction(kTy, {m, n, k}, fallbackSelected); + if (fun_name.empty()) return absl::OkStatus(); + +#ifdef PRINT_DEBUG + if (AllocatedGemmSizes.find({m, n, k}) == AllocatedGemmSizes.end()) { + AllocatedGemmSizes[{m, n, k}] = fun_name; + DEBUG("{m: " << m << ", n: " << n << ", k: " << k << "} -> " + << fun_name << (fallbackSelected ? " (fallback)" : "")); + } +#endif + + std::vector operands = {trA, trB, A, B, M, + N, K, alpha, beta}; + + HloInstruction* kernel_selector_call = + dot->AddInstruction(HloInstruction::CreateCustomCall( + dot->shape(), operands, runtime::kCustomCallKernelSelector)); + + // Add metadata + OpMetadata metadata = dot->metadata(); + metadata.set_op_name(fun_name); + metadata.set_op_type(runtime::kKernelSelectorOperationGEMM); + kernel_selector_call->set_metadata(metadata); + TF_RETURN_IF_ERROR(ReplaceInstruction(dot, kernel_selector_call)); + + break; + } + case GEMV: { + KernelType kTy = kGEMV; + auto dnums = dot->dot_dimension_numbers(); + auto lhs_contracting_dims = dnums.lhs_contracting_dimensions(); + + assert(lhs_contracting_dims.size() == 1); + + bool is_trA = lhs_contracting_dims[0] == 0; + HloInstruction* trA = makeConstant(dot, is_trA); + + HloInstruction* alpha = makeConstant(dot, (float)1.0); + HloInstruction* beta = makeConstant(dot, (float)0.0); + + HloInstruction* A = dot->operands()[0]; + HloInstruction* X = dot->operands()[1]; + + int m = A->shape().dimensions(is_trA ? 1 : 0); + HloInstruction* M = makeConstant(dot, m); + + int n = A->shape().dimensions(is_trA ? 0 : 1); + HloInstruction* N = makeConstant(dot, n); + + std::string fun_name = + GetKernelSelectorFunction(kTy, {m, n}, fallbackSelected); + if (fun_name.empty()) return absl::OkStatus(); + +#ifdef PRINT_DEBUG + if (AllocatedGemvSizes.find({m, n}) == AllocatedGemvSizes.end()) { + AllocatedGemvSizes[{m, n}] = fun_name; + DEBUG("{m: " << m << ", n: " << n << "} -> " << fun_name + << (fallbackSelected ? " (fallback)" : "")); + } +#endif + + std::vector operands = {trA, A, X, M, N, alpha, beta}; + + HloInstruction* kernel_selector_call = + dot->AddInstruction(HloInstruction::CreateCustomCall( + dot->shape(), operands, runtime::kCustomCallKernelSelector)); + + // Add metadata + OpMetadata metadata = dot->metadata(); + metadata.set_op_name(fun_name); + metadata.set_op_type(runtime::kKernelSelectorOperationGEMV); + kernel_selector_call->set_metadata(metadata); + TF_RETURN_IF_ERROR(ReplaceInstruction(dot, kernel_selector_call)); + + break; + } + case BATCH_MATMUL_3D: { + KernelType kTy = kBATCH3D; + auto dnums = dot->dot_dimension_numbers(); + auto lhs_contracting_dims = dnums.lhs_contracting_dimensions(); + auto rhs_contracting_dims = dnums.rhs_contracting_dimensions(); + + assert(lhs_contracting_dims.size() == 1); + assert(rhs_contracting_dims.size() == 1); + + HloInstruction* trA = makeConstant(dot, lhs_contracting_dims[0] == 1); + HloInstruction* trB = makeConstant(dot, rhs_contracting_dims[0] == 2); + + HloInstruction* A = dot->operands()[0]; + HloInstruction* B = dot->operands()[1]; + + int p = dot->shape().dimensions(0); + HloInstruction* P = makeConstant(dot, p); + + int num_batch_dims = dnums.lhs_batch_dimensions_size(); + + int m = dot->shape().dimensions(num_batch_dims); + HloInstruction* M = makeConstant(dot, m); + + int n = dot->shape().dimensions(num_batch_dims + 1); + HloInstruction* N = makeConstant(dot, n); + + int k = A->shape().dimensions(lhs_contracting_dims[0]); + HloInstruction* K = makeConstant(dot, k); + + std::string fun_name = + GetKernelSelectorFunction(kTy, {p, m, n, k}, fallbackSelected); + if (fun_name.empty()) return absl::OkStatus(); + +#ifdef PRINT_DEBUG + if (AllocatedBatchMatmul3DSizes.find({p, m, n, k}) == + AllocatedBatchMatmul3DSizes.end()) { + AllocatedBatchMatmul3DSizes[{p, m, n, k}] = fun_name; + DEBUG("{p: " << p << ", m: " << m << ", n: " << n << ", k: " << k + << "} -> " << fun_name + << (fallbackSelected ? " (fallback)" : "")); + } +#endif + + std::vector operands = {trA, trB, A, B, P, M, N, K}; + + HloInstruction* kernel_selector_call = + dot->AddInstruction(HloInstruction::CreateCustomCall( + dot->shape(), operands, runtime::kCustomCallKernelSelector)); + + // Add metadata + OpMetadata metadata = dot->metadata(); + metadata.set_op_name(fun_name); + metadata.set_op_type(runtime::kKernelSelectorOperationBATCH3D); + kernel_selector_call->set_metadata(metadata); + TF_RETURN_IF_ERROR(ReplaceInstruction(dot, kernel_selector_call)); + + break; + } + case BATCH_MATMUL_4D: { + KernelType kTy = kBATCH4D; + auto dnums = dot->dot_dimension_numbers(); + auto lhs_contracting_dims = dnums.lhs_contracting_dimensions(); + auto rhs_contracting_dims = dnums.rhs_contracting_dimensions(); + + assert(lhs_contracting_dims.size() == 1); + assert(rhs_contracting_dims.size() == 1); + + HloInstruction* trA = makeConstant(dot, lhs_contracting_dims[0] == 2); + HloInstruction* trB = makeConstant(dot, rhs_contracting_dims[0] == 3); + + HloInstruction* A = dot->operands()[0]; + HloInstruction* B = dot->operands()[1]; + + int q = dot->shape().dimensions(0); + HloInstruction* Q = makeConstant(dot, q); + + int p = dot->shape().dimensions(1); + HloInstruction* P = makeConstant(dot, p); + + int num_batch_dims = dnums.lhs_batch_dimensions_size(); + + int m = dot->shape().dimensions(num_batch_dims); + HloInstruction* M = makeConstant(dot, m); + + int n = dot->shape().dimensions(num_batch_dims + 1); + HloInstruction* N = makeConstant(dot, n); + + int k = A->shape().dimensions(lhs_contracting_dims[0]); + HloInstruction* K = makeConstant(dot, k); + + std::string fun_name = GetKernelSelectorFunction(kTy, {q, p, m, n, k}, fallbackSelected); + + if (fun_name.empty()) return absl::OkStatus(); + +#ifdef PRINT_DEBUG + if (AllocatedBatchMatmul4DSizes.find({q, p, m, n, k}) == + AllocatedBatchMatmul4DSizes.end()) { + AllocatedBatchMatmul4DSizes[{q, p, m, n, k}] = fun_name; + DEBUG("{q: " << q << ", p: " << p << ", m: " << m << ", n: " << n + << ", k: " << k << "} -> " << fun_name + << (fallbackSelected ? " (fallback)" : "")); + } +#endif + + std::vector operands = {trA, trB, A, B, Q, P, M, N, K}; + + HloInstruction* kernel_selector_call = + dot->AddInstruction(HloInstruction::CreateCustomCall( + dot->shape(), operands, runtime::kCustomCallKernelSelector)); + + // Add metadata + OpMetadata metadata = dot->metadata(); + metadata.set_op_name(fun_name); + metadata.set_op_type(runtime::kKernelSelectorOperationBATCH4D); + kernel_selector_call->set_metadata(metadata); + TF_RETURN_IF_ERROR(ReplaceInstruction(dot, kernel_selector_call)); + + break; + } + default: + DEBUG("No library funcion was selected."); + return absl::OkStatus(); + } + + return absl::OkStatus(); + } + + absl::Status HandleReduce(HloInstruction* reduce) override { + bool fallbackSelected; + std::string op_type = reduce->metadata().op_type(); + // TODO: Is this reliable way to check for ArgMax? + // Works for BERT but its unclear if this is the proper way. + if (op_type != "ArgMax") { + return absl::OkStatus(); + } + + auto reduceOpr = reduce->operands(); + // The ArgMax pattern we support has exactly 4 operands. + if (reduceOpr.size() != 4) { + return absl::OkStatus(); + } + + // We currently only support 3D ArgMax. + auto dims = reduceOpr[0]->shape().dimensions(); + if (dims.size() != 3) { + return absl::OkStatus(); + } + + KernelType kTy = kARGMAX; + int b = dims[0]; + int m = dims[1]; + int n = dims[2]; + + std::string fun_name = GetKernelSelectorFunction(kTy, {b, m, n}, fallbackSelected); + + if (fun_name.empty()) return absl::OkStatus(); + +#ifdef PRINT_DEBUG + if (AllocatedArgMax3DSizes.find({b, m, n}) == + AllocatedArgMax3DSizes.end()) { + AllocatedArgMax3DSizes[{b, m, n}] = fun_name; + DEBUG("{b: " << b << ", m: " << m << ", n: " << n << "} -> " << fun_name + << (fallbackSelected ? " (fallback)" : "")); + } +#endif + + std::vector operands; + for (int i = 0; i < 4; i++) operands.push_back(reduceOpr[i]); + + HloInstruction* kernel_selector_call = + reduce->AddInstruction(HloInstruction::CreateCustomCall( + reduce->shape(), operands, runtime::kCustomCallKernelSelector)); + + // Add metadata + OpMetadata metadata = reduce->metadata(); + metadata.set_op_name(fun_name); + metadata.set_op_type(runtime::kKernelSelectorOperationARGMAX); + kernel_selector_call->set_metadata(metadata); + TF_RETURN_IF_ERROR(ReplaceInstruction(reduce, kernel_selector_call)); + + return absl::OkStatus(); + } +}; // namespace cpu + +absl::StatusOr KernelSelectorOpsRewriter::Run( + HloModule* module, + const absl::flat_hash_set& execution_threads) { + XLA_VLOG_LINES( + 3, "KernelSelectorOpsRewriter::Run(), before:\n" + module->ToString()); + + if (!kernel_map_file) { + LOG(INFO) << "KERNEL_MAP_FILE is not set. The kernel selector will not " + "run.\n Check xla/service/cpu/example_kernel_map.txt for an " + "example of kernel map file"; + return absl::OkStatus(); + } + + // Build the reverse map. + for (const auto& pair : kernelStringToType) { + kernelTypeToString[pair.second] = pair.first; + } + + fill_map_from_file(kernel_map_file, sizesToSymbol); + + KernelSelectorOpsRewriterVisitor visitor; + TF_ASSIGN_OR_RETURN(auto result, + visitor.RunOnModule(module, execution_threads)); + XLA_VLOG_LINES( + 3, "KernelSelectorOpsRewriter::Run(), after:\n" + module->ToString()); + return result; +} + +} // namespace cpu +} // namespace xla diff --git a/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.h b/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.h new file mode 100644 index 00000000000000..36714cfdf315b3 --- /dev/null +++ b/third_party/xla/xla/service/cpu/kernel_selector_ops_rewriter.h @@ -0,0 +1,42 @@ +/* Copyright 2025 Huawei. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef XLA_SERVICE_CPU_KERNEL_SELECTOR_OPS_REWRITER_H_ +#define XLA_SERVICE_CPU_KERNEL_SELECTOR_OPS_REWRITER_H_ + +#include "xla/hlo/ir/hlo_instructions.h" +#include "xla/hlo/ir/hlo_module.h" +#include "xla/hlo/pass/hlo_pass_interface.h" + +namespace xla { +namespace cpu { + +// This pass rewrites hlo.dot into custom calls. +class KernelSelectorOpsRewriter : public HloModulePass { + public: + absl::string_view name() const override { + return "kernel-selector-ops-rewriter"; + } + + using HloPassInterface::Run; + absl::StatusOr Run( + HloModule* module, + const absl::flat_hash_set& execution_threads) override; +}; + +} // namespace cpu +} // namespace xla + +#endif // XLA_SERVICE_CPU_KERNEL_SELECTOR_OPS_REWRITER_H_ diff --git a/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc b/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc index 87aca6c386751a..fd9479f35fff82 100644 --- a/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc +++ b/third_party/xla/xla/service/cpu/runtime_symbol_generator.cc @@ -23,6 +23,7 @@ limitations under the License. #include #include #include +#include #include "absl/functional/any_invocable.h" #include "absl/strings/string_view.h" @@ -56,6 +57,8 @@ limitations under the License. #include "xla/service/cpu/runtime_single_threaded_matmul.h" #include "xla/service/cpu/runtime_topk.h" #include "xla/service/cpu/windows_compatibility.h" +#include "xla/service/cpu/xnnpack_ops.h" +#include "xla/service/cpu/kernel_selector.h" #include "xla/service/custom_call_target_registry.h" #include "tsl/platform/logging.h" @@ -209,6 +212,27 @@ static bool RegisterKnownJITSymbols() { REGISTER_CPU_RUNTIME_SYMBOL(TracingStart); REGISTER_CPU_RUNTIME_SYMBOL(TracingEnd); REGISTER_CPU_RUNTIME_SYMBOL(HandleFfiCall); + REGISTER_CPU_RUNTIME_SYMBOL(XnnPackSoftMaxND); + REGISTER_CPU_RUNTIME_SYMBOL(ArgMax3DParallel); + REGISTER_CPU_RUNTIME_SYMBOL(ArgMax3DSequential); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMMSequential); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMMParallel); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch3DSequential); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch3DParallel); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch4DSequential); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch4DParallel); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMV); +#ifdef ENABLE_BLAS_MLIR + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMMMLIR); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch3DMLIR); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch4DMLIR); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMVMLIR); +#endif // ENABLE_BLAS_MLIR + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMVEmpty); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorGEMMEmpty); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch3DEmpty); + REGISTER_CPU_RUNTIME_SYMBOL(KernelSelectorBatch4DEmpty); + REGISTER_CPU_RUNTIME_SYMBOL(ArgMax3DEmpty); #if defined(INTEL_MKL) REGISTER_CPU_RUNTIME_SYMBOL(OneDnnMatMul); REGISTER_CPU_RUNTIME_SYMBOL(OneDnnSoftmax); diff --git a/third_party/xla/xla/service/cpu/xnnpack_ops.cc b/third_party/xla/xla/service/cpu/xnnpack_ops.cc new file mode 100644 index 00000000000000..902086924f0fdf --- /dev/null +++ b/third_party/xla/xla/service/cpu/xnnpack_ops.cc @@ -0,0 +1,76 @@ +/* Original Copyright: Copyright (c) Facebook, Inc. and its affiliates. +This source code is licensed under the BSD-style license found in the +LICENSE file in the root directory of this source tree. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#define XNN_LOG_LEVEL 4 +#include +#include "xnnpack.h" +#include "absl/base/attributes.h" + +namespace xla { +namespace cpu { + +extern "C" { +ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_XnnPackSoftMaxND( + const void* run_options_ptr, void* in, void* out, int64_t batch_size, + int64_t channels) { + // NB: run_options_ptr is ignored. + float* input = (float*)in; + float* output = (float*)out; + + xnn_status status = xnn_initialize(nullptr /* allocator */); + if (status != xnn_status_success) { + std::cout << "failed to initialize XNNPACK"; + return; + } + + xnn_operator_t softmax_op = nullptr; + status = xnn_create_softmax_nc_f32(0 /* flags */, &softmax_op); + if (status != xnn_status_success || softmax_op == nullptr) { + std::cout << "failed to create SoftMax operator\n"; + return; + } + + status = xnn_reshape_softmax_nc_f32(softmax_op, channels, /* channels */ + channels /* input stride */, + channels /* output stride */, batch_size, + /*threadpool=*/nullptr); + if (status != xnn_status_success) { + std::cout << "failed to reshape SoftMax operator"; + return; + } + + status = xnn_setup_softmax_nc_f32(softmax_op, input, output); + if (status != xnn_status_success) { + std::cout << "failed to setup SoftMax operator"; + return; + } + + status = xnn_run_operator(softmax_op, /*threadpool=*/nullptr); + if (status != xnn_status_success) { + std::cout << "failed to run SoftMax operator"; + return; + } + + xnn_delete_operator(softmax_op); + + xnn_deinitialize(); +} + +} // extern "C" + +} // namespace cpu +} // namespace xla diff --git a/third_party/xla/xla/service/cpu/xnnpack_ops.h b/third_party/xla/xla/service/cpu/xnnpack_ops.h new file mode 100644 index 00000000000000..c3811f641a9f4c --- /dev/null +++ b/third_party/xla/xla/service/cpu/xnnpack_ops.h @@ -0,0 +1,36 @@ +/* Referenced & Modified External Open Source Code: +Source URL: https://github.com/openxla/xla/pull/7540/files +Original Copyright: 2023 The TensorFlow Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef XLA_SERVICE_CPU_XNNPACK_OPS_H_ +#define XLA_SERVICE_CPU_XNNPACK_OPS_H_ + +namespace xla { +namespace cpu { + +extern "C" { + +extern void __xla_cpu_runtime_XnnPackSoftMaxND(const void* run_options_ptr, + void* in, void* out, + int64_t batch_size, + int64_t channels); + +} // extern "C" + +} // namespace cpu +} // namespace xla + +#endif // XLA_SERVICE_CPU_XNNPACK_OPS_H_ diff --git a/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc new file mode 100644 index 00000000000000..4687473caf3ac7 --- /dev/null +++ b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.cc @@ -0,0 +1,226 @@ +/* +Referenced & Modified External Open Source Code: +Original Copyright: 2023 The OpenXLA Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "xnnpack_ops_rewriter.h" + +#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h" +#include "xla/literal_comparison.h" +#include "xla/literal_util.h" +#include "xnnpack_pattern_utils.h" +#include "xla/status_macros.h" + +namespace xla { +namespace cpu { + +namespace { +namespace m = match; +namespace pu = ::xla::cpu::xnnpack_pattern_utils_internal; + +bool IsNegInfConstScalar(const HloInstruction* const_instr) { + if (const_instr->opcode() != HloOpcode::kConstant) { + return false; + } + if (!ShapeUtil::IsEffectiveScalar(const_instr->shape())) { + return false; + } + auto value = LiteralUtil::GetFirstScalarLiteral(const_instr->literal()); + return literal_comparison::Equal( + value, LiteralUtil::MinValue(const_instr->shape().element_type())) + .ok(); +} + +bool IsMaxReducerComputation(const HloComputation* comp) { + if (comp->root_instruction()->opcode() != HloOpcode::kMaximum) { + return false; + } + auto max_instr = comp->root_instruction(); + const HloInstruction* p0 = comp->parameter_instruction(0); + const HloInstruction* p1 = comp->parameter_instruction(1); + const HloInstruction* max_p0 = max_instr->operand(0); + const HloInstruction* max_p1 = max_instr->operand(1); + return (max_p0 == p0 && max_p1 == p1) || (max_p1 == p0 && max_p0 == p1); +} + +// Pattern to match any of Maximum(Reduce_max(...), -inf) or Reduce_max(...). +auto MaxReduce(HloInstruction** instr) { + auto is_valid_reduce_max = [](const HloInstruction* reduce) { + HloComputation* reducer = reduce->to_apply(); + return IsMaxReducerComputation(reducer) && + (reduce->dimensions().size() == 1) && + (reduce->operand(1)->opcode() == HloOpcode::kConstant) && + IsNegInfConstScalar(reduce->operand(1)); + }; + + return m::AnyOf( + m::Maximum().WithBinaryOperandsAnyOrder( + m::Reduce(instr).WithPredicate(is_valid_reduce_max).WithOneUse(), + pu::OptionalBroadcast( + m::Constant().WithPredicate(IsNegInfConstScalar))), + m::Reduce(instr).WithPredicate(is_valid_reduce_max).WithOneUse()); +} + +// Matches the softmax pattern with divide instruction as root node. +// Here we pass 'instr' as root node and return the producer HloInstruction. +// Tha axis on which softmax is applied is stored in 'axis'. +std::optional MatchSoftmax(HloInstruction* instr, int* axis) { + // + // producer + // | \ + // | reduce_max or max(reduce_max) + // | | + // | reshape + // | | + // | broadcast + // | | + // | reshape + // | | + // | broadcast + // | / + // subtract + // | + // exponential + // | \ + // | Convert(optional) + // | | + // | reduce_sum + // | | + // | Convert(optional) + // | | + // | reshape + // | | + // | Convert(optional) + // | | + // | broadcast + // | | + // | reshape + // | | + // | broadcast + // | / + // divide // (instr parameter) + // + + // This matcher covers the most common SoftMax patterns we have encountered + // in real-life models. + HloInstruction* left_exponential; + HloInstruction* right_exponential; + HloInstruction* left_producer; + HloInstruction* reduce_sum; + HloInstruction* reduce_max; + HloInstruction* reduce_instr; + + // Lower diamond + if (!Match(instr, + m::Divide( + m::Exp(&left_exponential, m::Op()), + m::Broadcast(m::Reshape(m::Broadcast( + pu::OptionalConvert(m::Reshape(pu::OptionalConvert( + m::Reduce(&reduce_sum, + pu::OptionalConvert( + m::Exp(&right_exponential, m::Op())), + m::ConstantScalar(0)) + .WithPredicate([](const HloInstruction* reduce) { + HloComputation* reducer = reduce->to_apply(); + return (reducer->root_instruction()->opcode() == + HloOpcode::kAdd && + reduce->dimensions().size() == 1); + }) + .WithOneUse()))))))))) { + return std::nullopt; + } + + if (left_exponential != right_exponential || + left_exponential->user_count() != 2) { + return std::nullopt; + } + + // Upper diamond + if (!Match(left_exponential->mutable_operand(0), + m::Subtract(m::Op(&left_producer), + m::Broadcast(m::Reshape(m::Broadcast( + m::Reshape(m::Op(&reduce_instr))))) + .WithOneUse()) + .WithOneUse())) { + return std::nullopt; + } + + // Match the reduce max. + if (!Match(reduce_instr, MaxReduce(&reduce_max))) { + return std::nullopt; + } + + if (left_producer != reduce_max->operand(0) || + left_producer->user_count() != 2) { + return std::nullopt; + } + + if (reduce_sum->dimensions()[0] != reduce_max->dimensions()[0]) { + return std::nullopt; + } + + *axis = reduce_sum->dimensions()[0]; + + return left_producer; +} + +} // namespace + +class XnnPackOpsRewriterVisitor : public DfsHloRewriteVisitor { + public: + absl::Status HandleDivide(HloInstruction* divide_instr) override { + if (divide_instr->HasControlDependencies()) { + return absl::OkStatus(); + } + if (!pu::IsSupportedType(divide_instr->shape().element_type())) { + return absl::OkStatus(); + } + int axis = -1; + std::optional producer = MatchSoftmax(divide_instr, &axis); + if (producer == std::nullopt) { + return absl::OkStatus(); + } + + const Shape& output_shape = divide_instr->shape(); + int softmax_dims = output_shape.dimensions().size(); + if (softmax_dims < 2) { + XLA_VLOG_LINES(3, "Found SoftMax with " + std::to_string(softmax_dims) + + " dims, which is not supported\n"); + return absl::OkStatus(); + } + + HloInstruction* softmax_call = + divide_instr->AddInstruction(HloInstruction::CreateCustomCall( + output_shape, {producer.value()}, "__xnnpack$softmax")); + TF_RETURN_IF_ERROR(ReplaceInstruction(divide_instr, softmax_call)); + + return absl::OkStatus(); + } +}; + +absl::StatusOr XnnPackOpsRewriter::Run( + HloModule* module, + const absl::flat_hash_set& execution_threads) { + XLA_VLOG_LINES(3, + "XnnPackOpsRewriter::Run(), before:\n" + module->ToString()); + XnnPackOpsRewriterVisitor visitor; + TF_ASSIGN_OR_RETURN(auto result, + visitor.RunOnModule(module, execution_threads)); + XLA_VLOG_LINES(3, "XnnPackOpsRewriter::Run(), after:\n" + module->ToString()); + return result; +} + +} // namespace cpu +} // namespace xla diff --git a/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h new file mode 100644 index 00000000000000..f1cd18769d1704 --- /dev/null +++ b/third_party/xla/xla/service/cpu/xnnpack_ops_rewriter.h @@ -0,0 +1,43 @@ +/* Referenced & Modified External Open Source Code: +Original Copyright: 2023 The OpenXLA Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef XLA_SERVICE_CPU_XNNPACK_OPS_REWRITER_H_ +#define XLA_SERVICE_CPU_XNNPACK_OPS_REWRITER_H_ + +#include + +#include "absl/algorithm/container.h" +#include "xla/hlo/ir/hlo_instructions.h" +#include "xla/hlo/ir/hlo_module.h" +#include "xla/hlo/pass/hlo_pass_interface.h" + +namespace xla { +namespace cpu { + +class XnnPackOpsRewriter : public HloModulePass { + public: + absl::string_view name() const override { return "xnnpack-ops-rewriter"; } + + using HloPassInterface::Run; + absl::StatusOr Run( + HloModule* module, + const absl::flat_hash_set& execution_threads) override; +}; + +} // namespace cpu +} // namespace xla + +#endif // XLA_SERVICE_CPU_XNNPACK_OPS_REWRITER_H_ diff --git a/third_party/xla/xla/service/cpu/xnnpack_pattern_utils.h b/third_party/xla/xla/service/cpu/xnnpack_pattern_utils.h new file mode 100644 index 00000000000000..1ea52de3695def --- /dev/null +++ b/third_party/xla/xla/service/cpu/xnnpack_pattern_utils.h @@ -0,0 +1,65 @@ +/* +Referenced & Modified External Open Source Code: +Original Copyright: 2024 The OpenXLA Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef XLA_SERVICE_CPU_XNNPACK_PATTERN_UTILS_H_ +#define XLA_SERVICE_CPU_XNNPACK_PATTERN_UTILS_H_ + +#include "xla/hlo/ir/hlo_instruction.h" +#include "xla/hlo/ir/hlo_instructions.h" +#include "xla/service/pattern_matcher.h" + +namespace xla { +namespace cpu { + +namespace xnnpack_pattern_utils_internal { +namespace m = match; + +template +auto OptionalConvert(Pattern pattern) { + return m::AnyOf(m::Convert(pattern), std::move(pattern)); +} + +template +auto OptionalBroadcast(Pattern pattern) { + return m::AnyOf(m::Broadcast(pattern), std::move(pattern)); +} + +// Simplified from upstream XLA. +inline bool IsSupportedType(xla::PrimitiveType dtype) { return dtype == F32; } + +template +inline auto SupportedConvert(Pattern pattern) { + auto supported_convert = [](const HloInstruction* instr) -> bool { + return IsSupportedType(instr->shape().element_type()) && + IsSupportedType(instr->operand(0)->shape().element_type()); + }; + return m::Convert(pattern).WithPredicate(supported_convert); +} + +template +inline auto SupportedConvert(HloInstruction** convert, Pattern pattern) { + auto supported_convert = [](const HloInstruction* instr) -> bool { + return IsSupportedType(instr->shape().element_type()) && + IsSupportedType(instr->operand(0)->shape().element_type()); + }; + return m::Convert(convert, pattern).WithPredicate(supported_convert); +} +} // namespace xnnpack_pattern_utils_internal +} // namespace cpu +} // namespace xla + +#endif // XLA_SERVICE_CPU_XNNPACK_PATTERN_UTILS_H_ diff --git a/third_party/xla/xla/service/libs/BUILD b/third_party/xla/xla/service/libs/BUILD new file mode 100644 index 00000000000000..c9435fb4686cf4 --- /dev/null +++ b/third_party/xla/xla/service/libs/BUILD @@ -0,0 +1,17 @@ +cc_binary( + name = "libblas_mlir.so", + srcs = ["libblas_mlir/src/sgemm.cpp", + "libblas_mlir/src/sgemv.cpp", + "libblas_mlir/src/sbatch_matmul_3d.cpp", + "libblas_mlir/src/sbatch_matmul_4d.cpp", + "libblas_mlir/kernels/sbatch_matmul_3d_nn_mlir.s", + "libblas_mlir/kernels/sbatch_matmul_3d_nt_mlir.s", + "libblas_mlir/kernels/sbatch_matmul_4d_nn_mlir.s", + "libblas_mlir/kernels/sbatch_matmul_4d_nt_mlir.s", + "libblas_mlir/kernels/sgemm_nn_alpha1_beta1_mlir.s", + "libblas_mlir/kernels/sgemv_n_alpha1_beta1_mlir.s"], + linkshared = True, + linkstatic = False, + includes = ["libblas_mlir/include"], + visibility = ["//visibility:public"], +) diff --git a/third_party/xla/xla/service/libs/libblas_mlir/Makefile b/third_party/xla/xla/service/libs/libblas_mlir/Makefile new file mode 100644 index 00000000000000..941f9062f20211 --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/Makefile @@ -0,0 +1,52 @@ +# List of source files +SRCS := sgemm.cpp sgemv.cpp sbatch_matmul_3d.cpp sbatch_matmul_4d.cpp +KERNELS_DIR := kernels +KERNEL_SRCS := $(wildcard $(KERNELS_DIR)/*.s) + +# Source directory +SRC_DIR := src + +# Output directory +BUILD := build + +# Compiler and flags +CC := gcc +CFLAGS := -S -I include -O3 +ASFLAGS := -c -O3 +LDFLAGS := -shared + +# Full paths +SRC_PATHS := $(SRCS:%=$(SRC_DIR)/%) +ASM := $(SRCS:%.cpp=$(BUILD)/%.s) +OBJS := $(SRCS:%.cpp=$(BUILD)/%.o) +KERNEL_OBJS := $(KERNEL_SRCS:$(KERNELS_DIR)/%.s=$(BUILD)/%.o) + +# All object files +ALL_OBJS := $(OBJS) $(KERNEL_OBJS) + +# Default target +all: $(BUILD) $(ASM) $(ALL_OBJS) $(BUILD)/libblas_mlir.so + +# Create build directory +$(BUILD): + @mkdir -p $(BUILD) + +# Compile each .cpp file to .s in build/ +$(BUILD)/%.s: $(SRC_DIR)/%.cpp + @$(CC) $(CFLAGS) $< -o $@ + +# Assemble .s to .o +$(BUILD)/%.o: $(BUILD)/%.s + @$(CC) $(ASFLAGS) $< -o $@ + +# Assemble kernels .s to .o +$(BUILD)/%.o: $(KERNELS_DIR)/%.s | $(BUILD) + @$(CC) $(ASFLAGS) $< -o $@ + +# Link .o files into lib.so +$(BUILD)/libblas_mlir.so: $(ALL_OBJS) + @$(CC) $(LDFLAGS) -o $@ $^ + +# Clean target +clean: + @rm -rf $(BUILD) \ No newline at end of file diff --git a/third_party/xla/xla/service/libs/libblas_mlir/include/MemrefHelpers.h b/third_party/xla/xla/service/libs/libblas_mlir/include/MemrefHelpers.h new file mode 100644 index 00000000000000..6d4fab5e34f49c --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/include/MemrefHelpers.h @@ -0,0 +1,10 @@ +#ifndef MEMREF_HELPERS_H_ +#define MEMREF_HELPERS_H_ + +#define Memref_1D_Args(NAME, M, S) NAME, NAME, 0, M, S +#define Memref_2D_Args(NAME, M, N, LD) NAME, NAME, 0, M, N, LD, 1 +#define Memref_3D_Args(NAME, B, M, N, LD) NAME, NAME, 0, B, M, N, M *LD, LD, 1 +#define Memref_4D_Args(NAME, B1, B2, M, N, LD) \ + NAME, NAME, 0, B1, B2, M, N, B2 *M *LD, M *LD, LD, 1 + +#endif \ No newline at end of file diff --git a/third_party/xla/xla/service/libs/libblas_mlir/include/cblas.h b/third_party/xla/xla/service/libs/libblas_mlir/include/cblas.h new file mode 100644 index 00000000000000..4f7c410ec9bb3b --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/include/cblas.h @@ -0,0 +1,11 @@ +typedef int BLASINT; + +typedef enum CBLAS_ORDER { + CblasRowMajor = 101, + CblasColMajor = 102 +} CBLAS_ORDER; + +typedef enum CBLAS_TRANSPOSE { + CblasNoTrans = 111, + CblasTrans = 112, +} CBLAS_TRANSPOSE; diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nn_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nn_mlir.s new file mode 100644 index 00000000000000..38d54d0f69c54c --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nn_mlir.s @@ -0,0 +1,4079 @@ + .text + .file "LLVMDialectModule" + .globl sbatch_matmul_3d_nn_mlir // -- Begin function sbatch_matmul_3d_nn_mlir + .p2align 4 + .type sbatch_matmul_3d_nn_mlir,@function +sbatch_matmul_3d_nn_mlir: // @sbatch_matmul_3d_nn_mlir + .cfi_startproc +// %bb.0: + stp d15, d14, [sp, #-160]! // 16-byte Folded Spill + stp d13, d12, [sp, #16] // 16-byte Folded Spill + stp x29, x30, [sp, #64] // 16-byte Folded Spill + stp x28, x27, [sp, #80] // 16-byte Folded Spill + stp x26, x25, [sp, #96] // 16-byte Folded Spill + stp x24, x23, [sp, #112] // 16-byte Folded Spill + stp x22, x21, [sp, #128] // 16-byte Folded Spill + stp x20, x19, [sp, #144] // 16-byte Folded Spill + stp d11, d10, [sp, #32] // 16-byte Folded Spill + stp d9, d8, [sp, #48] // 16-byte Folded Spill + sub sp, sp, #1040 + .cfi_def_cfa_offset 1200 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + .cfi_offset b10, -120 + .cfi_offset b11, -128 + .cfi_offset b12, -136 + .cfi_offset b13, -144 + .cfi_offset b14, -152 + .cfi_offset b15, -160 + cmp x4, #0 + ldr x13, [sp, #1248] + ldr x29, [sp, #1336] + lsl x23, x5, #6 + cinv x8, x4, lt + ldr x20, [sp, #1264] + ldr x26, [sp, #1216] + add x0, x23, #64 + add x9, x8, x8, lsr #63 + add x10, x8, #3 + mov x19, x7 + str x6, [sp, #760] // 8-byte Folded Spill + mov x21, x5 + stp x13, x3, [sp, #144] // 16-byte Folded Spill + mov x27, x2 + str x1, [sp, #720] // 8-byte Folded Spill + asr x9, x9, #1 + str x4, [sp, #744] // 8-byte Folded Spill + cinv x28, x9, lt + cmp x8, #0 + ldr x9, [sp, #1256] + csel x8, x10, x8, lt + cmp x4, #0 + ldr x10, [sp, #1328] + asr x8, x8, #2 + cinv x24, x8, lt + cmp x13, #0 + cinv x8, x13, lt + str x9, [sp, #752] // 8-byte Folded Spill + add x9, x8, x8, lsr #63 + str x10, [sp, #736] // 8-byte Folded Spill + add x10, x8, #15 + add x11, x8, #7 + add x12, x8, #3 + asr x9, x9, #1 + cinv x14, x9, lt + ldr x9, [sp, #1296] + cmp x8, #0 + str x14, [sp, #1000] // 8-byte Folded Spill + str x9, [sp, #696] // 8-byte Folded Spill + ldr x9, [sp, #1288] + str x9, [sp, #688] // 8-byte Folded Spill + csel x9, x10, x8, lt + csel x10, x11, x8, lt + csel x8, x12, x8, lt + cmp x13, #0 + asr x9, x9, #4 + asr x8, x8, #2 + asr x10, x10, #3 + cinv x11, x9, lt + ldr x9, [sp, #1224] + cinv x25, x8, lt + cinv x10, x10, lt + lsl x8, x25, #2 + str x11, [sp, #1016] // 8-byte Folded Spill + str x10, [sp, #1008] // 8-byte Folded Spill + str x8, [sp, #600] // 8-byte Folded Spill + lsl x8, x14, #1 + str x8, [sp, #648] // 8-byte Folded Spill + str x9, [sp, #712] // 8-byte Folded Spill + lsl x9, x11, #4 + str x9, [sp, #832] // 8-byte Folded Spill + lsl x9, x10, #3 + str x9, [sp, #768] // 8-byte Folded Spill + bl malloc + lsl x8, x24, #2 + negs x9, x21 + add x10, x19, x19, lsl #1 + mov w12, #1 // =0x1 + str x8, [sp, #1024] // 8-byte Folded Spill + lsl x8, x28, #1 + and x9, x9, #0x3 + str x27, [sp, #704] // 8-byte Folded Spill + str x8, [sp, #920] // 8-byte Folded Spill + add x8, x0, #63 + lsl x27, x27, #2 + lsl x5, x21, #2 + and x22, x8, #0xffffffffffffffc0 + and x8, x21, #0x3 + bfi x12, x24, #2, #62 + mul x17, x19, x12 + csneg x6, x8, x9, mi + lsl x8, x10, #2 + mul x18, x28, x19 + add x12, x5, x27 + lsl x15, x6, #2 + str x8, [sp, #1032] // 8-byte Folded Spill + mul x16, x24, x19 + lsl x2, x16, #4 + sub x8, x5, x15 + lsl x3, x17, #2 + stp x5, x8, [sp, #96] // 16-byte Folded Spill + lsl x4, x20, #2 + sub x8, x12, x15 + sub x12, x22, x6, lsl #6 + mov x13, x20 + str x8, [sp, #904] // 8-byte Folded Spill + add x10, x4, x20 + lsl x11, x20, #5 + lsl x20, x19, #2 + add x8, x12, x23 + lsl x9, x13, #4 + sub x28, x11, x4 + str x0, [sp, #16] // 8-byte Folded Spill + str x8, [sp, #552] // 8-byte Folded Spill + add x8, x27, x18, lsl #3 + lsl x10, x10, #2 + str x13, [sp, #728] // 8-byte Folded Spill + str xzr, [sp, #184] // 8-byte Folded Spill + str xzr, [sp, #776] // 8-byte Folded Spill + add x12, x8, x5 + str x8, [sp, #888] // 8-byte Folded Spill + sub x8, x12, x15 + ldr x12, [sp, #104] // 8-byte Folded Reload + str x26, [sp, #680] // 8-byte Folded Spill + str x4, [sp, #824] // 8-byte Folded Spill + str x8, [sp, #896] // 8-byte Folded Spill + add x8, x3, x27 + add x14, x8, x5 + str x8, [sp, #992] // 8-byte Folded Spill + sub x8, x14, x15 + sub x14, x21, x6 + str x8, [sp, #880] // 8-byte Folded Spill + add x8, x2, x27 + add x5, x8, x5 + str x8, [sp, #912] // 8-byte Folded Spill + sub x8, x5, x15 + add x15, x21, x20 + ldr x5, [sp, #720] // 8-byte Folded Reload + str x8, [sp, #872] // 8-byte Folded Spill + ldr x8, [sp, #1016] // 8-byte Folded Reload + sub x1, x15, x6 + add x15, x21, x16, lsl #2 + lsl x16, x25, #4 + str x9, [sp, #1016] // 8-byte Folded Spill + sub x15, x15, x6 + lsl x15, x15, #2 + lsl x7, x8, #6 + ldr x8, [sp, #1008] // 8-byte Folded Reload + str x15, [sp, #576] // 8-byte Folded Spill + add x15, x21, x17 + sub x15, x15, x6 + lsl x15, x15, #2 + str x15, [sp, #568] // 8-byte Folded Spill + add x15, x21, x18, lsl #1 + lsl x17, x8, #5 + ldr x8, [sp, #1000] // 8-byte Folded Reload + sub x15, x15, x6 + lsl x18, x15, #2 + lsl x15, x8, #3 + ldr x8, [sp, #712] // 8-byte Folded Reload + lsl x8, x8, #2 + add x23, x11, x8 + str x8, [sp, #864] // 8-byte Folded Spill + add x23, x26, x23 + str x23, [sp, #984] // 8-byte Folded Spill + add x23, x9, x8 + add x23, x26, x23 + str x23, [sp, #976] // 8-byte Folded Spill + add x23, x4, x8 + add x23, x26, x23 + str x23, [sp, #968] // 8-byte Folded Spill + lsl x23, x13, #3 + add x24, x23, x8 + add x24, x26, x24 + str x24, [sp, #960] // 8-byte Folded Spill + add x24, x13, x13, lsl #1 + lsl x25, x24, #3 + lsl x30, x24, #2 + add x24, x26, x8 + add x0, x24, x28 + str x0, [sp, #952] // 8-byte Folded Spill + add x0, x24, x25 + str x0, [sp, #944] // 8-byte Folded Spill + add x0, x24, x10 + str x0, [sp, #936] // 8-byte Folded Spill + add x0, x24, x30 + str x0, [sp, #928] // 8-byte Folded Spill + add x0, x12, #4 + ldr x12, [sp, #904] // 8-byte Folded Reload + str x0, [sp, #512] // 8-byte Folded Spill + madd x24, x13, x0, x8 + add x0, x12, #4 + str x0, [sp, #672] // 8-byte Folded Spill + mul x0, x13, x14 + add x24, x26, x24 + add x0, x8, x0, lsl #2 + lsl x8, x19, #4 + str x8, [sp, #1008] // 8-byte Folded Spill + add x12, x26, x0 + add x0, x8, x27 + add x0, x0, x5 + add x8, x0, #32 + add x0, x27, x1, lsl #2 + add x1, x26, x4 + str x8, [sp, #816] // 8-byte Folded Spill + add x0, x0, x5 + add x8, x0, #4 + str x8, [sp, #808] // 8-byte Folded Spill + add x8, x5, x3 + add x3, x26, x11 + add x11, x26, x23 + add x23, x24, x7 + str x8, [sp, #624] // 8-byte Folded Spill + add x8, x5, x2 + add x2, x26, x28 + add x0, x3, x7 + str x8, [sp, #616] // 8-byte Folded Spill + ldr x8, [sp, #888] // 8-byte Folded Reload + str x0, [sp, #504] // 8-byte Folded Spill + add x0, x2, x7 + str x0, [sp, #496] // 8-byte Folded Spill + add x13, x8, x5 + add x8, x13, #32 + add x13, x26, x10 + str x8, [sp, #640] // 8-byte Folded Spill + add x8, x18, #4 + add x18, x26, x25 + str x8, [sp, #560] // 8-byte Folded Spill + ldr x8, [sp, #896] // 8-byte Folded Reload + add x0, x18, x7 + str x0, [sp, #488] // 8-byte Folded Spill + add x0, x13, x7 + str x0, [sp, #480] // 8-byte Folded Spill + add x0, x26, x9 + add x9, x0, x7 + add x8, x5, x8 + str x9, [sp, #472] // 8-byte Folded Spill + add x9, x1, x7 + str x8, [sp, #632] // 8-byte Folded Spill + add x8, x26, x30 + str x9, [sp, #464] // 8-byte Folded Spill + add x9, x11, x7 + str x9, [sp, #456] // 8-byte Folded Spill + add x9, x8, x7 + str x9, [sp, #448] // 8-byte Folded Spill + add x9, x12, x7 + str x9, [sp, #440] // 8-byte Folded Spill + ldr x9, [sp, #880] // 8-byte Folded Reload + add x9, x5, x9 + str x9, [sp, #544] // 8-byte Folded Spill + ldr x9, [sp, #872] // 8-byte Folded Reload + add x9, x5, x9 + str x9, [sp, #656] // 8-byte Folded Spill + add x9, x3, x17 + str x9, [sp, #432] // 8-byte Folded Spill + add x9, x2, x17 + str x9, [sp, #424] // 8-byte Folded Spill + add x9, x18, x17 + str x9, [sp, #416] // 8-byte Folded Spill + add x9, x13, x17 + str x9, [sp, #408] // 8-byte Folded Spill + add x9, x0, x17 + str x9, [sp, #400] // 8-byte Folded Spill + add x9, x1, x17 + str x9, [sp, #392] // 8-byte Folded Spill + add x9, x11, x17 + str x9, [sp, #384] // 8-byte Folded Spill + add x9, x8, x17 + str x9, [sp, #376] // 8-byte Folded Spill + add x9, x24, x17 + str x9, [sp, #368] // 8-byte Folded Spill + add x9, x12, x17 + lsl x17, x21, #3 + str x9, [sp, #360] // 8-byte Folded Spill + add x9, x3, x16 + str x17, [sp, #72] // 8-byte Folded Spill + str x9, [sp, #352] // 8-byte Folded Spill + add x9, x2, x16 + str x9, [sp, #344] // 8-byte Folded Spill + add x9, x18, x16 + str x9, [sp, #336] // 8-byte Folded Spill + add x9, x13, x16 + str x9, [sp, #328] // 8-byte Folded Spill + add x9, x0, x16 + str x9, [sp, #320] // 8-byte Folded Spill + add x9, x1, x16 + str x9, [sp, #312] // 8-byte Folded Spill + add x9, x11, x16 + add x11, x11, x15 + str x9, [sp, #304] // 8-byte Folded Spill + add x9, x8, x16 + add x8, x8, x15 + str x9, [sp, #296] // 8-byte Folded Spill + add x9, x24, x16 + str x8, [sp, #216] // 8-byte Folded Spill + add x8, x24, x15 + str x9, [sp, #288] // 8-byte Folded Spill + add x9, x12, x16 + lsl x16, x21, #4 + str x8, [sp, #208] // 8-byte Folded Spill + str x9, [sp, #280] // 8-byte Folded Spill + lsl x9, x21, #5 + sub x7, x16, x6, lsl #4 + sub x10, x9, x6, lsl #5 + sub x6, x17, x6, lsl #3 + mov x17, x12 + add x12, x18, x15 + stp x16, x9, [sp, #80] // 16-byte Folded Spill + lsl x9, x19, #3 + add x8, x17, x15 + str x12, [sp, #256] // 8-byte Folded Spill + add x12, x13, x15 + ldr x13, [sp, #1032] // 8-byte Folded Reload + add x16, x5, x9 + add x9, x9, x27 + str x14, [sp, #1032] // 8-byte Folded Spill + str x8, [sp, #200] // 8-byte Folded Spill + ldr x8, [sp, #992] // 8-byte Folded Reload + str x12, [sp, #248] // 8-byte Folded Spill + str x16, [sp, #800] // 8-byte Folded Spill + add x16, x3, x15 + add x9, x5, x9 + ldr x3, [sp, #776] // 8-byte Folded Reload + stp x6, x10, [sp, #56] // 16-byte Folded Spill + str x16, [sp, #272] // 8-byte Folded Spill + add x16, x2, x15 + str x9, [sp, #592] // 8-byte Folded Spill + add x9, x20, x27 + mov x2, x23 + sub x23, x14, #4 + str x16, [sp, #264] // 8-byte Folded Spill + ldr x16, [sp, #184] // 8-byte Folded Reload + add x9, x5, x9 + add x12, x5, x13 + str x9, [sp, #584] // 8-byte Folded Spill + mov x9, x24 + add x8, x5, x8 + str x12, [sp, #792] // 8-byte Folded Spill + add x12, x5, x27 + str x8, [sp, #536] // 8-byte Folded Spill + ldr x8, [sp, #912] // 8-byte Folded Reload + add x13, x12, x13 + str x13, [sp, #608] // 8-byte Folded Spill + add x13, x0, x15 + str x13, [sp, #240] // 8-byte Folded Spill + add x13, x1, x15 + stp x11, x13, [sp, #224] // 16-byte Folded Spill + add x8, x5, x8 + str x8, [sp, #528] // 8-byte Folded Spill + sub x8, x14, #3 + str x8, [sp, #912] // 8-byte Folded Spill + sub x8, x14, #2 + str x8, [sp, #904] // 8-byte Folded Spill + sub x8, x14, #1 + str x8, [sp, #896] // 8-byte Folded Spill + ldr x8, [sp, #752] // 8-byte Folded Reload + lsl x11, x8, #2 + ldr x8, [sp, #760] // 8-byte Folded Reload + lsl x8, x8, #2 + stp x8, x11, [sp, #128] // 16-byte Folded Spill + add x8, x5, x20 + add x11, x10, #32 + str x8, [sp, #784] // 8-byte Folded Spill + add x8, x22, #128 + str x8, [sp, #664] // 8-byte Folded Spill + add x8, x22, #256 + str x8, [sp, #1000] // 8-byte Folded Spill + ldr x8, [sp, #552] // 8-byte Folded Reload + add x8, x8, #64 + str x8, [sp, #992] // 8-byte Folded Spill + add x8, x7, #16 + stp x8, x11, [sp, #40] // 16-byte Folded Spill + add x8, x6, #8 + stp x7, x8, [sp, #24] // 16-byte Folded Spill + b .LBB0_4 + .p2align 2 +.LBB0_1: // in Loop: Header=BB0_4 Depth=1 + str s0, [x24, x9, lsl #2] +.LBB0_2: // in Loop: Header=BB0_4 Depth=1 + ldr x0, [sp, #120] // 8-byte Folded Reload + bl free +.LBB0_3: // %.backedge53 + // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #800] // 8-byte Folded Reload + ldp x11, x10, [sp, #128] // 16-byte Folded Reload + add x8, x8, x11 + ldr x5, [sp, #872] // 8-byte Folded Reload + ldp x9, x16, [sp, #176] // 16-byte Folded Reload + ldp x3, x17, [sp, #160] // 16-byte Folded Reload + ldr x12, [sp, #880] // 8-byte Folded Reload + ldr x2, [sp, #192] // 8-byte Folded Reload + add x5, x5, x11 + add x16, x16, x10 + add x9, x9, x10 + add x17, x17, x10 + add x12, x12, x11 + add x2, x2, x10 + str x8, [sp, #800] // 8-byte Folded Spill + ldr x8, [sp, #784] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #784] // 8-byte Folded Spill + ldr x8, [sp, #792] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #792] // 8-byte Folded Spill + ldr x8, [sp, #816] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #816] // 8-byte Folded Spill + ldr x8, [sp, #808] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #808] // 8-byte Folded Spill + ldr x8, [sp, #624] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #624] // 8-byte Folded Spill + ldr x8, [sp, #616] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #616] // 8-byte Folded Spill + ldr x8, [sp, #640] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #640] // 8-byte Folded Spill + ldr x8, [sp, #632] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #632] // 8-byte Folded Spill + ldr x8, [sp, #504] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #504] // 8-byte Folded Spill + ldr x8, [sp, #496] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #496] // 8-byte Folded Spill + ldr x8, [sp, #488] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #488] // 8-byte Folded Spill + ldr x8, [sp, #480] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #480] // 8-byte Folded Spill + ldr x8, [sp, #472] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #472] // 8-byte Folded Spill + ldr x8, [sp, #464] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #464] // 8-byte Folded Spill + ldr x8, [sp, #456] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #456] // 8-byte Folded Spill + ldr x8, [sp, #448] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #448] // 8-byte Folded Spill + ldr x8, [sp, #440] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #440] // 8-byte Folded Spill + ldr x8, [sp, #544] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #544] // 8-byte Folded Spill + ldr x8, [sp, #656] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #656] // 8-byte Folded Spill + ldr x8, [sp, #432] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #432] // 8-byte Folded Spill + ldr x8, [sp, #424] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #424] // 8-byte Folded Spill + ldr x8, [sp, #416] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #416] // 8-byte Folded Spill + ldr x8, [sp, #408] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #408] // 8-byte Folded Spill + ldr x8, [sp, #400] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #400] // 8-byte Folded Spill + ldr x8, [sp, #392] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #392] // 8-byte Folded Spill + ldr x8, [sp, #384] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #384] // 8-byte Folded Spill + ldr x8, [sp, #376] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #376] // 8-byte Folded Spill + ldr x8, [sp, #368] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #368] // 8-byte Folded Spill + ldr x8, [sp, #360] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #360] // 8-byte Folded Spill + ldr x8, [sp, #352] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #352] // 8-byte Folded Spill + ldr x8, [sp, #344] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #344] // 8-byte Folded Spill + ldr x8, [sp, #336] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #336] // 8-byte Folded Spill + ldr x8, [sp, #328] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #328] // 8-byte Folded Spill + ldr x8, [sp, #320] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #320] // 8-byte Folded Spill + ldr x8, [sp, #312] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #312] // 8-byte Folded Spill + ldr x8, [sp, #304] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #304] // 8-byte Folded Spill + ldr x8, [sp, #296] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #296] // 8-byte Folded Spill + ldr x8, [sp, #288] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #288] // 8-byte Folded Spill + ldr x8, [sp, #280] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #280] // 8-byte Folded Spill + ldr x8, [sp, #592] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #592] // 8-byte Folded Spill + ldr x8, [sp, #584] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #584] // 8-byte Folded Spill + ldr x8, [sp, #272] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #272] // 8-byte Folded Spill + ldr x8, [sp, #264] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #264] // 8-byte Folded Spill + ldr x8, [sp, #256] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #256] // 8-byte Folded Spill + ldr x8, [sp, #248] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #248] // 8-byte Folded Spill + ldr x8, [sp, #608] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #608] // 8-byte Folded Spill + ldr x8, [sp, #240] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #240] // 8-byte Folded Spill + ldr x8, [sp, #232] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #232] // 8-byte Folded Spill + ldr x8, [sp, #224] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #224] // 8-byte Folded Spill + ldr x8, [sp, #216] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #216] // 8-byte Folded Spill + ldr x8, [sp, #208] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #208] // 8-byte Folded Spill + ldr x8, [sp, #200] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #200] // 8-byte Folded Spill + ldr x8, [sp, #536] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #536] // 8-byte Folded Spill + ldr x8, [sp, #528] // 8-byte Folded Reload + add x8, x8, x11 + str x8, [sp, #528] // 8-byte Folded Spill +.LBB0_4: // =>This Loop Header: Depth=1 + // Child Loop BB0_8 Depth 2 + // Child Loop BB0_10 Depth 3 + // Child Loop BB0_12 Depth 3 + // Child Loop BB0_15 Depth 3 + // Child Loop BB0_17 Depth 4 + // Child Loop BB0_19 Depth 4 + // Child Loop BB0_22 Depth 3 + // Child Loop BB0_24 Depth 3 + // Child Loop BB0_28 Depth 3 + // Child Loop BB0_30 Depth 3 + // Child Loop BB0_36 Depth 2 + // Child Loop BB0_39 Depth 2 + // Child Loop BB0_42 Depth 2 + // Child Loop BB0_44 Depth 3 + // Child Loop BB0_46 Depth 3 + // Child Loop BB0_49 Depth 2 + // Child Loop BB0_51 Depth 2 + // Child Loop BB0_55 Depth 2 + // Child Loop BB0_57 Depth 2 + // Child Loop BB0_61 Depth 2 + // Child Loop BB0_64 Depth 2 + // Child Loop BB0_67 Depth 2 + // Child Loop BB0_69 Depth 3 + // Child Loop BB0_71 Depth 3 + // Child Loop BB0_74 Depth 2 + // Child Loop BB0_76 Depth 2 + // Child Loop BB0_80 Depth 2 + // Child Loop BB0_82 Depth 2 + // Child Loop BB0_86 Depth 2 + // Child Loop BB0_89 Depth 2 + // Child Loop BB0_92 Depth 2 + // Child Loop BB0_94 Depth 3 + // Child Loop BB0_96 Depth 3 + // Child Loop BB0_99 Depth 2 + // Child Loop BB0_101 Depth 2 + // Child Loop BB0_105 Depth 2 + // Child Loop BB0_107 Depth 2 + // Child Loop BB0_111 Depth 2 + // Child Loop BB0_114 Depth 2 + // Child Loop BB0_117 Depth 2 + // Child Loop BB0_119 Depth 3 + // Child Loop BB0_121 Depth 3 + // Child Loop BB0_124 Depth 2 + // Child Loop BB0_126 Depth 2 + // Child Loop BB0_130 Depth 2 + // Child Loop BB0_132 Depth 2 + ldr x8, [sp, #152] // 8-byte Folded Reload + cmp x3, x8 + b.ge .LBB0_133 +// %bb.5: // in Loop: Header=BB0_4 Depth=1 + stp x16, x2, [sp, #184] // 16-byte Folded Spill + add x8, x3, #1 + ldr x2, [sp, #832] // 8-byte Folded Reload + mov x4, x16 + str x3, [sp, #776] // 8-byte Folded Spill + ldr x3, [sp, #768] // 8-byte Folded Reload + str x12, [sp, #880] // 8-byte Folded Spill + mov x12, xzr + stp x8, x17, [sp, #160] // 16-byte Folded Spill + mov x8, x17 + str x9, [sp, #176] // 8-byte Folded Spill + str x5, [sp, #872] // 8-byte Folded Spill + b .LBB0_8 + .p2align 2 +.LBB0_6: // in Loop: Header=BB0_8 Depth=2 + stp q3, q2, [x11] + stp q1, q0, [x11, #32] +.LBB0_7: // %.backedge + // in Loop: Header=BB0_8 Depth=2 + ldr x12, [sp, #856] // 8-byte Folded Reload + add x4, x4, #64 + add x9, x9, #64 + add x8, x8, #64 +.LBB0_8: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_10 Depth 3 + // Child Loop BB0_12 Depth 3 + // Child Loop BB0_15 Depth 3 + // Child Loop BB0_17 Depth 4 + // Child Loop BB0_19 Depth 4 + // Child Loop BB0_22 Depth 3 + // Child Loop BB0_24 Depth 3 + // Child Loop BB0_28 Depth 3 + // Child Loop BB0_30 Depth 3 + cmp x12, x2 + b.ge .LBB0_31 +// %bb.9: // in Loop: Header=BB0_8 Depth=2 + add x10, x12, #16 + ldr x11, [sp, #688] // 8-byte Folded Reload + ldr x2, [sp, #776] // 8-byte Folded Reload + mov x3, x4 + str x10, [sp, #856] // 8-byte Folded Spill + ldr x10, [sp, #696] // 8-byte Folded Reload + mov x17, xzr + add x13, x11, x10, lsl #2 + ldr x10, [sp, #736] // 8-byte Folded Reload + ldr x11, [sp, #752] // 8-byte Folded Reload + ldr x1, [sp, #680] // 8-byte Folded Reload + str x4, [sp, #888] // 8-byte Folded Spill + mov x4, x5 + ldr x5, [sp, #784] // 8-byte Folded Reload + ldr x6, [sp, #800] // 8-byte Folded Reload + mul x14, x2, x10 + lsl x10, x29, #1 + mul x16, x2, x11 + add x11, x10, x29 + add x15, x14, x12 + str x16, [sp, #848] // 8-byte Folded Spill + add x16, x16, x12 + add x0, x15, x29 + add x10, x15, x10 + add x18, x13, x15, lsl #2 + add x11, x15, x11 + add x15, x13, x0, lsl #2 + add x10, x13, x10, lsl #2 + ldr x0, [sp, #712] // 8-byte Folded Reload + add x11, x13, x11, lsl #2 + ldp q6, q4, [x18, #32] + ldp q1, q0, [x18] + ldr x18, [sp, #664] // 8-byte Folded Reload + add x0, x1, x0, lsl #2 + ldp q19, q17, [x10, #32] + ldp q22, q20, [x10] + ldr x10, [sp, #760] // 8-byte Folded Reload + ldp q3, q2, [x15, #32] + str x0, [sp, #840] // 8-byte Folded Spill + ldp q7, q5, [x15] + ldp q18, q16, [x11, #32] + ldp q23, q21, [x11] + ldr x11, [sp, #704] // 8-byte Folded Reload + ldr x15, [sp, #720] // 8-byte Folded Reload + add x0, x0, x16, lsl #2 + mul x16, x2, x10 + add x15, x15, x11, lsl #2 + ldr x2, [sp, #792] // 8-byte Folded Reload + ldp q29, q28, [x0, #32] + ldp q30, q31, [x0] + lsl x10, x16, #2 + ldr q26, [x15, x10] + add x10, x16, x19 + lsl x10, x10, #2 + ldr q25, [x15, x10] + add x10, x16, x19, lsl #1 + lsl x10, x10, #2 + ldr q24, [x15, x10] + add x10, x2, x27 + cmp xzr, x23 + prfm pldl1keep, [x10, #16] + ldr q27, [x10] + b.ge .LBB0_11 + .p2align 2 +.LBB0_10: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x1, [sp, #936] // 8-byte Folded Reload + ldr x24, [sp, #968] // 8-byte Folded Reload + fmla v1.4s, v30.4s, v26.s[0] + fmla v0.4s, v31.4s, v26.s[0] + ldr x0, [sp, #984] // 8-byte Folded Reload + fmla v6.4s, v29.4s, v26.s[0] + fmla v4.4s, v28.4s, v26.s[0] + add x11, x6, x27 + fmla v7.4s, v30.4s, v25.s[0] + fmla v5.4s, v31.4s, v25.s[0] + stp q30, q31, [x18, #-128] + fmla v3.4s, v29.4s, v25.s[0] + fmla v2.4s, v28.4s, v25.s[0] + stp q29, q28, [x18, #-96] + add x1, x1, x3 + add x24, x24, x3 + add x26, x0, x3 + ldr x0, [sp, #952] // 8-byte Folded Reload + fmla v17.4s, v28.4s, v24.s[0] + fmla v19.4s, v29.4s, v24.s[0] + fmla v22.4s, v30.4s, v24.s[0] + fmla v20.4s, v31.4s, v24.s[0] + fmla v16.4s, v28.4s, v27.s[0] + fmla v18.4s, v29.4s, v27.s[0] + prfm pldl1keep, [x1] + ldr x1, [sp, #944] // 8-byte Folded Reload + fmla v21.4s, v31.4s, v27.s[0] + fmla v23.4s, v30.4s, v27.s[0] + ldp q28, q29, [x24, #32] + ldp q30, q31, [x24] + ldr x24, [sp, #960] // 8-byte Folded Reload + add x0, x0, x3 + add x1, x1, x3 + add x24, x24, x3 + fmla v4.4s, v29.4s, v26.s[1] + fmla v0.4s, v31.4s, v26.s[1] + fmla v6.4s, v28.4s, v26.s[1] + fmla v1.4s, v30.4s, v26.s[1] + fmla v2.4s, v29.4s, v25.s[1] + fmla v3.4s, v28.4s, v25.s[1] + fmla v5.4s, v31.4s, v25.s[1] + stp q30, q31, [x18, #-64] + fmla v7.4s, v30.4s, v25.s[1] + stp q28, q29, [x18, #-32] + fmla v20.4s, v31.4s, v24.s[1] + fmla v22.4s, v30.4s, v24.s[1] + fmla v19.4s, v28.4s, v24.s[1] + fmla v23.4s, v30.4s, v27.s[1] + prfm pldl1keep, [x1] + ldr x1, [sp, #928] // 8-byte Folded Reload + fmla v21.4s, v31.4s, v27.s[1] + ldp q31, q30, [x24, #32] + fmla v17.4s, v29.4s, v24.s[1] + fmla v18.4s, v28.4s, v27.s[1] + fmla v16.4s, v29.4s, v27.s[1] + ldp q29, q28, [x24] + add x1, x1, x3 + fmla v4.4s, v30.4s, v26.s[2] + fmla v2.4s, v30.4s, v25.s[2] + add x10, x5, x27 + fmla v17.4s, v30.4s, v24.s[2] + stp q29, q28, [x18] + fmla v16.4s, v30.4s, v27.s[2] + stp q31, q30, [x18, #32] + prfm pldl1keep, [x0] + ldr x0, [sp, #976] // 8-byte Folded Reload + ldp q30, q9, [x1] + ldp q10, q8, [x1, #32] + fmla v6.4s, v31.4s, v26.s[2] + fmla v3.4s, v31.4s, v25.s[2] + fmla v19.4s, v31.4s, v24.s[2] + fmla v1.4s, v29.4s, v26.s[2] + fmla v0.4s, v28.4s, v26.s[2] + fmla v7.4s, v29.4s, v25.s[2] + fmla v5.4s, v28.4s, v25.s[2] + fmla v22.4s, v29.4s, v24.s[2] + fmla v20.4s, v28.4s, v24.s[2] + fmla v23.4s, v29.4s, v27.s[2] + add x28, x4, x27 + add x7, x11, #32 + add x30, x10, #32 + add x25, x28, #32 + add x0, x0, x3 + fmla v18.4s, v31.4s, v27.s[2] + fmla v21.4s, v28.4s, v27.s[2] + stp q30, q9, [x18, #64] + fmla v4.4s, v8.4s, v26.s[3] + stp q10, q8, [x18, #96] + prfm pldl1keep, [x26] + fmla v6.4s, v10.4s, v26.s[3] + fmla v0.4s, v9.4s, v26.s[3] + fmla v1.4s, v30.4s, v26.s[3] + fmla v2.4s, v8.4s, v25.s[3] + fmla v3.4s, v10.4s, v25.s[3] + fmla v5.4s, v9.4s, v25.s[3] + fmla v7.4s, v30.4s, v25.s[3] + ldp q29, q28, [x0, #32] + fmla v20.4s, v9.4s, v24.s[3] + fmla v22.4s, v30.4s, v24.s[3] + add x17, x17, #4 + add x6, x6, #16 + fmla v19.4s, v10.4s, v24.s[3] + fmla v17.4s, v8.4s, v24.s[3] + add x5, x5, #16 + add x4, x4, #16 + fmla v23.4s, v30.4s, v27.s[3] + ldp q30, q31, [x0] + prfm pldl1keep, [x25] + ldr q26, [x28, #16] + prfm pldl1keep, [x30] + ldr q25, [x10, #16] + prfm pldl1keep, [x7] + ldr q24, [x11, #16] + ldr x10, [sp, #1016] // 8-byte Folded Reload + fmla v21.4s, v9.4s, v27.s[3] + fmla v18.4s, v10.4s, v27.s[3] + fmla v16.4s, v8.4s, v27.s[3] + add x3, x3, x10 + add x2, x2, #16 + add x18, x18, #256 + add x10, x2, x27 + cmp x17, x23 + prfm pldl1keep, [x10, #16] + ldr q27, [x10] + b.lt .LBB0_10 +.LBB0_11: // in Loop: Header=BB0_8 Depth=2 + ldr x0, [sp, #728] // 8-byte Folded Reload + ldr x17, [sp, #912] // 8-byte Folded Reload + add x11, x22, x23, lsl #6 + fmla v1.4s, v30.4s, v26.s[0] + ldr x3, [sp, #848] // 8-byte Folded Reload + ldr x4, [sp, #840] // 8-byte Folded Reload + fmla v0.4s, v31.4s, v26.s[0] + fmla v6.4s, v29.4s, v26.s[0] + ldr x1, [sp, #904] // 8-byte Folded Reload + stp q30, q31, [x11] + fmla v4.4s, v28.4s, v26.s[0] + stp q29, q28, [x11, #32] + fmla v2.4s, v28.4s, v25.s[0] + fmla v7.4s, v30.4s, v25.s[0] + madd x10, x17, x0, x3 + madd x18, x1, x0, x3 + fmla v5.4s, v31.4s, v25.s[0] + fmla v3.4s, v29.4s, v25.s[0] + fmla v17.4s, v28.4s, v24.s[0] + fmla v19.4s, v29.4s, v24.s[0] + fmla v20.4s, v31.4s, v24.s[0] + fmla v22.4s, v30.4s, v24.s[0] + fmla v16.4s, v28.4s, v27.s[0] + fmla v18.4s, v29.4s, v27.s[0] + ldr x11, [sp, #896] // 8-byte Folded Reload + add x17, x22, x17, lsl #6 + fmla v21.4s, v31.4s, v27.s[0] + fmla v23.4s, v30.4s, v27.s[0] + ldr x5, [sp, #824] // 8-byte Folded Reload + ldr x6, [sp, #872] // 8-byte Folded Reload + add x10, x10, x12 + mov x2, xzr + add x10, x4, x10, lsl #2 + ldp q28, q29, [x10] + ldp q30, q31, [x10, #32] + add x10, x18, x12 + add x18, x22, x1, lsl #6 + add x10, x4, x10, lsl #2 + fmla v4.4s, v31.4s, v26.s[1] + fmla v0.4s, v29.4s, v26.s[1] + fmla v5.4s, v29.4s, v25.s[1] + fmla v2.4s, v31.4s, v25.s[1] + fmla v20.4s, v29.4s, v24.s[1] + fmla v17.4s, v31.4s, v24.s[1] + fmla v21.4s, v29.4s, v27.s[1] + fmla v16.4s, v31.4s, v27.s[1] + fmla v6.4s, v30.4s, v26.s[1] + stp q28, q29, [x17] + stp q30, q31, [x17, #32] + fmla v1.4s, v28.4s, v26.s[1] + fmla v3.4s, v30.4s, v25.s[1] + fmla v7.4s, v28.4s, v25.s[1] + fmla v22.4s, v28.4s, v24.s[1] + fmla v19.4s, v30.4s, v24.s[1] + fmla v23.4s, v28.4s, v27.s[1] + fmla v18.4s, v30.4s, v27.s[1] + ldp q29, q28, [x10, #32] + ldp q31, q30, [x10] + madd x10, x11, x0, x3 + add x0, x22, x11, lsl #6 + ldr x11, [sp, #1032] // 8-byte Folded Reload + add x10, x10, x12 + fmla v0.4s, v30.4s, v26.s[2] + fmla v4.4s, v28.4s, v26.s[2] + fmla v2.4s, v28.4s, v25.s[2] + fmla v5.4s, v30.4s, v25.s[2] + fmla v17.4s, v28.4s, v24.s[2] + fmla v20.4s, v30.4s, v24.s[2] + fmla v16.4s, v28.4s, v27.s[2] + fmla v21.4s, v30.4s, v27.s[2] + add x10, x4, x10, lsl #2 + stp q31, q30, [x18] + fmla v1.4s, v31.4s, v26.s[2] + stp q29, q28, [x18, #32] + fmla v6.4s, v29.4s, v26.s[2] + fmla v7.4s, v31.4s, v25.s[2] + fmla v3.4s, v29.4s, v25.s[2] + fmla v19.4s, v29.4s, v24.s[2] + fmla v22.4s, v31.4s, v24.s[2] + fmla v18.4s, v29.4s, v27.s[2] + fmla v23.4s, v31.4s, v27.s[2] + ldp q28, q29, [x10] + fmla v0.4s, v29.4s, v26.s[3] + ldp q30, q31, [x10, #32] + fmla v4.4s, v31.4s, v26.s[3] + fmla v5.4s, v29.4s, v25.s[3] + fmla v2.4s, v31.4s, v25.s[3] + fmla v20.4s, v29.4s, v24.s[3] + fmla v17.4s, v31.4s, v24.s[3] + fmla v21.4s, v29.4s, v27.s[3] + ldr x10, [sp, #672] // 8-byte Folded Reload + fmla v16.4s, v31.4s, v27.s[3] + fmla v6.4s, v30.4s, v26.s[3] + fmla v1.4s, v28.4s, v26.s[3] + fmla v3.4s, v30.4s, v25.s[3] + fmla v7.4s, v28.4s, v25.s[3] + stp q28, q29, [x0] + stp q30, q31, [x0, #32] + fmla v22.4s, v28.4s, v24.s[3] + fmla v19.4s, v30.4s, v24.s[3] + fmla v23.4s, v28.4s, v27.s[3] + fmla v18.4s, v30.4s, v27.s[3] + cmp x11, x21 + b.ge .LBB0_13 + .p2align 2 +.LBB0_12: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x1, x6, x10 + add x10, x10, #4 + add x3, x1, x20 + prfm pldl1keep, [x1] + ldur s24, [x1, #-4] + add x1, x9, x2 + add x4, x3, x20 + prfm pldl1keep, [x3] + ldur s25, [x3, #-4] + add x3, x8, x2 + add x2, x2, x5 + prfm pldl1keep, [x4] + ldur s26, [x4, #-4] + add x4, x4, x20 + prfm pldl1keep, [x4] + ldur s27, [x4, #-4] + prfm pldl1keep, [x1] + ldp q28, q29, [x3, #32] + add x1, x22, x11, lsl #6 + ldp q30, q31, [x3] + add x11, x11, #1 + fmla v4.4s, v29.4s, v24.s[0] + fmla v0.4s, v31.4s, v24.s[0] + fmla v5.4s, v31.4s, v25.s[0] + fmla v2.4s, v29.4s, v25.s[0] + fmla v20.4s, v31.4s, v26.s[0] + fmla v17.4s, v29.4s, v26.s[0] + fmla v6.4s, v28.4s, v24.s[0] + fmla v1.4s, v30.4s, v24.s[0] + fmla v3.4s, v28.4s, v25.s[0] + fmla v7.4s, v30.4s, v25.s[0] + fmla v22.4s, v30.4s, v26.s[0] + fmla v19.4s, v28.4s, v26.s[0] + fmla v23.4s, v30.4s, v27.s[0] + fmla v21.4s, v31.4s, v27.s[0] + fmla v18.4s, v28.4s, v27.s[0] + stp q30, q31, [x1] + stp q28, q29, [x1, #32] + fmla v16.4s, v29.4s, v27.s[0] + cmp x11, x21 + b.lt .LBB0_12 +.LBB0_13: // %.preheader + // in Loop: Header=BB0_8 Depth=2 + ldr x1, [sp, #808] // 8-byte Folded Reload + ldr x11, [sp, #816] // 8-byte Folded Reload + mov x10, xzr + mov w5, #1 // =0x1 + mov w6, #2 // =0x2 + mov w4, #3 // =0x3 + mov w3, #4 // =0x4 + b .LBB0_15 + .p2align 2 +.LBB0_14: // %.loopexit + // in Loop: Header=BB0_15 Depth=3 + ldr x10, [sp, #1008] // 8-byte Folded Reload + add x11, x11, x10 + add x1, x1, x10 + mov x10, x3 + mov x3, x7 +.LBB0_15: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_17 Depth 4 + // Child Loop BB0_19 Depth 4 + madd x10, x10, x29, x14 + add x10, x10, x12 + madd x2, x5, x29, x14 + madd x5, x6, x29, x14 + add x2, x2, x12 + add x5, x5, x12 + add x10, x13, x10, lsl #2 + stp q1, q0, [x10] + stp q6, q4, [x10, #32] + add x10, x13, x2, lsl #2 + add x2, x13, x5, lsl #2 + stp q7, q5, [x10] + stp q3, q2, [x10, #32] + madd x10, x4, x29, x14 + add x10, x10, x12 + stp q22, q20, [x2] + stp q19, q17, [x2, #32] + ldr x2, [sp, #1024] // 8-byte Folded Reload + cmp x3, x2 + add x10, x13, x10, lsl #2 + stp q23, q21, [x10] + stp q18, q16, [x10, #32] + b.ge .LBB0_20 +// %bb.16: // in Loop: Header=BB0_15 Depth=3 + madd x10, x3, x29, x14 + add x4, x3, #3 + add x5, x3, #1 + add x6, x3, #2 + madd x2, x5, x29, x14 + ldp q28, q29, [x22, #32] + mov x30, xzr + madd x24, x6, x29, x14 + ldp q30, q31, [x22] + add x7, x3, #4 + add x10, x10, x12 + add x10, x13, x10, lsl #2 + add x2, x2, x12 + add x2, x13, x2, lsl #2 + ldp q6, q4, [x10, #32] + ldp q1, q0, [x10] + madd x10, x4, x29, x14 + ldp q3, q2, [x2, #32] + add x10, x10, x12 + ldp q7, q5, [x2] + add x2, x24, x12 + add x2, x13, x2, lsl #2 + ldp q19, q17, [x2, #32] + ldp q22, q20, [x2] + mov x2, x11 + add x10, x13, x10, lsl #2 + ldp q18, q16, [x10, #32] + ldp q23, q21, [x10] + madd x10, x3, x19, x16 + lsl x10, x10, #2 + ldr q27, [x15, x10] + madd x10, x5, x19, x16 + lsl x10, x10, #2 + ldr q26, [x15, x10] + madd x10, x6, x19, x16 + lsl x10, x10, #2 + ldr q25, [x15, x10] + madd x10, x4, x19, x16 + lsl x10, x10, #2 + ldr q24, [x15, x10] + ldr x10, [sp, #1000] // 8-byte Folded Reload + fmla v4.4s, v29.4s, v27.s[0] + cmp xzr, x23 + b.ge .LBB0_18 + .p2align 2 +.LBB0_17: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // Parent Loop BB0_15 Depth=3 + // => This Inner Loop Header: Depth=4 + add x28, x10, #64 + fmla v6.4s, v28.4s, v27.s[0] + fmla v1.4s, v30.4s, v27.s[0] + add x24, x10, #128 + prfm pldl1keep, [x28] + ldp q9, q8, [x10, #-160] + fmla v0.4s, v31.4s, v27.s[0] + ldp q12, q15, [x10, #-192] + fmla v2.4s, v29.4s, v26.s[0] + fmla v3.4s, v28.4s, v26.s[0] + fmla v5.4s, v31.4s, v26.s[0] + fmla v7.4s, v30.4s, v26.s[0] + fmla v17.4s, v29.4s, v25.s[0] + prfm pldl1keep, [x24] + fmla v19.4s, v28.4s, v25.s[0] + fmla v20.4s, v31.4s, v25.s[0] + ldp q11, q10, [x10, #-128] + fmla v22.4s, v30.4s, v25.s[0] + fmla v16.4s, v29.4s, v24.s[0] + ldp q13, q14, [x10, #-96] + fmla v18.4s, v28.4s, v24.s[0] + fmla v21.4s, v31.4s, v24.s[0] + add x26, x10, #192 + prfm pldl1keep, [x26] + fmla v23.4s, v30.4s, v24.s[0] + fmla v0.4s, v15.4s, v27.s[1] + add x25, x10, #256 + add x30, x30, #4 + fmla v1.4s, v12.4s, v27.s[1] + fmla v6.4s, v9.4s, v27.s[1] + fmla v4.4s, v8.4s, v27.s[1] + fmla v7.4s, v12.4s, v26.s[1] + fmla v5.4s, v15.4s, v26.s[1] + fmla v3.4s, v9.4s, v26.s[1] + fmla v2.4s, v8.4s, v26.s[1] + fmla v22.4s, v12.4s, v25.s[1] + fmla v20.4s, v15.4s, v25.s[1] + fmla v19.4s, v9.4s, v25.s[1] + fmla v17.4s, v8.4s, v25.s[1] + fmla v23.4s, v12.4s, v24.s[1] + fmla v21.4s, v15.4s, v24.s[1] + ldp q15, q12, [x10, #-64] + fmla v18.4s, v9.4s, v24.s[1] + fmla v16.4s, v8.4s, v24.s[1] + ldp q9, q8, [x10, #-32] + prfm pldl1keep, [x25] + ldp q28, q29, [x10, #32] + ldp q30, q31, [x10] + add x10, x2, x20 + prfm pldl1keep, [x2] + fmla v4.4s, v14.4s, v27.s[2] + fmla v6.4s, v13.4s, v27.s[2] + fmla v1.4s, v11.4s, v27.s[2] + fmla v0.4s, v10.4s, v27.s[2] + fmla v2.4s, v14.4s, v26.s[2] + fmla v3.4s, v13.4s, v26.s[2] + fmla v5.4s, v10.4s, v26.s[2] + fmla v7.4s, v11.4s, v26.s[2] + fmla v17.4s, v14.4s, v25.s[2] + fmla v19.4s, v13.4s, v25.s[2] + fmla v20.4s, v10.4s, v25.s[2] + fmla v22.4s, v11.4s, v25.s[2] + fmla v16.4s, v14.4s, v24.s[2] + fmla v18.4s, v13.4s, v24.s[2] + fmla v21.4s, v10.4s, v24.s[2] + fmla v23.4s, v11.4s, v24.s[2] + fmla v0.4s, v12.4s, v27.s[3] + fmla v1.4s, v15.4s, v27.s[3] + fmla v6.4s, v9.4s, v27.s[3] + fmla v4.4s, v8.4s, v27.s[3] + ldur q27, [x2, #-16] + prfm pldl1keep, [x10] + add x2, x2, #16 + fmla v7.4s, v15.4s, v26.s[3] + fmla v5.4s, v12.4s, v26.s[3] + fmla v3.4s, v9.4s, v26.s[3] + fmla v2.4s, v8.4s, v26.s[3] + ldur q26, [x10, #-16] + add x10, x10, x20 + add x24, x10, x20 + prfm pldl1keep, [x10] + fmla v22.4s, v15.4s, v25.s[3] + fmla v20.4s, v12.4s, v25.s[3] + fmla v19.4s, v9.4s, v25.s[3] + fmla v17.4s, v8.4s, v25.s[3] + ldur q25, [x10, #-16] + prfm pldl1keep, [x24] + mov x10, x25 + fmla v23.4s, v15.4s, v24.s[3] + fmla v21.4s, v12.4s, v24.s[3] + fmla v18.4s, v9.4s, v24.s[3] + fmla v16.4s, v8.4s, v24.s[3] + ldur q24, [x24, #-16] + fmla v4.4s, v29.4s, v27.s[0] + cmp x30, x23 + b.lt .LBB0_17 +.LBB0_18: // in Loop: Header=BB0_15 Depth=3 + ldp q10, q8, [x17, #32] + ldp q12, q11, [x17] + fmla v6.4s, v28.4s, v27.s[0] + fmla v0.4s, v31.4s, v27.s[0] + fmla v1.4s, v30.4s, v27.s[0] + fmla v2.4s, v29.4s, v26.s[0] + fmla v3.4s, v28.4s, v26.s[0] + fmla v5.4s, v31.4s, v26.s[0] + ldp q9, q13, [x18, #32] + fmla v7.4s, v30.4s, v26.s[0] + fmla v17.4s, v29.4s, v25.s[0] + ldr x2, [sp, #992] // 8-byte Folded Reload + ldr x25, [sp, #1032] // 8-byte Folded Reload + fmla v19.4s, v28.4s, v25.s[0] + fmla v20.4s, v31.4s, v25.s[0] + mov x10, x1 + fmla v22.4s, v30.4s, v25.s[0] + fmla v16.4s, v29.4s, v24.s[0] + fmla v18.4s, v28.4s, v24.s[0] + fmla v21.4s, v31.4s, v24.s[0] + fmla v23.4s, v30.4s, v24.s[0] + ldp q29, q30, [x18] + ldp q31, q28, [x0, #32] + fmla v1.4s, v12.4s, v27.s[1] + fmla v0.4s, v11.4s, v27.s[1] + fmla v6.4s, v10.4s, v27.s[1] + fmla v4.4s, v8.4s, v27.s[1] + fmla v7.4s, v12.4s, v26.s[1] + fmla v5.4s, v11.4s, v26.s[1] + fmla v3.4s, v10.4s, v26.s[1] + fmla v2.4s, v8.4s, v26.s[1] + fmla v22.4s, v12.4s, v25.s[1] + fmla v20.4s, v11.4s, v25.s[1] + fmla v19.4s, v10.4s, v25.s[1] + fmla v17.4s, v8.4s, v25.s[1] + fmla v23.4s, v12.4s, v24.s[1] + fmla v21.4s, v11.4s, v24.s[1] + fmla v18.4s, v10.4s, v24.s[1] + fmla v16.4s, v8.4s, v24.s[1] + ldp q10, q8, [x0] + fmla v4.4s, v13.4s, v27.s[2] + fmla v6.4s, v9.4s, v27.s[2] + fmla v0.4s, v30.4s, v27.s[2] + fmla v1.4s, v29.4s, v27.s[2] + fmla v2.4s, v13.4s, v26.s[2] + fmla v3.4s, v9.4s, v26.s[2] + fmla v5.4s, v30.4s, v26.s[2] + fmla v7.4s, v29.4s, v26.s[2] + fmla v17.4s, v13.4s, v25.s[2] + fmla v19.4s, v9.4s, v25.s[2] + fmla v20.4s, v30.4s, v25.s[2] + fmla v22.4s, v29.4s, v25.s[2] + fmla v16.4s, v13.4s, v24.s[2] + fmla v18.4s, v9.4s, v24.s[2] + fmla v21.4s, v30.4s, v24.s[2] + fmla v23.4s, v29.4s, v24.s[2] + fmla v1.4s, v10.4s, v27.s[3] + fmla v0.4s, v8.4s, v27.s[3] + fmla v6.4s, v31.4s, v27.s[3] + fmla v4.4s, v28.4s, v27.s[3] + fmla v7.4s, v10.4s, v26.s[3] + fmla v5.4s, v8.4s, v26.s[3] + fmla v3.4s, v31.4s, v26.s[3] + fmla v2.4s, v28.4s, v26.s[3] + fmla v22.4s, v10.4s, v25.s[3] + fmla v20.4s, v8.4s, v25.s[3] + fmla v19.4s, v31.4s, v25.s[3] + fmla v17.4s, v28.4s, v25.s[3] + fmla v23.4s, v10.4s, v24.s[3] + fmla v21.4s, v8.4s, v24.s[3] + fmla v18.4s, v31.4s, v24.s[3] + fmla v16.4s, v28.4s, v24.s[3] + cmp x25, x21 + b.ge .LBB0_14 + .p2align 2 +.LBB0_19: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // Parent Loop BB0_15 Depth=3 + // => This Inner Loop Header: Depth=4 + add x24, x10, x20 + prfm pldl1keep, [x10] + ldur s24, [x10, #-4] + add x25, x25, #1 + prfm pldl1keep, [x24] + ldur s25, [x24, #-4] + add x24, x24, x20 + add x10, x10, #4 + prfm pldl1keep, [x24] + ldur s26, [x24, #-4] + add x24, x24, x20 + prfm pldl1keep, [x24] + ldur s27, [x24, #-4] + prfm pldl1keep, [x2] + ldp q28, q29, [x2, #-32] + fmla v4.4s, v29.4s, v24.s[0] + ldp q30, q31, [x2, #-64] + fmla v0.4s, v31.4s, v24.s[0] + fmla v5.4s, v31.4s, v25.s[0] + fmla v2.4s, v29.4s, v25.s[0] + fmla v20.4s, v31.4s, v26.s[0] + fmla v17.4s, v29.4s, v26.s[0] + add x2, x2, #64 + fmla v6.4s, v28.4s, v24.s[0] + fmla v1.4s, v30.4s, v24.s[0] + fmla v3.4s, v28.4s, v25.s[0] + fmla v7.4s, v30.4s, v25.s[0] + fmla v22.4s, v30.4s, v26.s[0] + fmla v19.4s, v28.4s, v26.s[0] + fmla v23.4s, v30.4s, v27.s[0] + fmla v21.4s, v31.4s, v27.s[0] + fmla v18.4s, v28.4s, v27.s[0] + fmla v16.4s, v29.4s, v27.s[0] + cmp x25, x21 + b.lt .LBB0_19 + b .LBB0_14 + .p2align 2 +.LBB0_20: // in Loop: Header=BB0_8 Depth=2 + ldr x10, [sp, #1024] // 8-byte Folded Reload + ldr x11, [sp, #920] // 8-byte Folded Reload + cmp x10, x11 + b.ge .LBB0_26 +// %bb.21: // in Loop: Header=BB0_8 Depth=2 + ldr x3, [sp, #1024] // 8-byte Folded Reload + ldp q18, q19, [x22, #32] + mov x10, xzr + ldp q20, q21, [x22] + ldr x4, [sp, #1000] // 8-byte Folded Reload + madd x11, x3, x29, x14 + add x11, x11, x12 + add x1, x13, x11, lsl #2 + add x11, x3, #1 + madd x3, x3, x19, x16 + madd x2, x11, x29, x14 + madd x11, x11, x19, x16 + ldp q1, q0, [x1, #32] + ldp q4, q2, [x1] + lsl x3, x3, #2 + add x2, x2, x12 + lsl x11, x11, #2 + ldr q17, [x15, x3] + ldr x3, [sp, #624] // 8-byte Folded Reload + add x2, x13, x2, lsl #2 + ldr q16, [x15, x11] + ldr x11, [sp, #616] // 8-byte Folded Reload + ldp q5, q3, [x2, #32] + ldp q7, q6, [x2] + cmp xzr, x23 + b.ge .LBB0_23 + .p2align 2 +.LBB0_22: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x30, x4, #64 + fmla v0.4s, v19.4s, v17.s[0] + fmla v1.4s, v18.4s, v17.s[0] + add x28, x4, #128 + prfm pldl1keep, [x30] + ldp q23, q22, [x4, #-160] + fmla v4.4s, v20.4s, v17.s[0] + ldp q24, q25, [x4, #-192] + fmla v2.4s, v21.4s, v17.s[0] + fmla v3.4s, v19.4s, v16.s[0] + fmla v5.4s, v18.4s, v16.s[0] + fmla v6.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + prfm pldl1keep, [x28] + ldp q19, q18, [x4, #-128] + ldp q20, q21, [x4, #-96] + fmla v2.4s, v25.4s, v17.s[1] + fmla v0.4s, v22.4s, v17.s[1] + fmla v6.4s, v25.4s, v16.s[1] + fmla v3.4s, v22.4s, v16.s[1] + fmla v4.4s, v24.4s, v17.s[1] + fmla v1.4s, v23.4s, v17.s[1] + add x24, x4, #192 + prfm pldl1keep, [x24] + fmla v7.4s, v24.4s, v16.s[1] + fmla v5.4s, v23.4s, v16.s[1] + ldp q23, q22, [x4, #-32] + ldp q24, q25, [x4, #-64] + add x6, x3, x27 + add x25, x11, x27 + fmla v0.4s, v21.4s, v17.s[2] + fmla v2.4s, v18.4s, v17.s[2] + fmla v3.4s, v21.4s, v16.s[2] + fmla v6.4s, v18.4s, v16.s[2] + fmla v1.4s, v20.4s, v17.s[2] + fmla v4.4s, v19.4s, v17.s[2] + add x5, x4, #256 + add x7, x6, #32 + fmla v5.4s, v20.4s, v16.s[2] + fmla v7.4s, v19.4s, v16.s[2] + add x26, x25, #32 + prfm pldl1keep, [x26] + add x10, x10, #4 + add x3, x3, #16 + add x11, x11, #16 + fmla v2.4s, v25.4s, v17.s[3] + fmla v0.4s, v22.4s, v17.s[3] + fmla v6.4s, v25.4s, v16.s[3] + fmla v3.4s, v22.4s, v16.s[3] + fmla v4.4s, v24.4s, v17.s[3] + fmla v1.4s, v23.4s, v17.s[3] + ldr q17, [x25, #16] + prfm pldl1keep, [x7] + fmla v7.4s, v24.4s, v16.s[3] + fmla v5.4s, v23.4s, v16.s[3] + ldr q16, [x6, #16] + prfm pldl1keep, [x5] + ldp q18, q19, [x4, #32] + ldp q20, q21, [x4] + mov x4, x5 + cmp x10, x23 + b.lt .LBB0_22 +.LBB0_23: // in Loop: Header=BB0_8 Depth=2 + ldp q23, q22, [x17, #32] + ldp q25, q24, [x17] + fmla v0.4s, v19.4s, v17.s[0] + fmla v1.4s, v18.4s, v17.s[0] + fmla v2.4s, v21.4s, v17.s[0] + fmla v4.4s, v20.4s, v17.s[0] + fmla v3.4s, v19.4s, v16.s[0] + fmla v5.4s, v18.4s, v16.s[0] + ldp q18, q19, [x18] + fmla v6.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q20, q21, [x18, #32] + fmla v2.4s, v24.4s, v17.s[1] + fmla v0.4s, v22.4s, v17.s[1] + ldr x10, [sp, #880] // 8-byte Folded Reload + ldr x11, [sp, #992] // 8-byte Folded Reload + fmla v4.4s, v25.4s, v17.s[1] + fmla v1.4s, v23.4s, v17.s[1] + ldr x3, [sp, #1032] // 8-byte Folded Reload + ldr x6, [sp, #576] // 8-byte Folded Reload + fmla v7.4s, v25.4s, v16.s[1] + fmla v6.4s, v24.4s, v16.s[1] + ldp q25, q24, [x0] + fmla v5.4s, v23.4s, v16.s[1] + fmla v3.4s, v22.4s, v16.s[1] + ldp q23, q22, [x0, #32] + fmla v0.4s, v21.4s, v17.s[2] + fmla v2.4s, v19.4s, v17.s[2] + ldr x7, [sp, #568] // 8-byte Folded Reload + fmla v3.4s, v21.4s, v16.s[2] + fmla v1.4s, v20.4s, v17.s[2] + fmla v4.4s, v18.4s, v17.s[2] + fmla v5.4s, v20.4s, v16.s[2] + fmla v6.4s, v19.4s, v16.s[2] + fmla v7.4s, v18.4s, v16.s[2] + fmla v2.4s, v24.4s, v17.s[3] + fmla v0.4s, v22.4s, v17.s[3] + fmla v6.4s, v24.4s, v16.s[3] + fmla v3.4s, v22.4s, v16.s[3] + fmla v4.4s, v25.4s, v17.s[3] + fmla v1.4s, v23.4s, v17.s[3] + fmla v7.4s, v25.4s, v16.s[3] + fmla v5.4s, v23.4s, v16.s[3] + cmp x3, x21 + b.ge .LBB0_25 + .p2align 2 +.LBB0_24: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x10, x7 + add x5, x10, x6 + add x3, x3, #1 + add x5, x5, #4 + add x4, x4, #4 + prfm pldl1keep, [x5] + ldr s16, [x10, x6] + prfm pldl1keep, [x4] + ldr s17, [x10, x7] + prfm pldl1keep, [x11] + ldp q18, q19, [x11, #-64] + ldp q20, q21, [x11, #-32] + add x11, x11, #64 + add x10, x10, #4 + fmla v0.4s, v21.4s, v16.s[0] + fmla v1.4s, v20.4s, v16.s[0] + fmla v2.4s, v19.4s, v16.s[0] + fmla v4.4s, v18.4s, v16.s[0] + fmla v7.4s, v18.4s, v17.s[0] + fmla v6.4s, v19.4s, v17.s[0] + fmla v5.4s, v20.4s, v17.s[0] + fmla v3.4s, v21.4s, v17.s[0] + cmp x3, x21 + b.lt .LBB0_24 +.LBB0_25: // in Loop: Header=BB0_8 Depth=2 + stp q4, q2, [x1] + stp q1, q0, [x1, #32] + stp q7, q6, [x2] + stp q5, q3, [x2, #32] +.LBB0_26: // in Loop: Header=BB0_8 Depth=2 + ldr x10, [sp, #744] // 8-byte Folded Reload + ldr x11, [sp, #920] // 8-byte Folded Reload + cmp x11, x10 + ldr x2, [sp, #832] // 8-byte Folded Reload + ldr x3, [sp, #768] // 8-byte Folded Reload + ldr x5, [sp, #872] // 8-byte Folded Reload + ldr x4, [sp, #888] // 8-byte Folded Reload + b.ge .LBB0_7 +// %bb.27: // in Loop: Header=BB0_8 Depth=2 + ldr x1, [sp, #920] // 8-byte Folded Reload + ldp q7, q16, [x22, #32] + mov x10, xzr + ldp q6, q5, [x22] + madd x11, x1, x29, x14 + add x11, x11, x12 + madd x12, x1, x19, x16 + add x11, x13, x11, lsl #2 + ldr x13, [sp, #1000] // 8-byte Folded Reload + lsl x12, x12, #2 + ldp q1, q0, [x11, #32] + ldp q3, q2, [x11] + ldr q4, [x15, x12] + ldr x12, [sp, #640] // 8-byte Folded Reload + cmp xzr, x23 + b.ge .LBB0_29 + .p2align 2 +.LBB0_28: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x1, x13, #64 + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + add x16, x13, #128 + prfm pldl1keep, [x1] + ldp q18, q17, [x13, #-160] + fmla v3.4s, v6.4s, v4.s[0] + ldp q19, q20, [x13, #-192] + fmla v2.4s, v5.4s, v4.s[0] + prfm pldl1keep, [x16] + ldp q6, q5, [x13, #-128] + ldp q7, q16, [x13, #-96] + add x15, x13, #192 + prfm pldl1keep, [x15] + add x14, x13, #256 + add x10, x10, #4 + fmla v2.4s, v20.4s, v4.s[1] + fmla v0.4s, v17.4s, v4.s[1] + fmla v3.4s, v19.4s, v4.s[1] + fmla v1.4s, v18.4s, v4.s[1] + ldp q18, q17, [x13, #-32] + ldp q19, q20, [x13, #-64] + prfm pldl1keep, [x12] + fmla v0.4s, v16.4s, v4.s[2] + fmla v2.4s, v5.4s, v4.s[2] + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v2.4s, v20.4s, v4.s[3] + fmla v0.4s, v17.4s, v4.s[3] + fmla v3.4s, v19.4s, v4.s[3] + fmla v1.4s, v18.4s, v4.s[3] + ldur q4, [x12, #-16] + prfm pldl1keep, [x14] + add x12, x12, #16 + ldp q7, q16, [x13, #32] + ldp q6, q5, [x13] + mov x13, x14 + cmp x10, x23 + b.lt .LBB0_28 +.LBB0_29: // in Loop: Header=BB0_8 Depth=2 + ldp q18, q17, [x17, #32] + ldp q20, q19, [x17] + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + fmla v2.4s, v5.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q5, q6, [x18] + ldp q7, q16, [x18, #32] + ldr x12, [sp, #560] // 8-byte Folded Reload + ldr x16, [sp, #552] // 8-byte Folded Reload + ldr x17, [sp, #632] // 8-byte Folded Reload + ldr x18, [sp, #880] // 8-byte Folded Reload + fmla v2.4s, v19.4s, v4.s[1] + fmla v0.4s, v17.4s, v4.s[1] + mov x10, xzr + mov w13, #64 // =0x40 + fmla v3.4s, v20.4s, v4.s[1] + fmla v1.4s, v18.4s, v4.s[1] + ldp q18, q17, [x0, #32] + ldp q20, q19, [x0] + fmla v0.4s, v16.4s, v4.s[2] + fmla v2.4s, v6.4s, v4.s[2] + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v5.4s, v4.s[2] + fmla v2.4s, v19.4s, v4.s[3] + fmla v0.4s, v17.4s, v4.s[3] + fmla v3.4s, v20.4s, v4.s[3] + fmla v1.4s, v18.4s, v4.s[3] + ldr x14, [sp, #1032] // 8-byte Folded Reload + add x14, x14, xzr + cmp x14, x21 + b.ge .LBB0_6 + .p2align 2 +.LBB0_30: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x15, x18, x12 + add x14, x16, x13 + add x13, x13, #64 + prfm pldl1keep, [x15] + add x15, x16, x10, lsl #6 + ldr s4, [x17, x10, lsl #2] + prfm pldl1keep, [x14] + add x10, x10, #1 + add x12, x12, #4 + ldp q5, q6, [x15] + ldp q7, q16, [x15, #32] + fmla v0.4s, v16.4s, v4.s[0] + fmla v2.4s, v6.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + fmla v3.4s, v5.4s, v4.s[0] + ldr x14, [sp, #1032] // 8-byte Folded Reload + add x14, x14, x10 + cmp x14, x21 + b.lt .LBB0_30 + b .LBB0_6 + .p2align 2 +.LBB0_31: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #696] // 8-byte Folded Reload + ldr x9, [sp, #688] // 8-byte Folded Reload + cmp x2, x3 + add x24, x9, x8, lsl #2 + lsl x8, x29, #1 + ldr x9, [sp, #680] // 8-byte Folded Reload + str x8, [sp, #520] // 8-byte Folded Spill + ldr x8, [sp, #712] // 8-byte Folded Reload + str x24, [sp, #856] // 8-byte Folded Spill + add x8, x9, x8, lsl #2 + ldr x9, [sp, #720] // 8-byte Folded Reload + str x8, [sp, #840] // 8-byte Folded Spill + ldr x8, [sp, #704] // 8-byte Folded Reload + add x8, x9, x8, lsl #2 + str x8, [sp, #888] // 8-byte Folded Spill + b.lt .LBB0_35 +// %bb.32: // in Loop: Header=BB0_4 Depth=1 + ldr x4, [sp, #600] // 8-byte Folded Reload + cmp x3, x4 + b.lt .LBB0_60 +.LBB0_33: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #648] // 8-byte Folded Reload + cmp x4, x8 + b.lt .LBB0_85 +.LBB0_34: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #144] // 8-byte Folded Reload + ldr x9, [sp, #648] // 8-byte Folded Reload + cmp x9, x8 + b.ge .LBB0_3 + b .LBB0_110 + .p2align 2 +.LBB0_35: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #88] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #736] // 8-byte Folded Reload + ldr x13, [sp, #776] // 8-byte Folded Reload + mov x11, xzr + mul x9, x13, x8 + ldr x8, [sp, #760] // 8-byte Folded Reload + ldr x2, [sp, #520] // 8-byte Folded Reload + add x12, x2, x29 + ldr x6, [sp, #784] // 8-byte Folded Reload + ldp x3, x4, [sp, #496] // 16-byte Folded Reload + ldr x7, [sp, #800] // 8-byte Folded Reload + mul x10, x13, x8 + ldr x8, [sp, #752] // 8-byte Folded Reload + mul x13, x13, x8 + add x8, x0, #63 + add x17, x10, x19 + lsl x16, x10, #2 + str x10, [sp, #848] // 8-byte Folded Spill + and x25, x8, #0xffffffffffffffc0 + ldr x8, [sp, #832] // 8-byte Folded Reload + stp x13, x0, [sp, #112] // 16-byte Folded Spill + add x14, x9, x8 + add x15, x13, x8 + ldr x8, [sp, #888] // 8-byte Folded Reload + add x12, x14, x12 + add x18, x24, x14, lsl #2 + add x1, x14, x29 + add x2, x14, x2 + lsl x14, x17, #2 + add x12, x24, x12, lsl #2 + add x17, x24, x2, lsl #2 + ldp q1, q0, [x18] + ldr x18, [sp, #792] // 8-byte Folded Reload + ldp q7, q5, [x12] + add x12, x10, x19, lsl #1 + lsl x12, x12, #2 + ldr q16, [x8, x16] + ldr q17, [x8, x14] + add x16, x24, x1, lsl #2 + ldp q6, q3, [x16] + ldp q4, q2, [x17] + ldp x16, x17, [sp, #464] // 16-byte Folded Reload + ldp x1, x2, [sp, #480] // 16-byte Folded Reload + ldr q18, [x8, x12] + ldr x8, [sp, #840] // 8-byte Folded Reload + add x12, x25, #64 + add x5, x8, x15, lsl #2 + ldp x14, x15, [sp, #448] // 16-byte Folded Reload + ldp q21, q20, [x5] + ldr x5, [sp, #872] // 8-byte Folded Reload + .p2align 2 +.LBB0_36: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x24, x18, x27 + fmla v1.4s, v21.4s, v16.s[0] + fmla v0.4s, v20.4s, v16.s[0] + cmp x11, x23 + prfm pldl1keep, [x24, #16] + ldr q19, [x24] + b.ge .LBB0_38 +// %bb.37: // in Loop: Header=BB0_36 Depth=2 + ldr x8, [sp, #864] // 8-byte Folded Reload + mov x10, x25 + fmla v6.4s, v21.4s, v17.s[0] + fmla v3.4s, v20.4s, v17.s[0] + fmla v4.4s, v21.4s, v18.s[0] + fmla v2.4s, v20.4s, v18.s[0] + stp q21, q20, [x12, #-64] + fmla v7.4s, v21.4s, v19.s[0] + fmla v5.4s, v20.4s, v19.s[0] + add x26, x6, x27 + add x28, x5, x27 + add x0, x26, #32 + add x11, x11, #4 + add x6, x6, #16 + add x5, x5, #16 + add x24, x1, x8 + add x25, x16, x8 + add x13, x14, x8 + add x30, x4, x8 + prfm pldl1keep, [x24] + ldp q20, q21, [x25] + add x24, x2, x8 + add x25, x15, x8 + add x18, x18, #16 + fmla v0.4s, v21.4s, v16.s[1] + fmla v3.4s, v21.4s, v17.s[1] + fmla v2.4s, v21.4s, v18.s[1] + fmla v5.4s, v21.4s, v19.s[1] + fmla v1.4s, v20.4s, v16.s[1] + fmla v6.4s, v20.4s, v17.s[1] + fmla v4.4s, v20.4s, v18.s[1] + fmla v7.4s, v20.4s, v19.s[1] + stp q20, q21, [x12, #-32] + prfm pldl1keep, [x24] + ldp q21, q20, [x25] + add x24, x3, x8 + add x25, x7, x27 + add x7, x7, #16 + fmla v0.4s, v20.4s, v16.s[2] + fmla v3.4s, v20.4s, v17.s[2] + fmla v2.4s, v20.4s, v18.s[2] + fmla v5.4s, v20.4s, v19.s[2] + fmla v1.4s, v21.4s, v16.s[2] + fmla v6.4s, v21.4s, v17.s[2] + fmla v4.4s, v21.4s, v18.s[2] + fmla v7.4s, v21.4s, v19.s[2] + stp q21, q20, [x12] + prfm pldl1keep, [x24] + ldp q20, q21, [x13] + add x13, x17, x8 + add x24, x25, #32 + add x8, x28, #32 + fmla v0.4s, v21.4s, v16.s[3] + fmla v3.4s, v21.4s, v17.s[3] + fmla v2.4s, v21.4s, v18.s[3] + fmla v5.4s, v21.4s, v19.s[3] + fmla v1.4s, v20.4s, v16.s[3] + fmla v6.4s, v20.4s, v17.s[3] + fmla v4.4s, v20.4s, v18.s[3] + fmla v7.4s, v20.4s, v19.s[3] + stp q20, q21, [x12, #32] + prfm pldl1keep, [x30] + ldp q21, q20, [x13] + prfm pldl1keep, [x8] + ldr q16, [x28, #16] + prfm pldl1keep, [x0] + ldr q17, [x26, #16] + prfm pldl1keep, [x24] + ldr x8, [sp, #1016] // 8-byte Folded Reload + ldr q18, [x25, #16] + mov x25, x10 + add x12, x12, #128 + add x4, x4, x8 + add x3, x3, x8 + add x2, x2, x8 + add x1, x1, x8 + add x17, x17, x8 + add x16, x16, x8 + add x15, x15, x8 + add x14, x14, x8 + b .LBB0_36 + .p2align 2 +.LBB0_38: // in Loop: Header=BB0_4 Depth=1 + ldr x13, [sp, #728] // 8-byte Folded Reload + ldr x11, [sp, #912] // 8-byte Folded Reload + add x8, x25, x23, lsl #5 + fmla v6.4s, v21.4s, v17.s[0] + ldr x10, [sp, #112] // 8-byte Folded Reload + ldr x4, [sp, #832] // 8-byte Folded Reload + fmla v3.4s, v20.4s, v17.s[0] + fmla v4.4s, v21.4s, v18.s[0] + stp q21, q20, [x8] + fmla v2.4s, v20.4s, v18.s[0] + fmla v5.4s, v20.4s, v19.s[0] + ldr x16, [sp, #840] // 8-byte Folded Reload + ldr x12, [sp, #904] // 8-byte Folded Reload + fmla v7.4s, v21.4s, v19.s[0] + mov x14, xzr + madd x8, x11, x13, x10 + ldr x15, [sp, #896] // 8-byte Folded Reload + add x11, x25, x11, lsl #5 + ldr x0, [sp, #824] // 8-byte Folded Reload + ldr x1, [sp, #192] // 8-byte Folded Reload + ldr x2, [sp, #440] // 8-byte Folded Reload + add x8, x8, x4 + ldr x24, [sp, #856] // 8-byte Folded Reload + add x8, x16, x8, lsl #2 + ldp q20, q21, [x8] + madd x8, x12, x13, x10 + add x12, x25, x12, lsl #5 + add x8, x8, x4 + fmla v0.4s, v21.4s, v16.s[1] + fmla v3.4s, v21.4s, v17.s[1] + fmla v2.4s, v21.4s, v18.s[1] + fmla v5.4s, v21.4s, v19.s[1] + add x8, x16, x8, lsl #2 + fmla v1.4s, v20.4s, v16.s[1] + stp q20, q21, [x11] + fmla v6.4s, v20.4s, v17.s[1] + fmla v4.4s, v20.4s, v18.s[1] + fmla v7.4s, v20.4s, v19.s[1] + ldp q21, q20, [x8] + madd x8, x15, x13, x10 + add x13, x25, x15, lsl #5 + ldr x15, [sp, #512] // 8-byte Folded Reload + add x8, x8, x4 + fmla v0.4s, v20.4s, v16.s[2] + fmla v3.4s, v20.4s, v17.s[2] + fmla v2.4s, v20.4s, v18.s[2] + fmla v5.4s, v20.4s, v19.s[2] + add x8, x16, x8, lsl #2 + stp q21, q20, [x12] + fmla v1.4s, v21.4s, v16.s[2] + fmla v6.4s, v21.4s, v17.s[2] + fmla v4.4s, v21.4s, v18.s[2] + fmla v7.4s, v21.4s, v19.s[2] + ldr x16, [sp, #1032] // 8-byte Folded Reload + ldp q20, q21, [x8] + fmla v0.4s, v21.4s, v16.s[3] + fmla v3.4s, v21.4s, v17.s[3] + fmla v2.4s, v21.4s, v18.s[3] + fmla v5.4s, v21.4s, v19.s[3] + fmla v1.4s, v20.4s, v16.s[3] + fmla v6.4s, v20.4s, v17.s[3] + fmla v4.4s, v20.4s, v18.s[3] + fmla v7.4s, v20.4s, v19.s[3] + stp q20, q21, [x13] + ldr x17, [sp, #880] // 8-byte Folded Reload + cmp x16, x21 + b.ge .LBB0_40 + .p2align 2 +.LBB0_39: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x17, x17, x15 + add x8, x1, x14 + add x18, x25, x16, lsl #5 + add x16, x16, #1 + prfm pldl1keep, [x17] + ldur s16, [x17, #-4] + add x17, x17, x20 + add x15, x15, #4 + prfm pldl1keep, [x17] + ldur s17, [x17, #-4] + add x17, x17, x20 + prfm pldl1keep, [x17] + ldur s18, [x17, #-4] + add x17, x17, x20 + prfm pldl1keep, [x17] + ldur s19, [x17, #-4] + add x17, x2, x14 + prfm pldl1keep, [x8] + add x14, x14, x0 + ldp q20, q21, [x17] + fmla v0.4s, v21.4s, v16.s[0] + fmla v3.4s, v21.4s, v17.s[0] + fmla v2.4s, v21.4s, v18.s[0] + fmla v5.4s, v21.4s, v19.s[0] + fmla v1.4s, v20.4s, v16.s[0] + fmla v6.4s, v20.4s, v17.s[0] + fmla v4.4s, v20.4s, v18.s[0] + fmla v7.4s, v20.4s, v19.s[0] + stp q20, q21, [x18] + ldr x17, [sp, #880] // 8-byte Folded Reload + cmp x16, x21 + b.lt .LBB0_39 +.LBB0_40: // %.preheader52 + // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #48] // 8-byte Folded Reload + ldr x16, [sp, #808] // 8-byte Folded Reload + mov x5, xzr + add x14, x25, #128 + ldr x17, [sp, #816] // 8-byte Folded Reload + mov w2, #1 // =0x1 + mov w3, #2 // =0x2 + mov w1, #3 // =0x3 + mov w18, #4 // =0x4 + add x15, x25, x8 + b .LBB0_42 + .p2align 2 +.LBB0_41: // %.loopexit48 + // in Loop: Header=BB0_42 Depth=2 + ldr x8, [sp, #1008] // 8-byte Folded Reload + mov x5, x18 + mov x18, x4 + ldr x4, [sp, #832] // 8-byte Folded Reload + add x17, x17, x8 + add x16, x16, x8 +.LBB0_42: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_44 Depth 3 + // Child Loop BB0_46 Depth 3 + madd x8, x5, x29, x9 + add x8, x8, x4 + madd x0, x2, x29, x9 + madd x2, x3, x29, x9 + add x0, x0, x4 + add x8, x24, x8, lsl #2 + add x0, x24, x0, lsl #2 + stp q1, q0, [x8] + madd x8, x1, x29, x9 + add x8, x8, x4 + stp q6, q3, [x0] + add x0, x2, x4 + add x0, x24, x0, lsl #2 + stp q4, q2, [x0] + add x8, x24, x8, lsl #2 + stp q7, q5, [x8] + ldr x8, [sp, #1024] // 8-byte Folded Reload + cmp x18, x8 + b.ge .LBB0_47 +// %bb.43: // in Loop: Header=BB0_42 Depth=2 + madd x8, x18, x29, x9 + add x2, x18, #1 + add x1, x18, #3 + add x3, x18, #2 + madd x0, x2, x29, x9 + mov x7, x4 + ldr x10, [sp, #848] // 8-byte Folded Reload + mov x5, xzr + madd x6, x3, x29, x9 + ldp q20, q21, [x25] + add x8, x8, x4 + add x8, x24, x8, lsl #2 + add x0, x0, x4 + add x4, x18, #4 + add x0, x24, x0, lsl #2 + ldp q1, q0, [x8] + madd x8, x1, x29, x9 + add x8, x8, x7 + ldp q6, q3, [x0] + add x0, x6, x7 + add x0, x24, x0, lsl #2 + ldp q4, q2, [x0] + ldr x0, [sp, #888] // 8-byte Folded Reload + mov x6, x14 + mov x7, x17 + add x8, x24, x8, lsl #2 + ldp q7, q5, [x8] + madd x8, x18, x19, x10 + lsl x8, x8, #2 + ldr q19, [x0, x8] + madd x8, x2, x19, x10 + lsl x8, x8, #2 + ldr q18, [x0, x8] + madd x8, x3, x19, x10 + lsl x8, x8, #2 + ldr q17, [x0, x8] + madd x8, x1, x19, x10 + lsl x8, x8, #2 + ldr q16, [x0, x8] + cmp xzr, x23 + b.ge .LBB0_45 + .p2align 2 +.LBB0_44: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_42 Depth=2 + // => This Inner Loop Header: Depth=3 + add x8, x6, #32 + fmla v1.4s, v20.4s, v19.s[0] + fmla v0.4s, v21.4s, v19.s[0] + add x5, x5, #4 + prfm pldl1keep, [x8] + ldp q22, q23, [x6, #-96] + fmla v3.4s, v21.4s, v18.s[0] + fmla v6.4s, v20.4s, v18.s[0] + fmla v2.4s, v21.4s, v17.s[0] + fmla v4.4s, v20.4s, v17.s[0] + add x8, x6, #96 + fmla v5.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q21, q20, [x6, #-64] + prfm pldl1keep, [x8] + add x8, x7, x20 + add x0, x8, x20 + fmla v0.4s, v23.4s, v19.s[1] + fmla v3.4s, v23.4s, v18.s[1] + fmla v2.4s, v23.4s, v17.s[1] + fmla v5.4s, v23.4s, v16.s[1] + fmla v1.4s, v22.4s, v19.s[1] + fmla v6.4s, v22.4s, v18.s[1] + fmla v4.4s, v22.4s, v17.s[1] + fmla v7.4s, v22.4s, v16.s[1] + fmla v0.4s, v20.4s, v19.s[2] + ldp q22, q23, [x6, #-32] + fmla v3.4s, v20.4s, v18.s[2] + fmla v2.4s, v20.4s, v17.s[2] + fmla v5.4s, v20.4s, v16.s[2] + fmla v1.4s, v21.4s, v19.s[2] + fmla v6.4s, v21.4s, v18.s[2] + fmla v4.4s, v21.4s, v17.s[2] + fmla v7.4s, v21.4s, v16.s[2] + ldp q20, q21, [x6], #128 + prfm pldl1keep, [x7] + fmla v0.4s, v23.4s, v19.s[3] + fmla v3.4s, v23.4s, v18.s[3] + fmla v2.4s, v23.4s, v17.s[3] + fmla v5.4s, v23.4s, v16.s[3] + fmla v1.4s, v22.4s, v19.s[3] + ldur q19, [x7, #-16] + prfm pldl1keep, [x8] + fmla v6.4s, v22.4s, v18.s[3] + ldur q18, [x8, #-16] + add x8, x0, x20 + prfm pldl1keep, [x0] + add x7, x7, #16 + fmla v4.4s, v22.4s, v17.s[3] + ldur q17, [x0, #-16] + prfm pldl1keep, [x8] + fmla v7.4s, v22.4s, v16.s[3] + ldur q16, [x8, #-16] + cmp x5, x23 + b.lt .LBB0_44 +.LBB0_45: // in Loop: Header=BB0_42 Depth=2 + ldp q23, q22, [x11] + fmla v0.4s, v21.4s, v19.s[0] + fmla v1.4s, v20.4s, v19.s[0] + fmla v3.4s, v21.4s, v18.s[0] + fmla v6.4s, v20.4s, v18.s[0] + ldr x7, [sp, #1032] // 8-byte Folded Reload + mov x5, x16 + fmla v2.4s, v21.4s, v17.s[0] + fmla v4.4s, v20.4s, v17.s[0] + mov x6, x15 + fmla v5.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q20, q21, [x12] + fmla v0.4s, v22.4s, v19.s[1] + fmla v3.4s, v22.4s, v18.s[1] + fmla v2.4s, v22.4s, v17.s[1] + fmla v5.4s, v22.4s, v16.s[1] + fmla v1.4s, v23.4s, v19.s[1] + fmla v6.4s, v23.4s, v18.s[1] + fmla v4.4s, v23.4s, v17.s[1] + fmla v7.4s, v23.4s, v16.s[1] + fmla v0.4s, v21.4s, v19.s[2] + ldp q23, q22, [x13] + fmla v3.4s, v21.4s, v18.s[2] + fmla v2.4s, v21.4s, v17.s[2] + fmla v5.4s, v21.4s, v16.s[2] + fmla v1.4s, v20.4s, v19.s[2] + fmla v6.4s, v20.4s, v18.s[2] + fmla v4.4s, v20.4s, v17.s[2] + fmla v7.4s, v20.4s, v16.s[2] + fmla v0.4s, v22.4s, v19.s[3] + fmla v3.4s, v22.4s, v18.s[3] + fmla v2.4s, v22.4s, v17.s[3] + fmla v5.4s, v22.4s, v16.s[3] + fmla v1.4s, v23.4s, v19.s[3] + fmla v6.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v17.s[3] + fmla v7.4s, v23.4s, v16.s[3] + cmp x7, x21 + b.ge .LBB0_41 + .p2align 2 +.LBB0_46: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_42 Depth=2 + // => This Inner Loop Header: Depth=3 + add x8, x5, x20 + prfm pldl1keep, [x5] + ldur s16, [x5, #-4] + add x7, x7, #1 + prfm pldl1keep, [x8] + ldur s17, [x8, #-4] + add x8, x8, x20 + add x5, x5, #4 + prfm pldl1keep, [x8] + ldur s18, [x8, #-4] + add x8, x8, x20 + prfm pldl1keep, [x8] + ldur s19, [x8, #-4] + prfm pldl1keep, [x6] + ldp q20, q21, [x6, #-32] + add x6, x6, #32 + fmla v0.4s, v21.4s, v16.s[0] + fmla v3.4s, v21.4s, v17.s[0] + fmla v2.4s, v21.4s, v18.s[0] + fmla v1.4s, v20.4s, v16.s[0] + fmla v6.4s, v20.4s, v17.s[0] + fmla v4.4s, v20.4s, v18.s[0] + fmla v7.4s, v20.4s, v19.s[0] + fmla v5.4s, v21.4s, v19.s[0] + cmp x7, x21 + b.lt .LBB0_46 + b .LBB0_41 + .p2align 2 +.LBB0_47: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #1024] // 8-byte Folded Reload + ldr x15, [sp, #920] // 8-byte Folded Reload + cmp x8, x15 + b.ge .LBB0_53 +// %bb.48: // in Loop: Header=BB0_4 Depth=1 + ldr x18, [sp, #1024] // 8-byte Folded Reload + ldr x10, [sp, #848] // 8-byte Folded Reload + mov x17, xzr + madd x8, x18, x29, x9 + ldr x0, [sp, #888] // 8-byte Folded Reload + ldp q6, q7, [x25] + ldr x1, [sp, #616] // 8-byte Folded Reload + ldr x2, [sp, #624] // 8-byte Folded Reload + add x8, x8, x4 + add x15, x24, x8, lsl #2 + add x8, x18, #1 + madd x18, x18, x19, x10 + madd x16, x8, x29, x9 + madd x8, x8, x19, x10 + lsl x18, x18, #2 + ldp q1, q0, [x15] + add x16, x16, x4 + lsl x8, x8, #2 + ldr q5, [x0, x18] + mov x18, x14 + add x16, x24, x16, lsl #2 + ldr q4, [x0, x8] + ldp q3, q2, [x16] + cmp xzr, x23 + b.ge .LBB0_50 + .p2align 2 +.LBB0_49: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x6, x18, #32 + fmla v1.4s, v6.4s, v5.s[0] + fmla v0.4s, v7.4s, v5.s[0] + add x5, x18, #96 + prfm pldl1keep, [x6] + ldp q16, q17, [x18, #-96] + fmla v2.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q7, q6, [x18, #-64] + prfm pldl1keep, [x5] + add x8, x2, x27 + add x3, x1, x27 + add x0, x8, #32 + add x4, x3, #32 + add x17, x17, #4 + add x2, x2, #16 + add x1, x1, #16 + fmla v0.4s, v17.4s, v5.s[1] + fmla v2.4s, v17.4s, v4.s[1] + fmla v1.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + fmla v0.4s, v6.4s, v5.s[2] + ldp q16, q17, [x18, #-32] + fmla v2.4s, v6.4s, v4.s[2] + fmla v1.4s, v7.4s, v5.s[2] + fmla v3.4s, v7.4s, v4.s[2] + ldp q6, q7, [x18], #128 + prfm pldl1keep, [x4] + ldr x4, [sp, #832] // 8-byte Folded Reload + fmla v0.4s, v17.4s, v5.s[3] + fmla v2.4s, v17.4s, v4.s[3] + fmla v1.4s, v16.4s, v5.s[3] + ldr q5, [x3, #16] + prfm pldl1keep, [x0] + fmla v3.4s, v16.4s, v4.s[3] + ldr q4, [x8, #16] + cmp x17, x23 + b.lt .LBB0_49 +.LBB0_50: // in Loop: Header=BB0_4 Depth=1 + ldp q17, q16, [x11] + fmla v0.4s, v7.4s, v5.s[0] + fmla v1.4s, v6.4s, v5.s[0] + fmla v2.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q6, q7, [x12] + ldr x8, [sp, #64] // 8-byte Folded Reload + ldr x2, [sp, #1032] // 8-byte Folded Reload + mov x17, xzr + mov x18, xzr + fmla v0.4s, v16.4s, v5.s[1] + fmla v2.4s, v16.4s, v4.s[1] + add x1, x25, x8 + fmla v1.4s, v17.4s, v5.s[1] + fmla v3.4s, v17.4s, v4.s[1] + ldp q17, q16, [x13] + fmla v0.4s, v7.4s, v5.s[2] + fmla v2.4s, v7.4s, v4.s[2] + fmla v1.4s, v6.4s, v5.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v0.4s, v16.4s, v5.s[3] + fmla v2.4s, v16.4s, v4.s[3] + fmla v1.4s, v17.4s, v5.s[3] + fmla v3.4s, v17.4s, v4.s[3] + cmp x2, x21 + b.ge .LBB0_52 + .p2align 2 +.LBB0_51: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + ldr x5, [sp, #544] // 8-byte Folded Reload + ldr x6, [sp, #656] // 8-byte Folded Reload + add x8, x1, x18, lsl #3 + add x2, x2, #1 + add x8, x8, #32 + add x0, x5, x18 + add x3, x6, x18 + add x0, x0, #4 + add x3, x3, #4 + prfm pldl1keep, [x3] + ldr s4, [x6, x18] + prfm pldl1keep, [x0] + ldr s5, [x5, x18] + add x0, x1, x17 + prfm pldl1keep, [x8] + add x18, x18, #4 + add x17, x17, #32 + ldp q6, q7, [x0] + fmla v0.4s, v7.4s, v4.s[0] + fmla v1.4s, v6.4s, v4.s[0] + fmla v2.4s, v7.4s, v5.s[0] + fmla v3.4s, v6.4s, v5.s[0] + cmp x2, x21 + b.lt .LBB0_51 +.LBB0_52: // in Loop: Header=BB0_4 Depth=1 + stp q1, q0, [x15] + stp q3, q2, [x16] +.LBB0_53: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #744] // 8-byte Folded Reload + ldr x15, [sp, #920] // 8-byte Folded Reload + cmp x15, x8 + b.ge .LBB0_59 +// %bb.54: // in Loop: Header=BB0_4 Depth=1 + ldr x16, [sp, #920] // 8-byte Folded Reload + ldr x10, [sp, #888] // 8-byte Folded Reload + mov x15, xzr + madd x8, x16, x29, x9 + ldp q4, q3, [x25] + ldr x17, [sp, #632] // 8-byte Folded Reload + add x8, x8, x4 + add x9, x24, x8, lsl #2 + ldr x8, [sp, #848] // 8-byte Folded Reload + ldp q1, q0, [x9] + madd x8, x16, x19, x8 + lsl x8, x8, #2 + ldr q2, [x10, x8] + ldr x10, [sp, #640] // 8-byte Folded Reload + cmp xzr, x23 + b.ge .LBB0_56 + .p2align 2 +.LBB0_55: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x16, x14, #32 + fmla v1.4s, v4.4s, v2.s[0] + fmla v0.4s, v3.4s, v2.s[0] + add x8, x14, #96 + prfm pldl1keep, [x16] + ldp q5, q6, [x14, #-96] + add x15, x15, #4 + ldp q4, q3, [x14, #-64] + prfm pldl1keep, [x8] + fmla v0.4s, v6.4s, v2.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldp q5, q6, [x14, #-32] + prfm pldl1keep, [x10] + fmla v0.4s, v3.4s, v2.s[2] + fmla v1.4s, v4.4s, v2.s[2] + fmla v0.4s, v6.4s, v2.s[3] + fmla v1.4s, v5.4s, v2.s[3] + ldur q2, [x10, #-16] + ldp q4, q3, [x14], #128 + add x10, x10, #16 + cmp x15, x23 + b.lt .LBB0_55 +.LBB0_56: // in Loop: Header=BB0_4 Depth=1 + ldp q6, q5, [x11] + fmla v0.4s, v3.4s, v2.s[0] + fmla v1.4s, v4.4s, v2.s[0] + ldp q3, q4, [x12] + ldr x8, [sp, #64] // 8-byte Folded Reload + ldr x11, [sp, #1032] // 8-byte Folded Reload + mov x10, xzr + mov x14, xzr + fmla v0.4s, v5.4s, v2.s[1] + add x8, x25, x8 + fmla v1.4s, v6.4s, v2.s[1] + ldp q6, q5, [x13] + fmla v0.4s, v4.4s, v2.s[2] + fmla v1.4s, v3.4s, v2.s[2] + fmla v0.4s, v5.4s, v2.s[3] + fmla v1.4s, v6.4s, v2.s[3] + cmp x11, x21 + b.ge .LBB0_58 + .p2align 2 +.LBB0_57: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x8, x14, lsl #3 + add x13, x17, x14 + add x11, x11, #1 + add x13, x13, #4 + add x12, x12, #32 + prfm pldl1keep, [x13] + ldr s2, [x17, x14] + add x13, x8, x10 + add x14, x14, #4 + add x10, x10, #32 + prfm pldl1keep, [x12] + ldp q3, q4, [x13] + fmla v0.4s, v4.4s, v2.s[0] + fmla v1.4s, v3.4s, v2.s[0] + cmp x11, x21 + b.lt .LBB0_57 +.LBB0_58: // in Loop: Header=BB0_4 Depth=1 + stp q1, q0, [x9] +.LBB0_59: // in Loop: Header=BB0_4 Depth=1 + ldr x0, [sp, #120] // 8-byte Folded Reload + bl free + ldr x3, [sp, #768] // 8-byte Folded Reload + ldr x4, [sp, #600] // 8-byte Folded Reload + cmp x3, x4 + b.ge .LBB0_33 +.LBB0_60: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #80] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #736] // 8-byte Folded Reload + ldr x11, [sp, #776] // 8-byte Folded Reload + mov x12, xzr + mul x9, x11, x8 + ldr x8, [sp, #752] // 8-byte Folded Reload + ldr x10, [sp, #768] // 8-byte Folded Reload + ldr x18, [sp, #520] // 8-byte Folded Reload + add x13, x18, x29 + ldp x1, x2, [sp, #408] // 16-byte Folded Reload + ldp x3, x4, [sp, #424] // 16-byte Folded Reload + ldr x5, [sp, #872] // 8-byte Folded Reload + ldr x6, [sp, #784] // 8-byte Folded Reload + ldr x7, [sp, #800] // 8-byte Folded Reload + mul x15, x11, x8 + add x14, x9, x10 + add x8, x0, #63 + lsl x16, x14, #2 + add x17, x14, x29 + add x18, x14, x18 + add x13, x14, x13 + and x8, x8, #0xffffffffffffffc0 + lsl x13, x13, #2 + ldr q0, [x24, x16] + lsl x16, x18, #2 + ldr x18, [sp, #792] // 8-byte Folded Reload + ldr q3, [x24, x13] + ldr q1, [x24, x16] + stp x15, x0, [sp, #112] // 16-byte Folded Spill + add x15, x15, x10 + ldr x10, [sp, #760] // 8-byte Folded Reload + lsl x14, x15, #2 + lsl x15, x17, #2 + ldp x16, x17, [sp, #392] // 16-byte Folded Reload + mul x11, x11, x10 + ldr x10, [sp, #840] // 8-byte Folded Reload + ldr q2, [x24, x15] + lsl x13, x11, #2 + ldr q7, [x10, x14] + ldr x10, [sp, #888] // 8-byte Folded Reload + str x11, [sp, #848] // 8-byte Folded Spill + ldp x14, x15, [sp, #376] // 16-byte Folded Reload + ldr q4, [x10, x13] + add x13, x11, x19 + lsl x13, x13, #2 + ldr q5, [x10, x13] + add x13, x11, x19, lsl #1 + lsl x13, x13, #2 + ldr q6, [x10, x13] + orr x13, x8, #0x20 + .p2align 2 +.LBB0_61: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x24, x18, x27 + fmla v0.4s, v7.4s, v4.s[0] + fmla v2.4s, v7.4s, v5.s[0] + cmp x12, x23 + prfm pldl1keep, [x24, #16] + ldr q16, [x24] + b.ge .LBB0_63 +// %bb.62: // in Loop: Header=BB0_61 Depth=2 + ldr x10, [sp, #864] // 8-byte Folded Reload + stur q7, [x13, #-32] + fmla v1.4s, v7.4s, v6.s[0] + fmla v3.4s, v7.4s, v16.s[0] + add x25, x6, x27 + add x26, x5, x27 + add x11, x25, #32 + add x0, x26, #32 + add x12, x12, #4 + add x6, x6, #16 + add x5, x5, #16 + add x18, x18, #16 + add x24, x1, x10 + add x28, x4, x10 + prfm pldl1keep, [x24] + add x24, x2, x10 + ldr q17, [x16, x10] + stur q17, [x13, #-16] + prfm pldl1keep, [x24] + ldr q18, [x15, x10] + add x24, x3, x10 + fmla v0.4s, v17.4s, v4.s[1] + fmla v2.4s, v17.4s, v5.s[1] + fmla v1.4s, v17.4s, v6.s[1] + fmla v3.4s, v17.4s, v16.s[1] + str q18, [x13] + prfm pldl1keep, [x24] + ldr q19, [x14, x10] + fmla v0.4s, v18.4s, v4.s[2] + fmla v2.4s, v18.4s, v5.s[2] + fmla v1.4s, v18.4s, v6.s[2] + add x24, x7, x27 + fmla v3.4s, v18.4s, v16.s[2] + add x7, x7, #16 + add x30, x24, #32 + str q19, [x13, #16] + prfm pldl1keep, [x28] + ldr q7, [x17, x10] + fmla v0.4s, v19.4s, v4.s[3] + fmla v2.4s, v19.4s, v5.s[3] + fmla v1.4s, v19.4s, v6.s[3] + prfm pldl1keep, [x0] + ldr q4, [x26, #16] + prfm pldl1keep, [x11] + ldr q5, [x25, #16] + prfm pldl1keep, [x30] + ldr x10, [sp, #1016] // 8-byte Folded Reload + ldr q6, [x24, #16] + fmla v3.4s, v19.4s, v16.s[3] + add x13, x13, #64 + add x4, x4, x10 + add x3, x3, x10 + add x2, x2, x10 + add x1, x1, x10 + add x17, x17, x10 + add x16, x16, x10 + add x15, x15, x10 + add x14, x14, x10 + b .LBB0_61 + .p2align 2 +.LBB0_63: // in Loop: Header=BB0_4 Depth=1 + ldr x13, [sp, #728] // 8-byte Folded Reload + ldr x10, [sp, #912] // 8-byte Folded Reload + fmla v1.4s, v7.4s, v6.s[0] + fmla v3.4s, v7.4s, v16.s[0] + ldr x15, [sp, #112] // 8-byte Folded Reload + ldr x5, [sp, #768] // 8-byte Folded Reload + mov x12, xzr + madd x11, x10, x13, x15 + ldr x14, [sp, #840] // 8-byte Folded Reload + str q7, [x8, x23, lsl #4] + ldr x6, [sp, #576] // 8-byte Folded Reload + ldr x7, [sp, #568] // 8-byte Folded Reload + ldr x17, [sp, #880] // 8-byte Folded Reload + ldr x24, [sp, #856] // 8-byte Folded Reload + add x11, x11, x5 + lsl x11, x11, #2 + ldr q7, [x14, x11] + fmla v0.4s, v7.4s, v4.s[1] + str q7, [x8, x10, lsl #4] + ldr x10, [sp, #904] // 8-byte Folded Reload + fmla v2.4s, v7.4s, v5.s[1] + fmla v1.4s, v7.4s, v6.s[1] + fmla v3.4s, v7.4s, v16.s[1] + madd x11, x10, x13, x15 + add x11, x11, x5 + lsl x11, x11, #2 + ldr q17, [x14, x11] + fmla v0.4s, v17.4s, v4.s[2] + str q17, [x8, x10, lsl #4] + ldr x10, [sp, #896] // 8-byte Folded Reload + fmla v2.4s, v17.4s, v5.s[2] + fmla v1.4s, v17.4s, v6.s[2] + fmla v3.4s, v17.4s, v16.s[2] + madd x11, x10, x13, x15 + ldr x13, [sp, #1032] // 8-byte Folded Reload + add x11, x11, x5 + lsl x11, x11, #2 + ldr q7, [x14, x11] + ldr x11, [sp, #512] // 8-byte Folded Reload + fmla v0.4s, v7.4s, v4.s[3] + fmla v2.4s, v7.4s, v5.s[3] + fmla v1.4s, v7.4s, v6.s[3] + fmla v3.4s, v7.4s, v16.s[3] + str q7, [x8, x10, lsl #4] + ldp x16, x10, [sp, #360] // 16-byte Folded Reload + cmp x13, x21 + b.ge .LBB0_65 + .p2align 2 +.LBB0_64: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x15, x17, x11 + add x14, x10, x12 + add x11, x11, #4 + prfm pldl1keep, [x15] + ldur s4, [x15, #-4] + add x15, x15, x20 + prfm pldl1keep, [x15] + ldur s5, [x15, #-4] + add x15, x15, x20 + prfm pldl1keep, [x15] + ldur s6, [x15, #-4] + add x15, x15, x20 + prfm pldl1keep, [x15] + ldur s7, [x15, #-4] + prfm pldl1keep, [x14] + ldr x14, [sp, #824] // 8-byte Folded Reload + ldr q16, [x16, x12] + add x12, x12, x14 + fmla v0.4s, v16.4s, v4.s[0] + str q16, [x8, x13, lsl #4] + add x13, x13, #1 + fmla v2.4s, v16.4s, v5.s[0] + fmla v1.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v7.s[0] + cmp x13, x21 + b.lt .LBB0_64 +.LBB0_65: // %.preheader51 + // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #40] // 8-byte Folded Reload + ldr x13, [sp, #808] // 8-byte Folded Reload + mov x2, xzr + add x11, x8, #48 + ldr x14, [sp, #816] // 8-byte Folded Reload + mov w16, #1 // =0x1 + mov w17, #2 // =0x2 + mov w18, #3 // =0x3 + mov w15, #4 // =0x4 + add x12, x8, x10 + b .LBB0_67 + .p2align 2 +.LBB0_66: // %.loopexit47 + // in Loop: Header=BB0_67 Depth=2 + ldr x10, [sp, #1008] // 8-byte Folded Reload + mov x2, x15 + mov x15, x1 + add x14, x14, x10 + add x13, x13, x10 +.LBB0_67: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_69 Depth 3 + // Child Loop BB0_71 Depth 3 + madd x0, x2, x29, x9 + add x0, x0, x5 + madd x16, x16, x29, x9 + madd x17, x17, x29, x9 + madd x18, x18, x29, x9 + add x16, x16, x5 + add x17, x17, x5 + lsl x0, x0, #2 + lsl x16, x16, #2 + lsl x17, x17, #2 + str q0, [x24, x0] + str q2, [x24, x16] + add x16, x18, x5 + lsl x16, x16, #2 + str q1, [x24, x17] + str q3, [x24, x16] + ldr x16, [sp, #1024] // 8-byte Folded Reload + cmp x15, x16 + b.ge .LBB0_72 +// %bb.68: // in Loop: Header=BB0_67 Depth=2 + madd x0, x15, x29, x9 + add x17, x15, #2 + add x18, x15, #3 + ldr x10, [sp, #888] // 8-byte Folded Reload + madd x3, x17, x29, x9 + add x16, x15, #1 + ldr q16, [x8] + mov x2, xzr + madd x1, x16, x29, x9 + mov x4, x14 + add x0, x0, x5 + lsl x0, x0, #2 + add x3, x3, x5 + add x1, x1, x5 + ldr q0, [x24, x0] + madd x0, x18, x29, x9 + lsl x3, x3, #2 + lsl x1, x1, #2 + add x0, x0, x5 + ldr q1, [x24, x3] + ldr x3, [sp, #848] // 8-byte Folded Reload + lsl x0, x0, #2 + ldr q2, [x24, x1] + add x1, x15, #4 + ldr q3, [x24, x0] + madd x0, x15, x19, x3 + lsl x0, x0, #2 + ldr q7, [x10, x0] + madd x0, x16, x19, x3 + lsl x0, x0, #2 + ldr q6, [x10, x0] + madd x0, x17, x19, x3 + lsl x0, x0, #2 + ldr q5, [x10, x0] + madd x0, x18, x19, x3 + mov x3, x11 + lsl x0, x0, #2 + ldr q4, [x10, x0] + cmp xzr, x23 + b.ge .LBB0_70 + .p2align 2 +.LBB0_69: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_67 Depth=2 + // => This Inner Loop Header: Depth=3 + add x0, x3, #32 + fmla v0.4s, v16.4s, v7.s[0] + fmla v2.4s, v16.4s, v6.s[0] + add x2, x2, #4 + fmla v1.4s, v16.4s, v5.s[0] + fmla v3.4s, v16.4s, v4.s[0] + prfm pldl1keep, [x0] + add x0, x4, x20 + ldp q16, q17, [x3, #-32] + fmla v0.4s, v16.4s, v7.s[1] + fmla v2.4s, v16.4s, v6.s[1] + fmla v1.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + fmla v0.4s, v17.4s, v7.s[2] + fmla v2.4s, v17.4s, v6.s[2] + fmla v1.4s, v17.4s, v5.s[2] + fmla v3.4s, v17.4s, v4.s[2] + ldp q17, q16, [x3], #64 + prfm pldl1keep, [x4] + fmla v0.4s, v17.4s, v7.s[3] + ldur q7, [x4, #-16] + prfm pldl1keep, [x0] + fmla v2.4s, v17.4s, v6.s[3] + ldur q6, [x0, #-16] + add x0, x0, x20 + fmla v1.4s, v17.4s, v5.s[3] + fmla v3.4s, v17.4s, v4.s[3] + add x4, x4, #16 + prfm pldl1keep, [x0] + ldur q5, [x0, #-16] + add x0, x0, x20 + prfm pldl1keep, [x0] + ldur q4, [x0, #-16] + cmp x2, x23 + b.lt .LBB0_69 +.LBB0_70: // in Loop: Header=BB0_67 Depth=2 + ldr x10, [sp, #912] // 8-byte Folded Reload + fmla v0.4s, v16.4s, v7.s[0] + fmla v2.4s, v16.4s, v6.s[0] + ldr x4, [sp, #1032] // 8-byte Folded Reload + fmla v1.4s, v16.4s, v5.s[0] + fmla v3.4s, v16.4s, v4.s[0] + mov x2, x13 + mov x3, x12 + ldr q17, [x8, x10, lsl #4] + ldr x10, [sp, #904] // 8-byte Folded Reload + fmla v0.4s, v17.4s, v7.s[1] + ldr q16, [x8, x10, lsl #4] + ldr x10, [sp, #896] // 8-byte Folded Reload + fmla v2.4s, v17.4s, v6.s[1] + fmla v1.4s, v17.4s, v5.s[1] + fmla v3.4s, v17.4s, v4.s[1] + ldr q18, [x8, x10, lsl #4] + fmla v0.4s, v16.4s, v7.s[2] + fmla v2.4s, v16.4s, v6.s[2] + fmla v1.4s, v16.4s, v5.s[2] + fmla v3.4s, v16.4s, v4.s[2] + fmla v0.4s, v18.4s, v7.s[3] + fmla v2.4s, v18.4s, v6.s[3] + fmla v1.4s, v18.4s, v5.s[3] + fmla v3.4s, v18.4s, v4.s[3] + cmp x4, x21 + b.ge .LBB0_66 + .p2align 2 +.LBB0_71: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_67 Depth=2 + // => This Inner Loop Header: Depth=3 + add x0, x2, x20 + prfm pldl1keep, [x2] + ldur s4, [x2, #-4] + add x4, x4, #1 + prfm pldl1keep, [x0] + ldur s5, [x0, #-4] + add x0, x0, x20 + add x2, x2, #4 + prfm pldl1keep, [x0] + ldur s6, [x0, #-4] + add x0, x0, x20 + prfm pldl1keep, [x0] + ldur s7, [x0, #-4] + prfm pldl1keep, [x3] + ldur q16, [x3, #-16] + add x3, x3, #16 + fmla v0.4s, v16.4s, v4.s[0] + fmla v2.4s, v16.4s, v5.s[0] + fmla v1.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v7.s[0] + cmp x4, x21 + b.lt .LBB0_71 + b .LBB0_66 + .p2align 2 +.LBB0_72: // in Loop: Header=BB0_4 Depth=1 + ldr x13, [sp, #1024] // 8-byte Folded Reload + ldr x14, [sp, #920] // 8-byte Folded Reload + cmp x13, x14 + b.ge .LBB0_78 +// %bb.73: // in Loop: Header=BB0_4 Depth=1 + ldr x17, [sp, #1024] // 8-byte Folded Reload + ldr x18, [sp, #848] // 8-byte Folded Reload + mov x15, xzr + add x16, x17, #1 + madd x13, x17, x29, x9 + madd x17, x17, x19, x18 + ldr x10, [sp, #888] // 8-byte Folded Reload + ldr q4, [x8] + madd x14, x16, x29, x9 + madd x16, x16, x19, x18 + ldr x18, [sp, #624] // 8-byte Folded Reload + add x13, x13, x5 + lsl x17, x17, #2 + add x14, x14, x5 + add x13, x24, x13, lsl #2 + lsl x16, x16, #2 + ldr q3, [x10, x17] + ldr x17, [sp, #616] // 8-byte Folded Reload + add x14, x24, x14, lsl #2 + ldr q2, [x10, x16] + mov x16, x11 + ldr q0, [x13] + ldr q1, [x14] + cmp xzr, x23 + b.ge .LBB0_75 + .p2align 2 +.LBB0_74: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x4, x16, #32 + fmla v0.4s, v4.4s, v3.s[0] + fmla v1.4s, v4.4s, v2.s[0] + add x0, x18, x27 + prfm pldl1keep, [x4] + ldp q4, q5, [x16, #-32] + add x2, x17, x27 + add x1, x0, #32 + add x3, x2, #32 + add x15, x15, #4 + add x18, x18, #16 + add x17, x17, #16 + fmla v0.4s, v4.4s, v3.s[1] + fmla v1.4s, v4.4s, v2.s[1] + fmla v0.4s, v5.4s, v3.s[2] + fmla v1.4s, v5.4s, v2.s[2] + ldp q5, q4, [x16], #64 + prfm pldl1keep, [x3] + fmla v0.4s, v5.4s, v3.s[3] + ldr q3, [x2, #16] + prfm pldl1keep, [x1] + fmla v1.4s, v5.4s, v2.s[3] + ldr q2, [x0, #16] + cmp x15, x23 + b.lt .LBB0_74 +.LBB0_75: // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #912] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v3.s[0] + fmla v1.4s, v4.4s, v2.s[0] + ldr x15, [sp, #880] // 8-byte Folded Reload + ldr x16, [sp, #1032] // 8-byte Folded Reload + ldr q5, [x8, x10, lsl #4] + ldr x10, [sp, #904] // 8-byte Folded Reload + ldr q4, [x8, x10, lsl #4] + ldr x10, [sp, #896] // 8-byte Folded Reload + fmla v0.4s, v5.4s, v3.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldr q5, [x8, x10, lsl #4] + fmla v0.4s, v4.4s, v3.s[2] + fmla v1.4s, v4.4s, v2.s[2] + fmla v0.4s, v5.4s, v3.s[3] + fmla v1.4s, v5.4s, v2.s[3] + cmp x16, x21 + b.ge .LBB0_77 + .p2align 2 +.LBB0_76: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x17, x15, x7 + add x18, x15, x6 + add x16, x16, #1 + add x17, x17, #4 + add x18, x18, #4 + prfm pldl1keep, [x18] + ldr s2, [x15, x6] + prfm pldl1keep, [x17] + ldr s3, [x15, x7] + prfm pldl1keep, [x12] + ldur q4, [x12, #-16] + add x12, x12, #16 + add x15, x15, #4 + fmla v0.4s, v4.4s, v2.s[0] + fmla v1.4s, v4.4s, v3.s[0] + cmp x16, x21 + b.lt .LBB0_76 +.LBB0_77: // in Loop: Header=BB0_4 Depth=1 + str q0, [x13] + str q1, [x14] +.LBB0_78: // in Loop: Header=BB0_4 Depth=1 + ldr x12, [sp, #744] // 8-byte Folded Reload + ldr x13, [sp, #920] // 8-byte Folded Reload + cmp x13, x12 + b.ge .LBB0_84 +// %bb.79: // in Loop: Header=BB0_4 Depth=1 + ldr x13, [sp, #920] // 8-byte Folded Reload + ldr x10, [sp, #848] // 8-byte Folded Reload + mov x12, xzr + madd x9, x13, x29, x9 + madd x10, x13, x19, x10 + ldr x13, [sp, #888] // 8-byte Folded Reload + ldr q2, [x8] + ldr x14, [sp, #632] // 8-byte Folded Reload + ldr x15, [sp, #880] // 8-byte Folded Reload + add x9, x9, x5 + lsl x10, x10, #2 + add x9, x24, x9, lsl #2 + ldr q1, [x13, x10] + ldr x10, [sp, #640] // 8-byte Folded Reload + ldr q0, [x9] + cmp xzr, x23 + b.ge .LBB0_81 + .p2align 2 +.LBB0_80: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x11, #32 + fmla v0.4s, v2.4s, v1.s[0] + add x12, x12, #4 + prfm pldl1keep, [x13] + ldp q2, q3, [x11, #-32] + fmla v0.4s, v2.4s, v1.s[1] + fmla v0.4s, v3.4s, v1.s[2] + ldp q3, q2, [x11], #64 + prfm pldl1keep, [x10] + fmla v0.4s, v3.4s, v1.s[3] + ldur q1, [x10, #-16] + add x10, x10, #16 + cmp x12, x23 + b.lt .LBB0_80 +.LBB0_81: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #912] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[0] + mov x10, xzr + mov w12, #16 // =0x10 + ldr q3, [x8, x11, lsl #4] + ldr x11, [sp, #904] // 8-byte Folded Reload + fmla v0.4s, v3.4s, v1.s[1] + ldr q2, [x8, x11, lsl #4] + ldr x11, [sp, #896] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[2] + ldr q3, [x8, x11, lsl #4] + ldr x11, [sp, #24] // 8-byte Folded Reload + add x8, x8, x11 + ldr x11, [sp, #560] // 8-byte Folded Reload + fmla v0.4s, v3.4s, v1.s[3] + ldr x13, [sp, #1032] // 8-byte Folded Reload + add x13, x13, xzr + cmp x13, x21 + b.ge .LBB0_83 + .p2align 2 +.LBB0_82: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x15, x11 + add x11, x11, #4 + prfm pldl1keep, [x13] + add x13, x8, x12 + ldr s1, [x14, x10, lsl #2] + add x12, x12, #16 + prfm pldl1keep, [x13] + ldr q2, [x8, x10, lsl #4] + add x10, x10, #1 + fmla v0.4s, v2.4s, v1.s[0] + ldr x13, [sp, #1032] // 8-byte Folded Reload + add x13, x13, x10 + cmp x13, x21 + b.lt .LBB0_82 +.LBB0_83: // in Loop: Header=BB0_4 Depth=1 + str q0, [x9] +.LBB0_84: // in Loop: Header=BB0_4 Depth=1 + ldr x0, [sp, #120] // 8-byte Folded Reload + bl free + ldr x4, [sp, #600] // 8-byte Folded Reload + ldr x8, [sp, #648] // 8-byte Folded Reload + cmp x4, x8 + b.ge .LBB0_34 +.LBB0_85: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #72] // 8-byte Folded Reload + mov x24, x4 + add x0, x8, #64 + bl malloc + ldr x8, [sp, #736] // 8-byte Folded Reload + ldr x11, [sp, #776] // 8-byte Folded Reload + mov x12, xzr + mul x9, x11, x8 + ldr x8, [sp, #752] // 8-byte Folded Reload + ldr x18, [sp, #520] // 8-byte Folded Reload + str x0, [sp, #848] // 8-byte Folded Spill + add x13, x18, x29 + ldp x1, x2, [sp, #328] // 16-byte Folded Reload + ldp x3, x4, [sp, #344] // 16-byte Folded Reload + ldr x5, [sp, #872] // 8-byte Folded Reload + ldr x6, [sp, #784] // 8-byte Folded Reload + ldr x7, [sp, #800] // 8-byte Folded Reload + mul x10, x11, x8 + add x8, x0, #63 + add x14, x9, x24 + lsl x16, x14, #2 + add x17, x14, x29 + add x18, x14, x18 + add x13, x14, x13 + and x8, x8, #0xffffffffffffffc0 + lsl x13, x13, #2 + str x10, [sp, #112] // 8-byte Folded Spill + add x15, x10, x24 + ldr x10, [sp, #760] // 8-byte Folded Reload + lsl x14, x15, #2 + lsl x15, x17, #2 + mul x0, x11, x10 + ldr x10, [sp, #856] // 8-byte Folded Reload + ldr x11, [sp, #840] // 8-byte Folded Reload + ldr d0, [x10, x16] + lsl x16, x18, #2 + ldr x18, [sp, #792] // 8-byte Folded Reload + ldr d2, [x10, x15] + ldr d3, [x10, x13] + lsl x13, x0, #2 + ldr d7, [x11, x14] + str x0, [sp, #120] // 8-byte Folded Spill + ldp x14, x15, [sp, #296] // 16-byte Folded Reload + ldr d1, [x10, x16] + ldr x10, [sp, #888] // 8-byte Folded Reload + ldp x16, x17, [sp, #312] // 16-byte Folded Reload + ldr q4, [x10, x13] + add x13, x0, x19 + lsl x13, x13, #2 + ldr q5, [x10, x13] + add x13, x0, x19, lsl #1 + lsl x13, x13, #2 + ldr q6, [x10, x13] + orr x13, x8, #0x10 + .p2align 2 +.LBB0_86: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x24, x18, x27 + fmla v0.2s, v7.2s, v4.s[0] + fmla v2.2s, v7.2s, v5.s[0] + cmp x12, x23 + prfm pldl1keep, [x24, #16] + ldr q16, [x24] + b.ge .LBB0_88 +// %bb.87: // in Loop: Header=BB0_86 Depth=2 + ldr x10, [sp, #864] // 8-byte Folded Reload + stur d7, [x13, #-16] + fmla v1.2s, v7.2s, v6.s[0] + fmla v3.2s, v7.2s, v16.s[0] + add x25, x6, x27 + add x26, x5, x27 + add x11, x25, #32 + add x0, x26, #32 + add x12, x12, #4 + add x6, x6, #16 + add x5, x5, #16 + add x18, x18, #16 + add x24, x1, x10 + add x28, x4, x10 + prfm pldl1keep, [x24] + add x24, x2, x10 + ldr d17, [x16, x10] + stur d17, [x13, #-8] + prfm pldl1keep, [x24] + ldr d18, [x15, x10] + add x24, x3, x10 + fmla v0.2s, v17.2s, v4.s[1] + fmla v2.2s, v17.2s, v5.s[1] + fmla v1.2s, v17.2s, v6.s[1] + fmla v3.2s, v17.2s, v16.s[1] + str d18, [x13] + prfm pldl1keep, [x24] + ldr d19, [x14, x10] + fmla v0.2s, v18.2s, v4.s[2] + fmla v2.2s, v18.2s, v5.s[2] + fmla v1.2s, v18.2s, v6.s[2] + add x24, x7, x27 + fmla v3.2s, v18.2s, v16.s[2] + add x7, x7, #16 + add x30, x24, #32 + str d19, [x13, #8] + prfm pldl1keep, [x28] + ldr d7, [x17, x10] + fmla v0.2s, v19.2s, v4.s[3] + fmla v2.2s, v19.2s, v5.s[3] + fmla v1.2s, v19.2s, v6.s[3] + prfm pldl1keep, [x0] + ldr q4, [x26, #16] + prfm pldl1keep, [x11] + ldr q5, [x25, #16] + prfm pldl1keep, [x30] + ldr x10, [sp, #1016] // 8-byte Folded Reload + ldr q6, [x24, #16] + fmla v3.2s, v19.2s, v16.s[3] + add x13, x13, #32 + add x4, x4, x10 + add x3, x3, x10 + add x2, x2, x10 + add x1, x1, x10 + add x17, x17, x10 + add x16, x16, x10 + add x15, x15, x10 + add x14, x14, x10 + b .LBB0_86 + .p2align 2 +.LBB0_88: // in Loop: Header=BB0_4 Depth=1 + ldr x13, [sp, #728] // 8-byte Folded Reload + ldr x10, [sp, #912] // 8-byte Folded Reload + fmla v1.2s, v7.2s, v6.s[0] + fmla v3.2s, v7.2s, v16.s[0] + ldp x15, x6, [sp, #112] // 16-byte Folded Reload + ldr x4, [sp, #600] // 8-byte Folded Reload + mov x12, xzr + madd x11, x10, x13, x15 + ldr x14, [sp, #840] // 8-byte Folded Reload + str d7, [x8, x23, lsl #3] + ldr x16, [sp, #824] // 8-byte Folded Reload + ldr x18, [sp, #880] // 8-byte Folded Reload + ldr x5, [sp, #856] // 8-byte Folded Reload + add x11, x11, x4 + lsl x11, x11, #2 + ldr d7, [x14, x11] + fmla v0.2s, v7.2s, v4.s[1] + str d7, [x8, x10, lsl #3] + ldr x10, [sp, #904] // 8-byte Folded Reload + fmla v2.2s, v7.2s, v5.s[1] + fmla v1.2s, v7.2s, v6.s[1] + fmla v3.2s, v7.2s, v16.s[1] + madd x11, x10, x13, x15 + add x11, x11, x4 + lsl x11, x11, #2 + ldr d17, [x14, x11] + fmla v0.2s, v17.2s, v4.s[2] + str d17, [x8, x10, lsl #3] + ldr x10, [sp, #896] // 8-byte Folded Reload + fmla v2.2s, v17.2s, v5.s[2] + fmla v1.2s, v17.2s, v6.s[2] + fmla v3.2s, v17.2s, v16.s[2] + madd x11, x10, x13, x15 + ldr x13, [sp, #1032] // 8-byte Folded Reload + add x11, x11, x4 + lsl x11, x11, #2 + ldr d7, [x14, x11] + ldr x11, [sp, #512] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v4.s[3] + fmla v2.2s, v7.2s, v5.s[3] + fmla v1.2s, v7.2s, v6.s[3] + fmla v3.2s, v7.2s, v16.s[3] + str d7, [x8, x10, lsl #3] + ldp x17, x10, [sp, #280] // 16-byte Folded Reload + cmp x13, x21 + b.ge .LBB0_90 + .p2align 2 +.LBB0_89: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x15, x18, x11 + add x14, x10, x12 + add x11, x11, #4 + prfm pldl1keep, [x15] + ldur s4, [x15, #-4] + add x15, x15, x20 + prfm pldl1keep, [x15] + ldur s5, [x15, #-4] + add x15, x15, x20 + prfm pldl1keep, [x15] + ldur s6, [x15, #-4] + add x15, x15, x20 + prfm pldl1keep, [x15] + ldur s7, [x15, #-4] + prfm pldl1keep, [x14] + ldr d16, [x17, x12] + add x12, x12, x16 + fmla v0.2s, v16.2s, v4.s[0] + str d16, [x8, x13, lsl #3] + add x13, x13, #1 + fmla v2.2s, v16.2s, v5.s[0] + fmla v1.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v7.s[0] + cmp x13, x21 + b.lt .LBB0_89 +.LBB0_90: // %.preheader50 + // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #32] // 8-byte Folded Reload + ldr x13, [sp, #808] // 8-byte Folded Reload + mov x2, xzr + add x11, x8, #24 + ldr x14, [sp, #816] // 8-byte Folded Reload + mov w16, #1 // =0x1 + mov w17, #2 // =0x2 + mov w18, #3 // =0x3 + mov w15, #4 // =0x4 + add x12, x8, x10 + b .LBB0_92 + .p2align 2 +.LBB0_91: // %.loopexit46 + // in Loop: Header=BB0_92 Depth=2 + ldr x10, [sp, #1008] // 8-byte Folded Reload + ldr x4, [sp, #600] // 8-byte Folded Reload + mov x2, x15 + mov x15, x1 + add x14, x14, x10 + add x13, x13, x10 +.LBB0_92: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_94 Depth 3 + // Child Loop BB0_96 Depth 3 + madd x0, x2, x29, x9 + add x0, x0, x4 + madd x16, x16, x29, x9 + madd x17, x17, x29, x9 + madd x18, x18, x29, x9 + add x16, x16, x4 + add x17, x17, x4 + lsl x0, x0, #2 + lsl x16, x16, #2 + lsl x17, x17, #2 + str d0, [x5, x0] + str d2, [x5, x16] + add x16, x18, x4 + lsl x16, x16, #2 + str d1, [x5, x17] + str d3, [x5, x16] + ldr x16, [sp, #1024] // 8-byte Folded Reload + cmp x15, x16 + b.ge .LBB0_97 +// %bb.93: // in Loop: Header=BB0_92 Depth=2 + madd x0, x15, x29, x9 + add x18, x15, #3 + ldr x10, [sp, #888] // 8-byte Folded Reload + add x16, x15, #1 + add x17, x15, #2 + madd x1, x16, x29, x9 + ldr d16, [x8] + mov x2, xzr + madd x3, x17, x29, x9 + add x0, x0, x4 + lsl x0, x0, #2 + add x1, x1, x4 + add x3, x3, x4 + ldr d0, [x5, x0] + madd x0, x18, x29, x9 + lsl x1, x1, #2 + lsl x3, x3, #2 + ldr d2, [x5, x1] + ldr d1, [x5, x3] + add x1, x15, #4 + mov x3, x11 + add x0, x0, x4 + mov x4, x14 + lsl x0, x0, #2 + ldr d3, [x5, x0] + madd x0, x15, x19, x6 + lsl x0, x0, #2 + ldr q7, [x10, x0] + madd x0, x16, x19, x6 + lsl x0, x0, #2 + ldr q6, [x10, x0] + madd x0, x17, x19, x6 + lsl x0, x0, #2 + ldr q5, [x10, x0] + madd x0, x18, x19, x6 + lsl x0, x0, #2 + ldr q4, [x10, x0] + cmp xzr, x23 + b.ge .LBB0_95 + .p2align 2 +.LBB0_94: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_92 Depth=2 + // => This Inner Loop Header: Depth=3 + add x0, x3, #16 + fmla v0.2s, v16.2s, v7.s[0] + fmla v2.2s, v16.2s, v6.s[0] + add x2, x2, #4 + fmla v1.2s, v16.2s, v5.s[0] + fmla v3.2s, v16.2s, v4.s[0] + prfm pldl1keep, [x0] + add x0, x4, x20 + ldp d16, d17, [x3, #-16] + fmla v0.2s, v16.2s, v7.s[1] + fmla v2.2s, v16.2s, v6.s[1] + fmla v1.2s, v16.2s, v5.s[1] + fmla v3.2s, v16.2s, v4.s[1] + fmla v0.2s, v17.2s, v7.s[2] + fmla v2.2s, v17.2s, v6.s[2] + fmla v1.2s, v17.2s, v5.s[2] + fmla v3.2s, v17.2s, v4.s[2] + ldp d17, d16, [x3], #32 + prfm pldl1keep, [x4] + fmla v0.2s, v17.2s, v7.s[3] + ldur q7, [x4, #-16] + prfm pldl1keep, [x0] + fmla v2.2s, v17.2s, v6.s[3] + ldur q6, [x0, #-16] + add x0, x0, x20 + fmla v1.2s, v17.2s, v5.s[3] + fmla v3.2s, v17.2s, v4.s[3] + add x4, x4, #16 + prfm pldl1keep, [x0] + ldur q5, [x0, #-16] + add x0, x0, x20 + prfm pldl1keep, [x0] + ldur q4, [x0, #-16] + cmp x2, x23 + b.lt .LBB0_94 +.LBB0_95: // in Loop: Header=BB0_92 Depth=2 + ldr x10, [sp, #912] // 8-byte Folded Reload + fmla v0.2s, v16.2s, v7.s[0] + fmla v2.2s, v16.2s, v6.s[0] + ldr x4, [sp, #1032] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v5.s[0] + fmla v3.2s, v16.2s, v4.s[0] + mov x2, x13 + mov x3, x12 + ldr d17, [x8, x10, lsl #3] + ldr x10, [sp, #904] // 8-byte Folded Reload + fmla v0.2s, v17.2s, v7.s[1] + ldr d16, [x8, x10, lsl #3] + ldr x10, [sp, #896] // 8-byte Folded Reload + fmla v2.2s, v17.2s, v6.s[1] + fmla v1.2s, v17.2s, v5.s[1] + fmla v3.2s, v17.2s, v4.s[1] + ldr d18, [x8, x10, lsl #3] + fmla v0.2s, v16.2s, v7.s[2] + fmla v2.2s, v16.2s, v6.s[2] + fmla v1.2s, v16.2s, v5.s[2] + fmla v3.2s, v16.2s, v4.s[2] + fmla v0.2s, v18.2s, v7.s[3] + fmla v2.2s, v18.2s, v6.s[3] + fmla v1.2s, v18.2s, v5.s[3] + fmla v3.2s, v18.2s, v4.s[3] + cmp x4, x21 + b.ge .LBB0_91 + .p2align 2 +.LBB0_96: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_92 Depth=2 + // => This Inner Loop Header: Depth=3 + add x0, x2, x20 + prfm pldl1keep, [x2] + ldur s4, [x2, #-4] + add x4, x4, #1 + prfm pldl1keep, [x0] + ldur s5, [x0, #-4] + add x0, x0, x20 + add x2, x2, #4 + prfm pldl1keep, [x0] + ldur s6, [x0, #-4] + add x0, x0, x20 + prfm pldl1keep, [x0] + ldur s7, [x0, #-4] + prfm pldl1keep, [x3] + ldur d16, [x3, #-8] + add x3, x3, #8 + fmla v0.2s, v16.2s, v4.s[0] + fmla v2.2s, v16.2s, v5.s[0] + fmla v1.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v7.s[0] + cmp x4, x21 + b.lt .LBB0_96 + b .LBB0_91 + .p2align 2 +.LBB0_97: // in Loop: Header=BB0_4 Depth=1 + ldr x12, [sp, #1024] // 8-byte Folded Reload + ldr x13, [sp, #920] // 8-byte Folded Reload + cmp x12, x13 + b.ge .LBB0_103 +// %bb.98: // in Loop: Header=BB0_4 Depth=1 + ldr x16, [sp, #1024] // 8-byte Folded Reload + ldr x10, [sp, #888] // 8-byte Folded Reload + mov x14, xzr + add x15, x16, #1 + madd x12, x16, x29, x9 + madd x16, x16, x19, x6 + ldr d4, [x8] + ldr x17, [sp, #624] // 8-byte Folded Reload + madd x13, x15, x29, x9 + madd x15, x15, x19, x6 + add x12, x12, x4 + lsl x16, x16, #2 + add x13, x13, x4 + add x12, x5, x12, lsl #2 + lsl x15, x15, #2 + add x13, x5, x13, lsl #2 + ldr q3, [x10, x16] + ldr q2, [x10, x15] + ldr x16, [sp, #616] // 8-byte Folded Reload + mov x15, x11 + ldr d0, [x12] + ldr d1, [x13] + cmp xzr, x23 + b.ge .LBB0_100 + .p2align 2 +.LBB0_99: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x3, x15, #16 + fmla v0.2s, v4.2s, v3.s[0] + fmla v1.2s, v4.2s, v2.s[0] + add x18, x17, x27 + prfm pldl1keep, [x3] + ldp d4, d5, [x15, #-16] + add x1, x16, x27 + add x0, x18, #32 + add x2, x1, #32 + add x14, x14, #4 + add x17, x17, #16 + add x16, x16, #16 + fmla v0.2s, v4.2s, v3.s[1] + fmla v1.2s, v4.2s, v2.s[1] + fmla v0.2s, v5.2s, v3.s[2] + fmla v1.2s, v5.2s, v2.s[2] + ldp d5, d4, [x15], #32 + prfm pldl1keep, [x2] + fmla v0.2s, v5.2s, v3.s[3] + ldr q3, [x1, #16] + prfm pldl1keep, [x0] + fmla v1.2s, v5.2s, v2.s[3] + ldr q2, [x18, #16] + cmp x14, x23 + b.lt .LBB0_99 +.LBB0_100: // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #912] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v3.s[0] + fmla v1.2s, v4.2s, v2.s[0] + ldr x1, [sp, #544] // 8-byte Folded Reload + mov x14, xzr + mov x15, xzr + ldr d5, [x8, x10, lsl #3] + ldr x10, [sp, #904] // 8-byte Folded Reload + ldr d4, [x8, x10, lsl #3] + ldr x10, [sp, #896] // 8-byte Folded Reload + fmla v0.2s, v5.2s, v3.s[1] + fmla v1.2s, v5.2s, v2.s[1] + ldr d5, [x8, x10, lsl #3] + ldr x10, [sp, #56] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v3.s[2] + fmla v1.2s, v4.2s, v2.s[2] + add x16, x8, x10 + fmla v0.2s, v5.2s, v3.s[3] + fmla v1.2s, v5.2s, v2.s[3] + ldr x10, [sp, #1032] // 8-byte Folded Reload + add x17, x10, xzr + cmp x17, x21 + b.ge .LBB0_102 + .p2align 2 +.LBB0_101: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + ldr x10, [sp, #656] // 8-byte Folded Reload + add x17, x16, x15, lsl #3 + add x18, x1, x14 + add x18, x18, #4 + add x17, x17, #8 + add x0, x10, x14 + add x14, x14, #4 + add x0, x0, #4 + prfm pldl1keep, [x0] + ldr s2, [x10, x15, lsl #2] + prfm pldl1keep, [x18] + ldr s3, [x1, x15, lsl #2] + prfm pldl1keep, [x17] + ldr d4, [x16, x15, lsl #3] + add x15, x15, #1 + fmla v0.2s, v4.2s, v2.s[0] + fmla v1.2s, v4.2s, v3.s[0] + ldr x10, [sp, #1032] // 8-byte Folded Reload + add x17, x10, x15 + cmp x17, x21 + b.lt .LBB0_101 +.LBB0_102: // in Loop: Header=BB0_4 Depth=1 + str d0, [x12] + str d1, [x13] +.LBB0_103: // in Loop: Header=BB0_4 Depth=1 + ldr x12, [sp, #744] // 8-byte Folded Reload + ldr x13, [sp, #920] // 8-byte Folded Reload + cmp x13, x12 + b.ge .LBB0_109 +// %bb.104: // in Loop: Header=BB0_4 Depth=1 + ldr x13, [sp, #920] // 8-byte Folded Reload + ldr d2, [x8] + mov x12, xzr + madd x9, x13, x29, x9 + madd x10, x13, x19, x6 + ldr x13, [sp, #888] // 8-byte Folded Reload + ldr x14, [sp, #632] // 8-byte Folded Reload + ldr x15, [sp, #880] // 8-byte Folded Reload + add x9, x9, x4 + lsl x10, x10, #2 + add x9, x5, x9, lsl #2 + ldr d0, [x9] + ldr q1, [x13, x10] + ldr x10, [sp, #640] // 8-byte Folded Reload + cmp xzr, x23 + b.ge .LBB0_106 + .p2align 2 +.LBB0_105: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x11, #16 + fmla v0.2s, v2.2s, v1.s[0] + add x12, x12, #4 + prfm pldl1keep, [x13] + ldp d2, d3, [x11, #-16] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v3.2s, v1.s[2] + ldp d3, d2, [x11], #32 + prfm pldl1keep, [x10] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x10, #-16] + add x10, x10, #16 + cmp x12, x23 + b.lt .LBB0_105 +.LBB0_106: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #912] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.s[0] + mov x10, xzr + ldr d3, [x8, x11, lsl #3] + ldr x11, [sp, #904] // 8-byte Folded Reload + fmla v0.2s, v3.2s, v1.s[1] + ldr d4, [x8, x11, lsl #3] + ldr x11, [sp, #896] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v1.s[2] + ldr d2, [x8, x11, lsl #3] + ldr x11, [sp, #56] // 8-byte Folded Reload + add x8, x8, x11 + ldr x11, [sp, #560] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.s[3] + ldr x12, [sp, #1032] // 8-byte Folded Reload + add x12, x12, xzr + cmp x12, x21 + b.ge .LBB0_108 + .p2align 2 +.LBB0_107: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x8, x10, lsl #3 + add x13, x15, x11 + add x11, x11, #4 + prfm pldl1keep, [x13] + ldr s1, [x14, x10, lsl #2] + add x12, x12, #8 + prfm pldl1keep, [x12] + ldr d2, [x8, x10, lsl #3] + add x10, x10, #1 + fmla v0.2s, v2.2s, v1.s[0] + ldr x12, [sp, #1032] // 8-byte Folded Reload + add x12, x12, x10 + cmp x12, x21 + b.lt .LBB0_107 +.LBB0_108: // in Loop: Header=BB0_4 Depth=1 + str d0, [x9] +.LBB0_109: // in Loop: Header=BB0_4 Depth=1 + ldr x0, [sp, #848] // 8-byte Folded Reload + bl free + ldr x8, [sp, #144] // 8-byte Folded Reload + ldr x9, [sp, #648] // 8-byte Folded Reload + cmp x9, x8 + b.ge .LBB0_3 +.LBB0_110: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #96] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #736] // 8-byte Folded Reload + ldr x17, [sp, #776] // 8-byte Folded Reload + add x10, x0, #63 + mov x12, xzr + ldr x15, [sp, #520] // 8-byte Folded Reload + ldr x16, [sp, #648] // 8-byte Folded Reload + mov x13, xzr + mul x9, x17, x8 + ldr x24, [sp, #856] // 8-byte Folded Reload + add x8, x15, x29 + ldp x1, x2, [sp, #248] // 16-byte Folded Reload + ldp x3, x4, [sp, #264] // 16-byte Folded Reload + str x0, [sp, #120] // 8-byte Folded Spill + add x11, x9, x16 + add x8, x11, x8 + add x14, x11, x29 + add x15, x11, x15 + ldr s2, [x24, x11, lsl #2] + ldr x11, [sp, #840] // 8-byte Folded Reload + ldr s0, [x24, x8, lsl #2] + and x8, x10, #0xffffffffffffffc0 + ldr x10, [sp, #752] // 8-byte Folded Reload + ldr s3, [x24, x14, lsl #2] + ldr s1, [x24, x15, lsl #2] + mul x10, x17, x10 + str x10, [sp, #520] // 8-byte Folded Spill + add x10, x10, x16 + ldp x15, x16, [sp, #216] // 16-byte Folded Reload + ldr s7, [x11, x10, lsl #2] + ldr x10, [sp, #760] // 8-byte Folded Reload + mul x11, x17, x10 + ldr x10, [sp, #888] // 8-byte Folded Reload + lsl x14, x11, #2 + ldp x17, x18, [sp, #232] // 16-byte Folded Reload + str x11, [sp, #776] // 8-byte Folded Spill + ldr q4, [x10, x14] + add x14, x11, x19 + lsl x14, x14, #2 + ldr q5, [x10, x14] + add x14, x11, x19, lsl #1 + lsl x14, x14, #2 + ldr q6, [x10, x14] + orr x10, x8, #0xc + str x10, [sp, #848] // 8-byte Folded Spill + .p2align 2 +.LBB0_111: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + ldr x10, [sp, #608] // 8-byte Folded Reload + ext v20.16b, v4.16b, v4.16b, #8 + cmp x13, x23 + ext v19.16b, v5.16b, v5.16b, #8 + add x5, x10, x12 + prfm pldl1keep, [x5, #16] + ldr q16, [x5] + ext v18.16b, v6.16b, v6.16b, #8 + ext v17.16b, v16.16b, v16.16b, #8 + b.ge .LBB0_113 +// %bb.112: // in Loop: Header=BB0_111 Depth=2 + ldr x10, [sp, #592] // 8-byte Folded Reload + ldr x14, [sp, #864] // 8-byte Folded Reload + fmla v2.2s, v7.2s, v4.2s + fmla v3.2s, v7.2s, v5.2s + fmla v1.2s, v7.2s, v6.2s + fmla v0.2s, v7.2s, v16.2s + add x13, x13, #4 + add x5, x10, x12 + ldr x10, [sp, #584] // 8-byte Folded Reload + add x0, x1, x14 + add x11, x2, x14 + add x24, x3, x14 + add x30, x4, x14 + add x6, x5, #32 + add x7, x10, x12 + ldr x10, [sp, #880] // 8-byte Folded Reload + add x25, x7, #32 + add x26, x10, x12 + ldr x10, [sp, #848] // 8-byte Folded Reload + add x28, x26, #32 + add x10, x10, x12 + add x12, x12, #16 + stur s7, [x10, #-12] + prfm pldl1keep, [x0] + ldr s7, [x17, x14] + fmla v2.2s, v7.2s, v4.s[1] + fmla v3.2s, v7.2s, v5.s[1] + fmla v1.2s, v7.2s, v6.s[1] + fmla v0.2s, v7.2s, v16.s[1] + stur s7, [x10, #-8] + prfm pldl1keep, [x11] + ldr s7, [x16, x14] + fmla v1.2s, v7.2s, v18.2s + stur s7, [x10, #-4] + prfm pldl1keep, [x24] + ldr s18, [x15, x14] + fmla v2.2s, v7.2s, v20.2s + fmla v3.2s, v7.2s, v19.2s + ldr x24, [sp, #856] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v17.2s + str s18, [x10] + prfm pldl1keep, [x30] + ldr s7, [x18, x14] + fmla v2.2s, v18.2s, v4.s[3] + fmla v3.2s, v18.2s, v5.s[3] + fmla v1.2s, v18.2s, v6.s[3] + prfm pldl1keep, [x28] + ldr q4, [x26, #16] + prfm pldl1keep, [x25] + ldr q5, [x7, #16] + prfm pldl1keep, [x6] + ldr x10, [sp, #1016] // 8-byte Folded Reload + ldr q6, [x5, #16] + fmla v0.2s, v18.2s, v16.s[3] + add x4, x4, x10 + add x3, x3, x10 + add x2, x2, x10 + add x1, x1, x10 + add x18, x18, x10 + add x17, x17, x10 + add x16, x16, x10 + add x15, x15, x10 + b .LBB0_111 + .p2align 2 +.LBB0_113: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #728] // 8-byte Folded Reload + ldr x13, [sp, #912] // 8-byte Folded Reload + fmla v2.2s, v7.2s, v4.2s + fmla v3.2s, v7.2s, v5.2s + ldr x15, [sp, #520] // 8-byte Folded Reload + ldr x4, [sp, #648] // 8-byte Folded Reload + fmla v1.2s, v7.2s, v6.2s + fmla v0.2s, v7.2s, v16.2s + ldr x14, [sp, #840] // 8-byte Folded Reload + str s7, [x8, x23, lsl #2] + mov x12, xzr + ldr x16, [sp, #880] // 8-byte Folded Reload + ldp x18, x17, [sp, #200] // 16-byte Folded Reload + madd x10, x13, x11, x15 + ldr x5, [sp, #776] // 8-byte Folded Reload + add x10, x10, x4 + ldr s7, [x14, x10, lsl #2] + str s7, [x8, x13, lsl #2] + ldr x13, [sp, #904] // 8-byte Folded Reload + fmla v2.2s, v7.2s, v4.s[1] + fmla v3.2s, v7.2s, v5.s[1] + fmla v1.2s, v7.2s, v6.s[1] + fmla v0.2s, v7.2s, v16.s[1] + madd x10, x13, x11, x15 + add x10, x10, x4 + ldr s7, [x14, x10, lsl #2] + fmla v2.2s, v7.2s, v20.2s + str s7, [x8, x13, lsl #2] + ldr x13, [sp, #896] // 8-byte Folded Reload + fmla v3.2s, v7.2s, v19.2s + fmla v1.2s, v7.2s, v18.2s + fmla v0.2s, v7.2s, v17.2s + madd x10, x13, x11, x15 + ldr x11, [sp, #512] // 8-byte Folded Reload + ldr x15, [sp, #824] // 8-byte Folded Reload + add x10, x10, x4 + ldr s7, [x14, x10, lsl #2] + fmla v2.2s, v7.2s, v4.s[3] + fmla v3.2s, v7.2s, v5.s[3] + fmla v1.2s, v7.2s, v6.s[3] + fmla v0.2s, v7.2s, v16.s[3] + str s7, [x8, x13, lsl #2] + ldr x13, [sp, #1032] // 8-byte Folded Reload + cmp x13, x21 + b.ge .LBB0_115 + .p2align 2 +.LBB0_114: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x14, x16, x11 + add x10, x17, x12 + add x11, x11, #4 + prfm pldl1keep, [x14] + ldur s4, [x14, #-4] + add x14, x14, x20 + prfm pldl1keep, [x14] + ldur s5, [x14, #-4] + add x14, x14, x20 + prfm pldl1keep, [x14] + ldur s6, [x14, #-4] + add x14, x14, x20 + prfm pldl1keep, [x14] + ldur s7, [x14, #-4] + prfm pldl1keep, [x10] + ldr s16, [x18, x12] + add x12, x12, x15 + fmla v2.2s, v16.2s, v4.2s + str s16, [x8, x13, lsl #2] + add x13, x13, #1 + fmla v3.2s, v16.2s, v5.2s + fmla v1.2s, v16.2s, v6.2s + fmla v0.2s, v16.2s, v7.2s + cmp x13, x21 + b.lt .LBB0_114 +.LBB0_115: // %.preheader49 + // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #512] // 8-byte Folded Reload + ldr x13, [sp, #808] // 8-byte Folded Reload + mov x2, xzr + add x11, x8, #12 + ldr x14, [sp, #816] // 8-byte Folded Reload + mov w17, #1 // =0x1 + mov w18, #2 // =0x2 + mov w16, #3 // =0x3 + mov w15, #4 // =0x4 + add x12, x8, x10 + b .LBB0_117 + .p2align 2 +.LBB0_116: // %.loopexit45 + // in Loop: Header=BB0_117 Depth=2 + ldr x10, [sp, #1008] // 8-byte Folded Reload + ldr x4, [sp, #648] // 8-byte Folded Reload + mov x2, x15 + mov x15, x1 + add x14, x14, x10 + add x13, x13, x10 +.LBB0_117: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_119 Depth 3 + // Child Loop BB0_121 Depth 3 + madd x10, x2, x29, x9 + add x10, x10, x4 + madd x17, x17, x29, x9 + madd x18, x18, x29, x9 + add x17, x17, x4 + str s2, [x24, x10, lsl #2] + madd x10, x16, x29, x9 + add x16, x18, x4 + str s3, [x24, x17, lsl #2] + add x10, x10, x4 + str s1, [x24, x16, lsl #2] + str s0, [x24, x10, lsl #2] + ldr x10, [sp, #1024] // 8-byte Folded Reload + cmp x15, x10 + b.ge .LBB0_122 +// %bb.118: // in Loop: Header=BB0_117 Depth=2 + add x17, x15, #1 + madd x10, x15, x29, x9 + add x18, x15, #2 + add x16, x15, #3 + madd x0, x17, x29, x9 + ldr s16, [x8] + mov x2, xzr + add x10, x10, x4 + madd x1, x18, x29, x9 + madd x3, x16, x29, x9 + add x0, x0, x4 + add x1, x1, x4 + ldr s2, [x24, x10, lsl #2] + madd x10, x15, x19, x5 + add x3, x3, x4 + mov x4, x14 + ldr s3, [x24, x0, lsl #2] + ldr x0, [sp, #888] // 8-byte Folded Reload + ldr s0, [x24, x3, lsl #2] + ldr s1, [x24, x1, lsl #2] + add x1, x15, #4 + mov x3, x11 + lsl x10, x10, #2 + ldr q7, [x0, x10] + madd x10, x17, x19, x5 + lsl x10, x10, #2 + ldr q6, [x0, x10] + madd x10, x18, x19, x5 + lsl x10, x10, #2 + ldr q5, [x0, x10] + madd x10, x16, x19, x5 + lsl x10, x10, #2 + ldr q4, [x0, x10] + ext v20.16b, v7.16b, v7.16b, #8 + cmp xzr, x23 + ext v19.16b, v6.16b, v6.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.ge .LBB0_120 + .p2align 2 +.LBB0_119: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_117 Depth=2 + // => This Inner Loop Header: Depth=3 + add x10, x3, #8 + fmla v2.2s, v16.2s, v7.2s + fmla v3.2s, v16.2s, v6.2s + add x2, x2, #4 + fmla v1.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + prfm pldl1keep, [x10] + add x10, x4, x20 + ldp s16, s21, [x3, #-8] + fmla v0.2s, v16.2s, v4.s[1] + fmla v2.2s, v16.2s, v7.s[1] + fmla v3.2s, v16.2s, v6.s[1] + fmla v1.2s, v16.2s, v5.s[1] + fmla v0.2s, v21.2s, v17.2s + fmla v2.2s, v21.2s, v20.2s + ldp s17, s16, [x3], #16 + fmla v3.2s, v21.2s, v19.2s + fmla v1.2s, v21.2s, v18.2s + prfm pldl1keep, [x4] + fmla v2.2s, v17.2s, v7.s[3] + ldur q7, [x4, #-16] + prfm pldl1keep, [x10] + fmla v3.2s, v17.2s, v6.s[3] + ldur q6, [x10, #-16] + add x10, x10, x20 + fmla v1.2s, v17.2s, v5.s[3] + fmla v0.2s, v17.2s, v4.s[3] + add x4, x4, #16 + prfm pldl1keep, [x10] + ldur q5, [x10, #-16] + add x10, x10, x20 + prfm pldl1keep, [x10] + ldur q4, [x10, #-16] + ext v20.16b, v7.16b, v7.16b, #8 + cmp x2, x23 + ext v19.16b, v6.16b, v6.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.lt .LBB0_119 +.LBB0_120: // in Loop: Header=BB0_117 Depth=2 + ldr x10, [sp, #912] // 8-byte Folded Reload + fmla v2.2s, v16.2s, v7.2s + fmla v3.2s, v16.2s, v6.2s + ldr x4, [sp, #1032] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + mov x2, x13 + mov x3, x12 + ldr s21, [x8, x10, lsl #2] + ldr x10, [sp, #904] // 8-byte Folded Reload + fmla v2.2s, v21.2s, v7.s[1] + ldr s16, [x8, x10, lsl #2] + ldr x10, [sp, #896] // 8-byte Folded Reload + fmla v3.2s, v21.2s, v6.s[1] + fmla v1.2s, v21.2s, v5.s[1] + fmla v0.2s, v21.2s, v4.s[1] + ldr s22, [x8, x10, lsl #2] + fmla v2.2s, v16.2s, v20.2s + fmla v3.2s, v16.2s, v19.2s + fmla v1.2s, v16.2s, v18.2s + fmla v0.2s, v16.2s, v17.2s + fmla v2.2s, v22.2s, v7.s[3] + fmla v3.2s, v22.2s, v6.s[3] + fmla v1.2s, v22.2s, v5.s[3] + fmla v0.2s, v22.2s, v4.s[3] + cmp x4, x21 + b.ge .LBB0_116 + .p2align 2 +.LBB0_121: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_117 Depth=2 + // => This Inner Loop Header: Depth=3 + add x10, x2, x20 + prfm pldl1keep, [x2] + ldur s4, [x2, #-4] + add x4, x4, #1 + prfm pldl1keep, [x10] + ldur s5, [x10, #-4] + add x10, x10, x20 + add x2, x2, #4 + prfm pldl1keep, [x10] + ldur s6, [x10, #-4] + add x10, x10, x20 + prfm pldl1keep, [x10] + ldur s7, [x10, #-4] + prfm pldl1keep, [x3] + ldur s16, [x3, #-4] + add x3, x3, #4 + fmla v2.2s, v16.2s, v4.2s + fmla v3.2s, v16.2s, v5.2s + fmla v1.2s, v16.2s, v6.2s + fmla v0.2s, v16.2s, v7.2s + cmp x4, x21 + b.lt .LBB0_121 + b .LBB0_116 + .p2align 2 +.LBB0_122: // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #1024] // 8-byte Folded Reload + ldr x12, [sp, #920] // 8-byte Folded Reload + cmp x10, x12 + ldr x2, [sp, #544] // 8-byte Folded Reload + b.ge .LBB0_128 +// %bb.123: // in Loop: Header=BB0_4 Depth=1 + ldr x13, [sp, #1024] // 8-byte Folded Reload + ldr x16, [sp, #888] // 8-byte Folded Reload + mov x14, xzr + mov x15, xzr + ldr s4, [x8] + madd x12, x13, x19, x5 + add x10, x13, #1 + lsl x12, x12, #2 + ldr q3, [x16, x12] + madd x12, x10, x19, x5 + madd x10, x10, x29, x9 + lsl x12, x12, #2 + ldr q2, [x16, x12] + madd x12, x13, x29, x9 + add x13, x10, x4 + ldr s0, [x24, x13, lsl #2] + add x12, x12, x4 + ldr s1, [x24, x12, lsl #2] + ext v6.16b, v3.16b, v3.16b, #8 + cmp xzr, x23 + ext v5.16b, v2.16b, v2.16b, #8 + b.ge .LBB0_125 + .p2align 2 +.LBB0_124: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x0, x8, x14 + ldr x10, [sp, #536] // 8-byte Folded Reload + ldr x17, [sp, #528] // 8-byte Folded Reload + fmla v1.2s, v4.2s, v3.2s + add x1, x0, #20 + fmla v0.2s, v4.2s, v2.2s + add x15, x15, #4 + prfm pldl1keep, [x1] + ldp s4, s7, [x0, #4] + add x10, x10, x14 + add x17, x17, x14 + add x14, x14, #16 + add x16, x10, #32 + add x18, x17, #32 + fmla v0.2s, v4.2s, v2.s[1] + fmla v1.2s, v4.2s, v3.s[1] + fmla v0.2s, v7.2s, v5.2s + ldp s5, s4, [x0, #12] + fmla v1.2s, v7.2s, v6.2s + prfm pldl1keep, [x18] + fmla v1.2s, v5.2s, v3.s[3] + ldr q3, [x17, #16] + prfm pldl1keep, [x16] + fmla v0.2s, v5.2s, v2.s[3] + ldr q2, [x10, #16] + ext v6.16b, v3.16b, v3.16b, #8 + cmp x15, x23 + ext v5.16b, v2.16b, v2.16b, #8 + b.lt .LBB0_124 +.LBB0_125: // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #912] // 8-byte Folded Reload + fmla v1.2s, v4.2s, v3.2s + fmla v0.2s, v4.2s, v2.2s + ldr x16, [sp, #1032] // 8-byte Folded Reload + mov x14, xzr + ldr s7, [x8, x10, lsl #2] + ldr x10, [sp, #904] // 8-byte Folded Reload + ldr s4, [x8, x10, lsl #2] + ldr x10, [sp, #896] // 8-byte Folded Reload + fmla v1.2s, v7.2s, v3.s[1] + fmla v0.2s, v7.2s, v2.s[1] + ldr s7, [x8, x10, lsl #2] + ldr x10, [sp, #104] // 8-byte Folded Reload + fmla v1.2s, v4.2s, v6.2s + fmla v0.2s, v4.2s, v5.2s + add x15, x8, x10 + fmla v1.2s, v7.2s, v3.s[3] + fmla v0.2s, v7.2s, v2.s[3] + cmp x16, x21 + b.ge .LBB0_127 + .p2align 2 +.LBB0_126: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + ldr x0, [sp, #656] // 8-byte Folded Reload + add x10, x15, x14 + add x17, x2, x14 + add x16, x16, #1 + add x10, x10, #4 + add x17, x17, #4 + add x18, x0, x14 + add x18, x18, #4 + prfm pldl1keep, [x18] + prfm pldl1keep, [x17] + ldr s2, [x0, x14] + prfm pldl1keep, [x10] + ldr s3, [x15, x14] + fmla v1.2s, v3.2s, v2.2s + ldr s2, [x2, x14] + add x14, x14, #4 + fmla v0.2s, v3.2s, v2.2s + cmp x16, x21 + b.lt .LBB0_126 +.LBB0_127: // in Loop: Header=BB0_4 Depth=1 + str s1, [x24, x12, lsl #2] + str s0, [x24, x13, lsl #2] +.LBB0_128: // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #744] // 8-byte Folded Reload + ldr x12, [sp, #920] // 8-byte Folded Reload + cmp x12, x10 + b.ge .LBB0_2 +// %bb.129: // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #920] // 8-byte Folded Reload + ldr x13, [sp, #888] // 8-byte Folded Reload + mov x12, xzr + madd x9, x10, x29, x9 + madd x10, x10, x19, x5 + ldr s2, [x8] + ldr x14, [sp, #632] // 8-byte Folded Reload + add x9, x9, x4 + lsl x10, x10, #2 + ldr s0, [x24, x9, lsl #2] + ldr q1, [x13, x10] + ldr x10, [sp, #640] // 8-byte Folded Reload + ext v3.16b, v1.16b, v1.16b, #8 + cmp xzr, x23 + b.ge .LBB0_131 + .p2align 2 +.LBB0_130: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x11, #8 + fmla v0.2s, v2.2s, v1.2s + add x12, x12, #4 + prfm pldl1keep, [x13] + ldp s2, s4, [x11, #-8] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v4.2s, v3.2s + ldp s3, s2, [x11], #16 + prfm pldl1keep, [x10] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x10, #-16] + add x10, x10, #16 + ext v3.16b, v1.16b, v1.16b, #8 + cmp x12, x23 + b.lt .LBB0_130 +.LBB0_131: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #912] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.2s + mov x10, xzr + ldr s4, [x8, x11, lsl #2] + ldr x11, [sp, #904] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v1.s[1] + ldr s5, [x8, x11, lsl #2] + ldr x11, [sp, #896] // 8-byte Folded Reload + fmla v0.2s, v5.2s, v3.2s + ldr s2, [x8, x11, lsl #2] + ldr x11, [sp, #104] // 8-byte Folded Reload + add x8, x8, x11 + ldr x11, [sp, #1032] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.s[3] + cmp x11, x21 + b.ge .LBB0_1 + .p2align 2 +.LBB0_132: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x8, x10 + add x13, x14, x10 + add x11, x11, #1 + add x12, x12, #4 + add x13, x13, #4 + prfm pldl1keep, [x13] + prfm pldl1keep, [x12] + ldr s1, [x8, x10] + ldr s2, [x14, x10] + add x10, x10, #4 + fmla v0.2s, v1.2s, v2.2s + cmp x11, x21 + b.lt .LBB0_132 + b .LBB0_1 +.LBB0_133: + ldr x0, [sp, #16] // 8-byte Folded Reload + bl free + add sp, sp, #1040 + ldp d9, d8, [sp, #48] // 16-byte Folded Reload + ldp d11, d10, [sp, #32] // 16-byte Folded Reload + ldp d13, d12, [sp, #16] // 16-byte Folded Reload + ldp x20, x19, [sp, #144] // 16-byte Folded Reload + ldp x22, x21, [sp, #128] // 16-byte Folded Reload + ldp x24, x23, [sp, #112] // 16-byte Folded Reload + ldp x26, x25, [sp, #96] // 16-byte Folded Reload + ldp x28, x27, [sp, #80] // 16-byte Folded Reload + ldp x29, x30, [sp, #64] // 16-byte Folded Reload + ldp d15, d14, [sp], #160 // 16-byte Folded Reload + ret +.Lfunc_end0: + .size sbatch_matmul_3d_nn_mlir, .Lfunc_end0-sbatch_matmul_3d_nn_mlir + .cfi_endproc + // -- End function + .section ".note.GNU-stack","",@progbits diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nt_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nt_mlir.s new file mode 100644 index 00000000000000..a70650bb6207e2 --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_3d_nt_mlir.s @@ -0,0 +1,2987 @@ + .text + .file "LLVMDialectModule" + .globl sbatch_matmul_3d_nt_mlir // -- Begin function sbatch_matmul_3d_nt_mlir + .p2align 4 + .type sbatch_matmul_3d_nt_mlir,@function +sbatch_matmul_3d_nt_mlir: // @sbatch_matmul_3d_nt_mlir + .cfi_startproc +// %bb.0: + stp d15, d14, [sp, #-160]! // 16-byte Folded Spill + stp d13, d12, [sp, #16] // 16-byte Folded Spill + stp x29, x30, [sp, #64] // 16-byte Folded Spill + stp x28, x27, [sp, #80] // 16-byte Folded Spill + stp x26, x25, [sp, #96] // 16-byte Folded Spill + stp x24, x23, [sp, #112] // 16-byte Folded Spill + stp x22, x21, [sp, #128] // 16-byte Folded Spill + stp x20, x19, [sp, #144] // 16-byte Folded Spill + stp d11, d10, [sp, #32] // 16-byte Folded Spill + stp d9, d8, [sp, #48] // 16-byte Folded Spill + sub sp, sp, #512 + .cfi_def_cfa_offset 672 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + .cfi_offset b10, -120 + .cfi_offset b11, -128 + .cfi_offset b12, -136 + .cfi_offset b13, -144 + .cfi_offset b14, -152 + .cfi_offset b15, -160 + cmp x4, #0 + ldr x13, [sp, #712] + ldr x14, [sp, #768] + mov x19, x7 + cinv x8, x4, lt + ldr x12, [sp, #760] + ldr x28, [sp, #808] + mov x21, x5 + add x9, x8, x8, lsr #63 + add x10, x8, #3 + ldr x23, [sp, #728] + ldr x27, [sp, #736] + str x6, [sp, #448] // 8-byte Folded Spill + stp x13, x3, [sp, #136] // 16-byte Folded Spill + mov x26, x2 + mov x25, x1 + asr x9, x9, #1 + stp x12, x14, [sp, #328] // 16-byte Folded Spill + cinv x22, x9, lt + cmp x8, #0 + csel x8, x10, x8, lt + cmp x4, #0 + ldr x10, [sp, #800] + asr x8, x8, #2 + cinv x24, x8, lt + cmp x13, #0 + cinv x8, x13, lt + add x9, x8, x8, lsr #63 + stp x10, x4, [sp, #360] // 16-byte Folded Spill + add x10, x8, #15 + add x11, x8, #7 + add x12, x8, #3 + asr x9, x9, #1 + cinv x14, x9, lt + cmp x8, #0 + csel x9, x10, x8, lt + csel x10, x11, x8, lt + ldr x11, [sp, #696] + csel x8, x12, x8, lt + cmp x13, #0 + asr x9, x9, #4 + asr x10, x10, #3 + asr x8, x8, #2 + cinv x9, x9, lt + cinv x10, x10, lt + cinv x29, x8, lt + lsl x8, x14, #1 + stp x9, x10, [sp, #456] // 16-byte Folded Spill + lsl x9, x9, #4 + str x8, [sp, #168] // 8-byte Folded Spill + lsl x8, x5, #6 + lsl x20, x29, #2 + stp x11, x14, [sp, #488] // 16-byte Folded Spill + ldr x11, [sp, #688] + str x9, [sp, #416] // 8-byte Folded Spill + lsl x9, x10, #3 + add x0, x8, #64 + stp x8, x11, [sp, #472] // 16-byte Folded Spill + str x9, [sp, #280] // 8-byte Folded Spill + bl malloc + lsl x8, x24, #2 + mov x12, x22 + lsl x10, x23, #2 + mul x11, x24, x19 + str x8, [sp, #504] // 8-byte Folded Spill + lsl x8, x22, #1 + and x9, x21, #0x3 + lsl x22, x19, #2 + str x8, [sp, #440] // 8-byte Folded Spill + negs x8, x21 + str x10, [sp, #128] // 8-byte Folded Spill + lsl x10, x27, #6 + mul x13, x12, x19 + str x0, [sp, #16] // 8-byte Folded Spill + add x12, x0, #63 + and x8, x8, #0x3 + ldp x0, x18, [sp, #480] // 16-byte Folded Reload + str x10, [sp, #320] // 8-byte Folded Spill + mov w10, #1 // =0x1 + add x14, x21, x22 + csneg x8, x9, x8, mi + lsl x2, x26, #2 + bfi x10, x24, #2, #62 + sub x14, x14, x8 + and x23, x12, #0xffffffffffffffc0 + mul x12, x19, x10 + lsl x24, x19, #4 + add x14, x2, x14, lsl #2 + lsl x9, x11, #4 + add x11, x21, x11, lsl #2 + add x18, x0, x18, lsl #2 + add x0, x24, x2 + add x0, x0, x25 + add x14, x14, x25 + sub x11, x11, x8 + add x0, x0, #32 + add x14, x14, #4 + lsl x17, x12, #2 + add x12, x21, x12 + lsl x11, x11, #2 + stp x14, x0, [sp, #384] // 16-byte Folded Spill + ldr x0, [sp, #472] // 8-byte Folded Reload + add x16, x21, x13, lsl #1 + str x11, [sp, #264] // 8-byte Folded Spill + sub x11, x12, x8 + ldr x10, [sp, #456] // 8-byte Folded Reload + lsl x11, x11, #2 + sub x14, x23, x8, lsl #6 + str x11, [sp, #256] // 8-byte Folded Spill + sub x11, x16, x8 + mov w15, #1 // =0x1 + lsl x16, x11, #2 + add x13, x2, x13, lsl #3 + bfi x15, x29, #2, #62 + add x16, x16, #4 + add x0, x14, x0 + add x14, x25, x17 + lsl x4, x21, #2 + mul x10, x10, x27 + str x14, [sp, #296] // 8-byte Folded Spill + add x14, x25, x9 + mul x12, x29, x27 + mul x15, x27, x15 + str x14, [sp, #288] // 8-byte Folded Spill + add x14, x13, x25 + str x16, [sp, #272] // 8-byte Folded Spill + add x13, x13, x4 + lsl x16, x8, #2 + add x14, x14, #32 + sub x13, x13, x16 + ldr x11, [sp, #464] // 8-byte Folded Reload + str x14, [sp, #304] // 8-byte Folded Spill + ldr x14, [sp, #496] // 8-byte Folded Reload + add x13, x25, x13 + add x3, x18, #4 + str x13, [sp, #376] // 8-byte Folded Spill + add x13, x17, x2 + add x17, x18, x15, lsl #2 + add x18, x18, x12, lsl #4 + add x12, x13, x4 + add x10, x3, x10, lsl #6 + mul x11, x11, x27 + str x10, [sp, #232] // 8-byte Folded Spill + sub x12, x12, x16 + add x9, x9, x2 + mov x15, x0 + lsl x0, x21, #3 + mul x14, x14, x27 + add x10, x25, x12 + add x12, x25, x13 + add x13, x25, x9 + str x10, [sp, #240] // 8-byte Folded Spill + add x10, x9, x4 + lsl x9, x21, #5 + str x26, [sp, #352] // 8-byte Folded Spill + sub x10, x10, x16 + lsl x26, x27, #2 + sub x27, x21, x8 + str x0, [sp, #80] // 8-byte Folded Spill + add x10, x25, x10 + sub x0, x0, x8, lsl #3 + str x2, [sp, #456] // 8-byte Folded Spill + add x2, x25, x2 + str x10, [sp, #312] // 8-byte Folded Spill + add x10, x3, x11, lsl #5 + sub x11, x4, x16 + lsl x16, x21, #4 + add x14, x3, x14, lsl #3 + stp x16, x9, [sp, #88] // 16-byte Folded Spill + sub x9, x9, x8, lsl #5 + sub x16, x16, x8, lsl #4 + ldr x8, [sp, #448] // 8-byte Folded Reload + str x14, [sp, #224] // 8-byte Folded Spill + sub x14, x27, #3 + mov x1, x20 + str x14, [sp, #496] // 8-byte Folded Spill + sub x14, x27, #2 + stp x0, x9, [sp, #64] // 16-byte Folded Spill + add x9, x9, #32 + str x14, [sp, #488] // 8-byte Folded Spill + sub x14, x27, #1 + str x9, [sp, #56] // 8-byte Folded Spill + add x9, x16, #16 + str x14, [sp, #480] // 8-byte Folded Spill + lsl x14, x8, #2 + str x9, [sp, #48] // 8-byte Folded Spill + add x9, x0, #8 + stp x4, x14, [sp, #112] // 16-byte Folded Spill + add x14, x2, #4 + mov x20, xzr + sub x29, x27, #4 + str x14, [sp, #400] // 8-byte Folded Spill + add x14, x23, #256 + str x9, [sp, #40] // 8-byte Folded Spill + add x9, x11, #4 + str x14, [sp, #472] // 8-byte Folded Spill + add x14, x15, #64 + str x25, [sp, #344] // 8-byte Folded Spill + str x14, [sp, #464] // 8-byte Folded Spill + str x11, [sp, #104] // 8-byte Folded Spill + stp x16, x9, [sp, #24] // 16-byte Folded Spill + str x1, [sp, #176] // 8-byte Folded Spill + str x15, [sp, #248] // 8-byte Folded Spill + b .LBB0_4 + .p2align 2 +.LBB0_1: // in Loop: Header=BB0_4 Depth=1 + str s0, [x15, x9, lsl #2] +.LBB0_2: // in Loop: Header=BB0_4 Depth=1 + bl free + ldr x8, [sp, #448] // 8-byte Folded Reload +.LBB0_3: // %.backedge28 + // in Loop: Header=BB0_4 Depth=1 + ldp x14, x9, [sp, #120] // 16-byte Folded Reload + ldp x10, x2, [sp, #400] // 16-byte Folded Reload + ldp x20, x3, [sp, #152] // 16-byte Folded Reload + ldr x17, [sp, #216] // 8-byte Folded Reload + add x10, x10, x14 + ldp x13, x12, [sp, #184] // 16-byte Folded Reload + add x3, x3, x9 + add x2, x2, x14 + add x17, x17, x9 + add x12, x12, x14 + add x13, x13, x14 + str x10, [sp, #400] // 8-byte Folded Spill + ldr x10, [sp, #392] // 8-byte Folded Reload + add x10, x10, x14 + str x10, [sp, #392] // 8-byte Folded Spill + ldr x10, [sp, #384] // 8-byte Folded Reload + add x10, x10, x14 + str x10, [sp, #384] // 8-byte Folded Spill + ldr x10, [sp, #296] // 8-byte Folded Reload + add x10, x10, x14 + str x10, [sp, #296] // 8-byte Folded Spill + ldr x10, [sp, #288] // 8-byte Folded Reload + add x10, x10, x14 + str x10, [sp, #288] // 8-byte Folded Spill + ldr x10, [sp, #304] // 8-byte Folded Reload + add x10, x10, x14 + str x10, [sp, #304] // 8-byte Folded Spill + ldr x10, [sp, #376] // 8-byte Folded Reload + add x10, x10, x14 + str x10, [sp, #376] // 8-byte Folded Spill + ldp x11, x10, [sp, #224] // 16-byte Folded Reload + add x10, x10, x9 + add x11, x11, x9 + stp x11, x10, [sp, #224] // 16-byte Folded Spill + ldr x10, [sp, #240] // 8-byte Folded Reload + add x10, x10, x14 + str x10, [sp, #240] // 8-byte Folded Spill + ldr x10, [sp, #312] // 8-byte Folded Reload + add x10, x10, x14 + str x10, [sp, #312] // 8-byte Folded Spill + ldp x10, x18, [sp, #200] // 16-byte Folded Reload + add x10, x10, x9 + add x18, x18, x9 +.LBB0_4: // =>This Loop Header: Depth=1 + // Child Loop BB0_8 Depth 2 + // Child Loop BB0_10 Depth 3 + // Child Loop BB0_13 Depth 3 + // Child Loop BB0_15 Depth 4 + // Child Loop BB0_17 Depth 4 + // Child Loop BB0_20 Depth 3 + // Child Loop BB0_22 Depth 3 + // Child Loop BB0_26 Depth 3 + // Child Loop BB0_28 Depth 3 + // Child Loop BB0_34 Depth 2 + // Child Loop BB0_37 Depth 2 + // Child Loop BB0_39 Depth 3 + // Child Loop BB0_41 Depth 3 + // Child Loop BB0_44 Depth 2 + // Child Loop BB0_46 Depth 2 + // Child Loop BB0_50 Depth 2 + // Child Loop BB0_52 Depth 2 + // Child Loop BB0_56 Depth 2 + // Child Loop BB0_59 Depth 2 + // Child Loop BB0_61 Depth 3 + // Child Loop BB0_63 Depth 3 + // Child Loop BB0_66 Depth 2 + // Child Loop BB0_68 Depth 2 + // Child Loop BB0_72 Depth 2 + // Child Loop BB0_74 Depth 2 + // Child Loop BB0_78 Depth 2 + // Child Loop BB0_81 Depth 2 + // Child Loop BB0_83 Depth 3 + // Child Loop BB0_85 Depth 3 + // Child Loop BB0_88 Depth 2 + // Child Loop BB0_90 Depth 2 + // Child Loop BB0_94 Depth 2 + // Child Loop BB0_96 Depth 2 + // Child Loop BB0_100 Depth 2 + // Child Loop BB0_103 Depth 2 + // Child Loop BB0_105 Depth 3 + // Child Loop BB0_107 Depth 3 + // Child Loop BB0_110 Depth 2 + // Child Loop BB0_112 Depth 2 + // Child Loop BB0_116 Depth 2 + // Child Loop BB0_118 Depth 2 + ldr x9, [sp, #144] // 8-byte Folded Reload + cmp x20, x9 + b.ge .LBB0_119 +// %bb.5: // in Loop: Header=BB0_4 Depth=1 + ldr x0, [sp, #416] // 8-byte Folded Reload + ldr x30, [sp, #280] // 8-byte Folded Reload + add x9, x20, #1 + stp x10, x18, [sp, #200] // 16-byte Folded Spill + mov x10, xzr + str x2, [sp, #408] // 8-byte Folded Spill + stp x13, x12, [sp, #184] // 16-byte Folded Spill + str x17, [sp, #216] // 8-byte Folded Spill + stp x9, x3, [sp, #152] // 16-byte Folded Spill + b .LBB0_8 + .p2align 2 +.LBB0_6: // in Loop: Header=BB0_8 Depth=2 + ldr x8, [sp, #448] // 8-byte Folded Reload + stp q3, q2, [x10] + stp q1, q0, [x10, #32] +.LBB0_7: // %.backedge + // in Loop: Header=BB0_8 Depth=2 + ldr x9, [sp, #320] // 8-byte Folded Reload + ldp x10, x3, [sp, #424] // 16-byte Folded Reload + add x3, x3, x9 +.LBB0_8: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_10 Depth 3 + // Child Loop BB0_13 Depth 3 + // Child Loop BB0_15 Depth 4 + // Child Loop BB0_17 Depth 4 + // Child Loop BB0_20 Depth 3 + // Child Loop BB0_22 Depth 3 + // Child Loop BB0_26 Depth 3 + // Child Loop BB0_28 Depth 3 + ldp x11, x9, [sp, #344] // 16-byte Folded Reload + ldr x16, [sp, #400] // 8-byte Folded Reload + cmp x10, x0 + add x25, x11, x9, lsl #2 + b.ge .LBB0_29 +// %bb.9: // in Loop: Header=BB0_8 Depth=2 + ldr x9, [sp, #360] // 8-byte Folded Reload + mov x13, xzr + mul x12, x20, x9 + add x14, x12, x10 + ldp x11, x9, [sp, #328] // 16-byte Folded Reload + add x11, x11, x9, lsl #2 + add x15, x14, x28 + add x15, x11, x15, lsl #2 + add x9, x11, x14, lsl #2 + ldp q3, q1, [x15, #32] + ldp q5, q4, [x15] + lsl x15, x28, #1 + ldp q16, q6, [x9, #32] + ldp q2, q0, [x9] + add x9, x14, x15 + add x15, x15, x28 + add x14, x14, x15 + add x9, x11, x9, lsl #2 + mov x15, x3 + add x14, x11, x14, lsl #2 + ldp q17, q7, [x9, #32] + ldp q20, q18, [x9] + add x9, x10, #16 + str x9, [sp, #424] // 8-byte Folded Spill + ldp q21, q19, [x14, #32] + ldp q23, q22, [x14] + mov x14, x16 + cmp xzr, x21 + b.ge .LBB0_11 + .p2align 2 +.LBB0_10: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x16, x14, x22 + prfm pldl1keep, [x14] + ldur s27, [x14, #-4] + add x14, x14, #4 + add x17, x16, x22 + prfm pldl1keep, [x16] + ldur s28, [x16, #-4] + add x16, x15, x26 + add x18, x17, x22 + prfm pldl1keep, [x17] + ldur s26, [x17, #-4] + sub x17, x16, #4 + prfm pldl1keep, [x18] + ldur s25, [x18, #-4] + add x18, x16, x26 + prfm pldl1keep, [x15] + ldur s24, [x15, #-4] + add x15, x15, #4 + prfm pldl1keep, [x16] + sub x16, x18, #4 + prfm pldl1keep, [x18] + ld1 { v24.s }[1], [x17] + add x17, x18, x26 + prfm pldl1keep, [x17] + ld1 { v24.s }[2], [x16] + add x16, x17, x26 + sub x17, x17, #4 + prfm pldl1keep, [x16] + ldur s29, [x16, #-4] + add x16, x16, x26 + sub x18, x16, #4 + add x0, x16, x26 + ld1 { v24.s }[3], [x17] + prfm pldl1keep, [x16] + prfm pldl1keep, [x0] + ld1 { v29.s }[1], [x18] + sub x16, x0, #4 + add x17, x0, x26 + prfm pldl1keep, [x17] + fmla v2.4s, v24.4s, v27.s[0] + ld1 { v29.s }[2], [x16] + add x16, x17, x26 + sub x17, x17, #4 + fmla v5.4s, v24.4s, v28.s[0] + fmla v20.4s, v24.4s, v26.s[0] + fmla v23.4s, v24.4s, v25.s[0] + prfm pldl1keep, [x16] + ldur s30, [x16, #-4] + add x16, x16, x26 + sub x18, x16, #4 + add x0, x16, x26 + ld1 { v29.s }[3], [x17] + prfm pldl1keep, [x16] + prfm pldl1keep, [x0] + ld1 { v30.s }[1], [x18] + sub x16, x0, #4 + add x17, x0, x26 + prfm pldl1keep, [x17] + ld1 { v30.s }[2], [x16] + add x16, x17, x26 + sub x17, x17, #4 + fmla v0.4s, v29.4s, v27.s[0] + fmla v4.4s, v29.4s, v28.s[0] + fmla v18.4s, v29.4s, v26.s[0] + fmla v22.4s, v29.4s, v25.s[0] + prfm pldl1keep, [x16] + ldur s31, [x16, #-4] + add x16, x16, x26 + sub x18, x16, #4 + add x0, x16, x26 + ld1 { v30.s }[3], [x17] + prfm pldl1keep, [x16] + prfm pldl1keep, [x0] + ld1 { v31.s }[1], [x18] + sub x16, x0, #4 + add x17, x0, x26 + prfm pldl1keep, [x17] + fmla v16.4s, v30.4s, v27.s[0] + ld1 { v31.s }[2], [x16] + sub x16, x17, #4 + fmla v3.4s, v30.4s, v28.s[0] + fmla v17.4s, v30.4s, v26.s[0] + fmla v21.4s, v30.4s, v25.s[0] + ld1 { v31.s }[3], [x16] + add x16, x23, x13, lsl #6 + add x13, x13, #1 + stp q24, q29, [x16] + fmla v6.4s, v31.4s, v27.s[0] + fmla v1.4s, v31.4s, v28.s[0] + fmla v7.4s, v31.4s, v26.s[0] + fmla v19.4s, v31.4s, v25.s[0] + stp q30, q31, [x16, #32] + cmp x13, x21 + b.lt .LBB0_10 +.LBB0_11: // %.preheader + // in Loop: Header=BB0_8 Depth=2 + ldp x13, x14, [sp, #384] // 16-byte Folded Reload + str x3, [sp, #432] // 8-byte Folded Spill + mov x1, xzr + mov w17, #1 // =0x1 + mov w18, #2 // =0x2 + mov w16, #3 // =0x3 + mov w15, #4 // =0x4 + b .LBB0_13 + .p2align 2 +.LBB0_12: // %.loopexit + // in Loop: Header=BB0_13 Depth=3 + add x14, x14, x24 + add x13, x13, x24 + mov x1, x15 + mov x15, x0 +.LBB0_13: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_15 Depth 4 + // Child Loop BB0_17 Depth 4 + madd x0, x1, x28, x12 + ldr x9, [sp, #504] // 8-byte Folded Reload + add x0, x0, x10 + madd x17, x17, x28, x12 + madd x18, x18, x28, x12 + madd x16, x16, x28, x12 + add x17, x17, x10 + add x18, x18, x10 + add x16, x16, x10 + cmp x15, x9 + add x0, x11, x0, lsl #2 + add x17, x11, x17, lsl #2 + stp q2, q0, [x0] + add x18, x11, x18, lsl #2 + add x16, x11, x16, lsl #2 + stp q16, q6, [x0, #32] + stp q5, q4, [x17] + stp q3, q1, [x17, #32] + stp q20, q18, [x18] + stp q17, q7, [x18, #32] + stp q23, q22, [x16] + stp q21, q19, [x16, #32] + b.ge .LBB0_18 +// %bb.14: // in Loop: Header=BB0_13 Depth=3 + add x17, x15, #1 + add x16, x15, #3 + mul x2, x20, x8 + add x18, x15, #2 + madd x3, x17, x28, x12 + ldp q28, q29, [x23, #32] + mov x1, xzr + madd x0, x15, x28, x12 + ldp q30, q31, [x23] + add x0, x0, x10 + add x3, x3, x10 + add x0, x11, x0, lsl #2 + add x3, x11, x3, lsl #2 + ldp q16, q6, [x0, #32] + ldp q2, q0, [x0] + madd x0, x18, x28, x12 + ldp q3, q1, [x3, #32] + add x0, x0, x10 + ldp q5, q4, [x3] + madd x3, x16, x28, x12 + add x3, x3, x10 + add x0, x11, x0, lsl #2 + add x3, x11, x3, lsl #2 + ldp q17, q7, [x0, #32] + ldp q20, q18, [x0] + add x0, x15, #4 + ldp q21, q19, [x3, #32] + ldp q23, q22, [x3] + madd x3, x15, x19, x2 + lsl x3, x3, #2 + ldr q27, [x25, x3] + madd x3, x17, x19, x2 + lsl x3, x3, #2 + ldr q26, [x25, x3] + madd x3, x18, x19, x2 + madd x2, x16, x19, x2 + lsl x3, x3, #2 + lsl x2, x2, #2 + ldr q25, [x25, x3] + ldr q24, [x25, x2] + ldr x3, [sp, #472] // 8-byte Folded Reload + mov x2, x14 + fmla v6.4s, v29.4s, v27.s[0] + cmp xzr, x29 + b.ge .LBB0_16 + .p2align 2 +.LBB0_15: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // Parent Loop BB0_13 Depth=3 + // => This Inner Loop Header: Depth=4 + add x7, x3, #64 + fmla v16.4s, v28.4s, v27.s[0] + fmla v2.4s, v30.4s, v27.s[0] + add x6, x3, #128 + prfm pldl1keep, [x7] + ldp q9, q8, [x3, #-160] + fmla v0.4s, v31.4s, v27.s[0] + ldp q12, q15, [x3, #-192] + fmla v1.4s, v29.4s, v26.s[0] + fmla v3.4s, v28.4s, v26.s[0] + fmla v4.4s, v31.4s, v26.s[0] + fmla v5.4s, v30.4s, v26.s[0] + fmla v7.4s, v29.4s, v25.s[0] + prfm pldl1keep, [x6] + fmla v17.4s, v28.4s, v25.s[0] + fmla v18.4s, v31.4s, v25.s[0] + ldp q11, q10, [x3, #-128] + fmla v20.4s, v30.4s, v25.s[0] + fmla v19.4s, v29.4s, v24.s[0] + ldp q13, q14, [x3, #-96] + fmla v21.4s, v28.4s, v24.s[0] + fmla v22.4s, v31.4s, v24.s[0] + add x5, x3, #192 + prfm pldl1keep, [x5] + fmla v23.4s, v30.4s, v24.s[0] + fmla v0.4s, v15.4s, v27.s[1] + add x4, x3, #256 + add x1, x1, #4 + fmla v2.4s, v12.4s, v27.s[1] + fmla v16.4s, v9.4s, v27.s[1] + fmla v6.4s, v8.4s, v27.s[1] + fmla v5.4s, v12.4s, v26.s[1] + fmla v4.4s, v15.4s, v26.s[1] + fmla v3.4s, v9.4s, v26.s[1] + fmla v1.4s, v8.4s, v26.s[1] + fmla v20.4s, v12.4s, v25.s[1] + fmla v18.4s, v15.4s, v25.s[1] + fmla v17.4s, v9.4s, v25.s[1] + fmla v7.4s, v8.4s, v25.s[1] + fmla v23.4s, v12.4s, v24.s[1] + fmla v22.4s, v15.4s, v24.s[1] + ldp q15, q12, [x3, #-64] + fmla v21.4s, v9.4s, v24.s[1] + fmla v19.4s, v8.4s, v24.s[1] + ldp q9, q8, [x3, #-32] + prfm pldl1keep, [x4] + ldp q28, q29, [x3, #32] + ldp q30, q31, [x3] + add x3, x2, x22 + prfm pldl1keep, [x2] + fmla v6.4s, v14.4s, v27.s[2] + fmla v16.4s, v13.4s, v27.s[2] + fmla v2.4s, v11.4s, v27.s[2] + fmla v0.4s, v10.4s, v27.s[2] + fmla v1.4s, v14.4s, v26.s[2] + fmla v3.4s, v13.4s, v26.s[2] + fmla v4.4s, v10.4s, v26.s[2] + fmla v5.4s, v11.4s, v26.s[2] + fmla v7.4s, v14.4s, v25.s[2] + fmla v17.4s, v13.4s, v25.s[2] + fmla v18.4s, v10.4s, v25.s[2] + fmla v20.4s, v11.4s, v25.s[2] + fmla v19.4s, v14.4s, v24.s[2] + fmla v21.4s, v13.4s, v24.s[2] + fmla v22.4s, v10.4s, v24.s[2] + fmla v23.4s, v11.4s, v24.s[2] + fmla v0.4s, v12.4s, v27.s[3] + fmla v2.4s, v15.4s, v27.s[3] + fmla v16.4s, v9.4s, v27.s[3] + fmla v6.4s, v8.4s, v27.s[3] + ldur q27, [x2, #-16] + prfm pldl1keep, [x3] + add x2, x2, #16 + fmla v5.4s, v15.4s, v26.s[3] + fmla v4.4s, v12.4s, v26.s[3] + fmla v3.4s, v9.4s, v26.s[3] + fmla v1.4s, v8.4s, v26.s[3] + ldur q26, [x3, #-16] + add x3, x3, x22 + add x5, x3, x22 + prfm pldl1keep, [x3] + fmla v20.4s, v15.4s, v25.s[3] + fmla v18.4s, v12.4s, v25.s[3] + fmla v17.4s, v9.4s, v25.s[3] + fmla v7.4s, v8.4s, v25.s[3] + ldur q25, [x3, #-16] + prfm pldl1keep, [x5] + mov x3, x4 + fmla v23.4s, v15.4s, v24.s[3] + fmla v22.4s, v12.4s, v24.s[3] + fmla v21.4s, v9.4s, v24.s[3] + fmla v19.4s, v8.4s, v24.s[3] + ldur q24, [x5, #-16] + fmla v6.4s, v29.4s, v27.s[0] + cmp x1, x29 + b.lt .LBB0_15 +.LBB0_16: // in Loop: Header=BB0_13 Depth=3 + ldr x9, [sp, #496] // 8-byte Folded Reload + fmla v16.4s, v28.4s, v27.s[0] + fmla v2.4s, v30.4s, v27.s[0] + mov x2, x13 + fmla v0.4s, v31.4s, v27.s[0] + fmla v1.4s, v29.4s, v26.s[0] + mov x3, x27 + add x1, x23, x9, lsl #6 + ldr x9, [sp, #488] // 8-byte Folded Reload + fmla v3.4s, v28.4s, v26.s[0] + fmla v4.4s, v31.4s, v26.s[0] + fmla v5.4s, v30.4s, v26.s[0] + fmla v7.4s, v29.4s, v25.s[0] + fmla v17.4s, v28.4s, v25.s[0] + fmla v18.4s, v31.4s, v25.s[0] + fmla v20.4s, v30.4s, v25.s[0] + ldp q10, q9, [x1, #32] + ldp q11, q12, [x1] + fmla v19.4s, v29.4s, v24.s[0] + fmla v21.4s, v28.4s, v24.s[0] + fmla v22.4s, v31.4s, v24.s[0] + add x1, x23, x9, lsl #6 + fmla v23.4s, v30.4s, v24.s[0] + ldr x9, [sp, #480] // 8-byte Folded Reload + ldp q29, q30, [x1] + ldp q8, q13, [x1, #32] + fmla v0.4s, v12.4s, v27.s[1] + fmla v6.4s, v9.4s, v27.s[1] + fmla v4.4s, v12.4s, v26.s[1] + fmla v1.4s, v9.4s, v26.s[1] + fmla v18.4s, v12.4s, v25.s[1] + fmla v7.4s, v9.4s, v25.s[1] + fmla v22.4s, v12.4s, v24.s[1] + add x1, x23, x9, lsl #6 + fmla v2.4s, v11.4s, v27.s[1] + fmla v16.4s, v10.4s, v27.s[1] + fmla v5.4s, v11.4s, v26.s[1] + fmla v3.4s, v10.4s, v26.s[1] + ldp q31, q28, [x1, #32] + fmla v20.4s, v11.4s, v25.s[1] + fmla v17.4s, v10.4s, v25.s[1] + fmla v23.4s, v11.4s, v24.s[1] + fmla v21.4s, v10.4s, v24.s[1] + fmla v19.4s, v9.4s, v24.s[1] + ldp q9, q10, [x1] + ldr x1, [sp, #464] // 8-byte Folded Reload + fmla v6.4s, v13.4s, v27.s[2] + fmla v0.4s, v30.4s, v27.s[2] + fmla v1.4s, v13.4s, v26.s[2] + fmla v4.4s, v30.4s, v26.s[2] + fmla v7.4s, v13.4s, v25.s[2] + fmla v18.4s, v30.4s, v25.s[2] + fmla v19.4s, v13.4s, v24.s[2] + fmla v22.4s, v30.4s, v24.s[2] + fmla v16.4s, v8.4s, v27.s[2] + fmla v2.4s, v29.4s, v27.s[2] + fmla v3.4s, v8.4s, v26.s[2] + fmla v5.4s, v29.4s, v26.s[2] + fmla v17.4s, v8.4s, v25.s[2] + fmla v20.4s, v29.4s, v25.s[2] + fmla v21.4s, v8.4s, v24.s[2] + fmla v23.4s, v29.4s, v24.s[2] + fmla v0.4s, v10.4s, v27.s[3] + fmla v6.4s, v28.4s, v27.s[3] + fmla v4.4s, v10.4s, v26.s[3] + fmla v1.4s, v28.4s, v26.s[3] + fmla v18.4s, v10.4s, v25.s[3] + fmla v7.4s, v28.4s, v25.s[3] + fmla v22.4s, v10.4s, v24.s[3] + fmla v19.4s, v28.4s, v24.s[3] + fmla v2.4s, v9.4s, v27.s[3] + fmla v16.4s, v31.4s, v27.s[3] + fmla v5.4s, v9.4s, v26.s[3] + fmla v3.4s, v31.4s, v26.s[3] + fmla v20.4s, v9.4s, v25.s[3] + fmla v17.4s, v31.4s, v25.s[3] + fmla v23.4s, v9.4s, v24.s[3] + fmla v21.4s, v31.4s, v24.s[3] + cmp x27, x21 + b.ge .LBB0_12 + .p2align 2 +.LBB0_17: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // Parent Loop BB0_13 Depth=3 + // => This Inner Loop Header: Depth=4 + prfm pldl1keep, [x1] + ldp q24, q25, [x1, #-64] + add x4, x2, x22 + ldp q26, q27, [x1, #-32] + prfm pldl1keep, [x2] + add x3, x3, #1 + ldur s28, [x2, #-4] + prfm pldl1keep, [x4] + add x2, x2, #4 + add x1, x1, #64 + ldur s29, [x4, #-4] + add x4, x4, x22 + prfm pldl1keep, [x4] + fmla v6.4s, v27.4s, v28.s[0] + ldur s30, [x4, #-4] + add x4, x4, x22 + prfm pldl1keep, [x4] + fmla v16.4s, v26.4s, v28.s[0] + fmla v0.4s, v25.4s, v28.s[0] + fmla v2.4s, v24.4s, v28.s[0] + ldur s28, [x4, #-4] + fmla v4.4s, v25.4s, v29.s[0] + fmla v5.4s, v24.4s, v29.s[0] + fmla v3.4s, v26.4s, v29.s[0] + fmla v1.4s, v27.4s, v29.s[0] + fmla v20.4s, v24.4s, v30.s[0] + fmla v18.4s, v25.4s, v30.s[0] + fmla v17.4s, v26.4s, v30.s[0] + fmla v7.4s, v27.4s, v30.s[0] + fmla v23.4s, v24.4s, v28.s[0] + fmla v22.4s, v25.4s, v28.s[0] + fmla v21.4s, v26.4s, v28.s[0] + fmla v19.4s, v27.4s, v28.s[0] + cmp x3, x21 + b.lt .LBB0_17 + b .LBB0_12 + .p2align 2 +.LBB0_18: // in Loop: Header=BB0_8 Depth=2 + ldr x9, [sp, #504] // 8-byte Folded Reload + ldr x13, [sp, #440] // 8-byte Folded Reload + cmp x9, x13 + ldr x9, [sp, #496] // 8-byte Folded Reload + add x15, x23, x9, lsl #6 + ldr x9, [sp, #488] // 8-byte Folded Reload + add x14, x23, x9, lsl #6 + ldr x9, [sp, #480] // 8-byte Folded Reload + add x13, x23, x9, lsl #6 + b.ge .LBB0_24 +// %bb.19: // in Loop: Header=BB0_8 Depth=2 + ldr x9, [sp, #504] // 8-byte Folded Reload + add x17, x12, x10 + ldp q18, q19, [x23, #32] + ldp q20, q21, [x23] + mov x18, xzr + add x0, x9, #1 + mul x16, x9, x28 + mul x2, x9, x19 + madd x1, x0, x28, x12 + add x16, x17, x16 + add x17, x11, x16, lsl #2 + add x16, x1, x10 + mul x1, x20, x8 + madd x0, x0, x19, x1 + add x16, x11, x16, lsl #2 + ldp q2, q0, [x17, #32] + add x2, x1, x2 + ldp q6, q4, [x17] + ldp q3, q1, [x16, #32] + ldp q7, q5, [x16] + lsl x2, x2, #2 + lsl x0, x0, #2 + ldr q17, [x25, x2] + ldr q16, [x25, x0] + ldp x0, x1, [sp, #288] // 16-byte Folded Reload + ldr x2, [sp, #472] // 8-byte Folded Reload + cmp xzr, x29 + b.ge .LBB0_21 + .p2align 2 +.LBB0_20: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x8, [sp, #456] // 8-byte Folded Reload + fmla v0.4s, v19.4s, v17.s[0] + fmla v2.4s, v18.4s, v17.s[0] + add x9, x2, #128 + fmla v6.4s, v20.4s, v17.s[0] + fmla v4.4s, v21.4s, v17.s[0] + add x30, x2, #192 + add x3, x2, #256 + fmla v1.4s, v19.4s, v16.s[0] + fmla v3.4s, v18.4s, v16.s[0] + add x18, x18, #4 + add x4, x1, x8 + add x6, x0, x8 + add x8, x2, #64 + fmla v5.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + add x1, x1, #16 + add x0, x0, #16 + prfm pldl1keep, [x8] + add x5, x4, #32 + ldp q23, q22, [x2, #-160] + ldp q24, q25, [x2, #-192] + prfm pldl1keep, [x9] + ldp q19, q18, [x2, #-128] + add x7, x6, #32 + ldp q20, q21, [x2, #-96] + prfm pldl1keep, [x30] + fmla v4.4s, v25.4s, v17.s[1] + fmla v0.4s, v22.4s, v17.s[1] + fmla v5.4s, v25.4s, v16.s[1] + fmla v1.4s, v22.4s, v16.s[1] + fmla v6.4s, v24.4s, v17.s[1] + fmla v2.4s, v23.4s, v17.s[1] + fmla v7.4s, v24.4s, v16.s[1] + fmla v3.4s, v23.4s, v16.s[1] + fmla v0.4s, v21.4s, v17.s[2] + ldp q23, q22, [x2, #-32] + ldp q24, q25, [x2, #-64] + fmla v4.4s, v18.4s, v17.s[2] + fmla v1.4s, v21.4s, v16.s[2] + fmla v5.4s, v18.4s, v16.s[2] + prfm pldl1keep, [x7] + fmla v2.4s, v20.4s, v17.s[2] + fmla v6.4s, v19.4s, v17.s[2] + fmla v3.4s, v20.4s, v16.s[2] + fmla v7.4s, v19.4s, v16.s[2] + fmla v4.4s, v25.4s, v17.s[3] + fmla v0.4s, v22.4s, v17.s[3] + fmla v5.4s, v25.4s, v16.s[3] + fmla v1.4s, v22.4s, v16.s[3] + fmla v6.4s, v24.4s, v17.s[3] + fmla v2.4s, v23.4s, v17.s[3] + ldr q17, [x6, #16] + prfm pldl1keep, [x5] + fmla v7.4s, v24.4s, v16.s[3] + fmla v3.4s, v23.4s, v16.s[3] + ldr q16, [x4, #16] + prfm pldl1keep, [x3] + ldp q18, q19, [x2, #32] + ldp q20, q21, [x2] + mov x2, x3 + cmp x18, x29 + b.lt .LBB0_20 +.LBB0_21: // in Loop: Header=BB0_8 Depth=2 + ldp q23, q22, [x15, #32] + ldp q24, q25, [x15] + fmla v0.4s, v19.4s, v17.s[0] + fmla v2.4s, v18.4s, v17.s[0] + fmla v6.4s, v20.4s, v17.s[0] + fmla v4.4s, v21.4s, v17.s[0] + fmla v1.4s, v19.4s, v16.s[0] + fmla v3.4s, v18.4s, v16.s[0] + ldp q19, q18, [x14] + fmla v5.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q20, q21, [x14, #32] + fmla v4.4s, v25.4s, v17.s[1] + fmla v0.4s, v22.4s, v17.s[1] + ldr x18, [sp, #464] // 8-byte Folded Reload + ldr x0, [sp, #408] // 8-byte Folded Reload + fmla v6.4s, v24.4s, v17.s[1] + fmla v2.4s, v23.4s, v17.s[1] + ldr x30, [sp, #280] // 8-byte Folded Reload + mov x1, x27 + fmla v7.4s, v24.4s, v16.s[1] + fmla v5.4s, v25.4s, v16.s[1] + ldp q24, q25, [x13] + fmla v3.4s, v23.4s, v16.s[1] + fmla v1.4s, v22.4s, v16.s[1] + ldp q23, q22, [x13, #32] + fmla v0.4s, v21.4s, v17.s[2] + fmla v4.4s, v18.4s, v17.s[2] + ldp x3, x2, [sp, #256] // 16-byte Folded Reload + fmla v2.4s, v20.4s, v17.s[2] + fmla v6.4s, v19.4s, v17.s[2] + fmla v1.4s, v21.4s, v16.s[2] + fmla v3.4s, v20.4s, v16.s[2] + fmla v5.4s, v18.4s, v16.s[2] + fmla v7.4s, v19.4s, v16.s[2] + fmla v4.4s, v25.4s, v17.s[3] + fmla v0.4s, v22.4s, v17.s[3] + fmla v5.4s, v25.4s, v16.s[3] + fmla v1.4s, v22.4s, v16.s[3] + fmla v6.4s, v24.4s, v17.s[3] + fmla v2.4s, v23.4s, v17.s[3] + fmla v7.4s, v24.4s, v16.s[3] + fmla v3.4s, v23.4s, v16.s[3] + cmp x27, x21 + b.ge .LBB0_23 + .p2align 2 +.LBB0_22: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x8, x0, x3 + add x9, x0, x2 + prfm pldl1keep, [x18] + add x1, x1, #1 + add x8, x8, #4 + add x9, x9, #4 + ldp q16, q17, [x18, #-64] + ldp q18, q19, [x18, #-32] + prfm pldl1keep, [x9] + add x18, x18, #64 + ldr s20, [x0, x2] + prfm pldl1keep, [x8] + fmla v0.4s, v19.4s, v20.s[0] + ldr s21, [x0, x3] + fmla v2.4s, v18.4s, v20.s[0] + fmla v4.4s, v17.4s, v20.s[0] + fmla v6.4s, v16.4s, v20.s[0] + fmla v5.4s, v17.4s, v21.s[0] + fmla v7.4s, v16.4s, v21.s[0] + fmla v3.4s, v18.4s, v21.s[0] + fmla v1.4s, v19.4s, v21.s[0] + add x0, x0, #4 + cmp x1, x21 + b.lt .LBB0_22 +.LBB0_23: // in Loop: Header=BB0_8 Depth=2 + ldr x8, [sp, #448] // 8-byte Folded Reload + stp q6, q4, [x17] + stp q2, q0, [x17, #32] + stp q7, q5, [x16] + stp q3, q1, [x16, #32] +.LBB0_24: // in Loop: Header=BB0_8 Depth=2 + ldp x9, x1, [sp, #368] // 16-byte Folded Reload + ldr x16, [sp, #440] // 8-byte Folded Reload + cmp x16, x9 + ldr x0, [sp, #416] // 8-byte Folded Reload + b.ge .LBB0_7 +// %bb.25: // in Loop: Header=BB0_8 Depth=2 + mov x17, x8 + add x8, x12, x10 + ldr x12, [sp, #440] // 8-byte Folded Reload + ldr x2, [sp, #408] // 8-byte Folded Reload + ldp q7, q16, [x23, #32] + ldp q6, q5, [x23] + mov x16, xzr + mul x9, x12, x28 + add x8, x8, x9 + add x10, x11, x8, lsl #2 + mul x8, x12, x19 + ldr x11, [sp, #304] // 8-byte Folded Reload + ldr x12, [sp, #472] // 8-byte Folded Reload + madd x8, x20, x17, x8 + ldp q1, q0, [x10, #32] + ldp q3, q2, [x10] + lsl x8, x8, #2 + ldr q4, [x25, x8] + cmp xzr, x29 + b.ge .LBB0_27 + .p2align 2 +.LBB0_26: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x18, x12, #64 + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + add x9, x12, #128 + prfm pldl1keep, [x18] + ldp q18, q17, [x12, #-160] + fmla v3.4s, v6.4s, v4.s[0] + ldp q19, q20, [x12, #-192] + fmla v2.4s, v5.4s, v4.s[0] + prfm pldl1keep, [x9] + ldp q6, q5, [x12, #-128] + ldp q7, q16, [x12, #-96] + add x8, x12, #192 + prfm pldl1keep, [x8] + add x17, x12, #256 + add x16, x16, #4 + fmla v2.4s, v20.4s, v4.s[1] + fmla v0.4s, v17.4s, v4.s[1] + fmla v3.4s, v19.4s, v4.s[1] + fmla v1.4s, v18.4s, v4.s[1] + ldp q18, q17, [x12, #-32] + ldp q19, q20, [x12, #-64] + prfm pldl1keep, [x11] + fmla v0.4s, v16.4s, v4.s[2] + fmla v2.4s, v5.4s, v4.s[2] + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v2.4s, v20.4s, v4.s[3] + fmla v0.4s, v17.4s, v4.s[3] + fmla v3.4s, v19.4s, v4.s[3] + fmla v1.4s, v18.4s, v4.s[3] + ldur q4, [x11, #-16] + prfm pldl1keep, [x17] + add x11, x11, #16 + ldp q7, q16, [x12, #32] + ldp q6, q5, [x12] + mov x12, x17 + cmp x16, x29 + b.lt .LBB0_26 +.LBB0_27: // in Loop: Header=BB0_8 Depth=2 + ldp q18, q17, [x15, #32] + ldp q19, q20, [x15] + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + fmla v2.4s, v5.4s, v4.s[0] + ldp q6, q5, [x14] + ldp q7, q16, [x14, #32] + ldr x15, [sp, #248] // 8-byte Folded Reload + mov x11, xzr + mov w12, #64 // =0x40 + fmla v2.4s, v20.4s, v4.s[1] + fmla v0.4s, v17.4s, v4.s[1] + fmla v3.4s, v19.4s, v4.s[1] + fmla v1.4s, v18.4s, v4.s[1] + fmla v0.4s, v16.4s, v4.s[2] + ldp q18, q17, [x13, #32] + ldp q19, q20, [x13] + fmla v2.4s, v5.4s, v4.s[2] + ldr x13, [sp, #272] // 8-byte Folded Reload + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v2.4s, v20.4s, v4.s[3] + fmla v0.4s, v17.4s, v4.s[3] + fmla v3.4s, v19.4s, v4.s[3] + fmla v1.4s, v18.4s, v4.s[3] + add x8, x27, xzr + cmp x8, x21 + b.ge .LBB0_6 + .p2align 2 +.LBB0_28: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_8 Depth=2 + // => This Inner Loop Header: Depth=3 + add x14, x15, x11, lsl #6 + add x8, x2, x13 + add x9, x15, x12 + add x13, x13, #4 + prfm pldl1keep, [x9] + add x12, x12, #64 + ldp q4, q5, [x14] + ldp q6, q7, [x14, #32] + prfm pldl1keep, [x8] + ldr s16, [x1, x11, lsl #2] + add x11, x11, #1 + fmla v0.4s, v7.4s, v16.s[0] + fmla v1.4s, v6.4s, v16.s[0] + fmla v2.4s, v5.4s, v16.s[0] + fmla v3.4s, v4.4s, v16.s[0] + add x8, x27, x11 + cmp x8, x21 + b.lt .LBB0_28 + b .LBB0_6 + .p2align 2 +.LBB0_29: // in Loop: Header=BB0_4 Depth=1 + ldp x10, x9, [sp, #328] // 16-byte Folded Reload + cmp x0, x30 + add x11, x10, x9, lsl #2 + lsl x9, x28, #1 + stp x9, x11, [sp, #424] // 16-byte Folded Spill + b.lt .LBB0_33 +// %bb.30: // in Loop: Header=BB0_4 Depth=1 + ldr x1, [sp, #176] // 8-byte Folded Reload + cmp x30, x1 + b.lt .LBB0_55 +.LBB0_31: // in Loop: Header=BB0_4 Depth=1 + ldr x10, [sp, #168] // 8-byte Folded Reload + cmp x1, x10 + b.lt .LBB0_77 +.LBB0_32: // in Loop: Header=BB0_4 Depth=1 + ldr x9, [sp, #136] // 8-byte Folded Reload + cmp x10, x9 + b.ge .LBB0_3 + b .LBB0_99 + .p2align 2 +.LBB0_33: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #96] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #360] // 8-byte Folded Reload + ldr x6, [sp, #432] // 8-byte Folded Reload + mov x10, xzr + mov x11, xzr + ldr x15, [sp, #232] // 8-byte Folded Reload + ldr x16, [sp, #400] // 8-byte Folded Reload + mul x9, x20, x8 + ldp x8, x13, [sp, #416] // 16-byte Folded Reload + add x8, x9, x8 + add x12, x6, x8, lsl #2 + ldp q3, q2, [x12] + add x12, x8, x28 + add x12, x6, x12, lsl #2 + ldp q1, q0, [x12] + add x12, x8, x13 + add x12, x6, x12, lsl #2 + ldp q5, q4, [x12] + add x12, x13, x28 + add x8, x8, x12 + add x8, x6, x8, lsl #2 + ldp q7, q6, [x8] + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + cmp xzr, x21 + b.ge .LBB0_35 + .p2align 2 +.LBB0_34: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x16, x10 + add x12, x15, x10 + add x10, x10, #4 + prfm pldl1keep, [x13] + ldur s16, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s17, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s18, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s20, [x13, #-4] + prfm pldl1keep, [x12] + ldur s19, [x12, #-4] + add x12, x12, x26 + sub x13, x12, #4 + prfm pldl1keep, [x12] + add x12, x12, x26 + prfm pldl1keep, [x12] + sub x14, x12, #4 + add x12, x12, x26 + ld1 { v19.s }[1], [x13] + prfm pldl1keep, [x12] + sub x13, x12, #4 + add x12, x12, x26 + ld1 { v19.s }[2], [x14] + prfm pldl1keep, [x12] + ldur s21, [x12, #-4] + add x12, x12, x26 + ld1 { v19.s }[3], [x13] + prfm pldl1keep, [x12] + sub x13, x12, #4 + add x12, x12, x26 + prfm pldl1keep, [x12] + ld1 { v21.s }[1], [x13] + sub x14, x12, #4 + add x12, x12, x26 + prfm pldl1keep, [x12] + sub x12, x12, #4 + fmla v3.4s, v19.4s, v16.s[0] + fmla v1.4s, v19.4s, v17.s[0] + fmla v5.4s, v19.4s, v18.s[0] + fmla v7.4s, v19.4s, v20.s[0] + ld1 { v21.s }[2], [x14] + ld1 { v21.s }[3], [x12] + add x12, x8, x11, lsl #5 + add x11, x11, #1 + fmla v2.4s, v21.4s, v16.s[0] + fmla v0.4s, v21.4s, v17.s[0] + fmla v4.4s, v21.4s, v18.s[0] + fmla v6.4s, v21.4s, v20.s[0] + stp q19, q21, [x12] + cmp x11, x21 + b.lt .LBB0_34 +.LBB0_35: // %.preheader27 + // in Loop: Header=BB0_4 Depth=1 + ldr x12, [sp, #56] // 8-byte Folded Reload + ldp x15, x16, [sp, #384] // 16-byte Folded Reload + mov x11, xzr + add x10, x8, #128 + mov w18, #1 // =0x1 + mov w2, #2 // =0x2 + mov w1, #3 // =0x3 + mov w17, #4 // =0x4 + add x14, x8, x12 + b .LBB0_37 + .p2align 2 +.LBB0_36: // %.loopexit23 + // in Loop: Header=BB0_37 Depth=2 + add x16, x16, x24 + add x15, x15, x24 + mov x11, x17 + mov x17, x3 +.LBB0_37: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_39 Depth 3 + // Child Loop BB0_41 Depth 3 + madd x11, x11, x28, x9 + ldr x7, [sp, #416] // 8-byte Folded Reload + add x11, x11, x7 + madd x12, x18, x28, x9 + madd x13, x2, x28, x9 + add x12, x12, x7 + add x13, x13, x7 + add x11, x6, x11, lsl #2 + add x12, x6, x12, lsl #2 + stp q3, q2, [x11] + madd x11, x1, x28, x9 + stp q1, q0, [x12] + add x12, x6, x13, lsl #2 + add x11, x11, x7 + stp q5, q4, [x12] + add x11, x6, x11, lsl #2 + stp q7, q6, [x11] + ldr x11, [sp, #504] // 8-byte Folded Reload + cmp x17, x11 + ldr x11, [sp, #496] // 8-byte Folded Reload + add x13, x8, x11, lsl #5 + ldr x11, [sp, #488] // 8-byte Folded Reload + add x12, x8, x11, lsl #5 + ldr x11, [sp, #480] // 8-byte Folded Reload + add x11, x8, x11, lsl #5 + b.ge .LBB0_42 +// %bb.38: // in Loop: Header=BB0_37 Depth=2 + madd x5, x17, x28, x9 + add x18, x17, #1 + mov x30, x6 + add x2, x17, #2 + madd x6, x18, x28, x9 + add x1, x17, #3 + ldp q20, q21, [x8] + mov x4, xzr + add x3, x17, #4 + add x5, x5, x7 + add x5, x30, x5, lsl #2 + add x6, x6, x7 + add x6, x30, x6, lsl #2 + ldp q3, q2, [x5] + madd x5, x2, x28, x9 + ldp q1, q0, [x6] + madd x6, x1, x28, x9 + add x5, x5, x7 + add x5, x30, x5, lsl #2 + add x6, x6, x7 + add x6, x30, x6, lsl #2 + ldp q5, q4, [x5] + ldr x5, [sp, #448] // 8-byte Folded Reload + mul x5, x20, x5 + ldp q7, q6, [x6] + madd x6, x17, x19, x5 + lsl x6, x6, #2 + ldr q19, [x25, x6] + madd x6, x18, x19, x5 + lsl x6, x6, #2 + ldr q18, [x25, x6] + madd x6, x2, x19, x5 + madd x5, x1, x19, x5 + lsl x6, x6, #2 + lsl x5, x5, #2 + ldr q17, [x25, x6] + ldr q16, [x25, x5] + mov x5, x10 + mov x6, x16 + cmp xzr, x29 + b.ge .LBB0_40 + .p2align 2 +.LBB0_39: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_37 Depth=2 + // => This Inner Loop Header: Depth=3 + add x7, x5, #32 + fmla v3.4s, v20.4s, v19.s[0] + fmla v2.4s, v21.4s, v19.s[0] + add x4, x4, #4 + prfm pldl1keep, [x7] + ldp q22, q23, [x5, #-96] + fmla v0.4s, v21.4s, v18.s[0] + fmla v1.4s, v20.4s, v18.s[0] + fmla v4.4s, v21.4s, v17.s[0] + fmla v5.4s, v20.4s, v17.s[0] + add x7, x5, #96 + fmla v6.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q21, q20, [x5, #-64] + prfm pldl1keep, [x7] + add x7, x6, x22 + add x30, x7, x22 + fmla v2.4s, v23.4s, v19.s[1] + fmla v0.4s, v23.4s, v18.s[1] + fmla v4.4s, v23.4s, v17.s[1] + fmla v6.4s, v23.4s, v16.s[1] + fmla v3.4s, v22.4s, v19.s[1] + fmla v1.4s, v22.4s, v18.s[1] + fmla v5.4s, v22.4s, v17.s[1] + fmla v7.4s, v22.4s, v16.s[1] + fmla v2.4s, v20.4s, v19.s[2] + ldp q22, q23, [x5, #-32] + fmla v0.4s, v20.4s, v18.s[2] + fmla v4.4s, v20.4s, v17.s[2] + fmla v6.4s, v20.4s, v16.s[2] + fmla v3.4s, v21.4s, v19.s[2] + fmla v1.4s, v21.4s, v18.s[2] + fmla v5.4s, v21.4s, v17.s[2] + fmla v7.4s, v21.4s, v16.s[2] + ldp q20, q21, [x5], #128 + prfm pldl1keep, [x6] + fmla v2.4s, v23.4s, v19.s[3] + fmla v0.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v17.s[3] + fmla v6.4s, v23.4s, v16.s[3] + fmla v3.4s, v22.4s, v19.s[3] + ldur q19, [x6, #-16] + prfm pldl1keep, [x7] + fmla v1.4s, v22.4s, v18.s[3] + ldur q18, [x7, #-16] + add x7, x30, x22 + prfm pldl1keep, [x30] + add x6, x6, #16 + fmla v5.4s, v22.4s, v17.s[3] + ldur q17, [x30, #-16] + prfm pldl1keep, [x7] + fmla v7.4s, v22.4s, v16.s[3] + ldur q16, [x7, #-16] + cmp x4, x29 + b.lt .LBB0_39 +.LBB0_40: // in Loop: Header=BB0_37 Depth=2 + ldp q22, q23, [x13] + fmla v3.4s, v20.4s, v19.s[0] + fmla v2.4s, v21.4s, v19.s[0] + fmla v0.4s, v21.4s, v18.s[0] + fmla v1.4s, v20.4s, v18.s[0] + ldr x6, [sp, #432] // 8-byte Folded Reload + mov x13, x27 + fmla v4.4s, v21.4s, v17.s[0] + fmla v5.4s, v20.4s, v17.s[0] + fmla v6.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q21, q20, [x12] + mov x12, x15 + fmla v2.4s, v23.4s, v19.s[1] + fmla v0.4s, v23.4s, v18.s[1] + fmla v4.4s, v23.4s, v17.s[1] + fmla v6.4s, v23.4s, v16.s[1] + fmla v3.4s, v22.4s, v19.s[1] + fmla v1.4s, v22.4s, v18.s[1] + fmla v5.4s, v22.4s, v17.s[1] + fmla v7.4s, v22.4s, v16.s[1] + fmla v2.4s, v20.4s, v19.s[2] + ldp q22, q23, [x11] + fmla v0.4s, v20.4s, v18.s[2] + mov x11, x14 + fmla v4.4s, v20.4s, v17.s[2] + fmla v6.4s, v20.4s, v16.s[2] + fmla v3.4s, v21.4s, v19.s[2] + fmla v1.4s, v21.4s, v18.s[2] + fmla v5.4s, v21.4s, v17.s[2] + fmla v7.4s, v21.4s, v16.s[2] + fmla v2.4s, v23.4s, v19.s[3] + fmla v0.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v17.s[3] + fmla v6.4s, v23.4s, v16.s[3] + fmla v3.4s, v22.4s, v19.s[3] + fmla v1.4s, v22.4s, v18.s[3] + fmla v5.4s, v22.4s, v17.s[3] + fmla v7.4s, v22.4s, v16.s[3] + cmp x27, x21 + b.ge .LBB0_36 + .p2align 2 +.LBB0_41: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_37 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x12, x22 + prfm pldl1keep, [x11] + ldp q16, q17, [x11, #-32] + prfm pldl1keep, [x12] + ldur s18, [x12, #-4] + add x13, x13, #1 + add x12, x12, #4 + prfm pldl1keep, [x4] + ldur s19, [x4, #-4] + add x4, x4, x22 + add x11, x11, #32 + prfm pldl1keep, [x4] + ldur s20, [x4, #-4] + add x4, x4, x22 + fmla v2.4s, v17.4s, v18.s[0] + prfm pldl1keep, [x4] + ldur s21, [x4, #-4] + fmla v3.4s, v16.4s, v18.s[0] + fmla v0.4s, v17.4s, v19.s[0] + fmla v1.4s, v16.4s, v19.s[0] + fmla v4.4s, v17.4s, v20.s[0] + fmla v5.4s, v16.4s, v20.s[0] + fmla v6.4s, v17.4s, v21.s[0] + fmla v7.4s, v16.4s, v21.s[0] + cmp x13, x21 + b.lt .LBB0_41 + b .LBB0_36 + .p2align 2 +.LBB0_42: // in Loop: Header=BB0_4 Depth=1 + ldr x14, [sp, #504] // 8-byte Folded Reload + ldr x15, [sp, #440] // 8-byte Folded Reload + cmp x14, x15 + b.ge .LBB0_48 +// %bb.43: // in Loop: Header=BB0_4 Depth=1 + ldr x1, [sp, #504] // 8-byte Folded Reload + ldr x18, [sp, #416] // 8-byte Folded Reload + mov x16, xzr + mul x14, x1, x28 + add x17, x1, #1 + mul x1, x1, x19 + ldp q6, q7, [x8] + madd x15, x17, x28, x9 + add x14, x9, x14 + add x14, x14, x18 + add x15, x15, x18 + ldr x18, [sp, #448] // 8-byte Folded Reload + add x14, x6, x14, lsl #2 + add x15, x6, x15, lsl #2 + mul x18, x20, x18 + ldp q1, q0, [x14] + ldp q3, q2, [x15] + madd x17, x17, x19, x18 + add x1, x18, x1 + lsl x1, x1, #2 + lsl x17, x17, #2 + ldr q5, [x25, x1] + ldr q4, [x25, x17] + ldp x18, x1, [sp, #288] // 16-byte Folded Reload + mov x17, x10 + cmp xzr, x29 + b.ge .LBB0_45 + .p2align 2 +.LBB0_44: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x7, x17, #32 + ldr x4, [sp, #456] // 8-byte Folded Reload + fmla v1.4s, v6.4s, v5.s[0] + fmla v0.4s, v7.4s, v5.s[0] + prfm pldl1keep, [x7] + ldp q16, q17, [x17, #-96] + fmla v2.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q7, q6, [x17, #-64] + add x6, x17, #96 + prfm pldl1keep, [x6] + add x16, x16, #4 + add x2, x1, x4 + add x4, x18, x4 + add x1, x1, #16 + add x18, x18, #16 + fmla v0.4s, v17.4s, v5.s[1] + fmla v2.4s, v17.4s, v4.s[1] + add x3, x2, #32 + add x5, x4, #32 + fmla v1.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + ldp q16, q17, [x17, #-32] + fmla v0.4s, v6.4s, v5.s[2] + fmla v2.4s, v6.4s, v4.s[2] + fmla v1.4s, v7.4s, v5.s[2] + fmla v3.4s, v7.4s, v4.s[2] + fmla v0.4s, v17.4s, v5.s[3] + fmla v2.4s, v17.4s, v4.s[3] + ldp q6, q7, [x17], #128 + prfm pldl1keep, [x5] + fmla v1.4s, v16.4s, v5.s[3] + ldr q5, [x4, #16] + prfm pldl1keep, [x3] + fmla v3.4s, v16.4s, v4.s[3] + ldr q4, [x2, #16] + cmp x16, x29 + b.lt .LBB0_44 +.LBB0_45: // in Loop: Header=BB0_4 Depth=1 + ldp q16, q17, [x13] + fmla v1.4s, v6.4s, v5.s[0] + fmla v0.4s, v7.4s, v5.s[0] + fmla v2.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q7, q6, [x12] + ldr x18, [sp, #72] // 8-byte Folded Reload + mov x16, xzr + mov x17, xzr + mov x1, x27 + fmla v0.4s, v17.4s, v5.s[1] + fmla v2.4s, v17.4s, v4.s[1] + add x18, x8, x18 + fmla v1.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + ldp q16, q17, [x11] + fmla v0.4s, v6.4s, v5.s[2] + fmla v2.4s, v6.4s, v4.s[2] + fmla v1.4s, v7.4s, v5.s[2] + fmla v3.4s, v7.4s, v4.s[2] + fmla v0.4s, v17.4s, v5.s[3] + fmla v2.4s, v17.4s, v4.s[3] + fmla v1.4s, v16.4s, v5.s[3] + fmla v3.4s, v16.4s, v4.s[3] + cmp x27, x21 + b.ge .LBB0_47 + .p2align 2 +.LBB0_46: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + ldr x6, [sp, #240] // 8-byte Folded Reload + ldr x7, [sp, #312] // 8-byte Folded Reload + add x4, x18, x17, lsl #3 + add x5, x18, x16 + add x1, x1, #1 + add x16, x16, #32 + add x4, x4, #32 + prfm pldl1keep, [x4] + ldp q4, q5, [x5] + add x2, x6, x17 + add x3, x7, x17 + add x2, x2, #4 + add x3, x3, #4 + prfm pldl1keep, [x3] + ldr s6, [x7, x17] + prfm pldl1keep, [x2] + fmla v0.4s, v5.4s, v6.s[0] + ldr s7, [x6, x17] + fmla v1.4s, v4.4s, v6.s[0] + fmla v2.4s, v5.4s, v7.s[0] + fmla v3.4s, v4.4s, v7.s[0] + add x17, x17, #4 + cmp x1, x21 + b.lt .LBB0_46 +.LBB0_47: // in Loop: Header=BB0_4 Depth=1 + ldr x6, [sp, #432] // 8-byte Folded Reload + stp q1, q0, [x14] + stp q3, q2, [x15] +.LBB0_48: // in Loop: Header=BB0_4 Depth=1 + ldr x14, [sp, #368] // 8-byte Folded Reload + ldr x15, [sp, #440] // 8-byte Folded Reload + cmp x15, x14 + b.ge .LBB0_54 +// %bb.49: // in Loop: Header=BB0_4 Depth=1 + ldp x17, x16, [sp, #440] // 16-byte Folded Reload + ldp q4, q3, [x8] + ldr x18, [sp, #376] // 8-byte Folded Reload + mov x14, xzr + mul x15, x17, x28 + add x9, x9, x15 + ldr x15, [sp, #416] // 8-byte Folded Reload + add x9, x9, x15 + mul x15, x17, x19 + madd x15, x20, x16, x15 + add x9, x6, x9, lsl #2 + ldp q1, q0, [x9] + lsl x15, x15, #2 + ldr q2, [x25, x15] + ldr x15, [sp, #304] // 8-byte Folded Reload + cmp xzr, x29 + b.ge .LBB0_51 + .p2align 2 +.LBB0_50: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x17, x10, #32 + fmla v1.4s, v4.4s, v2.s[0] + fmla v0.4s, v3.4s, v2.s[0] + add x16, x10, #96 + prfm pldl1keep, [x17] + ldp q5, q6, [x10, #-96] + add x14, x14, #4 + ldp q4, q3, [x10, #-64] + prfm pldl1keep, [x16] + fmla v0.4s, v6.4s, v2.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldp q5, q6, [x10, #-32] + prfm pldl1keep, [x15] + fmla v0.4s, v3.4s, v2.s[2] + fmla v1.4s, v4.4s, v2.s[2] + fmla v0.4s, v6.4s, v2.s[3] + fmla v1.4s, v5.4s, v2.s[3] + ldur q2, [x15, #-16] + ldp q4, q3, [x10], #128 + add x15, x15, #16 + cmp x14, x29 + b.lt .LBB0_50 +.LBB0_51: // in Loop: Header=BB0_4 Depth=1 + ldp q5, q6, [x13] + fmla v1.4s, v4.4s, v2.s[0] + fmla v0.4s, v3.4s, v2.s[0] + ldp q4, q3, [x12] + mov x10, xzr + mov x14, xzr + fmla v0.4s, v6.4s, v2.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldp q5, q6, [x11] + ldr x11, [sp, #72] // 8-byte Folded Reload + fmla v0.4s, v3.4s, v2.s[2] + fmla v1.4s, v4.4s, v2.s[2] + add x8, x8, x11 + mov x11, x27 + fmla v0.4s, v6.4s, v2.s[3] + fmla v1.4s, v5.4s, v2.s[3] + cmp x27, x21 + b.ge .LBB0_53 + .p2align 2 +.LBB0_52: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x8, x14, lsl #3 + add x12, x18, x14 + add x15, x8, x10 + add x11, x11, #1 + add x12, x12, #4 + add x10, x10, #32 + add x13, x13, #32 + prfm pldl1keep, [x13] + ldp q2, q3, [x15] + prfm pldl1keep, [x12] + ldr s4, [x18, x14] + add x14, x14, #4 + fmla v0.4s, v3.4s, v4.s[0] + fmla v1.4s, v2.4s, v4.s[0] + cmp x11, x21 + b.lt .LBB0_52 +.LBB0_53: // in Loop: Header=BB0_4 Depth=1 + stp q1, q0, [x9] +.LBB0_54: // in Loop: Header=BB0_4 Depth=1 + bl free + ldr x8, [sp, #448] // 8-byte Folded Reload + ldr x30, [sp, #280] // 8-byte Folded Reload + ldr x1, [sp, #176] // 8-byte Folded Reload + cmp x30, x1 + b.ge .LBB0_31 +.LBB0_55: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #88] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #360] // 8-byte Folded Reload + ldr x5, [sp, #280] // 8-byte Folded Reload + mov x10, xzr + mov x11, xzr + ldp x13, x6, [sp, #424] // 16-byte Folded Reload + ldr x15, [sp, #200] // 8-byte Folded Reload + mul x9, x20, x8 + ldr x16, [sp, #400] // 8-byte Folded Reload + add x8, x9, x5 + lsl x12, x8, #2 + ldr q0, [x6, x12] + add x12, x8, x28 + lsl x12, x12, #2 + ldr q1, [x6, x12] + add x12, x8, x13 + lsl x12, x12, #2 + ldr q2, [x6, x12] + add x12, x13, x28 + add x8, x8, x12 + lsl x8, x8, #2 + ldr q3, [x6, x8] + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + cmp xzr, x21 + b.ge .LBB0_57 + .p2align 2 +.LBB0_56: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x16, x10 + add x12, x15, x10 + add x10, x10, #4 + prfm pldl1keep, [x13] + ldur s4, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s5, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s6, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s7, [x13, #-4] + prfm pldl1keep, [x12] + ldur s16, [x12, #-4] + add x12, x12, x26 + sub x13, x12, #4 + prfm pldl1keep, [x12] + add x12, x12, x26 + prfm pldl1keep, [x12] + sub x14, x12, #4 + add x12, x12, x26 + ld1 { v16.s }[1], [x13] + prfm pldl1keep, [x12] + sub x12, x12, #4 + ld1 { v16.s }[2], [x14] + ld1 { v16.s }[3], [x12] + str q16, [x8, x11, lsl #4] + add x11, x11, #1 + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v16.4s, v5.s[0] + fmla v2.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v7.s[0] + cmp x11, x21 + b.lt .LBB0_56 +.LBB0_57: // %.preheader26 + // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #48] // 8-byte Folded Reload + ldp x12, x13, [sp, #384] // 16-byte Folded Reload + mov x1, xzr + add x10, x8, #48 + mov w15, #1 // =0x1 + mov w16, #2 // =0x2 + mov w17, #3 // =0x3 + mov w14, #4 // =0x4 + add x11, x8, x11 + b .LBB0_59 + .p2align 2 +.LBB0_58: // %.loopexit22 + // in Loop: Header=BB0_59 Depth=2 + add x13, x13, x24 + add x12, x12, x24 + mov x1, x14 + mov x14, x18 +.LBB0_59: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_61 Depth 3 + // Child Loop BB0_63 Depth 3 + madd x18, x1, x28, x9 + add x18, x18, x5 + madd x15, x15, x28, x9 + madd x16, x16, x28, x9 + madd x17, x17, x28, x9 + add x15, x15, x5 + add x16, x16, x5 + lsl x18, x18, #2 + lsl x15, x15, #2 + lsl x16, x16, #2 + str q0, [x6, x18] + str q1, [x6, x15] + add x15, x17, x5 + lsl x15, x15, #2 + str q2, [x6, x16] + str q3, [x6, x15] + ldr x15, [sp, #504] // 8-byte Folded Reload + cmp x14, x15 + b.ge .LBB0_64 +// %bb.60: // in Loop: Header=BB0_59 Depth=2 + madd x2, x14, x28, x9 + add x15, x14, #1 + add x16, x14, #2 + add x17, x14, #3 + madd x3, x15, x28, x9 + ldr q16, [x8] + mov x1, xzr + add x18, x14, #4 + add x2, x2, x5 + lsl x2, x2, #2 + add x3, x3, x5 + lsl x3, x3, #2 + ldr q0, [x6, x2] + madd x2, x16, x28, x9 + add x2, x2, x5 + ldr q1, [x6, x3] + madd x3, x17, x28, x9 + lsl x2, x2, #2 + ldr q2, [x6, x2] + add x2, x3, x5 + lsl x2, x2, #2 + ldr q3, [x6, x2] + ldr x2, [sp, #448] // 8-byte Folded Reload + mul x2, x20, x2 + madd x3, x14, x19, x2 + lsl x3, x3, #2 + ldr q7, [x25, x3] + madd x3, x15, x19, x2 + lsl x3, x3, #2 + ldr q6, [x25, x3] + madd x3, x16, x19, x2 + madd x2, x17, x19, x2 + lsl x3, x3, #2 + lsl x2, x2, #2 + ldr q5, [x25, x3] + ldr q4, [x25, x2] + mov x2, x10 + mov x3, x13 + cmp xzr, x29 + b.ge .LBB0_62 + .p2align 2 +.LBB0_61: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_59 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x2, #32 + fmla v0.4s, v16.4s, v7.s[0] + fmla v1.4s, v16.4s, v6.s[0] + add x1, x1, #4 + fmla v2.4s, v16.4s, v5.s[0] + fmla v3.4s, v16.4s, v4.s[0] + prfm pldl1keep, [x4] + add x4, x3, x22 + ldp q16, q17, [x2, #-32] + fmla v0.4s, v16.4s, v7.s[1] + fmla v1.4s, v16.4s, v6.s[1] + fmla v2.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + fmla v0.4s, v17.4s, v7.s[2] + fmla v1.4s, v17.4s, v6.s[2] + fmla v2.4s, v17.4s, v5.s[2] + fmla v3.4s, v17.4s, v4.s[2] + ldp q17, q16, [x2], #64 + prfm pldl1keep, [x3] + fmla v0.4s, v17.4s, v7.s[3] + ldur q7, [x3, #-16] + prfm pldl1keep, [x4] + fmla v1.4s, v17.4s, v6.s[3] + ldur q6, [x4, #-16] + add x4, x4, x22 + fmla v2.4s, v17.4s, v5.s[3] + fmla v3.4s, v17.4s, v4.s[3] + add x3, x3, #16 + prfm pldl1keep, [x4] + ldur q5, [x4, #-16] + add x4, x4, x22 + prfm pldl1keep, [x4] + ldur q4, [x4, #-16] + cmp x1, x29 + b.lt .LBB0_61 +.LBB0_62: // in Loop: Header=BB0_59 Depth=2 + ldp x1, x2, [sp, #488] // 16-byte Folded Reload + fmla v0.4s, v16.4s, v7.s[0] + fmla v1.4s, v16.4s, v6.s[0] + fmla v2.4s, v16.4s, v5.s[0] + fmla v3.4s, v16.4s, v4.s[0] + mov x3, x27 + ldr q17, [x8, x2, lsl #4] + ldr q16, [x8, x1, lsl #4] + ldr x1, [sp, #480] // 8-byte Folded Reload + mov x2, x12 + ldr q18, [x8, x1, lsl #4] + mov x1, x11 + fmla v0.4s, v17.4s, v7.s[1] + fmla v1.4s, v17.4s, v6.s[1] + fmla v2.4s, v17.4s, v5.s[1] + fmla v3.4s, v17.4s, v4.s[1] + fmla v0.4s, v16.4s, v7.s[2] + fmla v1.4s, v16.4s, v6.s[2] + fmla v2.4s, v16.4s, v5.s[2] + fmla v3.4s, v16.4s, v4.s[2] + fmla v0.4s, v18.4s, v7.s[3] + fmla v1.4s, v18.4s, v6.s[3] + fmla v2.4s, v18.4s, v5.s[3] + fmla v3.4s, v18.4s, v4.s[3] + cmp x27, x21 + b.ge .LBB0_58 + .p2align 2 +.LBB0_63: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_59 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x2, x22 + prfm pldl1keep, [x1] + ldur q4, [x1, #-16] + add x3, x3, #1 + prfm pldl1keep, [x2] + ldur s5, [x2, #-4] + add x2, x2, #4 + add x1, x1, #16 + prfm pldl1keep, [x4] + ldur s6, [x4, #-4] + add x4, x4, x22 + fmla v0.4s, v4.4s, v5.s[0] + prfm pldl1keep, [x4] + ldur s7, [x4, #-4] + add x4, x4, x22 + prfm pldl1keep, [x4] + ldur s16, [x4, #-4] + fmla v1.4s, v4.4s, v6.s[0] + fmla v2.4s, v4.4s, v7.s[0] + fmla v3.4s, v4.4s, v16.s[0] + cmp x3, x21 + b.lt .LBB0_63 + b .LBB0_58 + .p2align 2 +.LBB0_64: // in Loop: Header=BB0_4 Depth=1 + ldr x12, [sp, #504] // 8-byte Folded Reload + ldr x13, [sp, #440] // 8-byte Folded Reload + cmp x12, x13 + b.ge .LBB0_70 +// %bb.65: // in Loop: Header=BB0_4 Depth=1 + ldr x17, [sp, #504] // 8-byte Folded Reload + ldr x16, [sp, #448] // 8-byte Folded Reload + mov x14, xzr + mul x12, x17, x28 + add x15, x17, #1 + mul x16, x20, x16 + mul x17, x17, x19 + ldr q4, [x8] + madd x13, x15, x28, x9 + madd x15, x15, x19, x16 + add x12, x9, x12 + add x17, x16, x17 + add x12, x12, x5 + add x13, x13, x5 + lsl x17, x17, #2 + lsl x15, x15, #2 + add x12, x6, x12, lsl #2 + add x13, x6, x13, lsl #2 + ldr q3, [x25, x17] + ldr q2, [x25, x15] + ldp x16, x17, [sp, #288] // 16-byte Folded Reload + mov x15, x10 + ldr q0, [x12] + ldr q1, [x13] + cmp xzr, x29 + b.ge .LBB0_67 + .p2align 2 +.LBB0_66: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x4, x15, #32 + ldr x2, [sp, #456] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v3.s[0] + fmla v1.4s, v4.4s, v2.s[0] + prfm pldl1keep, [x4] + ldp q4, q5, [x15, #-32] + add x14, x14, #4 + add x18, x17, x2 + add x2, x16, x2 + add x17, x17, #16 + add x16, x16, #16 + add x1, x18, #32 + add x3, x2, #32 + fmla v0.4s, v4.4s, v3.s[1] + fmla v1.4s, v4.4s, v2.s[1] + fmla v0.4s, v5.4s, v3.s[2] + fmla v1.4s, v5.4s, v2.s[2] + ldp q5, q4, [x15], #64 + prfm pldl1keep, [x3] + fmla v0.4s, v5.4s, v3.s[3] + ldr q3, [x2, #16] + prfm pldl1keep, [x1] + fmla v1.4s, v5.4s, v2.s[3] + ldr q2, [x18, #16] + cmp x14, x29 + b.lt .LBB0_66 +.LBB0_67: // in Loop: Header=BB0_4 Depth=1 + ldp x14, x15, [sp, #488] // 16-byte Folded Reload + fmla v0.4s, v4.4s, v3.s[0] + fmla v1.4s, v4.4s, v2.s[0] + ldp x1, x18, [sp, #256] // 16-byte Folded Reload + ldr q5, [x8, x15, lsl #4] + ldr q4, [x8, x14, lsl #4] + ldr x14, [sp, #480] // 8-byte Folded Reload + mov x15, x27 + fmla v0.4s, v5.4s, v3.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldr q5, [x8, x14, lsl #4] + ldr x14, [sp, #408] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v3.s[2] + fmla v1.4s, v4.4s, v2.s[2] + fmla v0.4s, v5.4s, v3.s[3] + fmla v1.4s, v5.4s, v2.s[3] + cmp x27, x21 + b.ge .LBB0_69 + .p2align 2 +.LBB0_68: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x16, x14, x1 + add x17, x14, x18 + prfm pldl1keep, [x11] + ldur q2, [x11, #-16] + add x16, x16, #4 + add x17, x17, #4 + add x15, x15, #1 + add x11, x11, #16 + prfm pldl1keep, [x17] + ldr s3, [x14, x18] + prfm pldl1keep, [x16] + ldr s4, [x14, x1] + add x14, x14, #4 + fmla v0.4s, v2.4s, v3.s[0] + fmla v1.4s, v2.4s, v4.s[0] + cmp x15, x21 + b.lt .LBB0_68 +.LBB0_69: // in Loop: Header=BB0_4 Depth=1 + str q0, [x12] + str q1, [x13] +.LBB0_70: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #368] // 8-byte Folded Reload + ldr x12, [sp, #440] // 8-byte Folded Reload + cmp x12, x11 + b.ge .LBB0_76 +// %bb.71: // in Loop: Header=BB0_4 Depth=1 + ldp x14, x13, [sp, #440] // 16-byte Folded Reload + ldr q2, [x8] + mov x11, xzr + mul x12, x14, x28 + add x9, x9, x12 + mul x12, x14, x19 + ldr x14, [sp, #376] // 8-byte Folded Reload + madd x12, x20, x13, x12 + add x9, x9, x5 + add x9, x6, x9, lsl #2 + ldr q0, [x9] + lsl x12, x12, #2 + ldr q1, [x25, x12] + ldr x12, [sp, #304] // 8-byte Folded Reload + cmp xzr, x29 + b.ge .LBB0_73 + .p2align 2 +.LBB0_72: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x10, #32 + fmla v0.4s, v2.4s, v1.s[0] + add x11, x11, #4 + prfm pldl1keep, [x13] + ldp q2, q3, [x10, #-32] + fmla v0.4s, v2.4s, v1.s[1] + fmla v0.4s, v3.4s, v1.s[2] + ldp q3, q2, [x10], #64 + prfm pldl1keep, [x12] + fmla v0.4s, v3.4s, v1.s[3] + ldur q1, [x12, #-16] + add x12, x12, #16 + cmp x11, x29 + b.lt .LBB0_72 +.LBB0_73: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #496] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[0] + ldr x12, [sp, #272] // 8-byte Folded Reload + mov x10, xzr + ldr q3, [x8, x11, lsl #4] + ldr x11, [sp, #488] // 8-byte Folded Reload + fmla v0.4s, v3.4s, v1.s[1] + ldr q2, [x8, x11, lsl #4] + ldr x11, [sp, #480] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[2] + ldr q3, [x8, x11, lsl #4] + ldr x11, [sp, #24] // 8-byte Folded Reload + add x8, x8, x11 + mov w11, #16 // =0x10 + fmla v0.4s, v3.4s, v1.s[3] + add x13, x27, xzr + cmp x13, x21 + b.ge .LBB0_75 + .p2align 2 +.LBB0_74: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x8, x11 + add x11, x11, #16 + prfm pldl1keep, [x13] + ldr x13, [sp, #408] // 8-byte Folded Reload + ldr q1, [x8, x10, lsl #4] + add x13, x13, x12 + add x12, x12, #4 + prfm pldl1keep, [x13] + ldr s2, [x14, x10, lsl #2] + add x10, x10, #1 + fmla v0.4s, v1.4s, v2.s[0] + add x13, x27, x10 + cmp x13, x21 + b.lt .LBB0_74 +.LBB0_75: // in Loop: Header=BB0_4 Depth=1 + str q0, [x9] +.LBB0_76: // in Loop: Header=BB0_4 Depth=1 + bl free + ldr x8, [sp, #448] // 8-byte Folded Reload + ldp x10, x1, [sp, #168] // 16-byte Folded Reload + cmp x1, x10 + b.ge .LBB0_32 +.LBB0_77: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #80] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #360] // 8-byte Folded Reload + ldr x5, [sp, #176] // 8-byte Folded Reload + mov x10, xzr + mov x11, xzr + ldp x13, x6, [sp, #424] // 16-byte Folded Reload + ldp x17, x16, [sp, #208] // 16-byte Folded Reload + ldr x18, [sp, #400] // 8-byte Folded Reload + mul x9, x20, x8 + add x8, x9, x5 + lsl x12, x8, #2 + ldr d0, [x6, x12] + add x12, x8, x28 + lsl x12, x12, #2 + ldr d1, [x6, x12] + add x12, x8, x13 + lsl x12, x12, #2 + ldr d2, [x6, x12] + add x12, x13, x28 + add x8, x8, x12 + lsl x8, x8, #2 + ldr d3, [x6, x8] + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + cmp xzr, x21 + b.ge .LBB0_79 + .p2align 2 +.LBB0_78: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x15, x18, x10 + add x12, x16, x10 + add x14, x17, x10 + prfm pldl1keep, [x15] + ldur s4, [x15, #-4] + add x15, x15, x22 + add x13, x12, #4 + add x14, x14, #4 + prfm pldl1keep, [x15] + ldur s5, [x15, #-4] + add x15, x15, x22 + prfm pldl1keep, [x15] + ldur s6, [x15, #-4] + add x15, x15, x22 + prfm pldl1keep, [x15] + ldur s7, [x15, #-4] + prfm pldl1keep, [x14] + prfm pldl1keep, [x13] + ldr s16, [x17, x10] + add x10, x10, #4 + ld1 { v16.s }[1], [x12] + fmla v0.2s, v16.2s, v4.s[0] + str d16, [x8, x11, lsl #3] + add x11, x11, #1 + fmla v1.2s, v16.2s, v5.s[0] + fmla v2.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v7.s[0] + cmp x11, x21 + b.lt .LBB0_78 +.LBB0_79: // %.preheader25 + // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #40] // 8-byte Folded Reload + ldp x12, x13, [sp, #384] // 16-byte Folded Reload + mov x1, xzr + add x10, x8, #24 + mov w15, #1 // =0x1 + mov w16, #2 // =0x2 + mov w17, #3 // =0x3 + mov w14, #4 // =0x4 + add x11, x8, x11 + b .LBB0_81 + .p2align 2 +.LBB0_80: // %.loopexit21 + // in Loop: Header=BB0_81 Depth=2 + add x13, x13, x24 + add x12, x12, x24 + mov x1, x14 + mov x14, x18 +.LBB0_81: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_83 Depth 3 + // Child Loop BB0_85 Depth 3 + madd x18, x1, x28, x9 + add x18, x18, x5 + madd x15, x15, x28, x9 + madd x16, x16, x28, x9 + madd x17, x17, x28, x9 + add x15, x15, x5 + add x16, x16, x5 + lsl x18, x18, #2 + lsl x15, x15, #2 + lsl x16, x16, #2 + str d0, [x6, x18] + str d1, [x6, x15] + add x15, x17, x5 + lsl x15, x15, #2 + str d2, [x6, x16] + str d3, [x6, x15] + ldr x15, [sp, #504] // 8-byte Folded Reload + cmp x14, x15 + b.ge .LBB0_86 +// %bb.82: // in Loop: Header=BB0_81 Depth=2 + madd x2, x14, x28, x9 + add x15, x14, #1 + add x16, x14, #2 + add x17, x14, #3 + madd x3, x15, x28, x9 + ldr d16, [x8] + mov x1, xzr + add x18, x14, #4 + add x2, x2, x5 + lsl x2, x2, #2 + add x3, x3, x5 + lsl x3, x3, #2 + ldr d0, [x6, x2] + madd x2, x16, x28, x9 + add x2, x2, x5 + ldr d1, [x6, x3] + madd x3, x17, x28, x9 + lsl x2, x2, #2 + ldr d2, [x6, x2] + add x2, x3, x5 + lsl x2, x2, #2 + ldr d3, [x6, x2] + ldr x2, [sp, #448] // 8-byte Folded Reload + mul x2, x20, x2 + madd x3, x14, x19, x2 + lsl x3, x3, #2 + ldr q7, [x25, x3] + madd x3, x15, x19, x2 + lsl x3, x3, #2 + ldr q6, [x25, x3] + madd x3, x16, x19, x2 + madd x2, x17, x19, x2 + lsl x3, x3, #2 + lsl x2, x2, #2 + ldr q5, [x25, x3] + ldr q4, [x25, x2] + mov x2, x10 + mov x3, x13 + cmp xzr, x29 + b.ge .LBB0_84 + .p2align 2 +.LBB0_83: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_81 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x2, #16 + fmla v0.2s, v16.2s, v7.s[0] + fmla v1.2s, v16.2s, v6.s[0] + add x1, x1, #4 + fmla v2.2s, v16.2s, v5.s[0] + fmla v3.2s, v16.2s, v4.s[0] + prfm pldl1keep, [x4] + add x4, x3, x22 + ldp d16, d17, [x2, #-16] + fmla v0.2s, v16.2s, v7.s[1] + fmla v1.2s, v16.2s, v6.s[1] + fmla v2.2s, v16.2s, v5.s[1] + fmla v3.2s, v16.2s, v4.s[1] + fmla v0.2s, v17.2s, v7.s[2] + fmla v1.2s, v17.2s, v6.s[2] + fmla v2.2s, v17.2s, v5.s[2] + fmla v3.2s, v17.2s, v4.s[2] + ldp d17, d16, [x2], #32 + prfm pldl1keep, [x3] + fmla v0.2s, v17.2s, v7.s[3] + ldur q7, [x3, #-16] + prfm pldl1keep, [x4] + fmla v1.2s, v17.2s, v6.s[3] + ldur q6, [x4, #-16] + add x4, x4, x22 + fmla v2.2s, v17.2s, v5.s[3] + fmla v3.2s, v17.2s, v4.s[3] + add x3, x3, #16 + prfm pldl1keep, [x4] + ldur q5, [x4, #-16] + add x4, x4, x22 + prfm pldl1keep, [x4] + ldur q4, [x4, #-16] + cmp x1, x29 + b.lt .LBB0_83 +.LBB0_84: // in Loop: Header=BB0_81 Depth=2 + ldp x1, x2, [sp, #488] // 16-byte Folded Reload + fmla v0.2s, v16.2s, v7.s[0] + fmla v1.2s, v16.2s, v6.s[0] + fmla v2.2s, v16.2s, v5.s[0] + fmla v3.2s, v16.2s, v4.s[0] + mov x3, x27 + ldr d17, [x8, x2, lsl #3] + ldr d16, [x8, x1, lsl #3] + ldr x1, [sp, #480] // 8-byte Folded Reload + mov x2, x12 + ldr d18, [x8, x1, lsl #3] + mov x1, x11 + fmla v0.2s, v17.2s, v7.s[1] + fmla v1.2s, v17.2s, v6.s[1] + fmla v2.2s, v17.2s, v5.s[1] + fmla v3.2s, v17.2s, v4.s[1] + fmla v0.2s, v16.2s, v7.s[2] + fmla v1.2s, v16.2s, v6.s[2] + fmla v2.2s, v16.2s, v5.s[2] + fmla v3.2s, v16.2s, v4.s[2] + fmla v0.2s, v18.2s, v7.s[3] + fmla v1.2s, v18.2s, v6.s[3] + fmla v2.2s, v18.2s, v5.s[3] + fmla v3.2s, v18.2s, v4.s[3] + cmp x27, x21 + b.ge .LBB0_80 + .p2align 2 +.LBB0_85: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_81 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x2, x22 + prfm pldl1keep, [x1] + ldur d4, [x1, #-8] + add x3, x3, #1 + prfm pldl1keep, [x2] + ldur s5, [x2, #-4] + add x2, x2, #4 + add x1, x1, #8 + prfm pldl1keep, [x4] + ldur s6, [x4, #-4] + add x4, x4, x22 + fmla v0.2s, v4.2s, v5.s[0] + prfm pldl1keep, [x4] + ldur s7, [x4, #-4] + add x4, x4, x22 + prfm pldl1keep, [x4] + ldur s16, [x4, #-4] + fmla v1.2s, v4.2s, v6.s[0] + fmla v2.2s, v4.2s, v7.s[0] + fmla v3.2s, v4.2s, v16.s[0] + cmp x3, x21 + b.lt .LBB0_85 + b .LBB0_80 + .p2align 2 +.LBB0_86: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #504] // 8-byte Folded Reload + ldr x12, [sp, #440] // 8-byte Folded Reload + cmp x11, x12 + b.ge .LBB0_92 +// %bb.87: // in Loop: Header=BB0_4 Depth=1 + ldr x16, [sp, #504] // 8-byte Folded Reload + ldr x15, [sp, #448] // 8-byte Folded Reload + mov x13, xzr + mul x11, x16, x28 + add x14, x16, #1 + mul x15, x20, x15 + mul x16, x16, x19 + ldr d4, [x8] + madd x12, x14, x28, x9 + madd x14, x14, x19, x15 + add x11, x9, x11 + add x16, x15, x16 + add x11, x11, x5 + add x12, x12, x5 + lsl x16, x16, #2 + lsl x14, x14, #2 + add x11, x6, x11, lsl #2 + add x12, x6, x12, lsl #2 + ldr q3, [x25, x16] + ldr q2, [x25, x14] + ldp x15, x16, [sp, #288] // 16-byte Folded Reload + mov x14, x10 + ldr d0, [x11] + ldr d1, [x12] + cmp xzr, x29 + b.ge .LBB0_89 + .p2align 2 +.LBB0_88: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x3, x14, #16 + ldr x1, [sp, #456] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v3.s[0] + fmla v1.2s, v4.2s, v2.s[0] + prfm pldl1keep, [x3] + ldp d4, d5, [x14, #-16] + add x13, x13, #4 + add x17, x16, x1 + add x1, x15, x1 + add x16, x16, #16 + add x15, x15, #16 + add x18, x17, #32 + add x2, x1, #32 + fmla v0.2s, v4.2s, v3.s[1] + fmla v1.2s, v4.2s, v2.s[1] + fmla v0.2s, v5.2s, v3.s[2] + fmla v1.2s, v5.2s, v2.s[2] + ldp d5, d4, [x14], #32 + prfm pldl1keep, [x2] + fmla v0.2s, v5.2s, v3.s[3] + ldr q3, [x1, #16] + prfm pldl1keep, [x18] + fmla v1.2s, v5.2s, v2.s[3] + ldr q2, [x17, #16] + cmp x13, x29 + b.lt .LBB0_88 +.LBB0_89: // in Loop: Header=BB0_4 Depth=1 + ldr x15, [sp, #496] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v3.s[0] + fmla v1.2s, v4.2s, v2.s[0] + ldr x1, [sp, #240] // 8-byte Folded Reload + mov x13, xzr + mov x14, xzr + ldr d5, [x8, x15, lsl #3] + ldr x15, [sp, #488] // 8-byte Folded Reload + ldr d4, [x8, x15, lsl #3] + ldr x15, [sp, #480] // 8-byte Folded Reload + fmla v0.2s, v5.2s, v3.s[1] + fmla v1.2s, v5.2s, v2.s[1] + ldr d5, [x8, x15, lsl #3] + ldr x15, [sp, #64] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v3.s[2] + fmla v1.2s, v4.2s, v2.s[2] + add x15, x8, x15 + fmla v0.2s, v5.2s, v3.s[3] + fmla v1.2s, v5.2s, v2.s[3] + add x16, x27, xzr + cmp x16, x21 + b.ge .LBB0_91 + .p2align 2 +.LBB0_90: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + ldr x2, [sp, #312] // 8-byte Folded Reload + add x18, x15, x14, lsl #3 + add x16, x1, x13 + add x16, x16, #4 + add x18, x18, #8 + prfm pldl1keep, [x18] + ldr d2, [x15, x14, lsl #3] + add x17, x2, x13 + add x13, x13, #4 + add x17, x17, #4 + prfm pldl1keep, [x17] + ldr s3, [x2, x14, lsl #2] + prfm pldl1keep, [x16] + fmla v0.2s, v2.2s, v3.s[0] + ldr s4, [x1, x14, lsl #2] + fmla v1.2s, v2.2s, v4.s[0] + add x14, x14, #1 + add x16, x27, x14 + cmp x16, x21 + b.lt .LBB0_90 +.LBB0_91: // in Loop: Header=BB0_4 Depth=1 + str d0, [x11] + str d1, [x12] +.LBB0_92: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #368] // 8-byte Folded Reload + ldr x12, [sp, #440] // 8-byte Folded Reload + cmp x12, x11 + b.ge .LBB0_98 +// %bb.93: // in Loop: Header=BB0_4 Depth=1 + ldp x14, x13, [sp, #440] // 16-byte Folded Reload + ldr d2, [x8] + mov x11, xzr + mul x12, x14, x28 + add x9, x9, x12 + mul x12, x14, x19 + ldr x14, [sp, #376] // 8-byte Folded Reload + madd x12, x20, x13, x12 + add x9, x9, x5 + add x9, x6, x9, lsl #2 + ldr d0, [x9] + lsl x12, x12, #2 + ldr q1, [x25, x12] + ldr x12, [sp, #304] // 8-byte Folded Reload + cmp xzr, x29 + b.ge .LBB0_95 + .p2align 2 +.LBB0_94: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x10, #16 + fmla v0.2s, v2.2s, v1.s[0] + add x11, x11, #4 + prfm pldl1keep, [x13] + ldp d2, d3, [x10, #-16] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v3.2s, v1.s[2] + ldp d3, d2, [x10], #32 + prfm pldl1keep, [x12] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x12, #-16] + add x12, x12, #16 + cmp x11, x29 + b.lt .LBB0_94 +.LBB0_95: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #496] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.s[0] + mov x10, xzr + ldr d3, [x8, x11, lsl #3] + ldr x11, [sp, #488] // 8-byte Folded Reload + fmla v0.2s, v3.2s, v1.s[1] + ldr d4, [x8, x11, lsl #3] + ldr x11, [sp, #480] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v1.s[2] + ldr d2, [x8, x11, lsl #3] + ldr x11, [sp, #64] // 8-byte Folded Reload + add x8, x8, x11 + ldr x11, [sp, #272] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.s[3] + add x12, x27, xzr + cmp x12, x21 + b.ge .LBB0_97 + .p2align 2 +.LBB0_96: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x8, x10, lsl #3 + add x12, x12, #8 + prfm pldl1keep, [x12] + ldr x12, [sp, #408] // 8-byte Folded Reload + ldr d1, [x8, x10, lsl #3] + add x12, x12, x11 + add x11, x11, #4 + prfm pldl1keep, [x12] + ldr s2, [x14, x10, lsl #2] + add x10, x10, #1 + fmla v0.2s, v1.2s, v2.s[0] + add x12, x27, x10 + cmp x12, x21 + b.lt .LBB0_96 +.LBB0_97: // in Loop: Header=BB0_4 Depth=1 + str d0, [x9] +.LBB0_98: // in Loop: Header=BB0_4 Depth=1 + bl free + ldr x8, [sp, #448] // 8-byte Folded Reload + ldr x10, [sp, #168] // 8-byte Folded Reload + ldr x9, [sp, #136] // 8-byte Folded Reload + cmp x10, x9 + b.ge .LBB0_3 +.LBB0_99: // in Loop: Header=BB0_4 Depth=1 + ldr x8, [sp, #112] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #360] // 8-byte Folded Reload + ldr x5, [sp, #168] // 8-byte Folded Reload + mov x10, xzr + mov x11, xzr + ldp x13, x14, [sp, #424] // 16-byte Folded Reload + mul x9, x20, x8 + add x12, x9, x5 + add x8, x12, x13 + add x13, x13, x28 + ldr s2, [x14, x12, lsl #2] + add x13, x12, x13 + ldr s1, [x14, x8, lsl #2] + add x8, x0, #63 + ldr s0, [x14, x13, lsl #2] + add x13, x12, x28 + ldr x12, [sp, #224] // 8-byte Folded Reload + and x8, x8, #0xffffffffffffffc0 + ldr s3, [x14, x13, lsl #2] + ldr x14, [sp, #400] // 8-byte Folded Reload + cmp xzr, x21 + b.ge .LBB0_101 + .p2align 2 +.LBB0_100: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x14, x10 + add x11, x11, #1 + prfm pldl1keep, [x13] + ldur s4, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s5, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s6, [x13, #-4] + add x13, x13, x22 + prfm pldl1keep, [x13] + ldur s7, [x13, #-4] + prfm pldl1keep, [x12] + ldur s16, [x12, #-4] + add x12, x12, #4 + fmla v2.2s, v16.2s, v4.2s + fmla v3.2s, v16.2s, v5.2s + fmla v1.2s, v16.2s, v6.2s + fmla v0.2s, v16.2s, v7.2s + str s16, [x8, x10] + add x10, x10, #4 + cmp x11, x21 + b.lt .LBB0_100 +.LBB0_101: // %.preheader24 + // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #32] // 8-byte Folded Reload + ldp x12, x13, [sp, #384] // 16-byte Folded Reload + mov x1, xzr + ldp x7, x6, [sp, #184] // 16-byte Folded Reload + add x10, x8, #12 + mov w16, #1 // =0x1 + mov w17, #2 // =0x2 + mov w15, #3 // =0x3 + mov w14, #4 // =0x4 + add x11, x8, x11 + b .LBB0_103 + .p2align 2 +.LBB0_102: // %.loopexit20 + // in Loop: Header=BB0_103 Depth=2 + add x13, x13, x24 + add x12, x12, x24 + mov x1, x14 + mov x14, x18 +.LBB0_103: // Parent Loop BB0_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_105 Depth 3 + // Child Loop BB0_107 Depth 3 + madd x18, x1, x28, x9 + ldr x30, [sp, #432] // 8-byte Folded Reload + add x18, x18, x5 + madd x16, x16, x28, x9 + madd x17, x17, x28, x9 + madd x15, x15, x28, x9 + add x16, x16, x5 + add x15, x15, x5 + str s2, [x30, x18, lsl #2] + str s3, [x30, x16, lsl #2] + add x16, x17, x5 + str s1, [x30, x16, lsl #2] + str s0, [x30, x15, lsl #2] + ldr x15, [sp, #504] // 8-byte Folded Reload + cmp x14, x15 + b.ge .LBB0_108 +// %bb.104: // in Loop: Header=BB0_103 Depth=2 + madd x2, x14, x28, x9 + add x15, x14, #3 + add x16, x14, #1 + add x17, x14, #2 + madd x3, x16, x28, x9 + ldr s16, [x8] + mov x1, xzr + add x18, x14, #4 + madd x4, x17, x28, x9 + add x2, x2, x5 + ldr s2, [x30, x2, lsl #2] + madd x2, x15, x28, x9 + add x3, x3, x5 + add x4, x4, x5 + add x2, x2, x5 + ldr s3, [x30, x3, lsl #2] + ldr s1, [x30, x4, lsl #2] + ldr s0, [x30, x2, lsl #2] + ldr x2, [sp, #448] // 8-byte Folded Reload + mul x2, x20, x2 + madd x3, x14, x19, x2 + lsl x3, x3, #2 + ldr q7, [x25, x3] + madd x3, x16, x19, x2 + lsl x3, x3, #2 + ldr q6, [x25, x3] + madd x3, x17, x19, x2 + madd x2, x15, x19, x2 + lsl x3, x3, #2 + lsl x2, x2, #2 + ldr q5, [x25, x3] + ldr q4, [x25, x2] + mov x2, x10 + mov x3, x13 + ext v20.16b, v7.16b, v7.16b, #8 + cmp xzr, x29 + ext v19.16b, v6.16b, v6.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.ge .LBB0_106 + .p2align 2 +.LBB0_105: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_103 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x2, #8 + fmla v2.2s, v16.2s, v7.2s + fmla v3.2s, v16.2s, v6.2s + add x1, x1, #4 + fmla v1.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + prfm pldl1keep, [x4] + add x4, x3, x22 + ldp s16, s21, [x2, #-8] + fmla v0.2s, v16.2s, v4.s[1] + fmla v2.2s, v16.2s, v7.s[1] + fmla v3.2s, v16.2s, v6.s[1] + fmla v1.2s, v16.2s, v5.s[1] + fmla v0.2s, v21.2s, v17.2s + fmla v2.2s, v21.2s, v20.2s + ldp s17, s16, [x2], #16 + fmla v3.2s, v21.2s, v19.2s + fmla v1.2s, v21.2s, v18.2s + prfm pldl1keep, [x3] + fmla v2.2s, v17.2s, v7.s[3] + ldur q7, [x3, #-16] + prfm pldl1keep, [x4] + fmla v3.2s, v17.2s, v6.s[3] + ldur q6, [x4, #-16] + add x4, x4, x22 + fmla v1.2s, v17.2s, v5.s[3] + fmla v0.2s, v17.2s, v4.s[3] + add x3, x3, #16 + prfm pldl1keep, [x4] + ldur q5, [x4, #-16] + add x4, x4, x22 + prfm pldl1keep, [x4] + ldur q4, [x4, #-16] + ext v20.16b, v7.16b, v7.16b, #8 + cmp x1, x29 + ext v19.16b, v6.16b, v6.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.lt .LBB0_105 +.LBB0_106: // in Loop: Header=BB0_103 Depth=2 + ldp x1, x2, [sp, #488] // 16-byte Folded Reload + fmla v2.2s, v16.2s, v7.2s + fmla v3.2s, v16.2s, v6.2s + fmla v1.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + mov x3, x27 + ldr s21, [x8, x2, lsl #2] + ldr s16, [x8, x1, lsl #2] + ldr x1, [sp, #480] // 8-byte Folded Reload + mov x2, x12 + ldr s22, [x8, x1, lsl #2] + mov x1, x11 + fmla v2.2s, v21.2s, v7.s[1] + fmla v3.2s, v21.2s, v6.s[1] + fmla v1.2s, v21.2s, v5.s[1] + fmla v0.2s, v21.2s, v4.s[1] + fmla v2.2s, v16.2s, v20.2s + fmla v3.2s, v16.2s, v19.2s + fmla v1.2s, v16.2s, v18.2s + fmla v0.2s, v16.2s, v17.2s + fmla v2.2s, v22.2s, v7.s[3] + fmla v3.2s, v22.2s, v6.s[3] + fmla v1.2s, v22.2s, v5.s[3] + fmla v0.2s, v22.2s, v4.s[3] + cmp x27, x21 + b.ge .LBB0_102 + .p2align 2 +.LBB0_107: // Parent Loop BB0_4 Depth=1 + // Parent Loop BB0_103 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x2, x22 + prfm pldl1keep, [x1] + ldur s4, [x1, #-4] + add x3, x3, #1 + prfm pldl1keep, [x2] + ldur s5, [x2, #-4] + add x2, x2, #4 + add x1, x1, #4 + prfm pldl1keep, [x4] + ldur s6, [x4, #-4] + add x4, x4, x22 + fmla v2.2s, v4.2s, v5.2s + prfm pldl1keep, [x4] + ldur s7, [x4, #-4] + add x4, x4, x22 + prfm pldl1keep, [x4] + ldur s16, [x4, #-4] + fmla v3.2s, v4.2s, v6.2s + fmla v1.2s, v4.2s, v7.2s + fmla v0.2s, v4.2s, v16.2s + cmp x3, x21 + b.lt .LBB0_107 + b .LBB0_102 + .p2align 2 +.LBB0_108: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #504] // 8-byte Folded Reload + ldr x12, [sp, #440] // 8-byte Folded Reload + cmp x11, x12 + b.ge .LBB0_114 +// %bb.109: // in Loop: Header=BB0_4 Depth=1 + ldr x16, [sp, #504] // 8-byte Folded Reload + ldr x12, [sp, #448] // 8-byte Folded Reload + mov x13, xzr + mov x14, xzr + ldr s4, [x8] + mul x12, x20, x12 + mul x15, x16, x19 + mul x11, x16, x28 + add x15, x12, x15 + add x11, x9, x11 + lsl x15, x15, #2 + add x11, x11, x5 + ldr q2, [x25, x15] + add x15, x16, #1 + madd x16, x15, x19, x12 + madd x12, x15, x28, x9 + ldr x15, [sp, #432] // 8-byte Folded Reload + add x12, x12, x5 + ldr s1, [x15, x11, lsl #2] + ldr s0, [x15, x12, lsl #2] + lsl x15, x16, #2 + ldr q3, [x25, x15] + ext v6.16b, v2.16b, v2.16b, #8 + cmp xzr, x29 + ext v5.16b, v3.16b, v3.16b, #8 + b.ge .LBB0_111 + .p2align 2 +.LBB0_110: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x1, x8, x13 + fmla v1.2s, v4.2s, v2.2s + fmla v0.2s, v4.2s, v3.2s + add x15, x6, x13 + add x2, x1, #20 + add x17, x7, x13 + add x16, x15, #32 + add x18, x17, #32 + prfm pldl1keep, [x2] + ldp s4, s7, [x1, #4] + add x14, x14, #4 + add x13, x13, #16 + fmla v0.2s, v4.2s, v3.s[1] + fmla v1.2s, v4.2s, v2.s[1] + fmla v0.2s, v7.2s, v5.2s + ldp s5, s4, [x1, #12] + fmla v1.2s, v7.2s, v6.2s + prfm pldl1keep, [x18] + fmla v1.2s, v5.2s, v2.s[3] + ldr q2, [x17, #16] + prfm pldl1keep, [x16] + fmla v0.2s, v5.2s, v3.s[3] + ldr q3, [x15, #16] + ext v6.16b, v2.16b, v2.16b, #8 + cmp x14, x29 + ext v5.16b, v3.16b, v3.16b, #8 + b.lt .LBB0_110 +.LBB0_111: // in Loop: Header=BB0_4 Depth=1 + ldp x14, x15, [sp, #488] // 16-byte Folded Reload + fmla v1.2s, v4.2s, v2.2s + fmla v0.2s, v4.2s, v3.2s + ldr x1, [sp, #240] // 8-byte Folded Reload + ldr x3, [sp, #432] // 8-byte Folded Reload + mov x13, xzr + ldr s7, [x8, x15, lsl #2] + ldr s4, [x8, x14, lsl #2] + ldr x14, [sp, #480] // 8-byte Folded Reload + mov x15, x27 + fmla v1.2s, v7.2s, v2.s[1] + fmla v0.2s, v7.2s, v3.s[1] + ldr s7, [x8, x14, lsl #2] + ldr x14, [sp, #104] // 8-byte Folded Reload + fmla v1.2s, v4.2s, v6.2s + fmla v0.2s, v4.2s, v5.2s + add x14, x8, x14 + fmla v1.2s, v7.2s, v2.s[3] + fmla v0.2s, v7.2s, v3.s[3] + cmp x27, x21 + b.ge .LBB0_113 + .p2align 2 +.LBB0_112: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + ldr x2, [sp, #312] // 8-byte Folded Reload + add x16, x1, x13 + add x18, x14, x13 + add x15, x15, #1 + add x16, x16, #4 + add x18, x18, #4 + prfm pldl1keep, [x18] + add x17, x2, x13 + ldr s2, [x14, x13] + add x17, x17, #4 + prfm pldl1keep, [x17] + prfm pldl1keep, [x16] + ldr s3, [x2, x13] + fmla v1.2s, v2.2s, v3.2s + ldr s3, [x1, x13] + add x13, x13, #4 + fmla v0.2s, v2.2s, v3.2s + cmp x15, x21 + b.lt .LBB0_112 +.LBB0_113: // in Loop: Header=BB0_4 Depth=1 + str s1, [x3, x11, lsl #2] + str s0, [x3, x12, lsl #2] +.LBB0_114: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #368] // 8-byte Folded Reload + ldr x12, [sp, #440] // 8-byte Folded Reload + cmp x12, x11 + b.ge .LBB0_2 +// %bb.115: // in Loop: Header=BB0_4 Depth=1 + ldp x15, x13, [sp, #432] // 16-byte Folded Reload + ldr s2, [x8] + mov x11, xzr + ldr x14, [sp, #376] // 8-byte Folded Reload + mul x12, x13, x28 + add x9, x9, x12 + mul x12, x13, x19 + ldr x13, [sp, #448] // 8-byte Folded Reload + add x9, x9, x5 + ldr s0, [x15, x9, lsl #2] + madd x12, x20, x13, x12 + lsl x12, x12, #2 + ldr q1, [x25, x12] + ldr x12, [sp, #304] // 8-byte Folded Reload + ext v3.16b, v1.16b, v1.16b, #8 + cmp xzr, x29 + b.ge .LBB0_117 + .p2align 2 +.LBB0_116: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x13, x10, #8 + fmla v0.2s, v2.2s, v1.2s + add x11, x11, #4 + prfm pldl1keep, [x13] + ldp s2, s4, [x10, #-8] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v4.2s, v3.2s + ldp s3, s2, [x10], #16 + prfm pldl1keep, [x12] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x12, #-16] + add x12, x12, #16 + ext v3.16b, v1.16b, v1.16b, #8 + cmp x11, x29 + b.lt .LBB0_116 +.LBB0_117: // in Loop: Header=BB0_4 Depth=1 + ldr x11, [sp, #496] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.2s + mov x10, xzr + ldr s4, [x8, x11, lsl #2] + ldr x11, [sp, #488] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v1.s[1] + ldr s5, [x8, x11, lsl #2] + ldr x11, [sp, #480] // 8-byte Folded Reload + fmla v0.2s, v5.2s, v3.2s + ldr s2, [x8, x11, lsl #2] + ldr x11, [sp, #104] // 8-byte Folded Reload + add x8, x8, x11 + mov x11, x27 + fmla v0.2s, v2.2s, v1.s[3] + cmp x27, x21 + b.ge .LBB0_1 + .p2align 2 +.LBB0_118: // Parent Loop BB0_4 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x14, x10 + add x13, x8, x10 + add x11, x11, #1 + add x12, x12, #4 + add x13, x13, #4 + prfm pldl1keep, [x13] + ldr s1, [x8, x10] + prfm pldl1keep, [x12] + ldr s2, [x14, x10] + add x10, x10, #4 + fmla v0.2s, v1.2s, v2.2s + cmp x11, x21 + b.lt .LBB0_118 + b .LBB0_1 +.LBB0_119: + ldr x0, [sp, #16] // 8-byte Folded Reload + bl free + add sp, sp, #512 + ldp d9, d8, [sp, #48] // 16-byte Folded Reload + ldp d11, d10, [sp, #32] // 16-byte Folded Reload + ldp d13, d12, [sp, #16] // 16-byte Folded Reload + ldp x20, x19, [sp, #144] // 16-byte Folded Reload + ldp x22, x21, [sp, #128] // 16-byte Folded Reload + ldp x24, x23, [sp, #112] // 16-byte Folded Reload + ldp x26, x25, [sp, #96] // 16-byte Folded Reload + ldp x28, x27, [sp, #80] // 16-byte Folded Reload + ldp x29, x30, [sp, #64] // 16-byte Folded Reload + ldp d15, d14, [sp], #160 // 16-byte Folded Reload + ret +.Lfunc_end0: + .size sbatch_matmul_3d_nt_mlir, .Lfunc_end0-sbatch_matmul_3d_nt_mlir + .cfi_endproc + // -- End function + .section ".note.GNU-stack","",@progbits diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nn_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nn_mlir.s new file mode 100644 index 00000000000000..96e02991c200d9 --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nn_mlir.s @@ -0,0 +1,4171 @@ + .text + .file "LLVMDialectModule" + .globl sbatch_matmul_4d_nn_mlir // -- Begin function sbatch_matmul_4d_nn_mlir + .p2align 4 + .type sbatch_matmul_4d_nn_mlir,@function +sbatch_matmul_4d_nn_mlir: // @sbatch_matmul_4d_nn_mlir + .cfi_startproc +// %bb.0: + stp d15, d14, [sp, #-160]! // 16-byte Folded Spill + stp d13, d12, [sp, #16] // 16-byte Folded Spill + stp x29, x30, [sp, #64] // 16-byte Folded Spill + stp x28, x27, [sp, #80] // 16-byte Folded Spill + stp x26, x25, [sp, #96] // 16-byte Folded Spill + stp x24, x23, [sp, #112] // 16-byte Folded Spill + stp x22, x21, [sp, #128] // 16-byte Folded Spill + stp x20, x19, [sp, #144] // 16-byte Folded Spill + stp d11, d10, [sp, #32] // 16-byte Folded Spill + stp d9, d8, [sp, #48] // 16-byte Folded Spill + sub sp, sp, #1312 + .cfi_def_cfa_offset 1472 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + .cfi_offset b10, -120 + .cfi_offset b11, -128 + .cfi_offset b12, -136 + .cfi_offset b13, -144 + .cfi_offset b14, -152 + .cfi_offset b15, -160 + cmp x5, #0 + ldr x13, [sp, #1544] + ldr x29, [sp, #1656] + mov x20, x6 + cinv x8, x5, lt + ldr x23, [sp, #1568] + ldr x27, [sp, #1512] + mov x21, x1 + add x9, x8, x8, lsr #63 + add x10, x8, #3 + ldr x24, [sp, #1504] + ldr x28, [sp, #1480] + str x7, [sp, #1056] // 8-byte Folded Spill + str x4, [sp, #520] // 8-byte Folded Spill + asr x9, x9, #1 + str x3, [sp, #40] // 8-byte Folded Spill + str x2, [sp, #960] // 8-byte Folded Spill + cinv x9, x9, lt + cmp x8, #0 + str x5, [sp, #1024] // 8-byte Folded Spill + str x13, [sp, #504] // 8-byte Folded Spill + csel x8, x10, x8, lt + str x9, [sp, #1280] // 8-byte Folded Spill + ldr x9, [sp, #1552] + cmp x5, #0 + ldr x10, [sp, #1600] + asr x8, x8, #2 + cinv x22, x8, lt + cmp x13, #0 + cinv x8, x13, lt + str x9, [sp, #1048] // 8-byte Folded Spill + ldr x9, [sp, #1648] + str x10, [sp, #944] // 8-byte Folded Spill + ldr x10, [sp, #1592] + add x11, x8, #7 + add x12, x8, #3 + str x9, [sp, #1016] // 8-byte Folded Spill + ldr x9, [sp, #1640] + str x10, [sp, #936] // 8-byte Folded Spill + add x10, x8, #15 + str x9, [sp, #1008] // 8-byte Folded Spill + add x9, x8, x8, lsr #63 + asr x9, x9, #1 + cinv x14, x9, lt + ldr x9, [sp, #1560] + cmp x8, #0 + str x14, [sp, #1272] // 8-byte Folded Spill + str x9, [sp, #1040] // 8-byte Folded Spill + csel x9, x10, x8, lt + csel x10, x11, x8, lt + csel x8, x12, x8, lt + cmp x13, #0 + asr x9, x9, #4 + asr x8, x8, #2 + asr x10, x10, #3 + cinv x19, x9, lt + cinv x9, x8, lt + cinv x25, x10, lt + lsl x8, x19, #4 + str x9, [sp, #1264] // 8-byte Folded Spill + lsl x26, x25, #3 + str x8, [sp, #1104] // 8-byte Folded Spill + ldr x8, [sp, #1472] + str x8, [sp, #1032] // 8-byte Folded Spill + lsl x8, x9, #2 + str x8, [sp, #592] // 8-byte Folded Spill + lsl x8, x14, #1 + str x8, [sp, #584] // 8-byte Folded Spill + lsl x8, x6, #6 + add x0, x8, #64 + str x8, [sp, #1288] // 8-byte Folded Spill + bl malloc + ldr x16, [sp, #1280] // 8-byte Folded Reload + lsl x8, x22, #2 + mov x30, x26 + str x27, [sp, #928] // 8-byte Folded Spill + str x8, [sp, #1296] // 8-byte Folded Spill + add x8, x0, #63 + lsl x12, x27, #2 + lsl x27, x28, #2 + and x26, x8, #0xffffffffffffffc0 + lsl x6, x20, #2 + str x0, [sp, #16] // 8-byte Folded Spill + mov w14, #20 // =0x14 + madd x14, x23, x14, x12 + mov w11, #12 // =0xc + str x6, [sp, #480] // 8-byte Folded Spill + mov w17, #28 // =0x1c + lsl x9, x16, #1 + mul x3, x16, x28 + madd x17, x23, x17, x12 + mov w15, #24 // =0x18 + str x9, [sp, #1128] // 8-byte Folded Spill + negs x9, x20 + madd x15, x23, x15, x12 + str x23, [sp, #1000] // 8-byte Folded Spill + and x8, x9, #0x3 + and x9, x20, #0x3 + str x24, [sp, #920] // 8-byte Folded Spill + lsl x25, x25, #5 + csneg x7, x9, x8, mi + ldr x8, [sp, #960] // 8-byte Folded Reload + add x9, x20, x27 + lsl x19, x19, #6 + lsl x18, x7, #2 + sub x4, x26, x7, lsl #6 + stp xzr, xzr, [sp, #264] // 16-byte Folded Spill + mov x13, xzr + str x21, [sp, #952] // 8-byte Folded Spill + stp x25, x19, [sp, #464] // 16-byte Folded Spill + lsl x10, x8, #2 + sub x8, x6, x18 + str x30, [sp, #1080] // 8-byte Folded Spill + str x8, [sp, #512] // 8-byte Folded Spill + sub x8, x9, x7 + mov w9, #1 // =0x1 + add x2, x10, x28, lsl #3 + bfi x9, x22, #2, #62 + str x8, [sp, #1304] // 8-byte Folded Spill + mul x8, x22, x28 + mul x1, x28, x9 + add x9, x20, x3, lsl #1 + sub x9, x9, x7 + str x9, [sp, #1280] // 8-byte Folded Spill + ldr x9, [sp, #1264] // 8-byte Folded Reload + add x22, x10, x8, lsl #4 + add x5, x22, x6 + lsl x0, x9, #4 + ldr x9, [sp, #1272] // 8-byte Folded Reload + str x0, [sp, #456] // 8-byte Folded Spill + lsl x16, x9, #3 + add x9, x21, x2 + add x2, x6, x10 + str x9, [sp, #1224] // 8-byte Folded Spill + sub x9, x2, x18 + add x2, x24, x14 + madd x14, x28, x11, x10 + madd x11, x23, x11, x12 + str x9, [sp, #1272] // 8-byte Folded Spill + ldr x9, [sp, #1288] // 8-byte Folded Reload + add x9, x4, x9 + str x9, [sp, #848] // 8-byte Folded Spill + add x9, x10, x1, lsl #2 + add x1, x10, x3, lsl #3 + add x3, x24, x15 + stp x2, x3, [sp, #232] // 16-byte Folded Spill + add x4, x9, x6 + add x6, x1, x6 + str x9, [sp, #1256] // 8-byte Folded Spill + sub x9, x20, x7 + sub x8, x4, x18 + add x4, x24, x17 + str x8, [sp, #1264] // 8-byte Folded Spill + sub x8, x5, x18 + str x8, [sp, #1248] // 8-byte Folded Spill + sub x8, x6, x18 + add x6, x12, x23, lsl #5 + str x8, [sp, #1216] // 8-byte Folded Spill + add x8, x21, x14 + lsl x14, x23, #4 + str x14, [sp, #1240] // 8-byte Folded Spill + add x14, x14, x12 + str x8, [sp, #1288] // 8-byte Folded Spill + ldr x8, [sp, #512] // 8-byte Folded Reload + add x6, x24, x6 + add x18, x24, x14 + lsl x14, x23, #2 + stp x4, x6, [sp, #248] // 16-byte Folded Spill + str x14, [sp, #992] // 8-byte Folded Spill + add x14, x14, x12 + add x17, x24, x14 + add x14, x12, x23, lsl #3 + add x8, x8, #4 + stp x17, x18, [sp, #216] // 16-byte Folded Spill + add x15, x24, x14 + add x14, x24, x11 + mul x11, x23, x9 + str x8, [sp, #448] // 8-byte Folded Spill + stp x14, x15, [sp, #200] // 16-byte Folded Spill + add x11, x12, x11, lsl #2 + madd x12, x23, x8, x12 + ldr x8, [sp, #1272] // 8-byte Folded Reload + add x23, x24, x11 + add x5, x24, x12 + add x12, x8, x21 + ldr x8, [sp, #1304] // 8-byte Folded Reload + add x24, x21, x10 + str x9, [sp, #1304] // 8-byte Folded Spill + add x12, x12, #4 + str x5, [sp, #192] // 8-byte Folded Spill + add x5, x5, x16 + stp x23, x12, [sp, #176] // 16-byte Folded Spill + add x11, x10, x8, lsl #2 + ldr x8, [sp, #1280] // 8-byte Folded Reload + add x12, x10, x8, lsl #2 + add x10, x10, x21 + lsl x8, x28, #4 + add x10, x8, x10 + str x8, [sp, #1232] // 8-byte Folded Spill + add x8, x10, #32 + add x10, x11, x21 + ldr x11, [sp, #848] // 8-byte Folded Reload + str x8, [sp, #168] // 8-byte Folded Spill + add x8, x10, #4 + ldr x10, [sp, #1264] // 8-byte Folded Reload + str x8, [sp, #160] // 8-byte Folded Spill + ldr x8, [sp, #1256] // 8-byte Folded Reload + add x10, x21, x10 + str x10, [sp, #1168] // 8-byte Folded Spill + ldr x10, [sp, #1248] // 8-byte Folded Reload + add x8, x21, x8 + str x8, [sp, #1184] // 8-byte Folded Spill + add x8, x21, x22 + sub x22, x9, #4 + str x8, [sp, #1176] // 8-byte Folded Spill + add x10, x21, x10 + str x10, [sp, #1160] // 8-byte Folded Spill + add x10, x1, x21 + lsl x1, x20, #3 + add x10, x10, #32 + str x1, [sp, #424] // 8-byte Folded Spill + sub x1, x1, x7, lsl #3 + str x10, [sp, #152] // 8-byte Folded Spill + add x10, x12, x21 + lsl x12, x20, #4 + add x10, x10, #4 + str x10, [sp, #144] // 8-byte Folded Spill + ldr x10, [sp, #1216] // 8-byte Folded Reload + str x24, [sp, #1216] // 8-byte Folded Spill + add x10, x21, x10 + str x10, [sp, #136] // 8-byte Folded Spill + lsl x10, x20, #5 + stp x12, x10, [sp, #432] // 16-byte Folded Spill + sub x10, x10, x7, lsl #5 + sub x12, x12, x7, lsl #4 + add x7, x6, x16 + stp x7, x1, [sp, #400] // 16-byte Folded Spill + add x7, x4, x16 + str x7, [sp, #392] // 8-byte Folded Spill + add x7, x3, x16 + str x10, [sp, #416] // 8-byte Folded Spill + str x12, [sp, #280] // 8-byte Folded Spill + str x7, [sp, #384] // 8-byte Folded Spill + add x7, x2, x16 + str x7, [sp, #840] // 8-byte Folded Spill + add x7, x18, x16 + str x7, [sp, #832] // 8-byte Folded Spill + add x7, x17, x16 + str x7, [sp, #824] // 8-byte Folded Spill + add x7, x15, x16 + str x7, [sp, #816] // 8-byte Folded Spill + add x7, x14, x16 + add x16, x23, x16 + stp x16, x5, [sp, #120] // 16-byte Folded Spill + sub x16, x9, #3 + str x7, [sp, #808] // 8-byte Folded Spill + str x16, [sp, #984] // 8-byte Folded Spill + sub x16, x9, #2 + sub x9, x9, #1 + str x16, [sp, #976] // 8-byte Folded Spill + add x16, x26, #128 + str x9, [sp, #968] // 8-byte Folded Spill + ldr x9, [sp, #1048] // 8-byte Folded Reload + str x16, [sp, #912] // 8-byte Folded Spill + add x16, x26, #256 + str x16, [sp, #1200] // 8-byte Folded Spill + add x16, x11, #64 + add x11, x10, #32 + add x10, x6, x25 + stp x10, x11, [sp, #328] // 16-byte Folded Spill + add x10, x4, x25 + add x11, x12, #16 + str x16, [sp, #1192] // 8-byte Folded Spill + add x16, x6, x19 + ldr x12, [sp, #1288] // 8-byte Folded Reload + str x10, [sp, #320] // 8-byte Folded Spill + add x10, x3, x25 + lsl x9, x9, #2 + str x16, [sp, #376] // 8-byte Folded Spill + add x16, x4, x19 + str x10, [sp, #776] // 8-byte Folded Spill + add x10, x2, x25 + str x9, [sp, #32] // 8-byte Folded Spill + ldr x9, [sp, #1040] // 8-byte Folded Reload + str x16, [sp, #368] // 8-byte Folded Spill + add x16, x3, x19 + str x10, [sp, #768] // 8-byte Folded Spill + add x10, x18, x25 + str x16, [sp, #360] // 8-byte Folded Spill + add x16, x2, x19 + str x10, [sp, #760] // 8-byte Folded Spill + add x10, x17, x25 + str x16, [sp, #352] // 8-byte Folded Spill + add x16, x18, x19 + stp x24, x12, [sp, #104] // 16-byte Folded Spill + str x10, [sp, #752] // 8-byte Folded Spill + add x10, x15, x25 + str x16, [sp, #344] // 8-byte Folded Spill + add x16, x17, x19 + str x10, [sp, #744] // 8-byte Folded Spill + add x10, x14, x25 + lsl x9, x9, #2 + str x16, [sp, #800] // 8-byte Folded Spill + add x16, x15, x19 + str x10, [sp, #736] // 8-byte Folded Spill + add x10, x6, x0 + str x9, [sp, #496] // 8-byte Folded Spill + ldr x9, [sp, #1056] // 8-byte Folded Reload + str x16, [sp, #792] // 8-byte Folded Spill + add x16, x14, x19 + stp x10, x11, [sp, #304] // 16-byte Folded Spill + add x10, x4, x0 + ldr x11, [sp, #1224] // 8-byte Folded Reload + str x16, [sp, #784] // 8-byte Folded Spill + str x10, [sp, #296] // 8-byte Folded Spill + add x10, x3, x0 + str x10, [sp, #728] // 8-byte Folded Spill + add x10, x2, x0 + str x10, [sp, #720] // 8-byte Folded Spill + add x10, x18, x0 + lsl x9, x9, #2 + str x10, [sp, #712] // 8-byte Folded Spill + add x10, x17, x0 + str x9, [sp, #24] // 8-byte Folded Spill + ldr x9, [sp, #1032] // 8-byte Folded Reload + str x10, [sp, #704] // 8-byte Folded Spill + add x10, x15, x0 + str x10, [sp, #696] // 8-byte Folded Spill + add x10, x14, x0 + str x10, [sp, #688] // 8-byte Folded Spill + add x10, x1, #8 + str x10, [sp, #288] // 8-byte Folded Spill + mov x10, x8 + ldr x8, [sp, #1184] // 8-byte Folded Reload + lsl x9, x9, #2 + str x9, [sp, #488] // 8-byte Folded Spill + add x9, x24, x27 + str x9, [sp, #1208] // 8-byte Folded Spill + str x9, [sp, #96] // 8-byte Folded Spill + mov x9, x8 + ldr x8, [sp, #1160] // 8-byte Folded Reload + stp x8, x11, [sp, #80] // 16-byte Folded Spill + ldr x8, [sp, #1168] // 8-byte Folded Reload + str x8, [sp, #72] // 8-byte Folded Spill + b .LBB0_2 + .p2align 2 +.LBB0_1: // %.loopexit68 + // in Loop: Header=BB0_2 Depth=1 + ldr x8, [sp, #264] // 8-byte Folded Reload + ldp x10, x9, [sp, #24] // 16-byte Folded Reload + add x8, x8, x10 + ldr x13, [sp, #48] // 8-byte Folded Reload + str x8, [sp, #264] // 8-byte Folded Spill + ldr x8, [sp, #256] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #256] // 8-byte Folded Spill + ldr x8, [sp, #248] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #248] // 8-byte Folded Spill + ldr x8, [sp, #240] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #240] // 8-byte Folded Spill + ldr x8, [sp, #232] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #232] // 8-byte Folded Spill + ldr x8, [sp, #224] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #224] // 8-byte Folded Spill + ldr x8, [sp, #216] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #216] // 8-byte Folded Spill + ldr x8, [sp, #208] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #208] // 8-byte Folded Spill + ldr x8, [sp, #200] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #200] // 8-byte Folded Spill + ldr x8, [sp, #192] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #192] // 8-byte Folded Spill + ldr x8, [sp, #184] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #184] // 8-byte Folded Spill + ldr x8, [sp, #176] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #176] // 8-byte Folded Spill + ldr x8, [sp, #168] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #168] // 8-byte Folded Spill + ldr x8, [sp, #160] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #160] // 8-byte Folded Spill + ldr x8, [sp, #152] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #152] // 8-byte Folded Spill + ldr x8, [sp, #144] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #144] // 8-byte Folded Spill + ldr x8, [sp, #136] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #136] // 8-byte Folded Spill + ldr x8, [sp, #272] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #272] // 8-byte Folded Spill + ldr x8, [sp, #72] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #72] // 8-byte Folded Spill + ldr x8, [sp, #80] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #80] // 8-byte Folded Spill + ldr x8, [sp, #88] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #88] // 8-byte Folded Spill + ldr x8, [sp, #96] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #96] // 8-byte Folded Spill + ldr x8, [sp, #104] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #104] // 8-byte Folded Spill + ldr x8, [sp, #112] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #112] // 8-byte Folded Spill + ldr x8, [sp, #128] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #128] // 8-byte Folded Spill + ldr x8, [sp, #120] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #120] // 8-byte Folded Spill + ldp x9, x8, [sp, #56] // 16-byte Folded Reload + add x9, x9, x10 + add x10, x8, x10 +.LBB0_2: // =>This Loop Header: Depth=1 + // Child Loop BB0_7 Depth 2 + // Child Loop BB0_11 Depth 3 + // Child Loop BB0_13 Depth 4 + // Child Loop BB0_16 Depth 4 + // Child Loop BB0_19 Depth 4 + // Child Loop BB0_21 Depth 5 + // Child Loop BB0_23 Depth 5 + // Child Loop BB0_26 Depth 4 + // Child Loop BB0_28 Depth 4 + // Child Loop BB0_32 Depth 4 + // Child Loop BB0_34 Depth 4 + // Child Loop BB0_40 Depth 3 + // Child Loop BB0_43 Depth 3 + // Child Loop BB0_46 Depth 3 + // Child Loop BB0_48 Depth 4 + // Child Loop BB0_50 Depth 4 + // Child Loop BB0_53 Depth 3 + // Child Loop BB0_55 Depth 3 + // Child Loop BB0_59 Depth 3 + // Child Loop BB0_61 Depth 3 + // Child Loop BB0_65 Depth 3 + // Child Loop BB0_68 Depth 3 + // Child Loop BB0_71 Depth 3 + // Child Loop BB0_73 Depth 4 + // Child Loop BB0_75 Depth 4 + // Child Loop BB0_78 Depth 3 + // Child Loop BB0_80 Depth 3 + // Child Loop BB0_84 Depth 3 + // Child Loop BB0_86 Depth 3 + // Child Loop BB0_90 Depth 3 + // Child Loop BB0_93 Depth 3 + // Child Loop BB0_96 Depth 3 + // Child Loop BB0_98 Depth 4 + // Child Loop BB0_100 Depth 4 + // Child Loop BB0_103 Depth 3 + // Child Loop BB0_105 Depth 3 + // Child Loop BB0_109 Depth 3 + // Child Loop BB0_111 Depth 3 + // Child Loop BB0_115 Depth 3 + // Child Loop BB0_118 Depth 3 + // Child Loop BB0_121 Depth 3 + // Child Loop BB0_123 Depth 4 + // Child Loop BB0_125 Depth 4 + // Child Loop BB0_128 Depth 3 + // Child Loop BB0_130 Depth 3 + // Child Loop BB0_134 Depth 3 + // Child Loop BB0_136 Depth 3 + ldr x8, [sp, #40] // 8-byte Folded Reload + cmp x13, x8 + b.ge .LBB0_137 +// %bb.3: // in Loop: Header=BB0_2 Depth=1 + add x8, x13, #1 + str x10, [sp, #64] // 8-byte Folded Spill + ldp x15, x16, [sp, #216] // 16-byte Folded Reload + stp x8, x9, [sp, #48] // 16-byte Folded Spill + mov x19, xzr + str x10, [sp, #616] // 8-byte Folded Spill + str x9, [sp, #608] // 8-byte Folded Spill + ldp x10, x8, [sp, #120] // 16-byte Folded Reload + str x13, [sp, #1064] // 8-byte Folded Spill + str x8, [sp, #600] // 8-byte Folded Spill + ldp x8, x9, [sp, #104] // 16-byte Folded Reload + ldp x13, x14, [sp, #200] // 16-byte Folded Reload + str x9, [sp, #888] // 8-byte Folded Spill + str x8, [sp, #872] // 8-byte Folded Spill + ldp x8, x9, [sp, #88] // 16-byte Folded Reload + str x9, [sp, #864] // 8-byte Folded Spill + str x8, [sp, #856] // 8-byte Folded Spill + ldp x8, x9, [sp, #72] // 16-byte Folded Reload + str x8, [sp, #680] // 8-byte Folded Spill + ldr x8, [sp, #272] // 8-byte Folded Reload + str x9, [sp, #648] // 8-byte Folded Spill + str x8, [sp, #656] // 8-byte Folded Spill + ldp x12, x8, [sp, #136] // 16-byte Folded Reload + str x8, [sp, #880] // 8-byte Folded Spill + ldp x9, x8, [sp, #152] // 16-byte Folded Reload + str x9, [sp, #904] // 8-byte Folded Spill + str x8, [sp, #1096] // 8-byte Folded Spill + ldp x9, x8, [sp, #168] // 16-byte Folded Reload + str x9, [sp, #1088] // 8-byte Folded Spill + str x8, [sp, #672] // 8-byte Folded Spill + ldp x9, x8, [sp, #184] // 16-byte Folded Reload + str x8, [sp, #664] // 8-byte Folded Spill + str x9, [sp, #1152] // 8-byte Folded Spill + ldp x17, x8, [sp, #232] // 16-byte Folded Reload + str x8, [sp, #640] // 8-byte Folded Spill + ldp x9, x8, [sp, #248] // 16-byte Folded Reload + str x8, [sp, #624] // 8-byte Folded Spill + ldr x8, [sp, #264] // 8-byte Folded Reload + str x9, [sp, #632] // 8-byte Folded Spill + str x8, [sp, #1120] // 8-byte Folded Spill + b .LBB0_7 + .p2align 2 +.LBB0_4: // in Loop: Header=BB0_7 Depth=2 + str s0, [x15, x9, lsl #2] +.LBB0_5: // in Loop: Header=BB0_7 Depth=2 + bl free + ldr x30, [sp, #1080] // 8-byte Folded Reload +.LBB0_6: // %.backedge69 + // in Loop: Header=BB0_7 Depth=2 + ldr x9, [sp, #488] // 8-byte Folded Reload + ldr x8, [sp, #1120] // 8-byte Folded Reload + add x8, x8, x9 + ldr x10, [sp, #624] // 8-byte Folded Reload + ldr x17, [sp, #536] // 8-byte Folded Reload + ldr x16, [sp, #544] // 8-byte Folded Reload + ldr x15, [sp, #552] // 8-byte Folded Reload + ldr x14, [sp, #560] // 8-byte Folded Reload + ldr x13, [sp, #568] // 8-byte Folded Reload + ldr x12, [sp, #896] // 8-byte Folded Reload + ldr x19, [sp, #528] // 8-byte Folded Reload + add x12, x12, x9 + str x8, [sp, #1120] // 8-byte Folded Spill + ldr x8, [sp, #496] // 8-byte Folded Reload + add x10, x10, x8 + add x17, x17, x8 + add x16, x16, x8 + add x15, x15, x8 + add x14, x14, x8 + add x13, x13, x8 + str x10, [sp, #624] // 8-byte Folded Spill + ldr x10, [sp, #632] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #632] // 8-byte Folded Spill + ldr x10, [sp, #640] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #640] // 8-byte Folded Spill + ldr x10, [sp, #664] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #664] // 8-byte Folded Spill + ldr x10, [sp, #1152] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #1152] // 8-byte Folded Spill + ldr x10, [sp, #672] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #672] // 8-byte Folded Spill + ldr x10, [sp, #1088] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #1088] // 8-byte Folded Spill + ldr x10, [sp, #1096] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #1096] // 8-byte Folded Spill + ldr x10, [sp, #904] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #904] // 8-byte Folded Spill + ldr x10, [sp, #880] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #880] // 8-byte Folded Spill + ldr x10, [sp, #656] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #656] // 8-byte Folded Spill + ldr x10, [sp, #680] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #680] // 8-byte Folded Spill + ldr x10, [sp, #648] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #648] // 8-byte Folded Spill + ldr x10, [sp, #856] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #856] // 8-byte Folded Spill + ldr x10, [sp, #864] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #864] // 8-byte Folded Spill + ldr x10, [sp, #872] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #872] // 8-byte Folded Spill + ldr x10, [sp, #888] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #888] // 8-byte Folded Spill + ldr x10, [sp, #600] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #600] // 8-byte Folded Spill + ldr x10, [sp, #576] // 8-byte Folded Reload + add x10, x10, x8 + ldr x8, [sp, #608] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #608] // 8-byte Folded Spill + ldr x8, [sp, #616] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #616] // 8-byte Folded Spill +.LBB0_7: // Parent Loop BB0_2 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_11 Depth 3 + // Child Loop BB0_13 Depth 4 + // Child Loop BB0_16 Depth 4 + // Child Loop BB0_19 Depth 4 + // Child Loop BB0_21 Depth 5 + // Child Loop BB0_23 Depth 5 + // Child Loop BB0_26 Depth 4 + // Child Loop BB0_28 Depth 4 + // Child Loop BB0_32 Depth 4 + // Child Loop BB0_34 Depth 4 + // Child Loop BB0_40 Depth 3 + // Child Loop BB0_43 Depth 3 + // Child Loop BB0_46 Depth 3 + // Child Loop BB0_48 Depth 4 + // Child Loop BB0_50 Depth 4 + // Child Loop BB0_53 Depth 3 + // Child Loop BB0_55 Depth 3 + // Child Loop BB0_59 Depth 3 + // Child Loop BB0_61 Depth 3 + // Child Loop BB0_65 Depth 3 + // Child Loop BB0_68 Depth 3 + // Child Loop BB0_71 Depth 3 + // Child Loop BB0_73 Depth 4 + // Child Loop BB0_75 Depth 4 + // Child Loop BB0_78 Depth 3 + // Child Loop BB0_80 Depth 3 + // Child Loop BB0_84 Depth 3 + // Child Loop BB0_86 Depth 3 + // Child Loop BB0_90 Depth 3 + // Child Loop BB0_93 Depth 3 + // Child Loop BB0_96 Depth 3 + // Child Loop BB0_98 Depth 4 + // Child Loop BB0_100 Depth 4 + // Child Loop BB0_103 Depth 3 + // Child Loop BB0_105 Depth 3 + // Child Loop BB0_109 Depth 3 + // Child Loop BB0_111 Depth 3 + // Child Loop BB0_115 Depth 3 + // Child Loop BB0_118 Depth 3 + // Child Loop BB0_121 Depth 3 + // Child Loop BB0_123 Depth 4 + // Child Loop BB0_125 Depth 4 + // Child Loop BB0_128 Depth 3 + // Child Loop BB0_130 Depth 3 + // Child Loop BB0_134 Depth 3 + // Child Loop BB0_136 Depth 3 + ldr x8, [sp, #520] // 8-byte Folded Reload + cmp x19, x8 + b.ge .LBB0_1 +// %bb.8: // in Loop: Header=BB0_7 Depth=2 + add x8, x19, #1 + str x15, [sp, #552] // 8-byte Folded Spill + mov x0, xzr + str x15, [sp, #1264] // 8-byte Folded Spill + ldr x15, [sp, #640] // 8-byte Folded Reload + str x8, [sp, #528] // 8-byte Folded Spill + ldr x8, [sp, #672] // 8-byte Folded Reload + str x16, [sp, #544] // 8-byte Folded Spill + str x16, [sp, #1256] // 8-byte Folded Spill + ldr x16, [sp, #632] // 8-byte Folded Reload + str x17, [sp, #536] // 8-byte Folded Spill + str x17, [sp, #1248] // 8-byte Folded Spill + ldr x17, [sp, #624] // 8-byte Folded Reload + str x12, [sp, #896] // 8-byte Folded Spill + str x10, [sp, #576] // 8-byte Folded Spill + str x13, [sp, #568] // 8-byte Folded Spill + str x13, [sp, #1280] // 8-byte Folded Spill + str x14, [sp, #560] // 8-byte Folded Spill + str x14, [sp, #1272] // 8-byte Folded Spill + str x19, [sp, #1072] // 8-byte Folded Spill + str x8, [sp, #1144] // 8-byte Folded Spill + ldr x8, [sp, #664] // 8-byte Folded Reload + str x8, [sp, #1136] // 8-byte Folded Spill + b .LBB0_11 + .p2align 2 +.LBB0_9: // in Loop: Header=BB0_11 Depth=3 + stp q3, q2, [x8] + stp q1, q0, [x8, #32] +.LBB0_10: // %.backedge + // in Loop: Header=BB0_11 Depth=3 + ldr x8, [sp, #1248] // 8-byte Folded Reload + ldr x0, [sp, #1112] // 8-byte Folded Reload + add x17, x17, #64 + add x16, x16, #64 + add x15, x15, #64 + add x8, x8, #64 + str x8, [sp, #1248] // 8-byte Folded Spill + ldr x8, [sp, #1256] // 8-byte Folded Reload + add x8, x8, #64 + str x8, [sp, #1256] // 8-byte Folded Spill + ldr x8, [sp, #1264] // 8-byte Folded Reload + add x8, x8, #64 + str x8, [sp, #1264] // 8-byte Folded Spill + ldr x8, [sp, #1272] // 8-byte Folded Reload + add x8, x8, #64 + str x8, [sp, #1272] // 8-byte Folded Spill + ldr x8, [sp, #1280] // 8-byte Folded Reload + add x8, x8, #64 + str x8, [sp, #1280] // 8-byte Folded Spill + ldr x8, [sp, #1136] // 8-byte Folded Reload + add x8, x8, #64 + str x8, [sp, #1136] // 8-byte Folded Spill + ldr x8, [sp, #1144] // 8-byte Folded Reload + add x8, x8, #64 + str x8, [sp, #1144] // 8-byte Folded Spill +.LBB0_11: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_13 Depth 4 + // Child Loop BB0_16 Depth 4 + // Child Loop BB0_19 Depth 4 + // Child Loop BB0_21 Depth 5 + // Child Loop BB0_23 Depth 5 + // Child Loop BB0_26 Depth 4 + // Child Loop BB0_28 Depth 4 + // Child Loop BB0_32 Depth 4 + // Child Loop BB0_34 Depth 4 + ldr x8, [sp, #1104] // 8-byte Folded Reload + cmp x0, x8 + b.ge .LBB0_35 +// %bb.12: // in Loop: Header=BB0_11 Depth=3 + ldr x8, [sp, #944] // 8-byte Folded Reload + ldr x9, [sp, #936] // 8-byte Folded Reload + mov x5, xzr + mov x7, xzr + ldr x10, [sp, #928] // 8-byte Folded Reload + ldr x11, [sp, #920] // 8-byte Folded Reload + add x1, x9, x8, lsl #2 + add x8, x0, #16 + add x14, x11, x10, lsl #2 + ldr x11, [sp, #1008] // 8-byte Folded Reload + ldr x12, [sp, #1064] // 8-byte Folded Reload + ldr x10, [sp, #1040] // 8-byte Folded Reload + lsl x9, x29, #1 + str x8, [sp, #1112] // 8-byte Folded Spill + ldr x8, [sp, #1016] // 8-byte Folded Reload + mul x10, x19, x10 + mul x8, x19, x8 + madd x2, x12, x11, x8 + ldr x11, [sp, #1048] // 8-byte Folded Reload + add x8, x9, x29 + madd x6, x12, x11, x10 + add x10, x2, x0 + add x11, x1, x10, lsl #2 + add x8, x10, x8 + add x9, x10, x9 + add x10, x10, x29 + add x8, x1, x8, lsl #2 + add x9, x1, x9, lsl #2 + add x10, x1, x10, lsl #2 + ldp q4, q3, [x11, #32] + ldp q1, q0, [x11] + add x11, x6, x0 + ldp q18, q16, [x8, #32] + ldp q23, q21, [x8] + ldp q19, q17, [x9, #32] + ldp q22, q20, [x9] + ldr x9, [sp, #1056] // 8-byte Folded Reload + ldp q5, q2, [x10, #32] + add x8, x14, x11, lsl #2 + ldp q7, q6, [x10] + ldr x10, [sp, #952] // 8-byte Folded Reload + ldp q29, q28, [x8, #32] + ldp q31, q30, [x8] + ldr x8, [sp, #1032] // 8-byte Folded Reload + mul x8, x19, x8 + ldr x19, [sp, #1120] // 8-byte Folded Reload + madd x4, x12, x9, x8 + ldr x9, [sp, #960] // 8-byte Folded Reload + add x3, x10, x9, lsl #2 + lsl x8, x4, #2 + ldr q26, [x3, x8] + add x8, x4, x28 + lsl x8, x8, #2 + ldr q25, [x3, x8] + add x8, x4, x28, lsl #1 + lsl x8, x8, #2 + ldr q24, [x3, x8] + ldr x8, [sp, #912] // 8-byte Folded Reload + .p2align 2 +.LBB0_13: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + ldr x9, [sp, #1288] // 8-byte Folded Reload + fmla v1.4s, v31.4s, v26.s[0] + fmla v0.4s, v30.4s, v26.s[0] + cmp x7, x22 + add x9, x9, x19 + prfm pldl1keep, [x9, #16] + ldr q27, [x9] + b.ge .LBB0_15 +// %bb.14: // in Loop: Header=BB0_13 Depth=4 + ldr x11, [sp, #1216] // 8-byte Folded Reload + ldr x12, [sp, #1264] // 8-byte Folded Reload + fmla v4.4s, v29.4s, v26.s[0] + fmla v3.4s, v28.4s, v26.s[0] + ldr x9, [sp, #1224] // 8-byte Folded Reload + ldr x10, [sp, #1208] // 8-byte Folded Reload + fmla v7.4s, v31.4s, v25.s[0] + fmla v6.4s, v30.4s, v25.s[0] + fmla v5.4s, v29.4s, v25.s[0] + fmla v2.4s, v28.4s, v25.s[0] + stp q31, q30, [x8, #-128] + fmla v17.4s, v28.4s, v24.s[0] + fmla v19.4s, v29.4s, v24.s[0] + stp q29, q28, [x8, #-96] + add x21, x11, x19 + ldr x11, [sp, #1248] // 8-byte Folded Reload + add x12, x12, x5 + fmla v22.4s, v31.4s, v24.s[0] + fmla v20.4s, v30.4s, v24.s[0] + fmla v16.4s, v28.4s, v27.s[0] + fmla v18.4s, v29.4s, v27.s[0] + add x30, x16, x5 + fmla v21.4s, v30.4s, v27.s[0] + fmla v23.4s, v31.4s, v27.s[0] + add x9, x9, x19 + add x10, x10, x19 + add x23, x9, #32 + add x24, x10, #32 + add x25, x21, #32 + add x7, x7, #4 + add x11, x11, x5 + add x19, x19, #16 + prfm pldl1keep, [x11] + ldr x13, [sp, #1272] // 8-byte Folded Reload + ldp q28, q29, [x12, #32] + ldp q30, q31, [x12] + add x12, x15, x5 + add x11, x17, x5 + add x18, x13, x5 + fmla v3.4s, v29.4s, v26.s[1] + fmla v0.4s, v31.4s, v26.s[1] + fmla v2.4s, v29.4s, v25.s[1] + fmla v6.4s, v31.4s, v25.s[1] + fmla v20.4s, v31.4s, v24.s[1] + fmla v17.4s, v29.4s, v24.s[1] + fmla v21.4s, v31.4s, v27.s[1] + fmla v16.4s, v29.4s, v27.s[1] + fmla v4.4s, v28.4s, v26.s[1] + fmla v1.4s, v30.4s, v26.s[1] + fmla v5.4s, v28.4s, v25.s[1] + fmla v7.4s, v30.4s, v25.s[1] + fmla v22.4s, v30.4s, v24.s[1] + stp q28, q29, [x8, #-32] + fmla v19.4s, v28.4s, v24.s[1] + fmla v23.4s, v30.4s, v27.s[1] + fmla v18.4s, v28.4s, v27.s[1] + stp q30, q31, [x8, #-64] + prfm pldl1keep, [x12] + ldp q29, q28, [x18, #32] + ldp q31, q30, [x18] + ldr x12, [sp, #1280] // 8-byte Folded Reload + add x12, x12, x5 + fmla v0.4s, v30.4s, v26.s[2] + fmla v3.4s, v28.4s, v26.s[2] + fmla v2.4s, v28.4s, v25.s[2] + fmla v17.4s, v28.4s, v24.s[2] + fmla v6.4s, v30.4s, v25.s[2] + fmla v20.4s, v30.4s, v24.s[2] + stp q31, q30, [x8] + stp q29, q28, [x8, #32] + prfm pldl1keep, [x30] + ldp q8, q9, [x12] + fmla v1.4s, v31.4s, v26.s[2] + ldp q10, q11, [x12, #32] + ldr x12, [sp, #1256] // 8-byte Folded Reload + fmla v4.4s, v29.4s, v26.s[2] + fmla v5.4s, v29.4s, v25.s[2] + fmla v19.4s, v29.4s, v24.s[2] + fmla v7.4s, v31.4s, v25.s[2] + fmla v22.4s, v31.4s, v24.s[2] + add x12, x12, x5 + fmla v16.4s, v28.4s, v27.s[2] + fmla v18.4s, v29.4s, v27.s[2] + fmla v21.4s, v30.4s, v27.s[2] + fmla v23.4s, v31.4s, v27.s[2] + fmla v3.4s, v11.4s, v26.s[3] + fmla v4.4s, v10.4s, v26.s[3] + fmla v0.4s, v9.4s, v26.s[3] + stp q8, q9, [x8, #64] + stp q10, q11, [x8, #96] + prfm pldl1keep, [x11] + fmla v1.4s, v8.4s, v26.s[3] + fmla v2.4s, v11.4s, v25.s[3] + fmla v5.4s, v10.4s, v25.s[3] + fmla v6.4s, v9.4s, v25.s[3] + fmla v7.4s, v8.4s, v25.s[3] + ldp q29, q28, [x12, #32] + fmla v20.4s, v9.4s, v24.s[3] + fmla v22.4s, v8.4s, v24.s[3] + fmla v19.4s, v10.4s, v24.s[3] + fmla v17.4s, v11.4s, v24.s[3] + ldp q31, q30, [x12] + prfm pldl1keep, [x25] + ldr q26, [x21, #16] + prfm pldl1keep, [x24] + ldr q25, [x10, #16] + prfm pldl1keep, [x23] + ldr q24, [x9, #16] + ldr x9, [sp, #1240] // 8-byte Folded Reload + fmla v23.4s, v8.4s, v27.s[3] + fmla v21.4s, v9.4s, v27.s[3] + fmla v18.4s, v10.4s, v27.s[3] + fmla v16.4s, v11.4s, v27.s[3] + add x5, x5, x9 + add x8, x8, #256 + b .LBB0_13 + .p2align 2 +.LBB0_15: // in Loop: Header=BB0_11 Depth=3 + ldr x11, [sp, #1000] // 8-byte Folded Reload + ldr x12, [sp, #984] // 8-byte Folded Reload + add x9, x26, x22, lsl #6 + fmla v4.4s, v29.4s, v26.s[0] + ldr x13, [sp, #976] // 8-byte Folded Reload + fmla v3.4s, v28.4s, v26.s[0] + fmla v2.4s, v28.4s, v25.s[0] + fmla v7.4s, v31.4s, v25.s[0] + stp q31, q30, [x9] + stp q29, q28, [x9, #32] + fmla v6.4s, v30.4s, v25.s[0] + fmla v5.4s, v29.4s, v25.s[0] + fmla v17.4s, v28.4s, v24.s[0] + fmla v19.4s, v29.4s, v24.s[0] + madd x8, x12, x11, x6 + madd x10, x13, x11, x6 + fmla v20.4s, v30.4s, v24.s[0] + fmla v22.4s, v31.4s, v24.s[0] + fmla v16.4s, v28.4s, v27.s[0] + fmla v18.4s, v29.4s, v27.s[0] + add x5, x26, x12, lsl #6 + ldr x12, [sp, #968] // 8-byte Folded Reload + fmla v21.4s, v30.4s, v27.s[0] + fmla v23.4s, v31.4s, v27.s[0] + ldr x18, [sp, #1136] // 8-byte Folded Reload + mov x19, xzr + add x8, x8, x0 + add x9, x10, x0 + add x7, x26, x12, lsl #6 + add x8, x14, x8, lsl #2 + add x9, x14, x9, lsl #2 + ldp q28, q29, [x8] + fmla v0.4s, v29.4s, v26.s[1] + ldp q30, q31, [x8, #32] + madd x8, x12, x11, x6 + fmla v3.4s, v31.4s, v26.s[1] + fmla v6.4s, v29.4s, v25.s[1] + fmla v2.4s, v31.4s, v25.s[1] + fmla v20.4s, v29.4s, v24.s[1] + fmla v17.4s, v31.4s, v24.s[1] + fmla v21.4s, v29.4s, v27.s[1] + fmla v16.4s, v31.4s, v27.s[1] + add x6, x26, x13, lsl #6 + add x8, x8, x0 + ldr x13, [sp, #992] // 8-byte Folded Reload + stp q28, q29, [x5] + fmla v4.4s, v30.4s, v26.s[1] + stp q30, q31, [x5, #32] + fmla v1.4s, v28.4s, v26.s[1] + fmla v5.4s, v30.4s, v25.s[1] + fmla v7.4s, v28.4s, v25.s[1] + fmla v22.4s, v28.4s, v24.s[1] + fmla v19.4s, v30.4s, v24.s[1] + fmla v23.4s, v28.4s, v27.s[1] + fmla v18.4s, v30.4s, v27.s[1] + add x8, x14, x8, lsl #2 + ldr x14, [sp, #1144] // 8-byte Folded Reload + ldp q29, q28, [x9, #32] + ldp q31, q30, [x9] + ldr x9, [sp, #1304] // 8-byte Folded Reload + fmla v0.4s, v30.4s, v26.s[2] + fmla v3.4s, v28.4s, v26.s[2] + fmla v2.4s, v28.4s, v25.s[2] + fmla v6.4s, v30.4s, v25.s[2] + fmla v17.4s, v28.4s, v24.s[2] + fmla v20.4s, v30.4s, v24.s[2] + fmla v16.4s, v28.4s, v27.s[2] + fmla v21.4s, v30.4s, v27.s[2] + stp q31, q30, [x6] + fmla v1.4s, v31.4s, v26.s[2] + stp q29, q28, [x6, #32] + fmla v4.4s, v29.4s, v26.s[2] + fmla v7.4s, v31.4s, v25.s[2] + fmla v5.4s, v29.4s, v25.s[2] + fmla v19.4s, v29.4s, v24.s[2] + fmla v22.4s, v31.4s, v24.s[2] + fmla v18.4s, v29.4s, v27.s[2] + fmla v23.4s, v31.4s, v27.s[2] + ldp q28, q29, [x8] + fmla v0.4s, v29.4s, v26.s[3] + ldp q30, q31, [x8, #32] + fmla v3.4s, v31.4s, v26.s[3] + fmla v6.4s, v29.4s, v25.s[3] + fmla v2.4s, v31.4s, v25.s[3] + fmla v20.4s, v29.4s, v24.s[3] + fmla v17.4s, v31.4s, v24.s[3] + fmla v21.4s, v29.4s, v27.s[3] + ldr x8, [sp, #1152] // 8-byte Folded Reload + fmla v16.4s, v31.4s, v27.s[3] + fmla v4.4s, v30.4s, v26.s[3] + fmla v1.4s, v28.4s, v26.s[3] + fmla v5.4s, v30.4s, v25.s[3] + fmla v7.4s, v28.4s, v25.s[3] + stp q28, q29, [x7] + stp q30, q31, [x7, #32] + fmla v22.4s, v28.4s, v24.s[3] + fmla v19.4s, v30.4s, v24.s[3] + fmla v23.4s, v28.4s, v27.s[3] + fmla v18.4s, v30.4s, v27.s[3] + cmp x9, x20 + b.ge .LBB0_17 + .p2align 2 +.LBB0_16: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + add x10, x8, x27 + add x11, x18, x19 + prfm pldl1keep, [x8] + ldur s24, [x8, #-4] + add x12, x10, x27 + prfm pldl1keep, [x10] + ldur s25, [x10, #-4] + add x10, x14, x19 + prfm pldl1keep, [x12] + ldur s26, [x12, #-4] + add x12, x12, x27 + add x19, x19, x13 + prfm pldl1keep, [x12] + ldur s27, [x12, #-4] + add x8, x8, #4 + prfm pldl1keep, [x11] + ldp q28, q29, [x10, #32] + fmla v3.4s, v29.4s, v24.s[0] + fmla v2.4s, v29.4s, v25.s[0] + ldp q30, q31, [x10] + add x10, x26, x9, lsl #6 + fmla v0.4s, v31.4s, v24.s[0] + fmla v6.4s, v31.4s, v25.s[0] + add x9, x9, #1 + fmla v20.4s, v31.4s, v26.s[0] + fmla v17.4s, v29.4s, v26.s[0] + fmla v4.4s, v28.4s, v24.s[0] + fmla v1.4s, v30.4s, v24.s[0] + fmla v5.4s, v28.4s, v25.s[0] + fmla v7.4s, v30.4s, v25.s[0] + fmla v22.4s, v30.4s, v26.s[0] + fmla v19.4s, v28.4s, v26.s[0] + fmla v23.4s, v30.4s, v27.s[0] + fmla v21.4s, v31.4s, v27.s[0] + fmla v18.4s, v28.4s, v27.s[0] + stp q30, q31, [x10] + fmla v16.4s, v29.4s, v27.s[0] + stp q28, q29, [x10, #32] + cmp x9, x20 + b.lt .LBB0_16 +.LBB0_17: // %.preheader + // in Loop: Header=BB0_11 Depth=3 + ldr x30, [sp, #1096] // 8-byte Folded Reload + ldr x9, [sp, #1088] // 8-byte Folded Reload + mov x8, xzr + mov w19, #1 // =0x1 + mov w25, #2 // =0x2 + mov w23, #3 // =0x3 + mov w24, #4 // =0x4 + b .LBB0_19 + .p2align 2 +.LBB0_18: // %.loopexit + // in Loop: Header=BB0_19 Depth=4 + ldr x8, [sp, #1232] // 8-byte Folded Reload + add x9, x9, x8 + add x30, x30, x8 + mov x8, x24 + mov x24, x21 +.LBB0_19: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Loop Header: Depth=4 + // Child Loop BB0_21 Depth 5 + // Child Loop BB0_23 Depth 5 + madd x8, x8, x29, x2 + add x8, x8, x0 + madd x10, x19, x29, x2 + madd x11, x25, x29, x2 + add x10, x10, x0 + add x11, x11, x0 + add x8, x1, x8, lsl #2 + stp q1, q0, [x8] + stp q4, q3, [x8, #32] + add x8, x1, x10, lsl #2 + add x10, x1, x11, lsl #2 + stp q7, q6, [x8] + stp q5, q2, [x8, #32] + madd x8, x23, x29, x2 + add x8, x8, x0 + stp q22, q20, [x10] + stp q19, q17, [x10, #32] + ldr x10, [sp, #1296] // 8-byte Folded Reload + cmp x24, x10 + add x8, x1, x8, lsl #2 + stp q23, q21, [x8] + stp q18, q16, [x8, #32] + b.ge .LBB0_24 +// %bb.20: // in Loop: Header=BB0_19 Depth=4 + madd x10, x24, x29, x2 + add x23, x24, #3 + add x19, x24, #1 + add x25, x24, #2 + madd x11, x19, x29, x2 + ldp q28, q29, [x26, #32] + mov x8, xzr + madd x12, x25, x29, x2 + ldp q30, q31, [x26] + add x21, x24, #4 + mov x18, x9 + add x10, x10, x0 + add x10, x1, x10, lsl #2 + add x11, x11, x0 + add x11, x1, x11, lsl #2 + ldp q4, q3, [x10, #32] + ldp q1, q0, [x10] + madd x10, x23, x29, x2 + add x10, x10, x0 + ldp q5, q2, [x11, #32] + ldp q7, q6, [x11] + add x11, x12, x0 + add x11, x1, x11, lsl #2 + ldp q19, q17, [x11, #32] + ldp q22, q20, [x11] + add x10, x1, x10, lsl #2 + ldp q18, q16, [x10, #32] + ldp q23, q21, [x10] + madd x10, x24, x28, x4 + lsl x10, x10, #2 + ldr q27, [x3, x10] + madd x10, x19, x28, x4 + lsl x10, x10, #2 + ldr q26, [x3, x10] + madd x10, x25, x28, x4 + lsl x10, x10, #2 + ldr q25, [x3, x10] + madd x10, x23, x28, x4 + lsl x10, x10, #2 + ldr q24, [x3, x10] + ldr x10, [sp, #1200] // 8-byte Folded Reload + fmla v3.4s, v29.4s, v27.s[0] + cmp xzr, x22 + b.ge .LBB0_22 + .p2align 2 +.LBB0_21: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // Parent Loop BB0_19 Depth=4 + // => This Inner Loop Header: Depth=5 + add x14, x10, #64 + fmla v4.4s, v28.4s, v27.s[0] + fmla v1.4s, v30.4s, v27.s[0] + add x13, x10, #128 + prfm pldl1keep, [x14] + ldp q9, q8, [x10, #-160] + fmla v0.4s, v31.4s, v27.s[0] + ldp q12, q15, [x10, #-192] + fmla v2.4s, v29.4s, v26.s[0] + fmla v5.4s, v28.4s, v26.s[0] + fmla v6.4s, v31.4s, v26.s[0] + fmla v7.4s, v30.4s, v26.s[0] + fmla v17.4s, v29.4s, v25.s[0] + prfm pldl1keep, [x13] + fmla v19.4s, v28.4s, v25.s[0] + fmla v20.4s, v31.4s, v25.s[0] + ldp q11, q10, [x10, #-128] + fmla v22.4s, v30.4s, v25.s[0] + fmla v16.4s, v29.4s, v24.s[0] + ldp q13, q14, [x10, #-96] + fmla v18.4s, v28.4s, v24.s[0] + fmla v21.4s, v31.4s, v24.s[0] + add x12, x10, #192 + prfm pldl1keep, [x12] + fmla v23.4s, v30.4s, v24.s[0] + fmla v0.4s, v15.4s, v27.s[1] + add x11, x10, #256 + add x8, x8, #4 + fmla v1.4s, v12.4s, v27.s[1] + fmla v4.4s, v9.4s, v27.s[1] + fmla v3.4s, v8.4s, v27.s[1] + fmla v7.4s, v12.4s, v26.s[1] + fmla v6.4s, v15.4s, v26.s[1] + fmla v5.4s, v9.4s, v26.s[1] + fmla v2.4s, v8.4s, v26.s[1] + fmla v22.4s, v12.4s, v25.s[1] + fmla v20.4s, v15.4s, v25.s[1] + fmla v19.4s, v9.4s, v25.s[1] + fmla v17.4s, v8.4s, v25.s[1] + fmla v23.4s, v12.4s, v24.s[1] + fmla v21.4s, v15.4s, v24.s[1] + ldp q15, q12, [x10, #-64] + fmla v18.4s, v9.4s, v24.s[1] + fmla v16.4s, v8.4s, v24.s[1] + ldp q9, q8, [x10, #-32] + prfm pldl1keep, [x11] + ldp q28, q29, [x10, #32] + ldp q30, q31, [x10] + add x10, x18, x27 + prfm pldl1keep, [x18] + fmla v3.4s, v14.4s, v27.s[2] + fmla v4.4s, v13.4s, v27.s[2] + fmla v1.4s, v11.4s, v27.s[2] + fmla v0.4s, v10.4s, v27.s[2] + fmla v2.4s, v14.4s, v26.s[2] + fmla v5.4s, v13.4s, v26.s[2] + fmla v6.4s, v10.4s, v26.s[2] + fmla v7.4s, v11.4s, v26.s[2] + fmla v17.4s, v14.4s, v25.s[2] + fmla v19.4s, v13.4s, v25.s[2] + fmla v20.4s, v10.4s, v25.s[2] + fmla v22.4s, v11.4s, v25.s[2] + fmla v16.4s, v14.4s, v24.s[2] + fmla v18.4s, v13.4s, v24.s[2] + fmla v21.4s, v10.4s, v24.s[2] + fmla v23.4s, v11.4s, v24.s[2] + fmla v0.4s, v12.4s, v27.s[3] + fmla v1.4s, v15.4s, v27.s[3] + fmla v4.4s, v9.4s, v27.s[3] + fmla v3.4s, v8.4s, v27.s[3] + ldur q27, [x18, #-16] + prfm pldl1keep, [x10] + add x18, x18, #16 + fmla v7.4s, v15.4s, v26.s[3] + fmla v6.4s, v12.4s, v26.s[3] + fmla v5.4s, v9.4s, v26.s[3] + fmla v2.4s, v8.4s, v26.s[3] + ldur q26, [x10, #-16] + add x10, x10, x27 + add x12, x10, x27 + prfm pldl1keep, [x10] + fmla v22.4s, v15.4s, v25.s[3] + fmla v20.4s, v12.4s, v25.s[3] + fmla v19.4s, v9.4s, v25.s[3] + fmla v17.4s, v8.4s, v25.s[3] + ldur q25, [x10, #-16] + prfm pldl1keep, [x12] + mov x10, x11 + fmla v23.4s, v15.4s, v24.s[3] + fmla v21.4s, v12.4s, v24.s[3] + fmla v18.4s, v9.4s, v24.s[3] + fmla v16.4s, v8.4s, v24.s[3] + ldur q24, [x12, #-16] + fmla v3.4s, v29.4s, v27.s[0] + cmp x8, x22 + b.lt .LBB0_21 +.LBB0_22: // in Loop: Header=BB0_19 Depth=4 + ldp q10, q8, [x5, #32] + ldp q12, q11, [x5] + fmla v4.4s, v28.4s, v27.s[0] + fmla v0.4s, v31.4s, v27.s[0] + fmla v1.4s, v30.4s, v27.s[0] + fmla v2.4s, v29.4s, v26.s[0] + fmla v5.4s, v28.4s, v26.s[0] + fmla v6.4s, v31.4s, v26.s[0] + ldp q9, q13, [x6, #32] + fmla v7.4s, v30.4s, v26.s[0] + fmla v17.4s, v29.4s, v25.s[0] + ldr x10, [sp, #1192] // 8-byte Folded Reload + ldr x11, [sp, #1304] // 8-byte Folded Reload + fmla v19.4s, v28.4s, v25.s[0] + fmla v20.4s, v31.4s, v25.s[0] + mov x8, x30 + fmla v22.4s, v30.4s, v25.s[0] + fmla v16.4s, v29.4s, v24.s[0] + fmla v18.4s, v28.4s, v24.s[0] + fmla v21.4s, v31.4s, v24.s[0] + fmla v23.4s, v30.4s, v24.s[0] + ldp q29, q30, [x6] + ldp q31, q28, [x7, #32] + fmla v1.4s, v12.4s, v27.s[1] + fmla v0.4s, v11.4s, v27.s[1] + fmla v4.4s, v10.4s, v27.s[1] + fmla v3.4s, v8.4s, v27.s[1] + fmla v7.4s, v12.4s, v26.s[1] + fmla v6.4s, v11.4s, v26.s[1] + fmla v5.4s, v10.4s, v26.s[1] + fmla v2.4s, v8.4s, v26.s[1] + fmla v22.4s, v12.4s, v25.s[1] + fmla v20.4s, v11.4s, v25.s[1] + fmla v19.4s, v10.4s, v25.s[1] + fmla v17.4s, v8.4s, v25.s[1] + fmla v23.4s, v12.4s, v24.s[1] + fmla v21.4s, v11.4s, v24.s[1] + fmla v18.4s, v10.4s, v24.s[1] + fmla v16.4s, v8.4s, v24.s[1] + ldp q10, q8, [x7] + fmla v3.4s, v13.4s, v27.s[2] + fmla v4.4s, v9.4s, v27.s[2] + fmla v0.4s, v30.4s, v27.s[2] + fmla v1.4s, v29.4s, v27.s[2] + fmla v2.4s, v13.4s, v26.s[2] + fmla v5.4s, v9.4s, v26.s[2] + fmla v6.4s, v30.4s, v26.s[2] + fmla v7.4s, v29.4s, v26.s[2] + fmla v17.4s, v13.4s, v25.s[2] + fmla v19.4s, v9.4s, v25.s[2] + fmla v20.4s, v30.4s, v25.s[2] + fmla v22.4s, v29.4s, v25.s[2] + fmla v16.4s, v13.4s, v24.s[2] + fmla v18.4s, v9.4s, v24.s[2] + fmla v21.4s, v30.4s, v24.s[2] + fmla v23.4s, v29.4s, v24.s[2] + fmla v1.4s, v10.4s, v27.s[3] + fmla v0.4s, v8.4s, v27.s[3] + fmla v4.4s, v31.4s, v27.s[3] + fmla v3.4s, v28.4s, v27.s[3] + fmla v7.4s, v10.4s, v26.s[3] + fmla v6.4s, v8.4s, v26.s[3] + fmla v5.4s, v31.4s, v26.s[3] + fmla v2.4s, v28.4s, v26.s[3] + fmla v22.4s, v10.4s, v25.s[3] + fmla v20.4s, v8.4s, v25.s[3] + fmla v19.4s, v31.4s, v25.s[3] + fmla v17.4s, v28.4s, v25.s[3] + fmla v23.4s, v10.4s, v24.s[3] + fmla v21.4s, v8.4s, v24.s[3] + fmla v18.4s, v31.4s, v24.s[3] + fmla v16.4s, v28.4s, v24.s[3] + cmp x11, x20 + b.ge .LBB0_18 + .p2align 2 +.LBB0_23: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // Parent Loop BB0_19 Depth=4 + // => This Inner Loop Header: Depth=5 + add x12, x8, x27 + prfm pldl1keep, [x8] + ldur s24, [x8, #-4] + add x11, x11, #1 + prfm pldl1keep, [x12] + ldur s25, [x12, #-4] + add x12, x12, x27 + add x8, x8, #4 + prfm pldl1keep, [x12] + ldur s26, [x12, #-4] + add x12, x12, x27 + prfm pldl1keep, [x12] + ldur s27, [x12, #-4] + prfm pldl1keep, [x10] + ldp q28, q29, [x10, #-32] + fmla v3.4s, v29.4s, v24.s[0] + ldp q30, q31, [x10, #-64] + fmla v0.4s, v31.4s, v24.s[0] + fmla v6.4s, v31.4s, v25.s[0] + fmla v2.4s, v29.4s, v25.s[0] + fmla v20.4s, v31.4s, v26.s[0] + fmla v17.4s, v29.4s, v26.s[0] + add x10, x10, #64 + fmla v4.4s, v28.4s, v24.s[0] + fmla v1.4s, v30.4s, v24.s[0] + fmla v5.4s, v28.4s, v25.s[0] + fmla v7.4s, v30.4s, v25.s[0] + fmla v22.4s, v30.4s, v26.s[0] + fmla v19.4s, v28.4s, v26.s[0] + fmla v23.4s, v30.4s, v27.s[0] + fmla v21.4s, v31.4s, v27.s[0] + fmla v18.4s, v28.4s, v27.s[0] + fmla v16.4s, v29.4s, v27.s[0] + cmp x11, x20 + b.lt .LBB0_23 + b .LBB0_18 + .p2align 2 +.LBB0_24: // in Loop: Header=BB0_11 Depth=3 + ldr x8, [sp, #1296] // 8-byte Folded Reload + ldr x9, [sp, #1128] // 8-byte Folded Reload + cmp x8, x9 + ldr x30, [sp, #1080] // 8-byte Folded Reload + b.ge .LBB0_30 +// %bb.25: // in Loop: Header=BB0_11 Depth=3 + ldr x12, [sp, #1296] // 8-byte Folded Reload + ldp q20, q21, [x26, #32] + mov x9, xzr + ldp q18, q19, [x26] + add x10, x12, #1 + madd x8, x12, x29, x2 + madd x11, x10, x29, x2 + madd x10, x10, x28, x4 + add x8, x8, x0 + add x11, x11, x0 + add x8, x1, x8, lsl #2 + lsl x10, x10, #2 + add x18, x1, x11, lsl #2 + madd x11, x12, x28, x4 + ldr q16, [x3, x10] + ldr x10, [sp, #1120] // 8-byte Folded Reload + lsl x11, x11, #2 + ldp q1, q0, [x8, #32] + ldp q4, q2, [x8] + ldp q5, q3, [x18, #32] + ldp q7, q6, [x18] + ldr q17, [x3, x11] + ldr x11, [sp, #1200] // 8-byte Folded Reload + cmp xzr, x22 + b.ge .LBB0_27 + .p2align 2 +.LBB0_26: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + ldr x12, [sp, #1184] // 8-byte Folded Reload + add x25, x11, #64 + fmla v0.4s, v21.4s, v17.s[0] + fmla v1.4s, v20.4s, v17.s[0] + fmla v4.4s, v18.4s, v17.s[0] + fmla v2.4s, v19.4s, v17.s[0] + add x14, x11, #128 + add x13, x11, #192 + fmla v3.4s, v21.4s, v16.s[0] + fmla v5.4s, v20.4s, v16.s[0] + add x19, x11, #256 + add x9, x9, #4 + fmla v6.4s, v19.4s, v16.s[0] + fmla v7.4s, v18.4s, v16.s[0] + add x21, x12, x10 + ldr x12, [sp, #1176] // 8-byte Folded Reload + prfm pldl1keep, [x25] + ldp q23, q22, [x11, #-160] + add x23, x21, #32 + ldp q24, q25, [x11, #-192] + prfm pldl1keep, [x14] + fmla v2.4s, v25.4s, v17.s[1] + ldp q19, q18, [x11, #-128] + ldp q20, q21, [x11, #-96] + fmla v0.4s, v22.4s, v17.s[1] + fmla v6.4s, v25.4s, v16.s[1] + fmla v3.4s, v22.4s, v16.s[1] + prfm pldl1keep, [x13] + add x24, x12, x10 + add x10, x10, #16 + fmla v4.4s, v24.4s, v17.s[1] + fmla v1.4s, v23.4s, v17.s[1] + fmla v7.4s, v24.4s, v16.s[1] + fmla v5.4s, v23.4s, v16.s[1] + ldp q23, q22, [x11, #-32] + ldp q24, q25, [x11, #-64] + add x12, x24, #32 + prfm pldl1keep, [x12] + fmla v0.4s, v21.4s, v17.s[2] + fmla v2.4s, v18.4s, v17.s[2] + fmla v3.4s, v21.4s, v16.s[2] + fmla v6.4s, v18.4s, v16.s[2] + fmla v1.4s, v20.4s, v17.s[2] + fmla v4.4s, v19.4s, v17.s[2] + fmla v5.4s, v20.4s, v16.s[2] + fmla v7.4s, v19.4s, v16.s[2] + fmla v2.4s, v25.4s, v17.s[3] + fmla v0.4s, v22.4s, v17.s[3] + fmla v6.4s, v25.4s, v16.s[3] + fmla v3.4s, v22.4s, v16.s[3] + fmla v4.4s, v24.4s, v17.s[3] + fmla v1.4s, v23.4s, v17.s[3] + ldr q17, [x24, #16] + prfm pldl1keep, [x23] + fmla v7.4s, v24.4s, v16.s[3] + fmla v5.4s, v23.4s, v16.s[3] + ldr q16, [x21, #16] + prfm pldl1keep, [x19] + ldp q20, q21, [x11, #32] + ldp q18, q19, [x11] + mov x11, x19 + cmp x9, x22 + b.lt .LBB0_26 +.LBB0_27: // in Loop: Header=BB0_11 Depth=3 + ldp q23, q22, [x5, #32] + ldp q25, q24, [x5] + fmla v0.4s, v21.4s, v17.s[0] + fmla v1.4s, v20.4s, v17.s[0] + fmla v2.4s, v19.4s, v17.s[0] + fmla v4.4s, v18.4s, v17.s[0] + fmla v3.4s, v21.4s, v16.s[0] + fmla v5.4s, v20.4s, v16.s[0] + ldp q20, q21, [x6, #32] + fmla v6.4s, v19.4s, v16.s[0] + fmla v7.4s, v18.4s, v16.s[0] + ldp q18, q19, [x6] + fmla v2.4s, v24.4s, v17.s[1] + fmla v0.4s, v22.4s, v17.s[1] + ldr x9, [sp, #1120] // 8-byte Folded Reload + ldr x10, [sp, #1192] // 8-byte Folded Reload + fmla v4.4s, v25.4s, v17.s[1] + fmla v1.4s, v23.4s, v17.s[1] + ldr x11, [sp, #1304] // 8-byte Folded Reload + fmla v7.4s, v25.4s, v16.s[1] + fmla v6.4s, v24.4s, v16.s[1] + ldp q25, q24, [x7] + fmla v5.4s, v23.4s, v16.s[1] + fmla v3.4s, v22.4s, v16.s[1] + ldp q23, q22, [x7, #32] + fmla v0.4s, v21.4s, v17.s[2] + fmla v2.4s, v19.4s, v17.s[2] + fmla v3.4s, v21.4s, v16.s[2] + fmla v6.4s, v19.4s, v16.s[2] + fmla v1.4s, v20.4s, v17.s[2] + fmla v4.4s, v18.4s, v17.s[2] + fmla v5.4s, v20.4s, v16.s[2] + fmla v7.4s, v18.4s, v16.s[2] + fmla v2.4s, v24.4s, v17.s[3] + fmla v0.4s, v22.4s, v17.s[3] + fmla v6.4s, v24.4s, v16.s[3] + fmla v3.4s, v22.4s, v16.s[3] + fmla v4.4s, v25.4s, v17.s[3] + fmla v1.4s, v23.4s, v17.s[3] + fmla v7.4s, v25.4s, v16.s[3] + fmla v5.4s, v23.4s, v16.s[3] + cmp x11, x20 + b.ge .LBB0_29 + .p2align 2 +.LBB0_28: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + ldr x14, [sp, #1168] // 8-byte Folded Reload + ldr x19, [sp, #1160] // 8-byte Folded Reload + add x11, x11, #1 + add x12, x14, x9 + add x13, x19, x9 + add x13, x13, #4 + add x12, x12, #4 + prfm pldl1keep, [x13] + ldr s16, [x19, x9] + prfm pldl1keep, [x12] + ldr s17, [x14, x9] + prfm pldl1keep, [x10] + add x9, x9, #4 + ldp q18, q19, [x10, #-64] + ldp q20, q21, [x10, #-32] + add x10, x10, #64 + fmla v0.4s, v21.4s, v16.s[0] + fmla v1.4s, v20.4s, v16.s[0] + fmla v2.4s, v19.4s, v16.s[0] + fmla v4.4s, v18.4s, v16.s[0] + fmla v7.4s, v18.4s, v17.s[0] + fmla v6.4s, v19.4s, v17.s[0] + fmla v5.4s, v20.4s, v17.s[0] + fmla v3.4s, v21.4s, v17.s[0] + cmp x11, x20 + b.lt .LBB0_28 +.LBB0_29: // in Loop: Header=BB0_11 Depth=3 + stp q4, q2, [x8] + stp q1, q0, [x8, #32] + stp q7, q6, [x18] + stp q5, q3, [x18, #32] +.LBB0_30: // in Loop: Header=BB0_11 Depth=3 + ldr x8, [sp, #1024] // 8-byte Folded Reload + ldr x9, [sp, #1128] // 8-byte Folded Reload + cmp x9, x8 + ldr x19, [sp, #1072] // 8-byte Folded Reload + b.ge .LBB0_10 +// %bb.31: // in Loop: Header=BB0_11 Depth=3 + ldr x10, [sp, #1128] // 8-byte Folded Reload + ldp q7, q16, [x26, #32] + mov x9, xzr + ldp q6, q5, [x26] + ldr x11, [sp, #1200] // 8-byte Folded Reload + madd x8, x10, x29, x2 + madd x10, x10, x28, x4 + add x8, x8, x0 + lsl x10, x10, #2 + ldr x0, [sp, #848] // 8-byte Folded Reload + add x8, x1, x8, lsl #2 + ldr q4, [x3, x10] + ldr x10, [sp, #904] // 8-byte Folded Reload + ldr x1, [sp, #896] // 8-byte Folded Reload + ldp q1, q0, [x8, #32] + ldp q3, q2, [x8] + cmp xzr, x22 + b.ge .LBB0_33 + .p2align 2 +.LBB0_32: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + add x18, x11, #64 + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + add x14, x11, #128 + prfm pldl1keep, [x18] + ldp q18, q17, [x11, #-160] + fmla v3.4s, v6.4s, v4.s[0] + ldp q19, q20, [x11, #-192] + fmla v2.4s, v5.4s, v4.s[0] + prfm pldl1keep, [x14] + ldp q6, q5, [x11, #-128] + ldp q7, q16, [x11, #-96] + add x13, x11, #192 + prfm pldl1keep, [x13] + add x12, x11, #256 + add x9, x9, #4 + fmla v2.4s, v20.4s, v4.s[1] + fmla v0.4s, v17.4s, v4.s[1] + fmla v3.4s, v19.4s, v4.s[1] + fmla v1.4s, v18.4s, v4.s[1] + ldp q18, q17, [x11, #-32] + ldp q19, q20, [x11, #-64] + prfm pldl1keep, [x10] + fmla v0.4s, v16.4s, v4.s[2] + fmla v2.4s, v5.4s, v4.s[2] + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v2.4s, v20.4s, v4.s[3] + fmla v0.4s, v17.4s, v4.s[3] + fmla v3.4s, v19.4s, v4.s[3] + fmla v1.4s, v18.4s, v4.s[3] + ldur q4, [x10, #-16] + prfm pldl1keep, [x12] + add x10, x10, #16 + ldp q7, q16, [x11, #32] + ldp q6, q5, [x11] + mov x11, x12 + cmp x9, x22 + b.lt .LBB0_32 +.LBB0_33: // in Loop: Header=BB0_11 Depth=3 + ldp q18, q17, [x5, #32] + ldp q20, q19, [x5] + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + fmla v2.4s, v5.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q5, q6, [x6] + ldp q7, q16, [x6, #32] + ldr x10, [sp, #880] // 8-byte Folded Reload + mov x9, xzr + mov w11, #64 // =0x40 + fmla v2.4s, v19.4s, v4.s[1] + fmla v0.4s, v17.4s, v4.s[1] + fmla v3.4s, v20.4s, v4.s[1] + fmla v1.4s, v18.4s, v4.s[1] + fmla v0.4s, v16.4s, v4.s[2] + ldp q18, q17, [x7, #32] + ldp q20, q19, [x7] + fmla v2.4s, v6.4s, v4.s[2] + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v5.4s, v4.s[2] + fmla v2.4s, v19.4s, v4.s[3] + fmla v0.4s, v17.4s, v4.s[3] + fmla v3.4s, v20.4s, v4.s[3] + fmla v1.4s, v18.4s, v4.s[3] + ldr x12, [sp, #1304] // 8-byte Folded Reload + add x12, x12, xzr + cmp x12, x20 + b.ge .LBB0_9 + .p2align 2 +.LBB0_34: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + add x13, x0, x9, lsl #6 + add x12, x0, x11 + prfm pldl1keep, [x10] + add x11, x11, #64 + ldr s4, [x1, x9, lsl #2] + prfm pldl1keep, [x12] + add x9, x9, #1 + ldp q5, q6, [x13] + ldp q7, q16, [x13, #32] + add x10, x10, #4 + fmla v0.4s, v16.4s, v4.s[0] + fmla v2.4s, v6.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + fmla v3.4s, v5.4s, v4.s[0] + ldr x12, [sp, #1304] // 8-byte Folded Reload + add x12, x12, x9 + cmp x12, x20 + b.lt .LBB0_34 + b .LBB0_9 + .p2align 2 +.LBB0_35: // in Loop: Header=BB0_7 Depth=2 + cmp x8, x30 + ldr x8, [sp, #944] // 8-byte Folded Reload + ldr x9, [sp, #936] // 8-byte Folded Reload + add x8, x9, x8, lsl #2 + ldr x9, [sp, #920] // 8-byte Folded Reload + str x8, [sp, #1272] // 8-byte Folded Spill + lsl x8, x29, #1 + str x8, [sp, #1256] // 8-byte Folded Spill + ldr x8, [sp, #928] // 8-byte Folded Reload + add x8, x9, x8, lsl #2 + ldr x9, [sp, #952] // 8-byte Folded Reload + str x8, [sp, #1264] // 8-byte Folded Spill + ldr x8, [sp, #960] // 8-byte Folded Reload + add x8, x9, x8, lsl #2 + str x8, [sp, #1280] // 8-byte Folded Spill + b.lt .LBB0_39 +// %bb.36: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #592] // 8-byte Folded Reload + cmp x30, x8 + b.lt .LBB0_64 +.LBB0_37: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #592] // 8-byte Folded Reload + ldr x9, [sp, #584] // 8-byte Folded Reload + cmp x8, x9 + b.lt .LBB0_89 +.LBB0_38: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #504] // 8-byte Folded Reload + ldr x9, [sp, #584] // 8-byte Folded Reload + cmp x9, x8 + b.ge .LBB0_6 + b .LBB0_114 + .p2align 2 +.LBB0_39: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #440] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x9, [sp, #1040] // 8-byte Folded Reload + ldr x8, [sp, #1016] // 8-byte Folded Reload + add x14, x0, #63 + mov x11, xzr + ldr x17, [sp, #1064] // 8-byte Folded Reload + ldr x1, [sp, #1256] // 8-byte Folded Reload + mul x12, x19, x9 + ldr x9, [sp, #1008] // 8-byte Folded Reload + mul x8, x19, x8 + ldr x16, [sp, #1104] // 8-byte Folded Reload + ldr x18, [sp, #1272] // 8-byte Folded Reload + add x10, x1, x29 + ldr x6, [sp, #992] // 8-byte Folded Reload + ldr x7, [sp, #976] // 8-byte Folded Reload + ldp x23, x21, [sp, #368] // 16-byte Folded Reload + madd x9, x17, x9, x8 + ldr x8, [sp, #1048] // 8-byte Folded Reload + ldp x25, x24, [sp, #352] // 16-byte Folded Reload + ldr x30, [sp, #344] // 8-byte Folded Reload + madd x13, x17, x8, x12 + add x12, x9, x16 + and x8, x14, #0xffffffffffffffc0 + add x10, x12, x10 + add x14, x18, x12, lsl #2 + add x15, x12, x29 + add x12, x12, x1 + add x10, x18, x10, lsl #2 + add x15, x18, x15, lsl #2 + add x12, x18, x12, lsl #2 + ldp q1, q0, [x14] + ldr x14, [sp, #1056] // 8-byte Folded Reload + ldp q7, q5, [x10] + ldr x10, [sp, #1032] // 8-byte Folded Reload + mul x10, x19, x10 + ldp q6, q3, [x15] + ldr x15, [sp, #1280] // 8-byte Folded Reload + ldr x19, [sp, #968] // 8-byte Folded Reload + ldp q4, q2, [x12] + add x12, x13, x16 + madd x10, x17, x14, x10 + lsl x14, x10, #2 + ldr q18, [x15, x14] + add x14, x10, x28 + lsl x14, x14, #2 + ldr q17, [x15, x14] + add x14, x10, x28, lsl #1 + lsl x14, x14, #2 + ldr q16, [x15, x14] + ldr x14, [sp, #1264] // 8-byte Folded Reload + ldr x15, [sp, #1120] // 8-byte Folded Reload + add x12, x14, x12, lsl #2 + ldr x14, [sp, #656] // 8-byte Folded Reload + ldp q21, q20, [x12] + add x12, x8, #64 + .p2align 2 +.LBB0_40: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x16, [sp, #1288] // 8-byte Folded Reload + fmla v1.4s, v21.4s, v18.s[0] + fmla v0.4s, v20.4s, v18.s[0] + cmp x11, x22 + add x16, x16, x15 + prfm pldl1keep, [x16, #16] + ldr q19, [x16] + b.ge .LBB0_42 +// %bb.41: // in Loop: Header=BB0_40 Depth=3 + ldr x17, [sp, #800] // 8-byte Folded Reload + add x16, x25, x14 + fmla v6.4s, v21.4s, v17.s[0] + fmla v3.4s, v20.4s, v17.s[0] + fmla v4.4s, v21.4s, v16.s[0] + fmla v2.4s, v20.4s, v16.s[0] + stp q21, q20, [x12, #-64] + fmla v7.4s, v21.4s, v19.s[0] + fmla v5.4s, v20.4s, v19.s[0] + prfm pldl1keep, [x16] + add x16, x24, x14 + add x1, x21, x14 + add x5, x30, x14 + add x11, x11, #4 + add x17, x17, x14 + ldp q20, q21, [x17] + ldr x17, [sp, #792] // 8-byte Folded Reload + add x17, x17, x14 + fmla v0.4s, v21.4s, v18.s[1] + fmla v3.4s, v21.4s, v17.s[1] + fmla v2.4s, v21.4s, v16.s[1] + fmla v5.4s, v21.4s, v19.s[1] + fmla v1.4s, v20.4s, v18.s[1] + fmla v6.4s, v20.4s, v17.s[1] + fmla v4.4s, v20.4s, v16.s[1] + fmla v7.4s, v20.4s, v19.s[1] + stp q20, q21, [x12, #-32] + prfm pldl1keep, [x16] + add x16, x23, x14 + ldp q21, q20, [x17] + ldr x17, [sp, #784] // 8-byte Folded Reload + add x17, x17, x14 + fmla v0.4s, v20.4s, v18.s[2] + fmla v3.4s, v20.4s, v17.s[2] + fmla v2.4s, v20.4s, v16.s[2] + fmla v5.4s, v20.4s, v19.s[2] + fmla v1.4s, v21.4s, v18.s[2] + fmla v6.4s, v21.4s, v17.s[2] + fmla v4.4s, v21.4s, v16.s[2] + fmla v7.4s, v21.4s, v19.s[2] + stp q21, q20, [x12] + prfm pldl1keep, [x16] + ldr x16, [sp, #1224] // 8-byte Folded Reload + ldp q20, q21, [x17] + ldr x17, [sp, #1208] // 8-byte Folded Reload + ldr x18, [sp, #1216] // 8-byte Folded Reload + add x16, x16, x15 + add x17, x17, x15 + add x18, x18, x15 + fmla v0.4s, v21.4s, v18.s[3] + fmla v3.4s, v21.4s, v17.s[3] + fmla v2.4s, v21.4s, v16.s[3] + fmla v5.4s, v21.4s, v19.s[3] + add x15, x15, #16 + add x2, x16, #32 + add x3, x17, #32 + add x4, x18, #32 + fmla v1.4s, v20.4s, v18.s[3] + fmla v6.4s, v20.4s, v17.s[3] + fmla v4.4s, v20.4s, v16.s[3] + fmla v7.4s, v20.4s, v19.s[3] + stp q20, q21, [x12, #32] + prfm pldl1keep, [x1] + add x12, x12, #128 + ldp q21, q20, [x5] + prfm pldl1keep, [x4] + ldr q18, [x18, #16] + prfm pldl1keep, [x3] + ldr q17, [x17, #16] + prfm pldl1keep, [x2] + ldr q16, [x16, #16] + ldr x16, [sp, #1240] // 8-byte Folded Reload + add x14, x14, x16 + b .LBB0_40 + .p2align 2 +.LBB0_42: // in Loop: Header=BB0_7 Depth=2 + ldr x16, [sp, #1000] // 8-byte Folded Reload + ldr x12, [sp, #984] // 8-byte Folded Reload + add x11, x8, x22, lsl #5 + fmla v6.4s, v21.4s, v17.s[0] + ldr x17, [sp, #1104] // 8-byte Folded Reload + ldr x18, [sp, #1264] // 8-byte Folded Reload + fmla v3.4s, v20.4s, v17.s[0] + fmla v4.4s, v21.4s, v16.s[0] + stp q21, q20, [x11] + fmla v2.4s, v20.4s, v16.s[0] + fmla v5.4s, v20.4s, v19.s[0] + fmla v7.4s, v21.4s, v19.s[0] + ldr x23, [sp, #648] // 8-byte Folded Reload + ldr x24, [sp, #680] // 8-byte Folded Reload + mov x14, xzr + madd x11, x12, x16, x13 + ldr x25, [sp, #1272] // 8-byte Folded Reload + mov x15, xzr + add x11, x11, x17 + add x11, x18, x11, lsl #2 + ldp q20, q21, [x11] + add x11, x8, x12, lsl #5 + madd x12, x7, x16, x13 + madd x13, x19, x16, x13 + ldr x16, [sp, #664] // 8-byte Folded Reload + add x12, x12, x17 + fmla v0.4s, v21.4s, v18.s[1] + fmla v3.4s, v21.4s, v17.s[1] + fmla v2.4s, v21.4s, v16.s[1] + fmla v5.4s, v21.4s, v19.s[1] + add x13, x13, x17 + ldr x17, [sp, #472] // 8-byte Folded Reload + add x12, x18, x12, lsl #2 + fmla v1.4s, v20.4s, v18.s[1] + stp q20, q21, [x11] + fmla v6.4s, v20.4s, v17.s[1] + fmla v4.4s, v20.4s, v16.s[1] + fmla v7.4s, v20.4s, v19.s[1] + add x13, x18, x13, lsl #2 + ldr x18, [sp, #672] // 8-byte Folded Reload + add x16, x16, x17 + ldp q21, q20, [x12] + add x12, x8, x7, lsl #5 + add x17, x18, x17 + ldr x18, [sp, #1304] // 8-byte Folded Reload + fmla v0.4s, v20.4s, v18.s[2] + fmla v3.4s, v20.4s, v17.s[2] + fmla v2.4s, v20.4s, v16.s[2] + fmla v5.4s, v20.4s, v19.s[2] + stp q21, q20, [x12] + fmla v1.4s, v21.4s, v18.s[2] + fmla v6.4s, v21.4s, v17.s[2] + fmla v4.4s, v21.4s, v16.s[2] + fmla v7.4s, v21.4s, v19.s[2] + ldp q20, q21, [x13] + add x13, x8, x19, lsl #5 + fmla v0.4s, v21.4s, v18.s[3] + fmla v3.4s, v21.4s, v17.s[3] + fmla v2.4s, v21.4s, v16.s[3] + fmla v5.4s, v21.4s, v19.s[3] + fmla v1.4s, v20.4s, v18.s[3] + fmla v6.4s, v20.4s, v17.s[3] + fmla v4.4s, v20.4s, v16.s[3] + fmla v7.4s, v20.4s, v19.s[3] + stp q20, q21, [x13] + cmp x18, x20 + b.ge .LBB0_44 + .p2align 2 +.LBB0_43: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x2, [sp, #1152] // 8-byte Folded Reload + add x1, x16, x15 + add x3, x8, x18, lsl #5 + add x18, x18, #1 + add x2, x2, x14 + add x14, x14, #4 + prfm pldl1keep, [x2] + ldur s16, [x2, #-4] + add x2, x2, x27 + prfm pldl1keep, [x2] + ldur s17, [x2, #-4] + add x2, x2, x27 + prfm pldl1keep, [x2] + ldur s18, [x2, #-4] + add x2, x2, x27 + prfm pldl1keep, [x2] + ldur s19, [x2, #-4] + add x2, x17, x15 + prfm pldl1keep, [x1] + add x15, x15, x6 + ldp q20, q21, [x2] + fmla v0.4s, v21.4s, v16.s[0] + fmla v3.4s, v21.4s, v17.s[0] + fmla v2.4s, v21.4s, v18.s[0] + fmla v5.4s, v21.4s, v19.s[0] + fmla v1.4s, v20.4s, v16.s[0] + fmla v6.4s, v20.4s, v17.s[0] + fmla v4.4s, v20.4s, v18.s[0] + fmla v7.4s, v20.4s, v19.s[0] + stp q20, q21, [x3] + cmp x18, x20 + b.lt .LBB0_43 +.LBB0_44: // %.preheader67 + // in Loop: Header=BB0_7 Depth=2 + ldr x15, [sp, #336] // 8-byte Folded Reload + ldr x16, [sp, #1096] // 8-byte Folded Reload + mov x5, xzr + add x14, x8, #128 + ldr x17, [sp, #1088] // 8-byte Folded Reload + mov w2, #1 // =0x1 + mov w3, #2 // =0x2 + mov w1, #3 // =0x3 + mov w18, #4 // =0x4 + add x15, x8, x15 + b .LBB0_46 + .p2align 2 +.LBB0_45: // %.loopexit63 + // in Loop: Header=BB0_46 Depth=3 + ldr x5, [sp, #1232] // 8-byte Folded Reload + add x17, x17, x5 + add x16, x16, x5 + mov x5, x18 + mov x18, x4 +.LBB0_46: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_48 Depth 4 + // Child Loop BB0_50 Depth 4 + madd x4, x5, x29, x9 + ldr x21, [sp, #1104] // 8-byte Folded Reload + add x4, x4, x21 + madd x2, x2, x29, x9 + madd x3, x3, x29, x9 + madd x1, x1, x29, x9 + add x2, x2, x21 + add x1, x1, x21 + add x4, x25, x4, lsl #2 + add x2, x25, x2, lsl #2 + stp q1, q0, [x4] + stp q6, q3, [x2] + add x2, x3, x21 + add x1, x25, x1, lsl #2 + add x2, x25, x2, lsl #2 + stp q4, q2, [x2] + stp q7, q5, [x1] + ldr x1, [sp, #1296] // 8-byte Folded Reload + cmp x18, x1 + b.ge .LBB0_51 +// %bb.47: // in Loop: Header=BB0_46 Depth=3 + madd x4, x18, x29, x9 + add x2, x18, #1 + add x3, x18, #2 + add x1, x18, #3 + madd x6, x2, x29, x9 + ldp q20, q21, [x8] + mov x5, xzr + madd x7, x3, x29, x9 + add x4, x4, x21 + madd x19, x1, x29, x9 + add x4, x25, x4, lsl #2 + ldp q1, q0, [x4] + add x4, x6, x21 + add x6, x7, x21 + add x7, x19, x21 + add x6, x25, x6, lsl #2 + add x7, x25, x7, lsl #2 + add x4, x25, x4, lsl #2 + ldp q4, q2, [x6] + madd x6, x18, x28, x10 + lsl x6, x6, #2 + ldp q7, q5, [x7] + ldr x7, [sp, #1280] // 8-byte Folded Reload + ldp q6, q3, [x4] + add x4, x18, #4 + ldr q19, [x7, x6] + madd x6, x2, x28, x10 + lsl x6, x6, #2 + ldr q18, [x7, x6] + madd x6, x3, x28, x10 + lsl x6, x6, #2 + ldr q17, [x7, x6] + madd x6, x1, x28, x10 + lsl x6, x6, #2 + ldr q16, [x7, x6] + mov x6, x14 + mov x7, x17 + cmp xzr, x22 + b.ge .LBB0_49 + .p2align 2 +.LBB0_48: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_46 Depth=3 + // => This Inner Loop Header: Depth=4 + add x19, x6, #32 + fmla v1.4s, v20.4s, v19.s[0] + fmla v0.4s, v21.4s, v19.s[0] + add x5, x5, #4 + prfm pldl1keep, [x19] + ldp q22, q23, [x6, #-96] + fmla v3.4s, v21.4s, v18.s[0] + fmla v6.4s, v20.4s, v18.s[0] + fmla v2.4s, v21.4s, v17.s[0] + fmla v4.4s, v20.4s, v17.s[0] + add x19, x6, #96 + fmla v5.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q21, q20, [x6, #-64] + prfm pldl1keep, [x19] + add x19, x7, x27 + add x21, x19, x27 + fmla v0.4s, v23.4s, v19.s[1] + fmla v3.4s, v23.4s, v18.s[1] + fmla v2.4s, v23.4s, v17.s[1] + fmla v5.4s, v23.4s, v16.s[1] + fmla v1.4s, v22.4s, v19.s[1] + fmla v6.4s, v22.4s, v18.s[1] + fmla v4.4s, v22.4s, v17.s[1] + fmla v7.4s, v22.4s, v16.s[1] + fmla v0.4s, v20.4s, v19.s[2] + ldp q22, q23, [x6, #-32] + fmla v3.4s, v20.4s, v18.s[2] + fmla v2.4s, v20.4s, v17.s[2] + fmla v5.4s, v20.4s, v16.s[2] + fmla v1.4s, v21.4s, v19.s[2] + fmla v6.4s, v21.4s, v18.s[2] + fmla v4.4s, v21.4s, v17.s[2] + fmla v7.4s, v21.4s, v16.s[2] + ldp q20, q21, [x6], #128 + prfm pldl1keep, [x7] + fmla v0.4s, v23.4s, v19.s[3] + fmla v3.4s, v23.4s, v18.s[3] + fmla v2.4s, v23.4s, v17.s[3] + fmla v5.4s, v23.4s, v16.s[3] + fmla v1.4s, v22.4s, v19.s[3] + ldur q19, [x7, #-16] + prfm pldl1keep, [x19] + fmla v6.4s, v22.4s, v18.s[3] + ldur q18, [x19, #-16] + add x19, x21, x27 + prfm pldl1keep, [x21] + add x7, x7, #16 + fmla v4.4s, v22.4s, v17.s[3] + ldur q17, [x21, #-16] + prfm pldl1keep, [x19] + fmla v7.4s, v22.4s, v16.s[3] + ldur q16, [x19, #-16] + cmp x5, x22 + b.lt .LBB0_48 +.LBB0_49: // in Loop: Header=BB0_46 Depth=3 + ldp q23, q22, [x11] + fmla v0.4s, v21.4s, v19.s[0] + fmla v1.4s, v20.4s, v19.s[0] + fmla v3.4s, v21.4s, v18.s[0] + fmla v6.4s, v20.4s, v18.s[0] + ldr x7, [sp, #1304] // 8-byte Folded Reload + mov x5, x16 + fmla v2.4s, v21.4s, v17.s[0] + fmla v4.4s, v20.4s, v17.s[0] + mov x6, x15 + fmla v5.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q20, q21, [x12] + fmla v0.4s, v22.4s, v19.s[1] + fmla v3.4s, v22.4s, v18.s[1] + fmla v2.4s, v22.4s, v17.s[1] + fmla v5.4s, v22.4s, v16.s[1] + fmla v1.4s, v23.4s, v19.s[1] + fmla v6.4s, v23.4s, v18.s[1] + fmla v4.4s, v23.4s, v17.s[1] + fmla v7.4s, v23.4s, v16.s[1] + fmla v0.4s, v21.4s, v19.s[2] + ldp q23, q22, [x13] + fmla v3.4s, v21.4s, v18.s[2] + fmla v2.4s, v21.4s, v17.s[2] + fmla v5.4s, v21.4s, v16.s[2] + fmla v1.4s, v20.4s, v19.s[2] + fmla v6.4s, v20.4s, v18.s[2] + fmla v4.4s, v20.4s, v17.s[2] + fmla v7.4s, v20.4s, v16.s[2] + fmla v0.4s, v22.4s, v19.s[3] + fmla v3.4s, v22.4s, v18.s[3] + fmla v2.4s, v22.4s, v17.s[3] + fmla v5.4s, v22.4s, v16.s[3] + fmla v1.4s, v23.4s, v19.s[3] + fmla v6.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v17.s[3] + fmla v7.4s, v23.4s, v16.s[3] + cmp x7, x20 + b.ge .LBB0_45 + .p2align 2 +.LBB0_50: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_46 Depth=3 + // => This Inner Loop Header: Depth=4 + add x19, x5, x27 + prfm pldl1keep, [x5] + ldur s16, [x5, #-4] + add x7, x7, #1 + prfm pldl1keep, [x19] + ldur s17, [x19, #-4] + add x19, x19, x27 + add x5, x5, #4 + prfm pldl1keep, [x19] + ldur s18, [x19, #-4] + add x19, x19, x27 + prfm pldl1keep, [x19] + ldur s19, [x19, #-4] + prfm pldl1keep, [x6] + ldp q20, q21, [x6, #-32] + add x6, x6, #32 + fmla v0.4s, v21.4s, v16.s[0] + fmla v3.4s, v21.4s, v17.s[0] + fmla v2.4s, v21.4s, v18.s[0] + fmla v1.4s, v20.4s, v16.s[0] + fmla v6.4s, v20.4s, v17.s[0] + fmla v4.4s, v20.4s, v18.s[0] + fmla v7.4s, v20.4s, v19.s[0] + fmla v5.4s, v21.4s, v19.s[0] + cmp x7, x20 + b.lt .LBB0_50 + b .LBB0_45 + .p2align 2 +.LBB0_51: // in Loop: Header=BB0_7 Depth=2 + ldr x15, [sp, #1296] // 8-byte Folded Reload + ldr x16, [sp, #1128] // 8-byte Folded Reload + cmp x15, x16 + ldr x19, [sp, #1072] // 8-byte Folded Reload + b.ge .LBB0_57 +// %bb.52: // in Loop: Header=BB0_7 Depth=2 + ldr x2, [sp, #1296] // 8-byte Folded Reload + ldr x1, [sp, #1104] // 8-byte Folded Reload + mov x17, xzr + add x18, x2, #1 + madd x15, x2, x29, x9 + ldp q6, q7, [x8] + madd x16, x18, x29, x9 + madd x18, x18, x28, x10 + add x15, x15, x1 + add x16, x16, x1 + madd x1, x2, x28, x10 + ldr x2, [sp, #1280] // 8-byte Folded Reload + add x15, x25, x15, lsl #2 + lsl x18, x18, #2 + add x16, x25, x16, lsl #2 + ldp q1, q0, [x15] + ldp q3, q2, [x16] + lsl x1, x1, #2 + ldr q4, [x2, x18] + mov x18, x14 + ldr q5, [x2, x1] + ldr x1, [sp, #1120] // 8-byte Folded Reload + cmp xzr, x22 + b.ge .LBB0_54 + .p2align 2 +.LBB0_53: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x7, x18, #32 + ldr x2, [sp, #1184] // 8-byte Folded Reload + ldr x4, [sp, #1176] // 8-byte Folded Reload + fmla v1.4s, v6.4s, v5.s[0] + prfm pldl1keep, [x7] + ldp q16, q17, [x18, #-96] + fmla v0.4s, v7.4s, v5.s[0] + fmla v2.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q7, q6, [x18, #-64] + add x6, x18, #96 + prfm pldl1keep, [x6] + add x17, x17, #4 + add x2, x2, x1 + add x4, x4, x1 + add x1, x1, #16 + fmla v0.4s, v17.4s, v5.s[1] + fmla v2.4s, v17.4s, v4.s[1] + add x3, x2, #32 + add x5, x4, #32 + fmla v1.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + ldp q16, q17, [x18, #-32] + fmla v0.4s, v6.4s, v5.s[2] + fmla v2.4s, v6.4s, v4.s[2] + fmla v1.4s, v7.4s, v5.s[2] + fmla v3.4s, v7.4s, v4.s[2] + fmla v0.4s, v17.4s, v5.s[3] + fmla v2.4s, v17.4s, v4.s[3] + ldp q6, q7, [x18], #128 + prfm pldl1keep, [x5] + fmla v1.4s, v16.4s, v5.s[3] + ldr q5, [x4, #16] + prfm pldl1keep, [x3] + fmla v3.4s, v16.4s, v4.s[3] + ldr q4, [x2, #16] + cmp x17, x22 + b.lt .LBB0_53 +.LBB0_54: // in Loop: Header=BB0_7 Depth=2 + ldp q17, q16, [x11] + fmla v0.4s, v7.4s, v5.s[0] + fmla v1.4s, v6.4s, v5.s[0] + fmla v2.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q6, q7, [x12] + ldr x1, [sp, #416] // 8-byte Folded Reload + ldr x2, [sp, #1304] // 8-byte Folded Reload + mov x17, xzr + mov x18, xzr + fmla v0.4s, v16.4s, v5.s[1] + fmla v2.4s, v16.4s, v4.s[1] + add x1, x8, x1 + fmla v1.4s, v17.4s, v5.s[1] + fmla v3.4s, v17.4s, v4.s[1] + ldp q17, q16, [x13] + fmla v0.4s, v7.4s, v5.s[2] + fmla v2.4s, v7.4s, v4.s[2] + fmla v1.4s, v6.4s, v5.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v0.4s, v16.4s, v5.s[3] + fmla v2.4s, v16.4s, v4.s[3] + fmla v1.4s, v17.4s, v5.s[3] + fmla v3.4s, v17.4s, v4.s[3] + cmp x2, x20 + b.ge .LBB0_56 + .p2align 2 +.LBB0_55: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x3, x1, x18, lsl #3 + add x4, x24, x18 + add x5, x23, x18 + add x2, x2, #1 + add x4, x4, #4 + add x5, x5, #4 + add x3, x3, #32 + prfm pldl1keep, [x5] + ldr s4, [x23, x18] + prfm pldl1keep, [x4] + add x4, x1, x17 + ldr s5, [x24, x18] + prfm pldl1keep, [x3] + add x18, x18, #4 + ldp q6, q7, [x4] + add x17, x17, #32 + fmla v0.4s, v7.4s, v4.s[0] + fmla v1.4s, v6.4s, v4.s[0] + fmla v2.4s, v7.4s, v5.s[0] + fmla v3.4s, v6.4s, v5.s[0] + cmp x2, x20 + b.lt .LBB0_55 +.LBB0_56: // in Loop: Header=BB0_7 Depth=2 + stp q1, q0, [x15] + stp q3, q2, [x16] +.LBB0_57: // in Loop: Header=BB0_7 Depth=2 + ldr x15, [sp, #1024] // 8-byte Folded Reload + ldr x16, [sp, #1128] // 8-byte Folded Reload + cmp x16, x15 + b.ge .LBB0_63 +// %bb.58: // in Loop: Header=BB0_7 Depth=2 + ldr x17, [sp, #1128] // 8-byte Folded Reload + ldr x16, [sp, #1104] // 8-byte Folded Reload + mov x15, xzr + madd x9, x17, x29, x9 + madd x10, x17, x28, x10 + ldp q4, q3, [x8] + ldr x18, [sp, #896] // 8-byte Folded Reload + add x9, x9, x16 + ldr x16, [sp, #1280] // 8-byte Folded Reload + lsl x10, x10, #2 + add x9, x25, x9, lsl #2 + ldp q1, q0, [x9] + ldr q2, [x16, x10] + ldr x10, [sp, #904] // 8-byte Folded Reload + cmp xzr, x22 + b.ge .LBB0_60 + .p2align 2 +.LBB0_59: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x17, x14, #32 + fmla v1.4s, v4.4s, v2.s[0] + fmla v0.4s, v3.4s, v2.s[0] + add x16, x14, #96 + prfm pldl1keep, [x17] + ldp q5, q6, [x14, #-96] + add x15, x15, #4 + ldp q4, q3, [x14, #-64] + prfm pldl1keep, [x16] + fmla v0.4s, v6.4s, v2.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldp q5, q6, [x14, #-32] + prfm pldl1keep, [x10] + fmla v0.4s, v3.4s, v2.s[2] + fmla v1.4s, v4.4s, v2.s[2] + fmla v0.4s, v6.4s, v2.s[3] + fmla v1.4s, v5.4s, v2.s[3] + ldur q2, [x10, #-16] + ldp q4, q3, [x14], #128 + add x10, x10, #16 + cmp x15, x22 + b.lt .LBB0_59 +.LBB0_60: // in Loop: Header=BB0_7 Depth=2 + ldp q6, q5, [x11] + fmla v0.4s, v3.4s, v2.s[0] + fmla v1.4s, v4.4s, v2.s[0] + ldp q3, q4, [x12] + ldr x11, [sp, #416] // 8-byte Folded Reload + mov x10, xzr + mov x14, xzr + fmla v0.4s, v5.4s, v2.s[1] + add x8, x8, x11 + ldr x11, [sp, #1304] // 8-byte Folded Reload + fmla v1.4s, v6.4s, v2.s[1] + ldp q6, q5, [x13] + fmla v0.4s, v4.4s, v2.s[2] + fmla v1.4s, v3.4s, v2.s[2] + fmla v0.4s, v5.4s, v2.s[3] + fmla v1.4s, v6.4s, v2.s[3] + cmp x11, x20 + b.ge .LBB0_62 + .p2align 2 +.LBB0_61: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x12, x8, x14, lsl #3 + add x13, x18, x14 + add x11, x11, #1 + add x13, x13, #4 + add x12, x12, #32 + prfm pldl1keep, [x13] + ldr s2, [x18, x14] + add x13, x8, x10 + add x14, x14, #4 + add x10, x10, #32 + prfm pldl1keep, [x12] + ldp q3, q4, [x13] + fmla v0.4s, v4.4s, v2.s[0] + fmla v1.4s, v3.4s, v2.s[0] + cmp x11, x20 + b.lt .LBB0_61 +.LBB0_62: // in Loop: Header=BB0_7 Depth=2 + stp q1, q0, [x9] +.LBB0_63: // in Loop: Header=BB0_7 Depth=2 + bl free + ldr x30, [sp, #1080] // 8-byte Folded Reload + ldr x8, [sp, #592] // 8-byte Folded Reload + cmp x30, x8 + b.ge .LBB0_37 +.LBB0_64: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #432] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #1016] // 8-byte Folded Reload + ldr x9, [sp, #1008] // 8-byte Folded Reload + mov x16, x19 + mov x12, xzr + ldr x15, [sp, #1064] // 8-byte Folded Reload + ldr x19, [sp, #1080] // 8-byte Folded Reload + mul x8, x16, x8 + ldr x14, [sp, #1256] // 8-byte Folded Reload + ldr x17, [sp, #1272] // 8-byte Folded Reload + add x10, x14, x29 + ldr x21, [sp, #992] // 8-byte Folded Reload + ldr x23, [sp, #976] // 8-byte Folded Reload + ldr x24, [sp, #968] // 8-byte Folded Reload + ldp x30, x25, [sp, #320] // 16-byte Folded Reload + madd x9, x15, x9, x8 + add x8, x9, x19 + add x13, x8, x29 + lsl x11, x8, #2 + add x14, x8, x14 + add x8, x8, x10 + lsl x10, x13, #2 + ldr q0, [x17, x11] + lsl x11, x14, #2 + ldr x13, [sp, #1264] // 8-byte Folded Reload + ldr x14, [sp, #1280] // 8-byte Folded Reload + lsl x8, x8, #2 + ldr q2, [x17, x10] + ldr x10, [sp, #1040] // 8-byte Folded Reload + mul x10, x16, x10 + ldr q1, [x17, x11] + ldr x11, [sp, #1048] // 8-byte Folded Reload + madd x11, x15, x11, x10 + ldr q3, [x17, x8] + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + add x10, x11, x19 + lsl x10, x10, #2 + ldr q7, [x13, x10] + ldr x10, [sp, #1032] // 8-byte Folded Reload + ldr x13, [sp, #1056] // 8-byte Folded Reload + mul x10, x16, x10 + madd x10, x15, x13, x10 + ldr x15, [sp, #1120] // 8-byte Folded Reload + lsl x13, x10, #2 + ldr q4, [x14, x13] + add x13, x10, x28 + lsl x13, x13, #2 + ldr q5, [x14, x13] + add x13, x10, x28, lsl #1 + lsl x13, x13, #2 + ldr q6, [x14, x13] + ldr x14, [sp, #656] // 8-byte Folded Reload + orr x13, x8, #0x20 + .p2align 2 +.LBB0_65: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x16, [sp, #1288] // 8-byte Folded Reload + fmla v0.4s, v7.4s, v4.s[0] + fmla v2.4s, v7.4s, v5.s[0] + cmp x12, x22 + add x16, x16, x15 + prfm pldl1keep, [x16, #16] + ldr q16, [x16] + b.ge .LBB0_67 +// %bb.66: // in Loop: Header=BB0_65 Depth=3 + ldr x7, [sp, #768] // 8-byte Folded Reload + ldr x16, [sp, #1224] // 8-byte Folded Reload + fmla v1.4s, v7.4s, v6.s[0] + fmla v3.4s, v7.4s, v16.s[0] + ldr x18, [sp, #1208] // 8-byte Folded Reload + ldr x2, [sp, #1216] // 8-byte Folded Reload + add x5, x30, x14 + add x4, x25, x14 + ldr x6, [sp, #776] // 8-byte Folded Reload + stur q7, [x13, #-32] + add x12, x12, #4 + add x7, x7, x14 + add x6, x6, x14 + add x16, x16, x15 + add x18, x18, x15 + add x2, x2, x15 + add x15, x15, #16 + prfm pldl1keep, [x7] + ldr x7, [sp, #752] // 8-byte Folded Reload + add x17, x16, #32 + add x1, x18, #32 + add x3, x2, #32 + ldr q7, [x7, x14] + stur q7, [x13, #-16] + prfm pldl1keep, [x6] + ldr x6, [sp, #744] // 8-byte Folded Reload + fmla v0.4s, v7.4s, v4.s[1] + fmla v2.4s, v7.4s, v5.s[1] + fmla v1.4s, v7.4s, v6.s[1] + fmla v3.4s, v7.4s, v16.s[1] + ldr q7, [x6, x14] + str q7, [x13] + prfm pldl1keep, [x5] + ldr x5, [sp, #736] // 8-byte Folded Reload + fmla v0.4s, v7.4s, v4.s[2] + fmla v2.4s, v7.4s, v5.s[2] + fmla v1.4s, v7.4s, v6.s[2] + fmla v3.4s, v7.4s, v16.s[2] + ldr q7, [x5, x14] + str q7, [x13, #16] + prfm pldl1keep, [x4] + ldr x4, [sp, #760] // 8-byte Folded Reload + fmla v0.4s, v7.4s, v4.s[3] + fmla v2.4s, v7.4s, v5.s[3] + fmla v1.4s, v7.4s, v6.s[3] + fmla v3.4s, v7.4s, v16.s[3] + add x13, x13, #64 + ldr q7, [x4, x14] + prfm pldl1keep, [x3] + ldr q4, [x2, #16] + prfm pldl1keep, [x1] + ldr q5, [x18, #16] + prfm pldl1keep, [x17] + ldr q6, [x16, #16] + ldr x16, [sp, #1240] // 8-byte Folded Reload + add x14, x14, x16 + b .LBB0_65 + .p2align 2 +.LBB0_67: // in Loop: Header=BB0_7 Depth=2 + ldr x15, [sp, #1000] // 8-byte Folded Reload + ldr x6, [sp, #984] // 8-byte Folded Reload + fmla v1.4s, v7.4s, v6.s[0] + fmla v3.4s, v7.4s, v16.s[0] + ldr x16, [sp, #1264] // 8-byte Folded Reload + str q7, [x8, x22, lsl #4] + mov x12, xzr + ldr x7, [sp, #1272] // 8-byte Folded Reload + mov x13, xzr + madd x14, x6, x15, x11 + add x14, x14, x19 + lsl x14, x14, #2 + ldr q17, [x16, x14] + madd x14, x23, x15, x11 + madd x11, x24, x15, x11 + ldr x15, [sp, #672] // 8-byte Folded Reload + add x14, x14, x19 + add x11, x11, x19 + lsl x14, x14, #2 + lsl x11, x11, #2 + str q17, [x8, x6, lsl #4] + fmla v0.4s, v17.4s, v4.s[1] + fmla v2.4s, v17.4s, v5.s[1] + fmla v1.4s, v17.4s, v6.s[1] + fmla v3.4s, v17.4s, v16.s[1] + ldr q7, [x16, x14] + ldr x14, [sp, #464] // 8-byte Folded Reload + fmla v0.4s, v7.4s, v4.s[2] + str q7, [x8, x23, lsl #4] + fmla v2.4s, v7.4s, v5.s[2] + fmla v1.4s, v7.4s, v6.s[2] + fmla v3.4s, v7.4s, v16.s[2] + ldr q7, [x16, x11] + ldr x11, [sp, #664] // 8-byte Folded Reload + add x11, x11, x14 + add x14, x15, x14 + ldr x15, [sp, #1304] // 8-byte Folded Reload + fmla v0.4s, v7.4s, v4.s[3] + fmla v2.4s, v7.4s, v5.s[3] + fmla v1.4s, v7.4s, v6.s[3] + fmla v3.4s, v7.4s, v16.s[3] + str q7, [x8, x24, lsl #4] + cmp x15, x20 + b.ge .LBB0_69 + .p2align 2 +.LBB0_68: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x17, [sp, #1152] // 8-byte Folded Reload + add x16, x11, x13 + add x17, x17, x12 + add x12, x12, #4 + prfm pldl1keep, [x17] + ldur s4, [x17, #-4] + add x17, x17, x27 + prfm pldl1keep, [x17] + ldur s5, [x17, #-4] + add x17, x17, x27 + prfm pldl1keep, [x17] + ldur s6, [x17, #-4] + add x17, x17, x27 + prfm pldl1keep, [x17] + ldur s7, [x17, #-4] + prfm pldl1keep, [x16] + ldr q16, [x14, x13] + add x13, x13, x21 + fmla v0.4s, v16.4s, v4.s[0] + str q16, [x8, x15, lsl #4] + add x15, x15, #1 + fmla v2.4s, v16.4s, v5.s[0] + fmla v1.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v7.s[0] + cmp x15, x20 + b.lt .LBB0_68 +.LBB0_69: // %.preheader66 + // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #312] // 8-byte Folded Reload + ldr x13, [sp, #1096] // 8-byte Folded Reload + mov x2, xzr + add x11, x8, #48 + ldr x14, [sp, #1088] // 8-byte Folded Reload + mov w16, #1 // =0x1 + mov w17, #2 // =0x2 + mov w18, #3 // =0x3 + mov w15, #4 // =0x4 + add x12, x8, x12 + b .LBB0_71 + .p2align 2 +.LBB0_70: // %.loopexit62 + // in Loop: Header=BB0_71 Depth=3 + ldr x2, [sp, #1232] // 8-byte Folded Reload + add x14, x14, x2 + add x13, x13, x2 + mov x2, x15 + mov x15, x1 +.LBB0_71: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_73 Depth 4 + // Child Loop BB0_75 Depth 4 + madd x1, x2, x29, x9 + add x1, x1, x19 + madd x16, x16, x29, x9 + madd x17, x17, x29, x9 + madd x18, x18, x29, x9 + add x16, x16, x19 + add x17, x17, x19 + lsl x1, x1, #2 + lsl x16, x16, #2 + lsl x17, x17, #2 + str q0, [x7, x1] + str q2, [x7, x16] + add x16, x18, x19 + lsl x16, x16, #2 + str q1, [x7, x17] + str q3, [x7, x16] + ldr x16, [sp, #1296] // 8-byte Folded Reload + cmp x15, x16 + b.ge .LBB0_76 +// %bb.72: // in Loop: Header=BB0_71 Depth=3 + add x16, x15, #1 + add x17, x15, #2 + madd x1, x15, x29, x9 + add x18, x15, #3 + madd x3, x16, x29, x9 + ldr q16, [x8] + mov x2, xzr + add x1, x1, x19 + madd x4, x17, x29, x9 + add x3, x3, x19 + add x4, x4, x19 + lsl x1, x1, #2 + lsl x3, x3, #2 + lsl x4, x4, #2 + ldr q0, [x7, x1] + madd x1, x18, x29, x9 + ldr q2, [x7, x3] + madd x3, x15, x28, x10 + ldr q1, [x7, x4] + ldr x4, [sp, #1280] // 8-byte Folded Reload + add x1, x1, x19 + lsl x3, x3, #2 + lsl x1, x1, #2 + ldr q7, [x4, x3] + madd x3, x16, x28, x10 + ldr q3, [x7, x1] + add x1, x15, #4 + lsl x3, x3, #2 + ldr q6, [x4, x3] + madd x3, x17, x28, x10 + lsl x3, x3, #2 + ldr q5, [x4, x3] + madd x3, x18, x28, x10 + lsl x3, x3, #2 + ldr q4, [x4, x3] + mov x3, x11 + mov x4, x14 + cmp xzr, x22 + b.ge .LBB0_74 + .p2align 2 +.LBB0_73: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_71 Depth=3 + // => This Inner Loop Header: Depth=4 + add x5, x3, #32 + fmla v0.4s, v16.4s, v7.s[0] + fmla v2.4s, v16.4s, v6.s[0] + add x2, x2, #4 + fmla v1.4s, v16.4s, v5.s[0] + fmla v3.4s, v16.4s, v4.s[0] + prfm pldl1keep, [x5] + add x5, x4, x27 + ldp q16, q17, [x3, #-32] + fmla v0.4s, v16.4s, v7.s[1] + fmla v2.4s, v16.4s, v6.s[1] + fmla v1.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + fmla v0.4s, v17.4s, v7.s[2] + fmla v2.4s, v17.4s, v6.s[2] + fmla v1.4s, v17.4s, v5.s[2] + fmla v3.4s, v17.4s, v4.s[2] + ldp q17, q16, [x3], #64 + prfm pldl1keep, [x4] + fmla v0.4s, v17.4s, v7.s[3] + ldur q7, [x4, #-16] + prfm pldl1keep, [x5] + fmla v2.4s, v17.4s, v6.s[3] + ldur q6, [x5, #-16] + add x5, x5, x27 + fmla v1.4s, v17.4s, v5.s[3] + fmla v3.4s, v17.4s, v4.s[3] + add x4, x4, #16 + prfm pldl1keep, [x5] + ldur q5, [x5, #-16] + add x5, x5, x27 + prfm pldl1keep, [x5] + ldur q4, [x5, #-16] + cmp x2, x22 + b.lt .LBB0_73 +.LBB0_74: // in Loop: Header=BB0_71 Depth=3 + ldr q17, [x8, x6, lsl #4] + fmla v0.4s, v16.4s, v7.s[0] + fmla v2.4s, v16.4s, v6.s[0] + fmla v1.4s, v16.4s, v5.s[0] + fmla v3.4s, v16.4s, v4.s[0] + ldr q16, [x8, x23, lsl #4] + ldr q18, [x8, x24, lsl #4] + ldr x4, [sp, #1304] // 8-byte Folded Reload + mov x2, x13 + mov x3, x12 + fmla v0.4s, v17.4s, v7.s[1] + fmla v2.4s, v17.4s, v6.s[1] + fmla v1.4s, v17.4s, v5.s[1] + fmla v3.4s, v17.4s, v4.s[1] + fmla v0.4s, v16.4s, v7.s[2] + fmla v2.4s, v16.4s, v6.s[2] + fmla v1.4s, v16.4s, v5.s[2] + fmla v3.4s, v16.4s, v4.s[2] + fmla v0.4s, v18.4s, v7.s[3] + fmla v2.4s, v18.4s, v6.s[3] + fmla v1.4s, v18.4s, v5.s[3] + fmla v3.4s, v18.4s, v4.s[3] + cmp x4, x20 + b.ge .LBB0_70 + .p2align 2 +.LBB0_75: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_71 Depth=3 + // => This Inner Loop Header: Depth=4 + add x5, x2, x27 + prfm pldl1keep, [x2] + ldur s4, [x2, #-4] + add x4, x4, #1 + prfm pldl1keep, [x5] + ldur s5, [x5, #-4] + add x5, x5, x27 + add x2, x2, #4 + prfm pldl1keep, [x5] + ldur s6, [x5, #-4] + add x5, x5, x27 + prfm pldl1keep, [x5] + ldur s7, [x5, #-4] + prfm pldl1keep, [x3] + ldur q16, [x3, #-16] + add x3, x3, #16 + fmla v0.4s, v16.4s, v4.s[0] + fmla v2.4s, v16.4s, v5.s[0] + fmla v1.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v7.s[0] + cmp x4, x20 + b.lt .LBB0_75 + b .LBB0_70 + .p2align 2 +.LBB0_76: // in Loop: Header=BB0_7 Depth=2 + ldr x13, [sp, #1296] // 8-byte Folded Reload + ldr x14, [sp, #1128] // 8-byte Folded Reload + cmp x13, x14 + b.ge .LBB0_82 +// %bb.77: // in Loop: Header=BB0_7 Depth=2 + ldr x17, [sp, #1296] // 8-byte Folded Reload + ldr x18, [sp, #1280] // 8-byte Folded Reload + mov x15, xzr + add x16, x17, #1 + madd x13, x17, x29, x9 + madd x17, x17, x28, x10 + ldr q4, [x8] + madd x14, x16, x29, x9 + madd x16, x16, x28, x10 + add x13, x13, x19 + lsl x17, x17, #2 + add x14, x14, x19 + add x13, x7, x13, lsl #2 + lsl x16, x16, #2 + ldr q3, [x18, x17] + ldr x17, [sp, #1120] // 8-byte Folded Reload + add x14, x7, x14, lsl #2 + ldr q2, [x18, x16] + mov x16, x11 + ldr q0, [x13] + ldr q1, [x14] + cmp xzr, x22 + b.ge .LBB0_79 + .p2align 2 +.LBB0_78: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x4, x16, #32 + ldr x18, [sp, #1184] // 8-byte Folded Reload + ldr x2, [sp, #1176] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v3.s[0] + prfm pldl1keep, [x4] + fmla v1.4s, v4.4s, v2.s[0] + ldp q4, q5, [x16, #-32] + add x15, x15, #4 + add x18, x18, x17 + add x2, x2, x17 + add x17, x17, #16 + add x1, x18, #32 + add x3, x2, #32 + fmla v0.4s, v4.4s, v3.s[1] + fmla v1.4s, v4.4s, v2.s[1] + fmla v0.4s, v5.4s, v3.s[2] + fmla v1.4s, v5.4s, v2.s[2] + ldp q5, q4, [x16], #64 + prfm pldl1keep, [x3] + fmla v0.4s, v5.4s, v3.s[3] + ldr q3, [x2, #16] + prfm pldl1keep, [x1] + fmla v1.4s, v5.4s, v2.s[3] + ldr q2, [x18, #16] + cmp x15, x22 + b.lt .LBB0_78 +.LBB0_79: // in Loop: Header=BB0_7 Depth=2 + ldr q5, [x8, x6, lsl #4] + fmla v0.4s, v4.4s, v3.s[0] + fmla v1.4s, v4.4s, v2.s[0] + ldr q4, [x8, x23, lsl #4] + ldr x15, [sp, #1120] // 8-byte Folded Reload + ldr x16, [sp, #1304] // 8-byte Folded Reload + fmla v0.4s, v5.4s, v3.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldr q5, [x8, x24, lsl #4] + fmla v0.4s, v4.4s, v3.s[2] + fmla v1.4s, v4.4s, v2.s[2] + fmla v0.4s, v5.4s, v3.s[3] + fmla v1.4s, v5.4s, v2.s[3] + cmp x16, x20 + b.ge .LBB0_81 + .p2align 2 +.LBB0_80: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x1, [sp, #1168] // 8-byte Folded Reload + ldr x2, [sp, #1160] // 8-byte Folded Reload + add x16, x16, #1 + add x17, x1, x15 + add x18, x2, x15 + add x17, x17, #4 + add x18, x18, #4 + prfm pldl1keep, [x18] + ldr s2, [x2, x15] + prfm pldl1keep, [x17] + ldr s3, [x1, x15] + prfm pldl1keep, [x12] + ldur q4, [x12, #-16] + add x12, x12, #16 + add x15, x15, #4 + fmla v0.4s, v4.4s, v2.s[0] + fmla v1.4s, v4.4s, v3.s[0] + cmp x16, x20 + b.lt .LBB0_80 +.LBB0_81: // in Loop: Header=BB0_7 Depth=2 + str q0, [x13] + str q1, [x14] +.LBB0_82: // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #1024] // 8-byte Folded Reload + ldr x13, [sp, #1128] // 8-byte Folded Reload + cmp x13, x12 + b.ge .LBB0_88 +// %bb.83: // in Loop: Header=BB0_7 Depth=2 + ldr x13, [sp, #1128] // 8-byte Folded Reload + ldr q2, [x8] + mov x12, xzr + madd x9, x13, x29, x9 + madd x10, x13, x28, x10 + ldr x13, [sp, #1280] // 8-byte Folded Reload + ldr x14, [sp, #896] // 8-byte Folded Reload + add x9, x9, x19 + lsl x10, x10, #2 + add x9, x7, x9, lsl #2 + ldr q1, [x13, x10] + ldr x10, [sp, #904] // 8-byte Folded Reload + ldr q0, [x9] + cmp xzr, x22 + b.ge .LBB0_85 + .p2align 2 +.LBB0_84: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x11, #32 + fmla v0.4s, v2.4s, v1.s[0] + add x12, x12, #4 + prfm pldl1keep, [x13] + ldp q2, q3, [x11, #-32] + fmla v0.4s, v2.4s, v1.s[1] + fmla v0.4s, v3.4s, v1.s[2] + ldp q3, q2, [x11], #64 + prfm pldl1keep, [x10] + fmla v0.4s, v3.4s, v1.s[3] + ldur q1, [x10, #-16] + add x10, x10, #16 + cmp x12, x22 + b.lt .LBB0_84 +.LBB0_85: // in Loop: Header=BB0_7 Depth=2 + ldr q3, [x8, x6, lsl #4] + fmla v0.4s, v2.4s, v1.s[0] + ldr x11, [sp, #280] // 8-byte Folded Reload + ldr q2, [x8, x23, lsl #4] + mov x10, xzr + mov w12, #16 // =0x10 + fmla v0.4s, v3.4s, v1.s[1] + ldr q3, [x8, x24, lsl #4] + add x8, x8, x11 + ldr x11, [sp, #880] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[2] + fmla v0.4s, v3.4s, v1.s[3] + ldr x13, [sp, #1304] // 8-byte Folded Reload + add x13, x13, xzr + cmp x13, x20 + b.ge .LBB0_87 + .p2align 2 +.LBB0_86: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x8, x12 + prfm pldl1keep, [x11] + ldr s1, [x14, x10, lsl #2] + prfm pldl1keep, [x13] + ldr q2, [x8, x10, lsl #4] + add x10, x10, #1 + add x12, x12, #16 + add x11, x11, #4 + fmla v0.4s, v2.4s, v1.s[0] + ldr x13, [sp, #1304] // 8-byte Folded Reload + add x13, x13, x10 + cmp x13, x20 + b.lt .LBB0_86 +.LBB0_87: // in Loop: Header=BB0_7 Depth=2 + str q0, [x9] +.LBB0_88: // in Loop: Header=BB0_7 Depth=2 + bl free + ldr x30, [sp, #1080] // 8-byte Folded Reload + ldr x19, [sp, #1072] // 8-byte Folded Reload + ldr x8, [sp, #592] // 8-byte Folded Reload + ldr x9, [sp, #584] // 8-byte Folded Reload + cmp x8, x9 + b.ge .LBB0_38 +.LBB0_89: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #424] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #1016] // 8-byte Folded Reload + ldr x9, [sp, #1008] // 8-byte Folded Reload + mov x16, x19 + mov x12, xzr + ldr x15, [sp, #1064] // 8-byte Folded Reload + ldr x14, [sp, #1256] // 8-byte Folded Reload + mul x8, x19, x8 + ldr x19, [sp, #592] // 8-byte Folded Reload + ldr x17, [sp, #1272] // 8-byte Folded Reload + add x10, x14, x29 + ldr x21, [sp, #992] // 8-byte Folded Reload + ldr x23, [sp, #976] // 8-byte Folded Reload + ldr x24, [sp, #968] // 8-byte Folded Reload + ldp x30, x25, [sp, #296] // 16-byte Folded Reload + madd x9, x15, x9, x8 + add x8, x9, x19 + add x13, x8, x29 + lsl x11, x8, #2 + add x14, x8, x14 + add x8, x8, x10 + lsl x10, x13, #2 + ldr d0, [x17, x11] + lsl x11, x14, #2 + ldr x13, [sp, #1264] // 8-byte Folded Reload + ldr x14, [sp, #1280] // 8-byte Folded Reload + lsl x8, x8, #2 + ldr d2, [x17, x10] + ldr x10, [sp, #1040] // 8-byte Folded Reload + mul x10, x16, x10 + ldr d1, [x17, x11] + ldr x11, [sp, #1048] // 8-byte Folded Reload + madd x11, x15, x11, x10 + ldr d3, [x17, x8] + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + add x10, x11, x19 + lsl x10, x10, #2 + ldr d7, [x13, x10] + ldr x10, [sp, #1032] // 8-byte Folded Reload + ldr x13, [sp, #1056] // 8-byte Folded Reload + mul x10, x16, x10 + madd x10, x15, x13, x10 + ldr x15, [sp, #1120] // 8-byte Folded Reload + lsl x13, x10, #2 + ldr q4, [x14, x13] + add x13, x10, x28 + lsl x13, x13, #2 + ldr q5, [x14, x13] + add x13, x10, x28, lsl #1 + lsl x13, x13, #2 + ldr q6, [x14, x13] + ldr x14, [sp, #656] // 8-byte Folded Reload + orr x13, x8, #0x10 + .p2align 2 +.LBB0_90: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x16, [sp, #1288] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v4.s[0] + fmla v2.2s, v7.2s, v5.s[0] + cmp x12, x22 + add x16, x16, x15 + prfm pldl1keep, [x16, #16] + ldr q16, [x16] + b.ge .LBB0_92 +// %bb.91: // in Loop: Header=BB0_90 Depth=3 + ldr x7, [sp, #720] // 8-byte Folded Reload + ldr x16, [sp, #1224] // 8-byte Folded Reload + fmla v1.2s, v7.2s, v6.s[0] + fmla v3.2s, v7.2s, v16.s[0] + ldr x18, [sp, #1208] // 8-byte Folded Reload + ldr x2, [sp, #1216] // 8-byte Folded Reload + add x5, x30, x14 + add x4, x25, x14 + ldr x6, [sp, #728] // 8-byte Folded Reload + stur d7, [x13, #-16] + add x12, x12, #4 + add x7, x7, x14 + add x6, x6, x14 + add x16, x16, x15 + add x18, x18, x15 + add x2, x2, x15 + add x15, x15, #16 + prfm pldl1keep, [x7] + ldr x7, [sp, #704] // 8-byte Folded Reload + add x17, x16, #32 + add x1, x18, #32 + add x3, x2, #32 + ldr d7, [x7, x14] + stur d7, [x13, #-8] + prfm pldl1keep, [x6] + ldr x6, [sp, #696] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v4.s[1] + fmla v2.2s, v7.2s, v5.s[1] + fmla v1.2s, v7.2s, v6.s[1] + fmla v3.2s, v7.2s, v16.s[1] + ldr d7, [x6, x14] + str d7, [x13] + prfm pldl1keep, [x5] + ldr x5, [sp, #688] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v4.s[2] + fmla v2.2s, v7.2s, v5.s[2] + fmla v1.2s, v7.2s, v6.s[2] + fmla v3.2s, v7.2s, v16.s[2] + ldr d7, [x5, x14] + str d7, [x13, #8] + prfm pldl1keep, [x4] + ldr x4, [sp, #712] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v4.s[3] + fmla v2.2s, v7.2s, v5.s[3] + fmla v1.2s, v7.2s, v6.s[3] + fmla v3.2s, v7.2s, v16.s[3] + add x13, x13, #32 + ldr d7, [x4, x14] + prfm pldl1keep, [x3] + ldr q4, [x2, #16] + prfm pldl1keep, [x1] + ldr q5, [x18, #16] + prfm pldl1keep, [x17] + ldr q6, [x16, #16] + ldr x16, [sp, #1240] // 8-byte Folded Reload + add x14, x14, x16 + b .LBB0_90 + .p2align 2 +.LBB0_92: // in Loop: Header=BB0_7 Depth=2 + ldr x15, [sp, #1000] // 8-byte Folded Reload + ldr x6, [sp, #984] // 8-byte Folded Reload + fmla v1.2s, v7.2s, v6.s[0] + fmla v3.2s, v7.2s, v16.s[0] + ldr x16, [sp, #1264] // 8-byte Folded Reload + str d7, [x8, x22, lsl #3] + mov x12, xzr + ldr x7, [sp, #648] // 8-byte Folded Reload + ldr x25, [sp, #680] // 8-byte Folded Reload + mov x13, xzr + madd x14, x6, x15, x11 + ldr x30, [sp, #1272] // 8-byte Folded Reload + add x14, x14, x19 + lsl x14, x14, #2 + ldr d17, [x16, x14] + madd x14, x23, x15, x11 + madd x11, x24, x15, x11 + ldr x15, [sp, #672] // 8-byte Folded Reload + add x14, x14, x19 + add x11, x11, x19 + lsl x14, x14, #2 + lsl x11, x11, #2 + str d17, [x8, x6, lsl #3] + fmla v0.2s, v17.2s, v4.s[1] + fmla v2.2s, v17.2s, v5.s[1] + fmla v1.2s, v17.2s, v6.s[1] + fmla v3.2s, v17.2s, v16.s[1] + ldr d7, [x16, x14] + ldr x14, [sp, #456] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v4.s[2] + str d7, [x8, x23, lsl #3] + fmla v2.2s, v7.2s, v5.s[2] + fmla v1.2s, v7.2s, v6.s[2] + fmla v3.2s, v7.2s, v16.s[2] + ldr d7, [x16, x11] + ldr x11, [sp, #664] // 8-byte Folded Reload + add x11, x11, x14 + add x14, x15, x14 + ldr x15, [sp, #1304] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v4.s[3] + fmla v2.2s, v7.2s, v5.s[3] + fmla v1.2s, v7.2s, v6.s[3] + fmla v3.2s, v7.2s, v16.s[3] + str d7, [x8, x24, lsl #3] + cmp x15, x20 + b.ge .LBB0_94 + .p2align 2 +.LBB0_93: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x17, [sp, #1152] // 8-byte Folded Reload + add x16, x11, x13 + add x17, x17, x12 + add x12, x12, #4 + prfm pldl1keep, [x17] + ldur s4, [x17, #-4] + add x17, x17, x27 + prfm pldl1keep, [x17] + ldur s5, [x17, #-4] + add x17, x17, x27 + prfm pldl1keep, [x17] + ldur s6, [x17, #-4] + add x17, x17, x27 + prfm pldl1keep, [x17] + ldur s7, [x17, #-4] + prfm pldl1keep, [x16] + ldr d16, [x14, x13] + add x13, x13, x21 + fmla v0.2s, v16.2s, v4.s[0] + str d16, [x8, x15, lsl #3] + add x15, x15, #1 + fmla v2.2s, v16.2s, v5.s[0] + fmla v1.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v7.s[0] + cmp x15, x20 + b.lt .LBB0_93 +.LBB0_94: // %.preheader65 + // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #288] // 8-byte Folded Reload + ldr x13, [sp, #1096] // 8-byte Folded Reload + mov x2, xzr + add x11, x8, #24 + ldr x14, [sp, #1088] // 8-byte Folded Reload + mov w16, #1 // =0x1 + mov w17, #2 // =0x2 + mov w18, #3 // =0x3 + mov w15, #4 // =0x4 + add x12, x8, x12 + b .LBB0_96 + .p2align 2 +.LBB0_95: // %.loopexit61 + // in Loop: Header=BB0_96 Depth=3 + ldr x2, [sp, #1232] // 8-byte Folded Reload + add x14, x14, x2 + add x13, x13, x2 + mov x2, x15 + mov x15, x1 +.LBB0_96: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_98 Depth 4 + // Child Loop BB0_100 Depth 4 + madd x1, x2, x29, x9 + add x1, x1, x19 + madd x16, x16, x29, x9 + madd x17, x17, x29, x9 + madd x18, x18, x29, x9 + add x16, x16, x19 + add x17, x17, x19 + lsl x1, x1, #2 + lsl x16, x16, #2 + lsl x17, x17, #2 + str d0, [x30, x1] + str d2, [x30, x16] + add x16, x18, x19 + lsl x16, x16, #2 + str d1, [x30, x17] + str d3, [x30, x16] + ldr x16, [sp, #1296] // 8-byte Folded Reload + cmp x15, x16 + b.ge .LBB0_101 +// %bb.97: // in Loop: Header=BB0_96 Depth=3 + add x16, x15, #1 + add x17, x15, #2 + madd x1, x15, x29, x9 + add x18, x15, #3 + madd x3, x16, x29, x9 + ldr d16, [x8] + mov x2, xzr + add x1, x1, x19 + madd x4, x17, x29, x9 + add x3, x3, x19 + add x4, x4, x19 + lsl x1, x1, #2 + lsl x3, x3, #2 + lsl x4, x4, #2 + ldr d0, [x30, x1] + madd x1, x18, x29, x9 + ldr d2, [x30, x3] + madd x3, x15, x28, x10 + ldr d1, [x30, x4] + ldr x4, [sp, #1280] // 8-byte Folded Reload + add x1, x1, x19 + lsl x3, x3, #2 + lsl x1, x1, #2 + ldr q7, [x4, x3] + madd x3, x16, x28, x10 + ldr d3, [x30, x1] + add x1, x15, #4 + lsl x3, x3, #2 + ldr q6, [x4, x3] + madd x3, x17, x28, x10 + lsl x3, x3, #2 + ldr q5, [x4, x3] + madd x3, x18, x28, x10 + lsl x3, x3, #2 + ldr q4, [x4, x3] + mov x3, x11 + mov x4, x14 + cmp xzr, x22 + b.ge .LBB0_99 + .p2align 2 +.LBB0_98: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_96 Depth=3 + // => This Inner Loop Header: Depth=4 + add x5, x3, #16 + fmla v0.2s, v16.2s, v7.s[0] + fmla v2.2s, v16.2s, v6.s[0] + add x2, x2, #4 + fmla v1.2s, v16.2s, v5.s[0] + fmla v3.2s, v16.2s, v4.s[0] + prfm pldl1keep, [x5] + add x5, x4, x27 + ldp d16, d17, [x3, #-16] + fmla v0.2s, v16.2s, v7.s[1] + fmla v2.2s, v16.2s, v6.s[1] + fmla v1.2s, v16.2s, v5.s[1] + fmla v3.2s, v16.2s, v4.s[1] + fmla v0.2s, v17.2s, v7.s[2] + fmla v2.2s, v17.2s, v6.s[2] + fmla v1.2s, v17.2s, v5.s[2] + fmla v3.2s, v17.2s, v4.s[2] + ldp d17, d16, [x3], #32 + prfm pldl1keep, [x4] + fmla v0.2s, v17.2s, v7.s[3] + ldur q7, [x4, #-16] + prfm pldl1keep, [x5] + fmla v2.2s, v17.2s, v6.s[3] + ldur q6, [x5, #-16] + add x5, x5, x27 + fmla v1.2s, v17.2s, v5.s[3] + fmla v3.2s, v17.2s, v4.s[3] + add x4, x4, #16 + prfm pldl1keep, [x5] + ldur q5, [x5, #-16] + add x5, x5, x27 + prfm pldl1keep, [x5] + ldur q4, [x5, #-16] + cmp x2, x22 + b.lt .LBB0_98 +.LBB0_99: // in Loop: Header=BB0_96 Depth=3 + ldr d17, [x8, x6, lsl #3] + fmla v0.2s, v16.2s, v7.s[0] + fmla v2.2s, v16.2s, v6.s[0] + fmla v1.2s, v16.2s, v5.s[0] + fmla v3.2s, v16.2s, v4.s[0] + ldr d16, [x8, x23, lsl #3] + ldr d18, [x8, x24, lsl #3] + ldr x4, [sp, #1304] // 8-byte Folded Reload + mov x2, x13 + mov x3, x12 + fmla v0.2s, v17.2s, v7.s[1] + fmla v2.2s, v17.2s, v6.s[1] + fmla v1.2s, v17.2s, v5.s[1] + fmla v3.2s, v17.2s, v4.s[1] + fmla v0.2s, v16.2s, v7.s[2] + fmla v2.2s, v16.2s, v6.s[2] + fmla v1.2s, v16.2s, v5.s[2] + fmla v3.2s, v16.2s, v4.s[2] + fmla v0.2s, v18.2s, v7.s[3] + fmla v2.2s, v18.2s, v6.s[3] + fmla v1.2s, v18.2s, v5.s[3] + fmla v3.2s, v18.2s, v4.s[3] + cmp x4, x20 + b.ge .LBB0_95 + .p2align 2 +.LBB0_100: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_96 Depth=3 + // => This Inner Loop Header: Depth=4 + add x5, x2, x27 + prfm pldl1keep, [x2] + ldur s4, [x2, #-4] + add x4, x4, #1 + prfm pldl1keep, [x5] + ldur s5, [x5, #-4] + add x5, x5, x27 + add x2, x2, #4 + prfm pldl1keep, [x5] + ldur s6, [x5, #-4] + add x5, x5, x27 + prfm pldl1keep, [x5] + ldur s7, [x5, #-4] + prfm pldl1keep, [x3] + ldur d16, [x3, #-8] + add x3, x3, #8 + fmla v0.2s, v16.2s, v4.s[0] + fmla v2.2s, v16.2s, v5.s[0] + fmla v1.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v7.s[0] + cmp x4, x20 + b.lt .LBB0_100 + b .LBB0_95 + .p2align 2 +.LBB0_101: // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #1296] // 8-byte Folded Reload + ldr x13, [sp, #1128] // 8-byte Folded Reload + cmp x12, x13 + b.ge .LBB0_107 +// %bb.102: // in Loop: Header=BB0_7 Depth=2 + ldr x16, [sp, #1296] // 8-byte Folded Reload + ldr x17, [sp, #1280] // 8-byte Folded Reload + mov x14, xzr + add x15, x16, #1 + madd x12, x16, x29, x9 + madd x16, x16, x28, x10 + ldr d4, [x8] + madd x13, x15, x29, x9 + madd x15, x15, x28, x10 + add x12, x12, x19 + lsl x16, x16, #2 + add x13, x13, x19 + add x12, x30, x12, lsl #2 + lsl x15, x15, #2 + ldr q3, [x17, x16] + ldr x16, [sp, #1120] // 8-byte Folded Reload + add x13, x30, x13, lsl #2 + ldr q2, [x17, x15] + mov x15, x11 + ldr d0, [x12] + ldr d1, [x13] + cmp xzr, x22 + b.ge .LBB0_104 + .p2align 2 +.LBB0_103: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x3, x15, #16 + ldr x17, [sp, #1184] // 8-byte Folded Reload + ldr x1, [sp, #1176] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v3.s[0] + prfm pldl1keep, [x3] + fmla v1.2s, v4.2s, v2.s[0] + ldp d4, d5, [x15, #-16] + add x14, x14, #4 + add x17, x17, x16 + add x1, x1, x16 + add x16, x16, #16 + add x18, x17, #32 + add x2, x1, #32 + fmla v0.2s, v4.2s, v3.s[1] + fmla v1.2s, v4.2s, v2.s[1] + fmla v0.2s, v5.2s, v3.s[2] + fmla v1.2s, v5.2s, v2.s[2] + ldp d5, d4, [x15], #32 + prfm pldl1keep, [x2] + fmla v0.2s, v5.2s, v3.s[3] + ldr q3, [x1, #16] + prfm pldl1keep, [x18] + fmla v1.2s, v5.2s, v2.s[3] + ldr q2, [x17, #16] + cmp x14, x22 + b.lt .LBB0_103 +.LBB0_104: // in Loop: Header=BB0_7 Depth=2 + ldr d5, [x8, x6, lsl #3] + fmla v0.2s, v4.2s, v3.s[0] + fmla v1.2s, v4.2s, v2.s[0] + ldr d4, [x8, x23, lsl #3] + ldr x16, [sp, #408] // 8-byte Folded Reload + mov x14, xzr + mov x15, xzr + add x16, x8, x16 + fmla v0.2s, v5.2s, v3.s[1] + fmla v1.2s, v5.2s, v2.s[1] + ldr d5, [x8, x24, lsl #3] + fmla v0.2s, v4.2s, v3.s[2] + fmla v1.2s, v4.2s, v2.s[2] + fmla v0.2s, v5.2s, v3.s[3] + fmla v1.2s, v5.2s, v2.s[3] + ldr x17, [sp, #1304] // 8-byte Folded Reload + add x17, x17, xzr + cmp x17, x20 + b.ge .LBB0_106 + .p2align 2 +.LBB0_105: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x17, x16, x15, lsl #3 + add x18, x25, x14 + add x1, x7, x14 + add x14, x14, #4 + add x1, x1, #4 + add x18, x18, #4 + add x17, x17, #8 + prfm pldl1keep, [x1] + ldr s2, [x7, x15, lsl #2] + prfm pldl1keep, [x18] + ldr s3, [x25, x15, lsl #2] + prfm pldl1keep, [x17] + ldr d4, [x16, x15, lsl #3] + add x15, x15, #1 + fmla v0.2s, v4.2s, v2.s[0] + fmla v1.2s, v4.2s, v3.s[0] + ldr x17, [sp, #1304] // 8-byte Folded Reload + add x17, x17, x15 + cmp x17, x20 + b.lt .LBB0_105 +.LBB0_106: // in Loop: Header=BB0_7 Depth=2 + str d0, [x12] + str d1, [x13] +.LBB0_107: // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #1024] // 8-byte Folded Reload + ldr x13, [sp, #1128] // 8-byte Folded Reload + cmp x13, x12 + b.ge .LBB0_113 +// %bb.108: // in Loop: Header=BB0_7 Depth=2 + ldr x13, [sp, #1128] // 8-byte Folded Reload + ldr d2, [x8] + mov x12, xzr + madd x9, x13, x29, x9 + madd x10, x13, x28, x10 + ldr x13, [sp, #1280] // 8-byte Folded Reload + ldr x14, [sp, #896] // 8-byte Folded Reload + add x9, x9, x19 + lsl x10, x10, #2 + add x9, x30, x9, lsl #2 + ldr q1, [x13, x10] + ldr x10, [sp, #904] // 8-byte Folded Reload + ldr d0, [x9] + cmp xzr, x22 + b.ge .LBB0_110 + .p2align 2 +.LBB0_109: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x11, #16 + fmla v0.2s, v2.2s, v1.s[0] + add x12, x12, #4 + prfm pldl1keep, [x13] + ldp d2, d3, [x11, #-16] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v3.2s, v1.s[2] + ldp d3, d2, [x11], #32 + prfm pldl1keep, [x10] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x10, #-16] + add x10, x10, #16 + cmp x12, x22 + b.lt .LBB0_109 +.LBB0_110: // in Loop: Header=BB0_7 Depth=2 + ldr d3, [x8, x6, lsl #3] + fmla v0.2s, v2.2s, v1.s[0] + ldr x11, [sp, #408] // 8-byte Folded Reload + ldr d4, [x8, x23, lsl #3] + ldr d2, [x8, x24, lsl #3] + mov x10, xzr + add x8, x8, x11 + ldr x11, [sp, #880] // 8-byte Folded Reload + fmla v0.2s, v3.2s, v1.s[1] + fmla v0.2s, v4.2s, v1.s[2] + fmla v0.2s, v2.2s, v1.s[3] + ldr x12, [sp, #1304] // 8-byte Folded Reload + add x12, x12, xzr + cmp x12, x20 + b.ge .LBB0_112 + .p2align 2 +.LBB0_111: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x12, x8, x10, lsl #3 + prfm pldl1keep, [x11] + ldr s1, [x14, x10, lsl #2] + add x11, x11, #4 + add x12, x12, #8 + prfm pldl1keep, [x12] + ldr d2, [x8, x10, lsl #3] + add x10, x10, #1 + fmla v0.2s, v2.2s, v1.s[0] + ldr x12, [sp, #1304] // 8-byte Folded Reload + add x12, x12, x10 + cmp x12, x20 + b.lt .LBB0_111 +.LBB0_112: // in Loop: Header=BB0_7 Depth=2 + str d0, [x9] +.LBB0_113: // in Loop: Header=BB0_7 Depth=2 + bl free + ldr x30, [sp, #1080] // 8-byte Folded Reload + ldr x19, [sp, #1072] // 8-byte Folded Reload + ldr x8, [sp, #504] // 8-byte Folded Reload + ldr x9, [sp, #584] // 8-byte Folded Reload + cmp x9, x8 + b.ge .LBB0_6 +.LBB0_114: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #480] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #1016] // 8-byte Folded Reload + ldr x9, [sp, #1008] // 8-byte Folded Reload + add x10, x0, #63 + mov x12, xzr + ldr x15, [sp, #1064] // 8-byte Folded Reload + ldr x21, [sp, #584] // 8-byte Folded Reload + mov x13, xzr + mul x8, x19, x8 + ldr x14, [sp, #1256] // 8-byte Folded Reload + ldr x16, [sp, #1272] // 8-byte Folded Reload + ldr x23, [sp, #992] // 8-byte Folded Reload + ldr x30, [sp, #384] // 8-byte Folded Reload + ldp x25, x24, [sp, #392] // 16-byte Folded Reload + madd x9, x15, x9, x8 + add x8, x9, x21 + add x11, x8, x14 + add x14, x14, x29 + ldr s1, [x16, x8, lsl #2] + add x14, x8, x14 + add x8, x8, x29 + ldr s2, [x16, x11, lsl #2] + ldr x11, [sp, #1048] // 8-byte Folded Reload + ldr s3, [x16, x8, lsl #2] + ldr x8, [sp, #1040] // 8-byte Folded Reload + ldr s0, [x16, x14, lsl #2] + ldr x14, [sp, #1264] // 8-byte Folded Reload + mul x8, x19, x8 + madd x11, x15, x11, x8 + add x8, x11, x21 + ldr s16, [x14, x8, lsl #2] + and x8, x10, #0xffffffffffffffc0 + ldr x10, [sp, #1032] // 8-byte Folded Reload + ldr x14, [sp, #1056] // 8-byte Folded Reload + mul x10, x19, x10 + madd x10, x15, x14, x10 + ldr x15, [sp, #1280] // 8-byte Folded Reload + lsl x14, x10, #2 + ldr q4, [x15, x14] + add x14, x10, x28 + lsl x14, x14, #2 + ldr q5, [x15, x14] + add x14, x10, x28, lsl #1 + lsl x14, x14, #2 + ldr q6, [x15, x14] + ldr x15, [sp, #656] // 8-byte Folded Reload + orr x14, x8, #0xc + .p2align 2 +.LBB0_115: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x16, [sp, #888] // 8-byte Folded Reload + ext v20.16b, v4.16b, v4.16b, #8 + cmp x13, x22 + ext v19.16b, v5.16b, v5.16b, #8 + add x16, x16, x12 + prfm pldl1keep, [x16, #16] + ldr q7, [x16] + ext v18.16b, v6.16b, v6.16b, #8 + ext v17.16b, v7.16b, v7.16b, #8 + b.ge .LBB0_117 +// %bb.116: // in Loop: Header=BB0_115 Depth=3 + ldr x7, [sp, #840] // 8-byte Folded Reload + add x19, x14, x12 + ldr x16, [sp, #856] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v4.2s + ldr x18, [sp, #864] // 8-byte Folded Reload + ldr x2, [sp, #872] // 8-byte Folded Reload + fmla v3.2s, v16.2s, v5.2s + fmla v2.2s, v16.2s, v6.2s + stur s16, [x19, #-12] + fmla v0.2s, v16.2s, v7.2s + add x6, x30, x15 + add x5, x25, x15 + add x4, x24, x15 + add x13, x13, #4 + add x7, x7, x15 + add x16, x16, x12 + add x18, x18, x12 + add x2, x2, x12 + add x12, x12, #16 + prfm pldl1keep, [x7] + ldr x7, [sp, #824] // 8-byte Folded Reload + add x17, x16, #32 + add x1, x18, #32 + add x3, x2, #32 + ldr s16, [x7, x15] + stur s16, [x19, #-8] + prfm pldl1keep, [x6] + ldr x6, [sp, #816] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v4.s[1] + fmla v3.2s, v16.2s, v5.s[1] + fmla v2.2s, v16.2s, v6.s[1] + fmla v0.2s, v16.2s, v7.s[1] + ldr s16, [x6, x15] + stur s16, [x19, #-4] + prfm pldl1keep, [x5] + ldr x5, [sp, #808] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v20.2s + fmla v3.2s, v16.2s, v19.2s + fmla v2.2s, v16.2s, v18.2s + fmla v0.2s, v16.2s, v17.2s + ldr s16, [x5, x15] + str s16, [x19] + prfm pldl1keep, [x4] + ldr x4, [sp, #832] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v4.s[3] + fmla v3.2s, v16.2s, v5.s[3] + fmla v2.2s, v16.2s, v6.s[3] + fmla v0.2s, v16.2s, v7.s[3] + ldr s16, [x4, x15] + prfm pldl1keep, [x3] + ldr q4, [x2, #16] + prfm pldl1keep, [x1] + ldr q5, [x18, #16] + prfm pldl1keep, [x17] + ldr q6, [x16, #16] + ldr x16, [sp, #1240] // 8-byte Folded Reload + add x15, x15, x16 + b .LBB0_115 + .p2align 2 +.LBB0_117: // in Loop: Header=BB0_7 Depth=2 + ldr x15, [sp, #1000] // 8-byte Folded Reload + ldr x6, [sp, #984] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v4.2s + fmla v3.2s, v16.2s, v5.2s + ldr x16, [sp, #1264] // 8-byte Folded Reload + ldr x7, [sp, #976] // 8-byte Folded Reload + fmla v2.2s, v16.2s, v6.2s + fmla v0.2s, v16.2s, v7.2s + str s16, [x8, x22, lsl #2] + ldr x19, [sp, #968] // 8-byte Folded Reload + mov x12, xzr + ldr x24, [sp, #616] // 8-byte Folded Reload + ldr x25, [sp, #608] // 8-byte Folded Reload + mov x13, xzr + madd x14, x6, x15, x11 + ldr x17, [sp, #576] // 8-byte Folded Reload + ldr x18, [sp, #600] // 8-byte Folded Reload + add x14, x14, x21 + ldr x30, [sp, #648] // 8-byte Folded Reload + ldr s16, [x16, x14, lsl #2] + madd x14, x7, x15, x11 + madd x11, x19, x15, x11 + add x14, x14, x21 + add x11, x11, x21 + str s16, [x8, x6, lsl #2] + fmla v1.2s, v16.2s, v4.s[1] + fmla v3.2s, v16.2s, v5.s[1] + fmla v2.2s, v16.2s, v6.s[1] + fmla v0.2s, v16.2s, v7.s[1] + ldr s16, [x16, x14, lsl #2] + ldr x14, [sp, #1304] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v20.2s + str s16, [x8, x7, lsl #2] + fmla v3.2s, v16.2s, v19.2s + fmla v2.2s, v16.2s, v18.2s + fmla v0.2s, v16.2s, v17.2s + ldr s16, [x16, x11, lsl #2] + ldr x11, [sp, #512] // 8-byte Folded Reload + add x11, x8, x11 + fmla v1.2s, v16.2s, v4.s[3] + fmla v3.2s, v16.2s, v5.s[3] + fmla v2.2s, v16.2s, v6.s[3] + fmla v0.2s, v16.2s, v7.s[3] + str s16, [x8, x19, lsl #2] + cmp x14, x20 + b.ge .LBB0_119 + .p2align 2 +.LBB0_118: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x16, [sp, #1152] // 8-byte Folded Reload + add x15, x18, x13 + add x14, x14, #1 + add x16, x16, x12 + prfm pldl1keep, [x16] + ldur s4, [x16, #-4] + add x16, x16, x27 + prfm pldl1keep, [x16] + ldur s5, [x16, #-4] + add x16, x16, x27 + prfm pldl1keep, [x16] + ldur s6, [x16, #-4] + add x16, x16, x27 + prfm pldl1keep, [x16] + ldur s7, [x16, #-4] + prfm pldl1keep, [x15] + ldr s16, [x17, x13] + add x13, x13, x23 + fmla v1.2s, v16.2s, v4.2s + fmla v3.2s, v16.2s, v5.2s + fmla v2.2s, v16.2s, v6.2s + fmla v0.2s, v16.2s, v7.2s + str s16, [x11, x12] + add x12, x12, #4 + cmp x14, x20 + b.lt .LBB0_118 +.LBB0_119: // %.preheader64 + // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #448] // 8-byte Folded Reload + ldr x13, [sp, #1096] // 8-byte Folded Reload + mov x2, xzr + add x11, x8, #12 + ldr x14, [sp, #1088] // 8-byte Folded Reload + mov w17, #1 // =0x1 + mov w18, #2 // =0x2 + mov w16, #3 // =0x3 + mov w15, #4 // =0x4 + add x12, x8, x12 + b .LBB0_121 + .p2align 2 +.LBB0_120: // %.loopexit60 + // in Loop: Header=BB0_121 Depth=3 + ldr x2, [sp, #1232] // 8-byte Folded Reload + add x14, x14, x2 + add x13, x13, x2 + mov x2, x15 + mov x15, x1 +.LBB0_121: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_123 Depth 4 + // Child Loop BB0_125 Depth 4 + madd x1, x2, x29, x9 + ldr x23, [sp, #1272] // 8-byte Folded Reload + add x1, x1, x21 + madd x17, x17, x29, x9 + madd x18, x18, x29, x9 + madd x16, x16, x29, x9 + add x17, x17, x21 + add x16, x16, x21 + str s1, [x23, x1, lsl #2] + str s3, [x23, x17, lsl #2] + add x17, x18, x21 + str s2, [x23, x17, lsl #2] + str s0, [x23, x16, lsl #2] + ldr x16, [sp, #1296] // 8-byte Folded Reload + cmp x15, x16 + b.ge .LBB0_126 +// %bb.122: // in Loop: Header=BB0_121 Depth=3 + add x17, x15, #1 + add x18, x15, #2 + add x16, x15, #3 + madd x1, x15, x29, x9 + madd x3, x17, x29, x9 + ldr s16, [x8] + mov x2, xzr + add x1, x1, x21 + madd x4, x18, x29, x9 + madd x5, x16, x29, x9 + add x3, x3, x21 + add x4, x4, x21 + add x5, x5, x21 + ldr s1, [x23, x1, lsl #2] + add x1, x15, #4 + ldr s3, [x23, x3, lsl #2] + madd x3, x15, x28, x10 + lsl x3, x3, #2 + ldr s2, [x23, x4, lsl #2] + ldr x4, [sp, #1280] // 8-byte Folded Reload + ldr s0, [x23, x5, lsl #2] + ldr q7, [x4, x3] + madd x3, x17, x28, x10 + lsl x3, x3, #2 + ldr q6, [x4, x3] + madd x3, x18, x28, x10 + lsl x3, x3, #2 + ldr q5, [x4, x3] + madd x3, x16, x28, x10 + lsl x3, x3, #2 + ldr q4, [x4, x3] + mov x3, x11 + mov x4, x14 + ext v20.16b, v7.16b, v7.16b, #8 + cmp xzr, x22 + ext v19.16b, v6.16b, v6.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.ge .LBB0_124 + .p2align 2 +.LBB0_123: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_121 Depth=3 + // => This Inner Loop Header: Depth=4 + add x5, x3, #8 + fmla v1.2s, v16.2s, v7.2s + fmla v3.2s, v16.2s, v6.2s + add x2, x2, #4 + fmla v2.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + prfm pldl1keep, [x5] + add x5, x4, x27 + ldp s16, s21, [x3, #-8] + fmla v0.2s, v16.2s, v4.s[1] + fmla v1.2s, v16.2s, v7.s[1] + fmla v3.2s, v16.2s, v6.s[1] + fmla v2.2s, v16.2s, v5.s[1] + fmla v0.2s, v21.2s, v17.2s + fmla v1.2s, v21.2s, v20.2s + ldp s17, s16, [x3], #16 + fmla v3.2s, v21.2s, v19.2s + fmla v2.2s, v21.2s, v18.2s + prfm pldl1keep, [x4] + fmla v1.2s, v17.2s, v7.s[3] + ldur q7, [x4, #-16] + prfm pldl1keep, [x5] + fmla v3.2s, v17.2s, v6.s[3] + ldur q6, [x5, #-16] + add x5, x5, x27 + fmla v2.2s, v17.2s, v5.s[3] + fmla v0.2s, v17.2s, v4.s[3] + add x4, x4, #16 + prfm pldl1keep, [x5] + ldur q5, [x5, #-16] + add x5, x5, x27 + prfm pldl1keep, [x5] + ldur q4, [x5, #-16] + ext v20.16b, v7.16b, v7.16b, #8 + cmp x2, x22 + ext v19.16b, v6.16b, v6.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.lt .LBB0_123 +.LBB0_124: // in Loop: Header=BB0_121 Depth=3 + ldr s21, [x8, x6, lsl #2] + fmla v1.2s, v16.2s, v7.2s + fmla v3.2s, v16.2s, v6.2s + fmla v2.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + ldr s16, [x8, x7, lsl #2] + ldr s22, [x8, x19, lsl #2] + ldr x4, [sp, #1304] // 8-byte Folded Reload + mov x2, x13 + mov x3, x12 + fmla v1.2s, v21.2s, v7.s[1] + fmla v3.2s, v21.2s, v6.s[1] + fmla v2.2s, v21.2s, v5.s[1] + fmla v0.2s, v21.2s, v4.s[1] + fmla v1.2s, v16.2s, v20.2s + fmla v3.2s, v16.2s, v19.2s + fmla v2.2s, v16.2s, v18.2s + fmla v0.2s, v16.2s, v17.2s + fmla v1.2s, v22.2s, v7.s[3] + fmla v3.2s, v22.2s, v6.s[3] + fmla v2.2s, v22.2s, v5.s[3] + fmla v0.2s, v22.2s, v4.s[3] + cmp x4, x20 + b.ge .LBB0_120 + .p2align 2 +.LBB0_125: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_121 Depth=3 + // => This Inner Loop Header: Depth=4 + add x5, x2, x27 + prfm pldl1keep, [x2] + ldur s4, [x2, #-4] + add x4, x4, #1 + prfm pldl1keep, [x5] + ldur s5, [x5, #-4] + add x5, x5, x27 + add x2, x2, #4 + prfm pldl1keep, [x5] + ldur s6, [x5, #-4] + add x5, x5, x27 + prfm pldl1keep, [x5] + ldur s7, [x5, #-4] + prfm pldl1keep, [x3] + ldur s16, [x3, #-4] + add x3, x3, #4 + fmla v1.2s, v16.2s, v4.2s + fmla v3.2s, v16.2s, v5.2s + fmla v2.2s, v16.2s, v6.2s + fmla v0.2s, v16.2s, v7.2s + cmp x4, x20 + b.lt .LBB0_125 + b .LBB0_120 + .p2align 2 +.LBB0_126: // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #1296] // 8-byte Folded Reload + ldr x13, [sp, #1128] // 8-byte Folded Reload + cmp x12, x13 + b.ge .LBB0_132 +// %bb.127: // in Loop: Header=BB0_7 Depth=2 + ldr x16, [sp, #1296] // 8-byte Folded Reload + ldr x17, [sp, #1280] // 8-byte Folded Reload + mov x14, xzr + mov x15, xzr + ldr s4, [x8] + madd x12, x16, x28, x10 + add x13, x16, #1 + lsl x12, x12, #2 + ldr q3, [x17, x12] + madd x12, x13, x28, x10 + madd x13, x13, x29, x9 + lsl x12, x12, #2 + add x13, x13, x21 + ldr q2, [x17, x12] + madd x12, x16, x29, x9 + ldr x16, [sp, #1272] // 8-byte Folded Reload + add x12, x12, x21 + ldr s0, [x16, x13, lsl #2] + ldr s1, [x16, x12, lsl #2] + ext v6.16b, v3.16b, v3.16b, #8 + cmp xzr, x22 + ext v5.16b, v2.16b, v2.16b, #8 + b.ge .LBB0_129 + .p2align 2 +.LBB0_128: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x2, x8, x14 + fmla v1.2s, v4.2s, v3.2s + fmla v0.2s, v4.2s, v2.2s + add x16, x25, x14 + add x3, x2, #20 + add x18, x24, x14 + add x17, x16, #32 + add x1, x18, #32 + prfm pldl1keep, [x3] + ldp s4, s7, [x2, #4] + add x15, x15, #4 + add x14, x14, #16 + fmla v0.2s, v4.2s, v2.s[1] + fmla v1.2s, v4.2s, v3.s[1] + fmla v0.2s, v7.2s, v5.2s + ldp s5, s4, [x2, #12] + fmla v1.2s, v7.2s, v6.2s + prfm pldl1keep, [x1] + fmla v1.2s, v5.2s, v3.s[3] + ldr q3, [x18, #16] + prfm pldl1keep, [x17] + fmla v0.2s, v5.2s, v2.s[3] + ldr q2, [x16, #16] + ext v6.16b, v3.16b, v3.16b, #8 + cmp x15, x22 + ext v5.16b, v2.16b, v2.16b, #8 + b.lt .LBB0_128 +.LBB0_129: // in Loop: Header=BB0_7 Depth=2 + ldr s7, [x8, x6, lsl #2] + fmla v1.2s, v4.2s, v3.2s + fmla v0.2s, v4.2s, v2.2s + ldr s4, [x8, x7, lsl #2] + ldr x15, [sp, #512] // 8-byte Folded Reload + ldr x16, [sp, #1304] // 8-byte Folded Reload + mov x14, xzr + add x15, x8, x15 + fmla v1.2s, v7.2s, v3.s[1] + fmla v0.2s, v7.2s, v2.s[1] + ldr s7, [x8, x19, lsl #2] + fmla v1.2s, v4.2s, v6.2s + fmla v0.2s, v4.2s, v5.2s + fmla v1.2s, v7.2s, v3.s[3] + fmla v0.2s, v7.2s, v2.s[3] + cmp x16, x20 + b.ge .LBB0_131 + .p2align 2 +.LBB0_130: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x2, [sp, #680] // 8-byte Folded Reload + add x17, x15, x14 + add x1, x30, x14 + add x16, x16, #1 + add x17, x17, #4 + add x1, x1, #4 + prfm pldl1keep, [x1] + add x18, x2, x14 + add x18, x18, #4 + prfm pldl1keep, [x18] + ldr s2, [x30, x14] + prfm pldl1keep, [x17] + ldr s3, [x15, x14] + fmla v1.2s, v3.2s, v2.2s + ldr s2, [x2, x14] + add x14, x14, #4 + fmla v0.2s, v3.2s, v2.2s + cmp x16, x20 + b.lt .LBB0_130 +.LBB0_131: // in Loop: Header=BB0_7 Depth=2 + ldr x14, [sp, #1272] // 8-byte Folded Reload + str s1, [x14, x12, lsl #2] + str s0, [x14, x13, lsl #2] +.LBB0_132: // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #1024] // 8-byte Folded Reload + ldr x13, [sp, #1128] // 8-byte Folded Reload + cmp x13, x12 + b.ge .LBB0_5 +// %bb.133: // in Loop: Header=BB0_7 Depth=2 + ldr x13, [sp, #1128] // 8-byte Folded Reload + ldr x15, [sp, #1272] // 8-byte Folded Reload + mov x12, xzr + madd x9, x13, x29, x9 + madd x10, x13, x28, x10 + ldr x13, [sp, #1280] // 8-byte Folded Reload + ldr s2, [x8] + ldr x14, [sp, #896] // 8-byte Folded Reload + add x9, x9, x21 + lsl x10, x10, #2 + ldr s0, [x15, x9, lsl #2] + ldr q1, [x13, x10] + ldr x10, [sp, #904] // 8-byte Folded Reload + ext v3.16b, v1.16b, v1.16b, #8 + cmp xzr, x22 + b.ge .LBB0_135 + .p2align 2 +.LBB0_134: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x11, #8 + fmla v0.2s, v2.2s, v1.2s + add x12, x12, #4 + prfm pldl1keep, [x13] + ldp s2, s4, [x11, #-8] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v4.2s, v3.2s + ldp s3, s2, [x11], #16 + prfm pldl1keep, [x10] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x10, #-16] + add x10, x10, #16 + ext v3.16b, v1.16b, v1.16b, #8 + cmp x12, x22 + b.lt .LBB0_134 +.LBB0_135: // in Loop: Header=BB0_7 Depth=2 + ldr s4, [x8, x6, lsl #2] + fmla v0.2s, v2.2s, v1.2s + ldr x11, [sp, #512] // 8-byte Folded Reload + ldr s5, [x8, x7, lsl #2] + ldr s2, [x8, x19, lsl #2] + mov x10, xzr + add x8, x8, x11 + ldr x11, [sp, #1304] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v1.s[1] + fmla v0.2s, v5.2s, v3.2s + fmla v0.2s, v2.2s, v1.s[3] + cmp x11, x20 + b.ge .LBB0_4 + .p2align 2 +.LBB0_136: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x12, x8, x10 + add x13, x14, x10 + add x11, x11, #1 + add x12, x12, #4 + add x13, x13, #4 + prfm pldl1keep, [x13] + prfm pldl1keep, [x12] + ldr s1, [x8, x10] + ldr s2, [x14, x10] + add x10, x10, #4 + fmla v0.2s, v1.2s, v2.2s + cmp x11, x20 + b.lt .LBB0_136 + b .LBB0_4 +.LBB0_137: + ldr x0, [sp, #16] // 8-byte Folded Reload + bl free + add sp, sp, #1312 + ldp d9, d8, [sp, #48] // 16-byte Folded Reload + ldp d11, d10, [sp, #32] // 16-byte Folded Reload + ldp d13, d12, [sp, #16] // 16-byte Folded Reload + ldp x20, x19, [sp, #144] // 16-byte Folded Reload + ldp x22, x21, [sp, #128] // 16-byte Folded Reload + ldp x24, x23, [sp, #112] // 16-byte Folded Reload + ldp x26, x25, [sp, #96] // 16-byte Folded Reload + ldp x28, x27, [sp, #80] // 16-byte Folded Reload + ldp x29, x30, [sp, #64] // 16-byte Folded Reload + ldp d15, d14, [sp], #160 // 16-byte Folded Reload + ret +.Lfunc_end0: + .size sbatch_matmul_4d_nn_mlir, .Lfunc_end0-sbatch_matmul_4d_nn_mlir + .cfi_endproc + // -- End function + .section ".note.GNU-stack","",@progbits diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nt_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nt_mlir.s new file mode 100644 index 00000000000000..89f885cbd35df1 --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sbatch_matmul_4d_nt_mlir.s @@ -0,0 +1,3208 @@ + .text + .file "LLVMDialectModule" + .globl sbatch_matmul_4d_nt_mlir // -- Begin function sbatch_matmul_4d_nt_mlir + .p2align 4 + .type sbatch_matmul_4d_nt_mlir,@function +sbatch_matmul_4d_nt_mlir: // @sbatch_matmul_4d_nt_mlir + .cfi_startproc +// %bb.0: + stp d15, d14, [sp, #-160]! // 16-byte Folded Spill + stp d13, d12, [sp, #16] // 16-byte Folded Spill + stp x29, x30, [sp, #64] // 16-byte Folded Spill + stp x28, x27, [sp, #80] // 16-byte Folded Spill + stp x26, x25, [sp, #96] // 16-byte Folded Spill + stp x24, x23, [sp, #112] // 16-byte Folded Spill + stp x22, x21, [sp, #128] // 16-byte Folded Spill + stp x20, x19, [sp, #144] // 16-byte Folded Spill + stp d11, d10, [sp, #32] // 16-byte Folded Spill + stp d9, d8, [sp, #48] // 16-byte Folded Spill + sub sp, sp, #688 + .cfi_def_cfa_offset 848 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + .cfi_offset b10, -120 + .cfi_offset b11, -128 + .cfi_offset b12, -136 + .cfi_offset b13, -144 + .cfi_offset b14, -152 + .cfi_offset b15, -160 + cmp x5, #0 + ldr x13, [sp, #912] + ldr x14, [sp, #848] + mov x20, x6 + cinv x8, x5, lt + ldr x28, [sp, #1032] + ldr x22, [sp, #856] + mov x27, x2 + add x9, x8, x8, lsr #63 + add x10, x8, #3 + ldr x25, [sp, #944] + str x7, [sp, #664] // 8-byte Folded Spill + stp x13, x4, [sp, #296] // 16-byte Folded Spill + str x3, [sp, #32] // 8-byte Folded Spill + mov x19, x1 + asr x9, x9, #1 + str x14, [sp, #656] // 8-byte Folded Spill + str x5, [sp, #528] // 8-byte Folded Spill + cinv x21, x9, lt + ldr x9, [sp, #1024] + cmp x8, #0 + csel x8, x10, x8, lt + ldr x10, [sp, #976] + cmp x5, #0 + asr x8, x8, #2 + cinv x29, x8, lt + cmp x13, #0 + str x9, [sp, #520] // 8-byte Folded Spill + ldr x9, [sp, #1016] + cinv x8, x13, lt + add x11, x8, #7 + add x12, x8, #3 + str x9, [sp, #512] // 8-byte Folded Spill + ldr x9, [sp, #968] + stp x9, x10, [sp, #480] // 16-byte Folded Spill + add x9, x8, x8, lsr #63 + add x10, x8, #15 + asr x9, x9, #1 + cinv x14, x9, lt + cmp x8, #0 + csel x9, x10, x8, lt + csel x10, x11, x8, lt + ldr x11, [sp, #888] + csel x8, x12, x8, lt + cmp x13, #0 + str x14, [sp, #616] // 8-byte Folded Spill + asr x9, x9, #4 + asr x10, x10, #3 + asr x8, x8, #2 + cinv x24, x9, lt + cinv x26, x10, lt + cinv x23, x8, lt + lsl x8, x24, #4 + str x11, [sp, #672] // 8-byte Folded Spill + ldr x11, [sp, #880] + str x8, [sp, #568] // 8-byte Folded Spill + lsl x8, x23, #2 + str x11, [sp, #648] // 8-byte Folded Spill + ldr x11, [sp, #936] + str x11, [sp, #632] // 8-byte Folded Spill + ldr x11, [sp, #928] + str x11, [sp, #624] // 8-byte Folded Spill + lsl x11, x26, #3 + stp x8, x11, [sp, #440] // 16-byte Folded Spill + lsl x8, x14, #1 + str x8, [sp, #432] // 8-byte Folded Spill + lsl x8, x6, #6 + add x0, x8, #64 + str x8, [sp, #640] // 8-byte Folded Spill + bl malloc + add x12, x0, #63 + mul x9, x24, x25 + ldr x1, [sp, #672] // 8-byte Folded Reload + ldr x2, [sp, #648] // 8-byte Folded Reload + and x24, x12, #0xffffffffffffffc0 + ldr x12, [sp, #624] // 8-byte Folded Reload + mul x15, x21, x22 + lsl x8, x29, #2 + str x8, [sp, #680] // 8-byte Folded Spill + lsl x8, x21, #1 + mov w11, #1 // =0x1 + str x8, [sp, #592] // 8-byte Folded Spill + negs x8, x20 + bfi x11, x29, #2, #62 + and x10, x20, #0x3 + lsl x21, x22, #2 + str x0, [sp, #8] // 8-byte Folded Spill + mul x18, x22, x11 + and x8, x8, #0x3 + add x11, x20, x15, lsl #1 + lsl x12, x12, #2 + lsl x0, x27, #2 + mov w14, #1 // =0x1 + add x1, x2, x1, lsl #2 + str x12, [sp, #24] // 8-byte Folded Spill + ldr x12, [sp, #632] // 8-byte Folded Reload + csneg x8, x10, x8, mi + mul x10, x26, x25 + bfi x14, x23, #2, #62 + add x2, x0, x19 + mul x16, x25, x14 + add x4, x1, #4 + add x5, x2, #4 + add x9, x4, x9, lsl #6 + mul x17, x29, x22 + sub x29, x20, x8 + mul x13, x23, x25 + lsl x23, x22, #4 + add x2, x2, x23 + add x2, x2, #32 + lsl x12, x12, #2 + str x2, [sp, #152] // 8-byte Folded Spill + sub x2, x24, x8, lsl #6 + ldr x6, [sp, #640] // 8-byte Folded Reload + str x12, [sp, #288] // 8-byte Folded Spill + lsl x12, x25, #6 + add x18, x0, x18, lsl #2 + str x27, [sp, #504] // 8-byte Folded Spill + str x12, [sp, #472] // 8-byte Folded Spill + add x12, x20, x21 + add x17, x0, x17, lsl #4 + lsl x27, x25, #2 + sub x14, x12, x8 + sub x12, x11, x8 + add x15, x0, x15, lsl #3 + lsl x0, x8, #2 + add x12, x5, x12, lsl #2 + ldr x11, [sp, #616] // 8-byte Folded Reload + add x2, x2, x6 + add x6, x19, x18 + add x16, x1, x16, lsl #2 + add x7, x19, x17 + str xzr, [sp, #176] // 8-byte Folded Spill + mov x3, xzr + stp x9, x12, [sp, #96] // 16-byte Folded Spill + add x9, x4, x10, lsl #5 + lsl x10, x20, #4 + lsl x12, x20, #3 + add x13, x1, x13, lsl #4 + stp x13, x16, [sp, #136] // 16-byte Folded Spill + add x13, x15, x19 + add x13, x13, #32 + str x9, [sp, #88] // 8-byte Folded Spill + lsl x9, x20, #5 + mul x11, x11, x25 + lsl x25, x20, #2 + stp x10, x9, [sp, #248] // 16-byte Folded Spill + sub x9, x9, x8, lsl #5 + str x13, [sp, #128] // 8-byte Folded Spill + add x13, x15, x25 + sub x10, x10, x8, lsl #4 + sub x13, x13, x0 + add x18, x18, x25 + add x17, x17, x25 + stp x9, x12, [sp, #232] // 16-byte Folded Spill + sub x8, x12, x8, lsl #3 + sub x12, x29, #3 + add x13, x19, x13 + str x12, [sp, #648] // 8-byte Folded Spill + sub x12, x29, #2 + add x14, x5, x14, lsl #2 + sub x18, x18, x0 + str x12, [sp, #640] // 8-byte Folded Spill + sub x12, x29, #1 + sub x17, x17, x0 + stp x13, x14, [sp, #112] // 16-byte Folded Spill + str x12, [sp, #632] // 8-byte Folded Spill + ldr x12, [sp, #664] // 8-byte Folded Reload + sub x13, x25, x0 + ldr x0, [sp, #568] // 8-byte Folded Reload + add x9, x9, #32 + add x11, x4, x11, lsl #3 + stp x9, x8, [sp, #216] // 16-byte Folded Spill + str x10, [sp, #184] // 8-byte Folded Spill + add x10, x10, #16 + add x8, x8, #8 + add x18, x19, x18 + add x17, x19, x17 + stp x8, x10, [sp, #200] // 16-byte Folded Spill + add x8, x13, #4 + str x19, [sp, #496] // 8-byte Folded Spill + stp x13, x25, [sp, #264] // 16-byte Folded Spill + lsl x12, x12, #2 + stp x5, x4, [sp, #160] // 16-byte Folded Spill + str x18, [sp, #376] // 8-byte Folded Spill + sub x19, x29, #4 + str x12, [sp, #16] // 8-byte Folded Spill + ldr x12, [sp, #656] // 8-byte Folded Reload + mov x9, x11 + str x8, [sp, #192] // 8-byte Folded Spill + str x7, [sp, #600] // 8-byte Folded Spill + str x6, [sp, #608] // 8-byte Folded Spill + lsl x12, x12, #2 + stp x17, x6, [sp, #72] // 16-byte Folded Spill + stp x2, x17, [sp, #360] // 16-byte Folded Spill + str x18, [sp, #64] // 8-byte Folded Spill + str x12, [sp, #280] // 8-byte Folded Spill + add x12, x24, #256 + str x12, [sp, #624] // 8-byte Folded Spill + add x12, x2, #64 + str x12, [sp, #616] // 8-byte Folded Spill + b .LBB0_2 + .p2align 2 +.LBB0_1: // %.loopexit40 + // in Loop: Header=BB0_2 Depth=1 + ldp x10, x9, [sp, #16] // 16-byte Folded Reload + ldr x8, [sp, #168] // 8-byte Folded Reload + add x8, x8, x9 + ldr x3, [sp, #40] // 8-byte Folded Reload + str x8, [sp, #168] // 8-byte Folded Spill + ldr x8, [sp, #160] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #160] // 8-byte Folded Spill + ldr x8, [sp, #152] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #152] // 8-byte Folded Spill + ldr x8, [sp, #120] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #120] // 8-byte Folded Spill + ldr x8, [sp, #176] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #176] // 8-byte Folded Spill + ldr x8, [sp, #128] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #128] // 8-byte Folded Spill + ldr x8, [sp, #104] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #104] // 8-byte Folded Spill + ldr x8, [sp, #112] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #112] // 8-byte Folded Spill + ldr x8, [sp, #96] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #96] // 8-byte Folded Spill + ldr x8, [sp, #64] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #64] // 8-byte Folded Spill + ldr x8, [sp, #72] // 8-byte Folded Reload + add x8, x8, x10 + str x8, [sp, #72] // 8-byte Folded Spill + ldr x8, [sp, #88] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #88] // 8-byte Folded Spill + ldr x8, [sp, #144] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #144] // 8-byte Folded Spill + ldr x8, [sp, #136] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #136] // 8-byte Folded Spill + ldp x7, x8, [sp, #48] // 16-byte Folded Reload + add x9, x8, x9 + ldr x8, [sp, #80] // 8-byte Folded Reload + add x7, x7, x10 + add x8, x8, x10 + str x8, [sp, #80] // 8-byte Folded Spill +.LBB0_2: // =>This Loop Header: Depth=1 + // Child Loop BB0_7 Depth 2 + // Child Loop BB0_11 Depth 3 + // Child Loop BB0_13 Depth 4 + // Child Loop BB0_16 Depth 4 + // Child Loop BB0_18 Depth 5 + // Child Loop BB0_20 Depth 5 + // Child Loop BB0_23 Depth 4 + // Child Loop BB0_25 Depth 4 + // Child Loop BB0_29 Depth 4 + // Child Loop BB0_31 Depth 4 + // Child Loop BB0_37 Depth 3 + // Child Loop BB0_40 Depth 3 + // Child Loop BB0_42 Depth 4 + // Child Loop BB0_44 Depth 4 + // Child Loop BB0_47 Depth 3 + // Child Loop BB0_49 Depth 3 + // Child Loop BB0_53 Depth 3 + // Child Loop BB0_55 Depth 3 + // Child Loop BB0_59 Depth 3 + // Child Loop BB0_62 Depth 3 + // Child Loop BB0_64 Depth 4 + // Child Loop BB0_66 Depth 4 + // Child Loop BB0_69 Depth 3 + // Child Loop BB0_71 Depth 3 + // Child Loop BB0_75 Depth 3 + // Child Loop BB0_77 Depth 3 + // Child Loop BB0_81 Depth 3 + // Child Loop BB0_84 Depth 3 + // Child Loop BB0_86 Depth 4 + // Child Loop BB0_88 Depth 4 + // Child Loop BB0_91 Depth 3 + // Child Loop BB0_93 Depth 3 + // Child Loop BB0_97 Depth 3 + // Child Loop BB0_99 Depth 3 + // Child Loop BB0_103 Depth 3 + // Child Loop BB0_106 Depth 3 + // Child Loop BB0_108 Depth 4 + // Child Loop BB0_110 Depth 4 + // Child Loop BB0_113 Depth 3 + // Child Loop BB0_115 Depth 3 + // Child Loop BB0_119 Depth 3 + // Child Loop BB0_121 Depth 3 + ldr x8, [sp, #32] // 8-byte Folded Reload + cmp x3, x8 + b.ge .LBB0_122 +// %bb.3: // in Loop: Header=BB0_2 Depth=1 + add x8, x3, #1 + str x9, [sp, #56] // 8-byte Folded Spill + mov x25, xzr + str x9, [sp, #328] // 8-byte Folded Spill + stp x8, x7, [sp, #40] // 16-byte Folded Spill + ldr x8, [sp, #80] // 8-byte Folded Reload + str x3, [sp, #672] // 8-byte Folded Spill + stp x8, x7, [sp, #336] // 16-byte Folded Spill + ldp x9, x8, [sp, #136] // 16-byte Folded Reload + stp x8, x9, [sp, #400] // 16-byte Folded Spill + ldr x9, [sp, #88] // 8-byte Folded Reload + ldp x11, x10, [sp, #64] // 16-byte Folded Reload + str x10, [sp, #352] // 8-byte Folded Spill + ldp x10, x8, [sp, #96] // 16-byte Folded Reload + stp x8, x11, [sp, #416] // 16-byte Folded Spill + ldr x8, [sp, #128] // 8-byte Folded Reload + stp x10, x9, [sp, #384] // 16-byte Folded Spill + str x8, [sp, #456] // 8-byte Folded Spill + ldp x12, x8, [sp, #168] // 16-byte Folded Reload + str x8, [sp, #536] // 8-byte Folded Spill + ldp x11, x8, [sp, #112] // 16-byte Folded Reload + str x8, [sp, #552] // 8-byte Folded Spill + ldp x8, x16, [sp, #152] // 16-byte Folded Reload + str x8, [sp, #544] // 8-byte Folded Spill + b .LBB0_7 + .p2align 2 +.LBB0_4: // in Loop: Header=BB0_7 Depth=2 + str s0, [x6, x9, lsl #2] +.LBB0_5: // in Loop: Header=BB0_7 Depth=2 + bl free + ldr x16, [sp, #560] // 8-byte Folded Reload +.LBB0_6: // %.backedge41 + // in Loop: Header=BB0_7 Depth=2 + ldp x9, x8, [sp, #280] // 16-byte Folded Reload + ldr x10, [sp, #544] // 8-byte Folded Reload + add x10, x10, x9 + ldp x25, x12, [sp, #312] // 16-byte Folded Reload + add x12, x12, x8 + ldr x0, [sp, #568] // 8-byte Folded Reload + add x16, x16, x9 + str x10, [sp, #544] // 8-byte Folded Spill + ldr x10, [sp, #552] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #552] // 8-byte Folded Spill + ldr x10, [sp, #536] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #536] // 8-byte Folded Spill + ldp x10, x11, [sp, #456] // 16-byte Folded Reload + add x10, x10, x9 + add x11, x11, x9 + str x10, [sp, #456] // 8-byte Folded Spill + ldr x10, [sp, #416] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #416] // 8-byte Folded Spill + ldr x10, [sp, #384] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #384] // 8-byte Folded Spill + ldr x10, [sp, #424] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #424] // 8-byte Folded Spill + ldr x10, [sp, #352] // 8-byte Folded Reload + add x10, x10, x9 + str x10, [sp, #352] // 8-byte Folded Spill + ldr x10, [sp, #392] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #392] // 8-byte Folded Spill + ldr x10, [sp, #400] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #400] // 8-byte Folded Spill + ldr x10, [sp, #408] // 8-byte Folded Reload + add x10, x10, x8 + str x10, [sp, #408] // 8-byte Folded Spill + ldr x10, [sp, #328] // 8-byte Folded Reload + add x10, x10, x8 + ldr x8, [sp, #336] // 8-byte Folded Reload + add x8, x8, x9 + stp x10, x8, [sp, #328] // 16-byte Folded Spill + ldr x8, [sp, #344] // 8-byte Folded Reload + add x8, x8, x9 + str x8, [sp, #344] // 8-byte Folded Spill +.LBB0_7: // Parent Loop BB0_2 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_11 Depth 3 + // Child Loop BB0_13 Depth 4 + // Child Loop BB0_16 Depth 4 + // Child Loop BB0_18 Depth 5 + // Child Loop BB0_20 Depth 5 + // Child Loop BB0_23 Depth 4 + // Child Loop BB0_25 Depth 4 + // Child Loop BB0_29 Depth 4 + // Child Loop BB0_31 Depth 4 + // Child Loop BB0_37 Depth 3 + // Child Loop BB0_40 Depth 3 + // Child Loop BB0_42 Depth 4 + // Child Loop BB0_44 Depth 4 + // Child Loop BB0_47 Depth 3 + // Child Loop BB0_49 Depth 3 + // Child Loop BB0_53 Depth 3 + // Child Loop BB0_55 Depth 3 + // Child Loop BB0_59 Depth 3 + // Child Loop BB0_62 Depth 3 + // Child Loop BB0_64 Depth 4 + // Child Loop BB0_66 Depth 4 + // Child Loop BB0_69 Depth 3 + // Child Loop BB0_71 Depth 3 + // Child Loop BB0_75 Depth 3 + // Child Loop BB0_77 Depth 3 + // Child Loop BB0_81 Depth 3 + // Child Loop BB0_84 Depth 3 + // Child Loop BB0_86 Depth 4 + // Child Loop BB0_88 Depth 4 + // Child Loop BB0_91 Depth 3 + // Child Loop BB0_93 Depth 3 + // Child Loop BB0_97 Depth 3 + // Child Loop BB0_99 Depth 3 + // Child Loop BB0_103 Depth 3 + // Child Loop BB0_106 Depth 3 + // Child Loop BB0_108 Depth 4 + // Child Loop BB0_110 Depth 4 + // Child Loop BB0_113 Depth 3 + // Child Loop BB0_115 Depth 3 + // Child Loop BB0_119 Depth 3 + // Child Loop BB0_121 Depth 3 + ldr x8, [sp, #304] // 8-byte Folded Reload + cmp x25, x8 + b.ge .LBB0_1 +// %bb.8: // in Loop: Header=BB0_7 Depth=2 + mov x10, xzr + add x8, x25, #1 + mov x1, x12 + str x11, [sp, #464] // 8-byte Folded Spill + stp x8, x12, [sp, #312] // 16-byte Folded Spill + str x16, [sp, #560] // 8-byte Folded Spill + b .LBB0_11 + .p2align 2 +.LBB0_9: // in Loop: Header=BB0_11 Depth=3 + stp q3, q2, [x10] + stp q1, q0, [x10, #32] +.LBB0_10: // %.backedge + // in Loop: Header=BB0_11 Depth=3 + ldr x8, [sp, #472] // 8-byte Folded Reload + ldr x1, [sp, #584] // 8-byte Folded Reload + add x1, x1, x8 + ldr x10, [sp, #576] // 8-byte Folded Reload + ldr x16, [sp, #560] // 8-byte Folded Reload +.LBB0_11: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_13 Depth 4 + // Child Loop BB0_16 Depth 4 + // Child Loop BB0_18 Depth 5 + // Child Loop BB0_20 Depth 5 + // Child Loop BB0_23 Depth 4 + // Child Loop BB0_25 Depth 4 + // Child Loop BB0_29 Depth 4 + // Child Loop BB0_31 Depth 4 + ldp x9, x8, [sp, #496] // 16-byte Folded Reload + cmp x10, x0 + add x26, x9, x8, lsl #2 + b.ge .LBB0_32 +// %bb.12: // in Loop: Header=BB0_11 Depth=3 + ldr x8, [sp, #520] // 8-byte Folded Reload + ldr x11, [sp, #672] // 8-byte Folded Reload + mov x13, xzr + mul x9, x25, x8 + ldr x8, [sp, #512] // 8-byte Folded Reload + madd x12, x11, x8, x9 + ldp x9, x8, [sp, #480] // 16-byte Folded Reload + add x11, x9, x8, lsl #2 + add x14, x12, x10 + add x8, x10, #16 + add x15, x14, x28 + str x8, [sp, #576] // 8-byte Folded Spill + add x15, x11, x15, lsl #2 + add x9, x11, x14, lsl #2 + ldp q3, q1, [x15, #32] + ldp q5, q4, [x15] + lsl x15, x28, #1 + ldp q17, q6, [x9, #32] + ldp q2, q0, [x9] + add x9, x14, x15 + add x15, x15, x28 + add x14, x14, x15 + add x9, x11, x9, lsl #2 + mov x15, x1 + add x14, x11, x14, lsl #2 + ldp q18, q7, [x9, #32] + ldp q21, q20, [x9] + ldp q19, q16, [x14, #32] + ldp q23, q22, [x14] + mov x14, x16 + cmp xzr, x20 + b.ge .LBB0_14 + .p2align 2 +.LBB0_13: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + add x16, x14, x21 + prfm pldl1keep, [x14] + ldur s27, [x14, #-4] + add x14, x14, #4 + add x17, x16, x21 + prfm pldl1keep, [x16] + ldur s28, [x16, #-4] + add x16, x15, x27 + add x18, x17, x21 + prfm pldl1keep, [x17] + ldur s26, [x17, #-4] + sub x17, x16, #4 + prfm pldl1keep, [x18] + ldur s25, [x18, #-4] + add x18, x16, x27 + prfm pldl1keep, [x15] + ldur s24, [x15, #-4] + add x15, x15, #4 + prfm pldl1keep, [x16] + sub x16, x18, #4 + prfm pldl1keep, [x18] + ld1 { v24.s }[1], [x17] + add x17, x18, x27 + prfm pldl1keep, [x17] + ld1 { v24.s }[2], [x16] + add x16, x17, x27 + sub x17, x17, #4 + prfm pldl1keep, [x16] + ldur s29, [x16, #-4] + add x16, x16, x27 + sub x18, x16, #4 + add x0, x16, x27 + ld1 { v24.s }[3], [x17] + prfm pldl1keep, [x16] + prfm pldl1keep, [x0] + ld1 { v29.s }[1], [x18] + sub x16, x0, #4 + add x17, x0, x27 + prfm pldl1keep, [x17] + fmla v2.4s, v24.4s, v27.s[0] + ld1 { v29.s }[2], [x16] + add x16, x17, x27 + sub x17, x17, #4 + fmla v5.4s, v24.4s, v28.s[0] + fmla v21.4s, v24.4s, v26.s[0] + fmla v23.4s, v24.4s, v25.s[0] + prfm pldl1keep, [x16] + ldur s30, [x16, #-4] + add x16, x16, x27 + sub x18, x16, #4 + add x0, x16, x27 + ld1 { v29.s }[3], [x17] + prfm pldl1keep, [x16] + prfm pldl1keep, [x0] + ld1 { v30.s }[1], [x18] + sub x16, x0, #4 + add x17, x0, x27 + prfm pldl1keep, [x17] + ld1 { v30.s }[2], [x16] + add x16, x17, x27 + sub x17, x17, #4 + fmla v0.4s, v29.4s, v27.s[0] + fmla v4.4s, v29.4s, v28.s[0] + fmla v20.4s, v29.4s, v26.s[0] + fmla v22.4s, v29.4s, v25.s[0] + prfm pldl1keep, [x16] + ldur s31, [x16, #-4] + add x16, x16, x27 + sub x18, x16, #4 + add x0, x16, x27 + ld1 { v30.s }[3], [x17] + prfm pldl1keep, [x16] + prfm pldl1keep, [x0] + ld1 { v31.s }[1], [x18] + sub x16, x0, #4 + add x17, x0, x27 + prfm pldl1keep, [x17] + fmla v17.4s, v30.4s, v27.s[0] + ld1 { v31.s }[2], [x16] + sub x16, x17, #4 + fmla v3.4s, v30.4s, v28.s[0] + fmla v18.4s, v30.4s, v26.s[0] + fmla v19.4s, v30.4s, v25.s[0] + ld1 { v31.s }[3], [x16] + add x16, x24, x13, lsl #6 + add x13, x13, #1 + stp q24, q29, [x16] + fmla v6.4s, v31.4s, v27.s[0] + fmla v1.4s, v31.4s, v28.s[0] + fmla v7.4s, v31.4s, v26.s[0] + fmla v16.4s, v31.4s, v25.s[0] + stp q30, q31, [x16, #32] + cmp x13, x20 + b.lt .LBB0_13 +.LBB0_14: // %.preheader + // in Loop: Header=BB0_11 Depth=3 + ldr x16, [sp, #552] // 8-byte Folded Reload + ldr x17, [sp, #544] // 8-byte Folded Reload + mov x13, xzr + mov w2, #2 // =0x2 + str x1, [sp, #584] // 8-byte Folded Spill + mov w1, #1 // =0x1 + mov w0, #3 // =0x3 + mov w18, #4 // =0x4 + b .LBB0_16 + .p2align 2 +.LBB0_15: // %.loopexit + // in Loop: Header=BB0_16 Depth=4 + add x17, x17, x23 + add x16, x16, x23 + mov x13, x18 + mov x18, x3 +.LBB0_16: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Loop Header: Depth=4 + // Child Loop BB0_18 Depth 5 + // Child Loop BB0_20 Depth 5 + ldr x8, [sp, #680] // 8-byte Folded Reload + madd x13, x13, x28, x12 + cmp x18, x8 + madd x14, x1, x28, x12 + madd x15, x2, x28, x12 + ldr x8, [sp, #648] // 8-byte Folded Reload + add x13, x13, x10 + add x14, x14, x10 + add x15, x15, x10 + add x13, x11, x13, lsl #2 + stp q2, q0, [x13] + stp q17, q6, [x13, #32] + add x13, x11, x14, lsl #2 + add x14, x11, x15, lsl #2 + add x15, x24, x8, lsl #6 + ldr x8, [sp, #640] // 8-byte Folded Reload + stp q5, q4, [x13] + stp q3, q1, [x13, #32] + madd x13, x0, x28, x12 + add x13, x13, x10 + stp q21, q20, [x14] + stp q18, q7, [x14, #32] + add x14, x24, x8, lsl #6 + ldr x8, [sp, #632] // 8-byte Folded Reload + add x0, x11, x13, lsl #2 + add x13, x24, x8, lsl #6 + stp q23, q22, [x0] + stp q19, q16, [x0, #32] + b.ge .LBB0_21 +// %bb.17: // in Loop: Header=BB0_16 Depth=4 + madd x5, x18, x28, x12 + add x1, x18, #1 + add x2, x18, #2 + add x0, x18, #3 + ldr x8, [sp, #656] // 8-byte Folded Reload + ldr x9, [sp, #672] // 8-byte Folded Reload + ldp q28, q29, [x24, #32] + ldp q30, q31, [x24] + mov x4, xzr + add x3, x18, #4 + add x5, x5, x10 + mul x6, x25, x8 + ldr x8, [sp, #664] // 8-byte Folded Reload + add x5, x11, x5, lsl #2 + ldp q17, q6, [x5, #32] + ldp q2, q0, [x5] + madd x5, x1, x28, x12 + add x5, x5, x10 + add x5, x11, x5, lsl #2 + ldp q3, q1, [x5, #32] + ldp q5, q4, [x5] + madd x5, x2, x28, x12 + add x5, x5, x10 + add x5, x11, x5, lsl #2 + ldp q18, q7, [x5, #32] + ldp q21, q20, [x5] + madd x5, x0, x28, x12 + add x5, x5, x10 + add x5, x11, x5, lsl #2 + ldp q19, q16, [x5, #32] + ldp q23, q22, [x5] + madd x5, x9, x8, x6 + madd x6, x18, x22, x5 + lsl x6, x6, #2 + ldr q27, [x26, x6] + madd x6, x1, x22, x5 + lsl x6, x6, #2 + ldr q26, [x26, x6] + madd x6, x2, x22, x5 + madd x5, x0, x22, x5 + lsl x6, x6, #2 + lsl x5, x5, #2 + ldr q25, [x26, x6] + ldr q24, [x26, x5] + ldr x6, [sp, #624] // 8-byte Folded Reload + mov x5, x17 + fmla v6.4s, v29.4s, v27.s[0] + cmp xzr, x19 + b.ge .LBB0_19 + .p2align 2 +.LBB0_18: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // Parent Loop BB0_16 Depth=4 + // => This Inner Loop Header: Depth=5 + add x8, x6, #64 + fmla v17.4s, v28.4s, v27.s[0] + fmla v2.4s, v30.4s, v27.s[0] + add x9, x6, #128 + prfm pldl1keep, [x8] + ldp q9, q8, [x6, #-160] + fmla v0.4s, v31.4s, v27.s[0] + ldp q12, q15, [x6, #-192] + fmla v1.4s, v29.4s, v26.s[0] + fmla v3.4s, v28.4s, v26.s[0] + fmla v4.4s, v31.4s, v26.s[0] + fmla v5.4s, v30.4s, v26.s[0] + fmla v7.4s, v29.4s, v25.s[0] + prfm pldl1keep, [x9] + fmla v18.4s, v28.4s, v25.s[0] + fmla v20.4s, v31.4s, v25.s[0] + ldp q11, q10, [x6, #-128] + fmla v21.4s, v30.4s, v25.s[0] + fmla v16.4s, v29.4s, v24.s[0] + ldp q13, q14, [x6, #-96] + fmla v19.4s, v28.4s, v24.s[0] + fmla v22.4s, v31.4s, v24.s[0] + add x30, x6, #192 + prfm pldl1keep, [x30] + fmla v23.4s, v30.4s, v24.s[0] + fmla v0.4s, v15.4s, v27.s[1] + add x7, x6, #256 + add x8, x5, x21 + fmla v2.4s, v12.4s, v27.s[1] + fmla v17.4s, v9.4s, v27.s[1] + add x4, x4, #4 + fmla v6.4s, v8.4s, v27.s[1] + fmla v5.4s, v12.4s, v26.s[1] + fmla v4.4s, v15.4s, v26.s[1] + fmla v3.4s, v9.4s, v26.s[1] + fmla v1.4s, v8.4s, v26.s[1] + fmla v21.4s, v12.4s, v25.s[1] + fmla v20.4s, v15.4s, v25.s[1] + fmla v18.4s, v9.4s, v25.s[1] + fmla v7.4s, v8.4s, v25.s[1] + fmla v23.4s, v12.4s, v24.s[1] + fmla v22.4s, v15.4s, v24.s[1] + ldp q15, q12, [x6, #-64] + fmla v19.4s, v9.4s, v24.s[1] + fmla v16.4s, v8.4s, v24.s[1] + ldp q9, q8, [x6, #-32] + prfm pldl1keep, [x7] + ldp q28, q29, [x6, #32] + fmla v6.4s, v14.4s, v27.s[2] + ldp q30, q31, [x6] + prfm pldl1keep, [x5] + mov x6, x7 + fmla v17.4s, v13.4s, v27.s[2] + fmla v2.4s, v11.4s, v27.s[2] + fmla v0.4s, v10.4s, v27.s[2] + fmla v1.4s, v14.4s, v26.s[2] + fmla v3.4s, v13.4s, v26.s[2] + fmla v4.4s, v10.4s, v26.s[2] + fmla v5.4s, v11.4s, v26.s[2] + fmla v7.4s, v14.4s, v25.s[2] + fmla v18.4s, v13.4s, v25.s[2] + fmla v20.4s, v10.4s, v25.s[2] + fmla v21.4s, v11.4s, v25.s[2] + fmla v16.4s, v14.4s, v24.s[2] + fmla v19.4s, v13.4s, v24.s[2] + fmla v22.4s, v10.4s, v24.s[2] + fmla v23.4s, v11.4s, v24.s[2] + fmla v0.4s, v12.4s, v27.s[3] + fmla v2.4s, v15.4s, v27.s[3] + fmla v17.4s, v9.4s, v27.s[3] + fmla v6.4s, v8.4s, v27.s[3] + ldur q27, [x5, #-16] + prfm pldl1keep, [x8] + add x5, x5, #16 + fmla v5.4s, v15.4s, v26.s[3] + fmla v4.4s, v12.4s, v26.s[3] + fmla v3.4s, v9.4s, v26.s[3] + fmla v1.4s, v8.4s, v26.s[3] + ldur q26, [x8, #-16] + add x8, x8, x21 + prfm pldl1keep, [x8] + fmla v21.4s, v15.4s, v25.s[3] + fmla v20.4s, v12.4s, v25.s[3] + fmla v18.4s, v9.4s, v25.s[3] + fmla v7.4s, v8.4s, v25.s[3] + ldur q25, [x8, #-16] + add x8, x8, x21 + prfm pldl1keep, [x8] + fmla v23.4s, v15.4s, v24.s[3] + fmla v22.4s, v12.4s, v24.s[3] + fmla v19.4s, v9.4s, v24.s[3] + fmla v16.4s, v8.4s, v24.s[3] + ldur q24, [x8, #-16] + fmla v6.4s, v29.4s, v27.s[0] + cmp x4, x19 + b.lt .LBB0_18 +.LBB0_19: // in Loop: Header=BB0_16 Depth=4 + ldp q10, q8, [x15, #32] + ldp q11, q12, [x15] + fmla v17.4s, v28.4s, v27.s[0] + fmla v2.4s, v30.4s, v27.s[0] + fmla v0.4s, v31.4s, v27.s[0] + fmla v1.4s, v29.4s, v26.s[0] + fmla v3.4s, v28.4s, v26.s[0] + fmla v4.4s, v31.4s, v26.s[0] + ldp q9, q13, [x14, #32] + fmla v5.4s, v30.4s, v26.s[0] + fmla v7.4s, v29.4s, v25.s[0] + mov x15, x29 + fmla v18.4s, v28.4s, v25.s[0] + fmla v20.4s, v31.4s, v25.s[0] + fmla v21.4s, v30.4s, v25.s[0] + fmla v16.4s, v29.4s, v24.s[0] + fmla v19.4s, v28.4s, v24.s[0] + fmla v22.4s, v31.4s, v24.s[0] + ldp q31, q28, [x13, #32] + fmla v23.4s, v30.4s, v24.s[0] + ldp q29, q30, [x14] + mov x14, x16 + fmla v0.4s, v12.4s, v27.s[1] + fmla v2.4s, v11.4s, v27.s[1] + fmla v17.4s, v10.4s, v27.s[1] + fmla v6.4s, v8.4s, v27.s[1] + fmla v5.4s, v11.4s, v26.s[1] + fmla v4.4s, v12.4s, v26.s[1] + fmla v3.4s, v10.4s, v26.s[1] + fmla v1.4s, v8.4s, v26.s[1] + fmla v21.4s, v11.4s, v25.s[1] + fmla v20.4s, v12.4s, v25.s[1] + fmla v18.4s, v10.4s, v25.s[1] + fmla v7.4s, v8.4s, v25.s[1] + fmla v23.4s, v11.4s, v24.s[1] + fmla v22.4s, v12.4s, v24.s[1] + fmla v19.4s, v10.4s, v24.s[1] + fmla v16.4s, v8.4s, v24.s[1] + fmla v6.4s, v13.4s, v27.s[2] + ldp q8, q10, [x13] + ldr x13, [sp, #616] // 8-byte Folded Reload + fmla v17.4s, v9.4s, v27.s[2] + fmla v2.4s, v29.4s, v27.s[2] + fmla v0.4s, v30.4s, v27.s[2] + fmla v1.4s, v13.4s, v26.s[2] + fmla v3.4s, v9.4s, v26.s[2] + fmla v4.4s, v30.4s, v26.s[2] + fmla v5.4s, v29.4s, v26.s[2] + fmla v7.4s, v13.4s, v25.s[2] + fmla v18.4s, v9.4s, v25.s[2] + fmla v20.4s, v30.4s, v25.s[2] + fmla v21.4s, v29.4s, v25.s[2] + fmla v16.4s, v13.4s, v24.s[2] + fmla v19.4s, v9.4s, v24.s[2] + fmla v22.4s, v30.4s, v24.s[2] + fmla v23.4s, v29.4s, v24.s[2] + fmla v0.4s, v10.4s, v27.s[3] + fmla v2.4s, v8.4s, v27.s[3] + fmla v17.4s, v31.4s, v27.s[3] + fmla v6.4s, v28.4s, v27.s[3] + fmla v5.4s, v8.4s, v26.s[3] + fmla v4.4s, v10.4s, v26.s[3] + fmla v3.4s, v31.4s, v26.s[3] + fmla v1.4s, v28.4s, v26.s[3] + fmla v21.4s, v8.4s, v25.s[3] + fmla v20.4s, v10.4s, v25.s[3] + fmla v18.4s, v31.4s, v25.s[3] + fmla v7.4s, v28.4s, v25.s[3] + fmla v23.4s, v8.4s, v24.s[3] + fmla v22.4s, v10.4s, v24.s[3] + fmla v19.4s, v31.4s, v24.s[3] + fmla v16.4s, v28.4s, v24.s[3] + cmp x29, x20 + b.ge .LBB0_15 + .p2align 2 +.LBB0_20: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // Parent Loop BB0_16 Depth=4 + // => This Inner Loop Header: Depth=5 + prfm pldl1keep, [x13] + ldp q24, q25, [x13, #-64] + add x8, x14, x21 + ldp q26, q27, [x13, #-32] + prfm pldl1keep, [x14] + add x15, x15, #1 + ldur s28, [x14, #-4] + prfm pldl1keep, [x8] + add x14, x14, #4 + add x13, x13, #64 + ldur s29, [x8, #-4] + add x8, x8, x21 + prfm pldl1keep, [x8] + fmla v6.4s, v27.4s, v28.s[0] + ldur s30, [x8, #-4] + add x8, x8, x21 + prfm pldl1keep, [x8] + fmla v17.4s, v26.4s, v28.s[0] + fmla v0.4s, v25.4s, v28.s[0] + fmla v2.4s, v24.4s, v28.s[0] + ldur s28, [x8, #-4] + fmla v4.4s, v25.4s, v29.s[0] + fmla v5.4s, v24.4s, v29.s[0] + fmla v3.4s, v26.4s, v29.s[0] + fmla v1.4s, v27.4s, v29.s[0] + fmla v21.4s, v24.4s, v30.s[0] + fmla v20.4s, v25.4s, v30.s[0] + fmla v18.4s, v26.4s, v30.s[0] + fmla v7.4s, v27.4s, v30.s[0] + fmla v23.4s, v24.4s, v28.s[0] + fmla v22.4s, v25.4s, v28.s[0] + fmla v19.4s, v26.4s, v28.s[0] + fmla v16.4s, v27.4s, v28.s[0] + cmp x15, x20 + b.lt .LBB0_20 + b .LBB0_15 + .p2align 2 +.LBB0_21: // in Loop: Header=BB0_11 Depth=3 + ldr x8, [sp, #680] // 8-byte Folded Reload + ldr x9, [sp, #592] // 8-byte Folded Reload + cmp x8, x9 + b.ge .LBB0_27 +// %bb.22: // in Loop: Header=BB0_11 Depth=3 + ldr x9, [sp, #656] // 8-byte Folded Reload + ldr x0, [sp, #680] // 8-byte Folded Reload + mov x18, xzr + mul x9, x25, x9 + ldr x17, [sp, #664] // 8-byte Folded Reload + ldr x1, [sp, #672] // 8-byte Folded Reload + madd x8, x0, x28, x12 + madd x9, x1, x17, x9 + ldp q20, q21, [x24, #32] + ldp q18, q19, [x24] + ldr x1, [sp, #624] // 8-byte Folded Reload + madd x17, x0, x22, x9 + add x8, x8, x10 + add x16, x11, x8, lsl #2 + add x8, x0, #1 + ldr x0, [sp, #536] // 8-byte Folded Reload + lsl x17, x17, #2 + ldr q17, [x26, x17] + madd x17, x8, x28, x12 + madd x8, x8, x22, x9 + ldp q1, q0, [x16, #32] + ldp q3, q2, [x16] + add x17, x17, x10 + lsl x8, x8, #2 + add x17, x11, x17, lsl #2 + ldr q16, [x26, x8] + ldp q5, q4, [x17, #32] + ldp q7, q6, [x17] + cmp xzr, x19 + b.ge .LBB0_24 + .p2align 2 +.LBB0_23: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + ldr x8, [sp, #608] // 8-byte Folded Reload + add x7, x1, #64 + fmla v0.4s, v21.4s, v17.s[0] + fmla v1.4s, v20.4s, v17.s[0] + fmla v3.4s, v18.4s, v17.s[0] + fmla v2.4s, v19.4s, v17.s[0] + add x9, x1, #128 + add x2, x1, #256 + fmla v4.4s, v21.4s, v16.s[0] + fmla v5.4s, v20.4s, v16.s[0] + add x18, x18, #4 + add x3, x8, x0 + ldr x8, [sp, #600] // 8-byte Folded Reload + prfm pldl1keep, [x7] + ldp q23, q22, [x1, #-160] + ldp q24, q25, [x1, #-192] + fmla v6.4s, v19.4s, v16.s[0] + fmla v7.4s, v18.4s, v16.s[0] + prfm pldl1keep, [x9] + ldp q19, q18, [x1, #-128] + add x4, x3, #32 + ldp q20, q21, [x1, #-96] + fmla v2.4s, v25.4s, v17.s[1] + fmla v0.4s, v22.4s, v17.s[1] + fmla v6.4s, v25.4s, v16.s[1] + fmla v4.4s, v22.4s, v16.s[1] + add x5, x8, x0 + add x8, x1, #192 + add x0, x0, #16 + fmla v3.4s, v24.4s, v17.s[1] + fmla v1.4s, v23.4s, v17.s[1] + fmla v7.4s, v24.4s, v16.s[1] + fmla v5.4s, v23.4s, v16.s[1] + prfm pldl1keep, [x8] + ldp q23, q22, [x1, #-32] + ldp q24, q25, [x1, #-64] + add x6, x5, #32 + prfm pldl1keep, [x6] + fmla v0.4s, v21.4s, v17.s[2] + fmla v2.4s, v18.4s, v17.s[2] + fmla v4.4s, v21.4s, v16.s[2] + fmla v6.4s, v18.4s, v16.s[2] + fmla v1.4s, v20.4s, v17.s[2] + fmla v3.4s, v19.4s, v17.s[2] + fmla v5.4s, v20.4s, v16.s[2] + fmla v7.4s, v19.4s, v16.s[2] + fmla v2.4s, v25.4s, v17.s[3] + fmla v0.4s, v22.4s, v17.s[3] + fmla v6.4s, v25.4s, v16.s[3] + fmla v4.4s, v22.4s, v16.s[3] + fmla v3.4s, v24.4s, v17.s[3] + fmla v1.4s, v23.4s, v17.s[3] + ldr q17, [x5, #16] + prfm pldl1keep, [x4] + fmla v7.4s, v24.4s, v16.s[3] + fmla v5.4s, v23.4s, v16.s[3] + ldr q16, [x3, #16] + prfm pldl1keep, [x2] + ldp q20, q21, [x1, #32] + ldp q18, q19, [x1] + mov x1, x2 + cmp x18, x19 + b.lt .LBB0_23 +.LBB0_24: // in Loop: Header=BB0_11 Depth=3 + ldp q23, q22, [x15, #32] + ldp q24, q25, [x15] + fmla v0.4s, v21.4s, v17.s[0] + fmla v1.4s, v20.4s, v17.s[0] + fmla v3.4s, v18.4s, v17.s[0] + fmla v2.4s, v19.4s, v17.s[0] + fmla v4.4s, v21.4s, v16.s[0] + fmla v5.4s, v20.4s, v16.s[0] + ldp q20, q21, [x14, #32] + fmla v6.4s, v19.4s, v16.s[0] + fmla v7.4s, v18.4s, v16.s[0] + ldp q19, q18, [x14] + fmla v2.4s, v25.4s, v17.s[1] + fmla v0.4s, v22.4s, v17.s[1] + ldr x18, [sp, #616] // 8-byte Folded Reload + ldr x0, [sp, #536] // 8-byte Folded Reload + fmla v3.4s, v24.4s, v17.s[1] + fmla v1.4s, v23.4s, v17.s[1] + ldp x3, x2, [sp, #368] // 16-byte Folded Reload + fmla v7.4s, v24.4s, v16.s[1] + fmla v6.4s, v25.4s, v16.s[1] + ldp q24, q25, [x13] + fmla v5.4s, v23.4s, v16.s[1] + fmla v4.4s, v22.4s, v16.s[1] + ldp q23, q22, [x13, #32] + fmla v0.4s, v21.4s, v17.s[2] + fmla v2.4s, v18.4s, v17.s[2] + mov x1, x29 + fmla v4.4s, v21.4s, v16.s[2] + fmla v1.4s, v20.4s, v17.s[2] + fmla v3.4s, v19.4s, v17.s[2] + fmla v5.4s, v20.4s, v16.s[2] + fmla v6.4s, v18.4s, v16.s[2] + fmla v7.4s, v19.4s, v16.s[2] + fmla v2.4s, v25.4s, v17.s[3] + fmla v0.4s, v22.4s, v17.s[3] + fmla v6.4s, v25.4s, v16.s[3] + fmla v4.4s, v22.4s, v16.s[3] + fmla v3.4s, v24.4s, v17.s[3] + fmla v1.4s, v23.4s, v17.s[3] + fmla v7.4s, v24.4s, v16.s[3] + fmla v5.4s, v23.4s, v16.s[3] + cmp x29, x20 + b.ge .LBB0_26 + .p2align 2 +.LBB0_25: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + add x8, x2, x0 + add x9, x3, x0 + prfm pldl1keep, [x18] + add x1, x1, #1 + add x8, x8, #4 + add x9, x9, #4 + ldp q16, q17, [x18, #-64] + ldp q18, q19, [x18, #-32] + prfm pldl1keep, [x9] + add x18, x18, #64 + ldr s20, [x3, x0] + prfm pldl1keep, [x8] + fmla v0.4s, v19.4s, v20.s[0] + ldr s21, [x2, x0] + fmla v1.4s, v18.4s, v20.s[0] + fmla v2.4s, v17.4s, v20.s[0] + fmla v3.4s, v16.4s, v20.s[0] + fmla v6.4s, v17.4s, v21.s[0] + fmla v7.4s, v16.4s, v21.s[0] + fmla v5.4s, v18.4s, v21.s[0] + fmla v4.4s, v19.4s, v21.s[0] + add x0, x0, #4 + cmp x1, x20 + b.lt .LBB0_25 +.LBB0_26: // in Loop: Header=BB0_11 Depth=3 + stp q3, q2, [x16] + stp q1, q0, [x16, #32] + stp q7, q6, [x17] + stp q5, q4, [x17, #32] +.LBB0_27: // in Loop: Header=BB0_11 Depth=3 + ldr x8, [sp, #528] // 8-byte Folded Reload + ldr x9, [sp, #592] // 8-byte Folded Reload + cmp x9, x8 + ldr x0, [sp, #568] // 8-byte Folded Reload + b.ge .LBB0_10 +// %bb.28: // in Loop: Header=BB0_11 Depth=3 + ldr x17, [sp, #592] // 8-byte Folded Reload + ldr x9, [sp, #664] // 8-byte Folded Reload + mov x16, xzr + madd x8, x17, x28, x12 + ldp q7, q16, [x24, #32] + ldp q6, q5, [x24] + ldr x12, [sp, #624] // 8-byte Folded Reload + ldr x1, [sp, #360] // 8-byte Folded Reload + add x8, x8, x10 + add x10, x11, x8, lsl #2 + ldr x8, [sp, #656] // 8-byte Folded Reload + ldr x11, [sp, #672] // 8-byte Folded Reload + ldp q1, q0, [x10, #32] + ldp q3, q2, [x10] + mul x8, x25, x8 + madd x8, x11, x9, x8 + ldr x11, [sp, #456] // 8-byte Folded Reload + madd x8, x17, x22, x8 + lsl x8, x8, #2 + ldr q4, [x26, x8] + cmp xzr, x19 + b.ge .LBB0_30 + .p2align 2 +.LBB0_29: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + add x18, x12, #64 + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + add x9, x12, #128 + prfm pldl1keep, [x18] + ldp q18, q17, [x12, #-160] + fmla v3.4s, v6.4s, v4.s[0] + ldp q19, q20, [x12, #-192] + fmla v2.4s, v5.4s, v4.s[0] + prfm pldl1keep, [x9] + ldp q6, q5, [x12, #-128] + ldp q7, q16, [x12, #-96] + add x8, x12, #192 + prfm pldl1keep, [x8] + add x17, x12, #256 + add x16, x16, #4 + fmla v2.4s, v20.4s, v4.s[1] + fmla v0.4s, v17.4s, v4.s[1] + fmla v3.4s, v19.4s, v4.s[1] + fmla v1.4s, v18.4s, v4.s[1] + ldp q18, q17, [x12, #-32] + ldp q19, q20, [x12, #-64] + prfm pldl1keep, [x11] + fmla v0.4s, v16.4s, v4.s[2] + fmla v2.4s, v5.4s, v4.s[2] + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v2.4s, v20.4s, v4.s[3] + fmla v0.4s, v17.4s, v4.s[3] + fmla v3.4s, v19.4s, v4.s[3] + fmla v1.4s, v18.4s, v4.s[3] + ldur q4, [x11, #-16] + prfm pldl1keep, [x17] + add x11, x11, #16 + ldp q7, q16, [x12, #32] + ldp q6, q5, [x12] + mov x12, x17 + cmp x16, x19 + b.lt .LBB0_29 +.LBB0_30: // in Loop: Header=BB0_11 Depth=3 + ldp q18, q17, [x15, #32] + ldp q19, q20, [x15] + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + fmla v2.4s, v5.4s, v4.s[0] + ldp q6, q5, [x14] + ldp q7, q16, [x14, #32] + ldr x14, [sp, #464] // 8-byte Folded Reload + mov x11, xzr + mov w12, #64 // =0x40 + fmla v2.4s, v20.4s, v4.s[1] + fmla v0.4s, v17.4s, v4.s[1] + fmla v3.4s, v19.4s, v4.s[1] + fmla v1.4s, v18.4s, v4.s[1] + fmla v0.4s, v16.4s, v4.s[2] + ldp q18, q17, [x13, #32] + ldp q19, q20, [x13] + fmla v2.4s, v5.4s, v4.s[2] + ldr x13, [sp, #416] // 8-byte Folded Reload + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v2.4s, v20.4s, v4.s[3] + fmla v0.4s, v17.4s, v4.s[3] + fmla v3.4s, v19.4s, v4.s[3] + fmla v1.4s, v18.4s, v4.s[3] + add x8, x29, xzr + cmp x8, x20 + b.ge .LBB0_9 + .p2align 2 +.LBB0_31: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_11 Depth=3 + // => This Inner Loop Header: Depth=4 + add x9, x1, x11, lsl #6 + add x8, x1, x12 + add x12, x12, #64 + prfm pldl1keep, [x8] + ldp q4, q5, [x9] + ldp q6, q7, [x9, #32] + prfm pldl1keep, [x13] + ldr s16, [x14, x11, lsl #2] + add x11, x11, #1 + add x13, x13, #4 + fmla v0.4s, v7.4s, v16.s[0] + fmla v1.4s, v6.4s, v16.s[0] + fmla v2.4s, v5.4s, v16.s[0] + fmla v3.4s, v4.4s, v16.s[0] + add x8, x29, x11 + cmp x8, x20 + b.lt .LBB0_31 + b .LBB0_9 + .p2align 2 +.LBB0_32: // in Loop: Header=BB0_7 Depth=2 + ldp x9, x8, [sp, #480] // 16-byte Folded Reload + ldr x10, [sp, #448] // 8-byte Folded Reload + add x8, x9, x8, lsl #2 + cmp x0, x10 + str x8, [sp, #584] // 8-byte Folded Spill + lsl x8, x28, #1 + str x8, [sp, #576] // 8-byte Folded Spill + b.lt .LBB0_36 +// %bb.33: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #440] // 8-byte Folded Reload + cmp x10, x8 + b.lt .LBB0_58 +.LBB0_34: // in Loop: Header=BB0_7 Depth=2 + ldr x9, [sp, #432] // 8-byte Folded Reload + cmp x8, x9 + b.lt .LBB0_80 +.LBB0_35: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #296] // 8-byte Folded Reload + cmp x9, x8 + b.ge .LBB0_6 + b .LBB0_102 + .p2align 2 +.LBB0_36: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #256] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #520] // 8-byte Folded Reload + ldr x9, [sp, #512] // 8-byte Folded Reload + mov x10, xzr + mov x11, xzr + ldr x12, [sp, #672] // 8-byte Folded Reload + ldr x13, [sp, #584] // 8-byte Folded Reload + mul x8, x25, x8 + ldr x14, [sp, #576] // 8-byte Folded Reload + ldr x15, [sp, #560] // 8-byte Folded Reload + madd x9, x12, x9, x8 + ldr x8, [sp, #568] // 8-byte Folded Reload + add x8, x9, x8 + add x12, x13, x8, lsl #2 + ldp q3, q2, [x12] + add x12, x8, x28 + add x12, x13, x12, lsl #2 + ldp q1, q0, [x12] + add x12, x8, x14 + add x12, x13, x12, lsl #2 + ldp q5, q4, [x12] + add x12, x14, x28 + add x8, x8, x12 + add x8, x13, x8, lsl #2 + ldp q7, q6, [x8] + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + cmp xzr, x20 + b.ge .LBB0_38 + .p2align 2 +.LBB0_37: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x12, [sp, #384] // 8-byte Folded Reload + add x13, x15, x10 + prfm pldl1keep, [x13] + ldur s16, [x13, #-4] + add x13, x13, x21 + prfm pldl1keep, [x13] + ldur s17, [x13, #-4] + add x13, x13, x21 + prfm pldl1keep, [x13] + ldur s18, [x13, #-4] + add x13, x13, x21 + add x12, x12, x10 + prfm pldl1keep, [x13] + ldur s20, [x13, #-4] + add x10, x10, #4 + prfm pldl1keep, [x12] + ldur s19, [x12, #-4] + add x12, x12, x27 + prfm pldl1keep, [x12] + sub x13, x12, #4 + add x12, x12, x27 + prfm pldl1keep, [x12] + sub x14, x12, #4 + add x12, x12, x27 + ld1 { v19.s }[1], [x13] + prfm pldl1keep, [x12] + sub x13, x12, #4 + add x12, x12, x27 + ld1 { v19.s }[2], [x14] + prfm pldl1keep, [x12] + ldur s21, [x12, #-4] + add x12, x12, x27 + ld1 { v19.s }[3], [x13] + prfm pldl1keep, [x12] + sub x13, x12, #4 + add x12, x12, x27 + prfm pldl1keep, [x12] + ld1 { v21.s }[1], [x13] + sub x14, x12, #4 + add x12, x12, x27 + prfm pldl1keep, [x12] + sub x12, x12, #4 + fmla v3.4s, v19.4s, v16.s[0] + fmla v1.4s, v19.4s, v17.s[0] + fmla v5.4s, v19.4s, v18.s[0] + fmla v7.4s, v19.4s, v20.s[0] + ld1 { v21.s }[2], [x14] + ld1 { v21.s }[3], [x12] + add x12, x8, x11, lsl #5 + add x11, x11, #1 + fmla v2.4s, v21.4s, v16.s[0] + fmla v0.4s, v21.4s, v17.s[0] + fmla v4.4s, v21.4s, v18.s[0] + fmla v6.4s, v21.4s, v20.s[0] + stp q19, q21, [x12] + cmp x11, x20 + b.lt .LBB0_37 +.LBB0_38: // %.preheader39 + // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #216] // 8-byte Folded Reload + ldr x15, [sp, #552] // 8-byte Folded Reload + mov x11, xzr + add x10, x8, #128 + ldr x16, [sp, #544] // 8-byte Folded Reload + mov w18, #1 // =0x1 + mov w2, #2 // =0x2 + mov w1, #3 // =0x3 + mov w17, #4 // =0x4 + add x14, x8, x12 + b .LBB0_40 + .p2align 2 +.LBB0_39: // %.loopexit35 + // in Loop: Header=BB0_40 Depth=3 + add x16, x16, x23 + add x15, x15, x23 + mov x11, x17 + mov x17, x3 +.LBB0_40: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_42 Depth 4 + // Child Loop BB0_44 Depth 4 + madd x11, x11, x28, x9 + ldr x7, [sp, #568] // 8-byte Folded Reload + ldr x30, [sp, #584] // 8-byte Folded Reload + add x11, x11, x7 + madd x12, x18, x28, x9 + madd x13, x2, x28, x9 + add x12, x12, x7 + add x13, x13, x7 + add x11, x30, x11, lsl #2 + add x12, x30, x12, lsl #2 + stp q3, q2, [x11] + madd x11, x1, x28, x9 + stp q1, q0, [x12] + add x12, x30, x13, lsl #2 + stp q5, q4, [x12] + add x11, x11, x7 + add x11, x30, x11, lsl #2 + stp q7, q6, [x11] + ldr x11, [sp, #680] // 8-byte Folded Reload + cmp x17, x11 + ldr x11, [sp, #648] // 8-byte Folded Reload + add x13, x8, x11, lsl #5 + ldr x11, [sp, #640] // 8-byte Folded Reload + add x12, x8, x11, lsl #5 + ldr x11, [sp, #632] // 8-byte Folded Reload + add x11, x8, x11, lsl #5 + b.ge .LBB0_45 +// %bb.41: // in Loop: Header=BB0_40 Depth=3 + madd x5, x17, x28, x9 + add x18, x17, #1 + add x2, x17, #2 + add x1, x17, #3 + madd x6, x18, x28, x9 + ldp q20, q21, [x8] + mov x4, xzr + add x3, x17, #4 + add x5, x5, x7 + add x5, x30, x5, lsl #2 + add x6, x6, x7 + add x6, x30, x6, lsl #2 + ldp q3, q2, [x5] + madd x5, x2, x28, x9 + ldp q1, q0, [x6] + madd x6, x1, x28, x9 + add x5, x5, x7 + add x6, x6, x7 + add x5, x30, x5, lsl #2 + ldr x7, [sp, #672] // 8-byte Folded Reload + add x6, x30, x6, lsl #2 + ldp q5, q4, [x5] + ldr x5, [sp, #656] // 8-byte Folded Reload + mul x5, x25, x5 + ldp q7, q6, [x6] + ldr x6, [sp, #664] // 8-byte Folded Reload + madd x5, x7, x6, x5 + madd x6, x17, x22, x5 + lsl x6, x6, #2 + ldr q19, [x26, x6] + madd x6, x18, x22, x5 + lsl x6, x6, #2 + ldr q18, [x26, x6] + madd x6, x2, x22, x5 + madd x5, x1, x22, x5 + lsl x6, x6, #2 + lsl x5, x5, #2 + ldr q17, [x26, x6] + ldr q16, [x26, x5] + mov x5, x10 + mov x6, x16 + cmp xzr, x19 + b.ge .LBB0_43 + .p2align 2 +.LBB0_42: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_40 Depth=3 + // => This Inner Loop Header: Depth=4 + add x7, x5, #32 + fmla v3.4s, v20.4s, v19.s[0] + fmla v2.4s, v21.4s, v19.s[0] + add x4, x4, #4 + prfm pldl1keep, [x7] + ldp q22, q23, [x5, #-96] + fmla v0.4s, v21.4s, v18.s[0] + fmla v1.4s, v20.4s, v18.s[0] + fmla v4.4s, v21.4s, v17.s[0] + fmla v5.4s, v20.4s, v17.s[0] + add x7, x5, #96 + fmla v6.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q21, q20, [x5, #-64] + prfm pldl1keep, [x7] + add x7, x6, x21 + add x30, x7, x21 + fmla v2.4s, v23.4s, v19.s[1] + fmla v0.4s, v23.4s, v18.s[1] + fmla v4.4s, v23.4s, v17.s[1] + fmla v6.4s, v23.4s, v16.s[1] + fmla v3.4s, v22.4s, v19.s[1] + fmla v1.4s, v22.4s, v18.s[1] + fmla v5.4s, v22.4s, v17.s[1] + fmla v7.4s, v22.4s, v16.s[1] + fmla v2.4s, v20.4s, v19.s[2] + ldp q22, q23, [x5, #-32] + fmla v0.4s, v20.4s, v18.s[2] + fmla v4.4s, v20.4s, v17.s[2] + fmla v6.4s, v20.4s, v16.s[2] + fmla v3.4s, v21.4s, v19.s[2] + fmla v1.4s, v21.4s, v18.s[2] + fmla v5.4s, v21.4s, v17.s[2] + fmla v7.4s, v21.4s, v16.s[2] + ldp q20, q21, [x5], #128 + prfm pldl1keep, [x6] + fmla v2.4s, v23.4s, v19.s[3] + fmla v0.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v17.s[3] + fmla v6.4s, v23.4s, v16.s[3] + fmla v3.4s, v22.4s, v19.s[3] + ldur q19, [x6, #-16] + prfm pldl1keep, [x7] + fmla v1.4s, v22.4s, v18.s[3] + ldur q18, [x7, #-16] + add x7, x30, x21 + prfm pldl1keep, [x30] + add x6, x6, #16 + fmla v5.4s, v22.4s, v17.s[3] + ldur q17, [x30, #-16] + prfm pldl1keep, [x7] + fmla v7.4s, v22.4s, v16.s[3] + ldur q16, [x7, #-16] + cmp x4, x19 + b.lt .LBB0_42 +.LBB0_43: // in Loop: Header=BB0_40 Depth=3 + ldp q22, q23, [x13] + fmla v3.4s, v20.4s, v19.s[0] + fmla v2.4s, v21.4s, v19.s[0] + fmla v0.4s, v21.4s, v18.s[0] + fmla v1.4s, v20.4s, v18.s[0] + mov x13, x29 + fmla v4.4s, v21.4s, v17.s[0] + fmla v5.4s, v20.4s, v17.s[0] + fmla v6.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + fmla v2.4s, v23.4s, v19.s[1] + ldp q21, q20, [x12] + fmla v0.4s, v23.4s, v18.s[1] + mov x12, x15 + fmla v4.4s, v23.4s, v17.s[1] + fmla v6.4s, v23.4s, v16.s[1] + fmla v3.4s, v22.4s, v19.s[1] + fmla v1.4s, v22.4s, v18.s[1] + fmla v5.4s, v22.4s, v17.s[1] + fmla v7.4s, v22.4s, v16.s[1] + ldp q22, q23, [x11] + mov x11, x14 + fmla v2.4s, v20.4s, v19.s[2] + fmla v0.4s, v20.4s, v18.s[2] + fmla v4.4s, v20.4s, v17.s[2] + fmla v6.4s, v20.4s, v16.s[2] + fmla v3.4s, v21.4s, v19.s[2] + fmla v1.4s, v21.4s, v18.s[2] + fmla v5.4s, v21.4s, v17.s[2] + fmla v7.4s, v21.4s, v16.s[2] + fmla v2.4s, v23.4s, v19.s[3] + fmla v0.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v17.s[3] + fmla v6.4s, v23.4s, v16.s[3] + fmla v3.4s, v22.4s, v19.s[3] + fmla v1.4s, v22.4s, v18.s[3] + fmla v5.4s, v22.4s, v17.s[3] + fmla v7.4s, v22.4s, v16.s[3] + cmp x29, x20 + b.ge .LBB0_39 + .p2align 2 +.LBB0_44: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_40 Depth=3 + // => This Inner Loop Header: Depth=4 + add x4, x12, x21 + prfm pldl1keep, [x11] + ldp q16, q17, [x11, #-32] + prfm pldl1keep, [x12] + ldur s18, [x12, #-4] + add x13, x13, #1 + add x12, x12, #4 + prfm pldl1keep, [x4] + ldur s19, [x4, #-4] + add x4, x4, x21 + add x11, x11, #32 + prfm pldl1keep, [x4] + ldur s20, [x4, #-4] + add x4, x4, x21 + fmla v2.4s, v17.4s, v18.s[0] + prfm pldl1keep, [x4] + ldur s21, [x4, #-4] + fmla v3.4s, v16.4s, v18.s[0] + fmla v0.4s, v17.4s, v19.s[0] + fmla v1.4s, v16.4s, v19.s[0] + fmla v4.4s, v17.4s, v20.s[0] + fmla v5.4s, v16.4s, v20.s[0] + fmla v6.4s, v17.4s, v21.s[0] + fmla v7.4s, v16.4s, v21.s[0] + cmp x13, x20 + b.lt .LBB0_44 + b .LBB0_39 + .p2align 2 +.LBB0_45: // in Loop: Header=BB0_7 Depth=2 + ldr x14, [sp, #680] // 8-byte Folded Reload + ldr x15, [sp, #592] // 8-byte Folded Reload + mov x7, x30 + cmp x14, x15 + b.ge .LBB0_51 +// %bb.46: // in Loop: Header=BB0_7 Depth=2 + ldr x15, [sp, #656] // 8-byte Folded Reload + ldr x18, [sp, #664] // 8-byte Folded Reload + mov x16, xzr + mul x15, x25, x15 + ldr x3, [sp, #672] // 8-byte Folded Reload + ldr x2, [sp, #680] // 8-byte Folded Reload + add x17, x2, #1 + madd x14, x2, x28, x9 + ldr x1, [sp, #568] // 8-byte Folded Reload + ldp q6, q7, [x8] + madd x18, x3, x18, x15 + madd x15, x2, x22, x18 + add x14, x14, x1 + add x14, x7, x14, lsl #2 + lsl x15, x15, #2 + ldr q4, [x26, x15] + madd x15, x17, x28, x9 + madd x17, x17, x22, x18 + ldp q1, q0, [x14] + ldr x18, [sp, #536] // 8-byte Folded Reload + add x15, x15, x1 + lsl x17, x17, #2 + add x15, x7, x15, lsl #2 + ldr q5, [x26, x17] + mov x17, x10 + ldp q3, q2, [x15] + cmp xzr, x19 + b.ge .LBB0_48 + .p2align 2 +.LBB0_47: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x6, x17, #32 + ldr x1, [sp, #608] // 8-byte Folded Reload + ldr x3, [sp, #600] // 8-byte Folded Reload + fmla v1.4s, v6.4s, v4.s[0] + prfm pldl1keep, [x6] + ldp q16, q17, [x17, #-96] + fmla v0.4s, v7.4s, v4.s[0] + fmla v2.4s, v7.4s, v5.s[0] + fmla v3.4s, v6.4s, v5.s[0] + ldp q7, q6, [x17, #-64] + add x5, x17, #96 + prfm pldl1keep, [x5] + add x16, x16, #4 + add x1, x1, x18 + add x3, x3, x18 + add x18, x18, #16 + fmla v0.4s, v17.4s, v4.s[1] + fmla v2.4s, v17.4s, v5.s[1] + add x2, x1, #32 + add x4, x3, #32 + fmla v1.4s, v16.4s, v4.s[1] + fmla v3.4s, v16.4s, v5.s[1] + ldp q16, q17, [x17, #-32] + fmla v0.4s, v6.4s, v4.s[2] + fmla v2.4s, v6.4s, v5.s[2] + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v7.4s, v5.s[2] + fmla v0.4s, v17.4s, v4.s[3] + fmla v2.4s, v17.4s, v5.s[3] + ldp q6, q7, [x17], #128 + prfm pldl1keep, [x4] + fmla v1.4s, v16.4s, v4.s[3] + ldr q4, [x3, #16] + prfm pldl1keep, [x2] + fmla v3.4s, v16.4s, v5.s[3] + ldr q5, [x1, #16] + cmp x16, x19 + b.lt .LBB0_47 +.LBB0_48: // in Loop: Header=BB0_7 Depth=2 + ldp q16, q17, [x13] + fmla v1.4s, v6.4s, v4.s[0] + fmla v0.4s, v7.4s, v4.s[0] + fmla v2.4s, v7.4s, v5.s[0] + fmla v3.4s, v6.4s, v5.s[0] + ldp q7, q6, [x12] + ldr x18, [sp, #232] // 8-byte Folded Reload + mov x16, xzr + mov x17, xzr + mov x1, x29 + fmla v0.4s, v17.4s, v4.s[1] + fmla v2.4s, v17.4s, v5.s[1] + add x18, x8, x18 + fmla v1.4s, v16.4s, v4.s[1] + fmla v3.4s, v16.4s, v5.s[1] + ldp q16, q17, [x11] + fmla v0.4s, v6.4s, v4.s[2] + fmla v2.4s, v6.4s, v5.s[2] + fmla v1.4s, v7.4s, v4.s[2] + fmla v3.4s, v7.4s, v5.s[2] + fmla v0.4s, v17.4s, v4.s[3] + fmla v2.4s, v17.4s, v5.s[3] + fmla v1.4s, v16.4s, v4.s[3] + fmla v3.4s, v16.4s, v5.s[3] + cmp x29, x20 + b.ge .LBB0_50 + .p2align 2 +.LBB0_49: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x2, [sp, #424] // 8-byte Folded Reload + ldr x6, [sp, #352] // 8-byte Folded Reload + add x4, x18, x17, lsl #3 + add x5, x18, x16 + add x1, x1, #1 + add x16, x16, #32 + add x4, x4, #32 + prfm pldl1keep, [x4] + ldp q4, q5, [x5] + add x2, x2, x17 + add x3, x6, x17 + add x2, x2, #4 + add x3, x3, #4 + prfm pldl1keep, [x3] + ldr s6, [x6, x17] + prfm pldl1keep, [x2] + ldr x2, [sp, #424] // 8-byte Folded Reload + ldr s7, [x2, x17] + add x17, x17, #4 + fmla v0.4s, v5.4s, v6.s[0] + fmla v1.4s, v4.4s, v6.s[0] + fmla v2.4s, v5.4s, v7.s[0] + fmla v3.4s, v4.4s, v7.s[0] + cmp x1, x20 + b.lt .LBB0_49 +.LBB0_50: // in Loop: Header=BB0_7 Depth=2 + stp q1, q0, [x14] + stp q3, q2, [x15] +.LBB0_51: // in Loop: Header=BB0_7 Depth=2 + ldr x14, [sp, #528] // 8-byte Folded Reload + ldr x15, [sp, #592] // 8-byte Folded Reload + cmp x15, x14 + b.ge .LBB0_57 +// %bb.52: // in Loop: Header=BB0_7 Depth=2 + ldr x17, [sp, #592] // 8-byte Folded Reload + ldr x15, [sp, #568] // 8-byte Folded Reload + mov x14, xzr + madd x9, x17, x28, x9 + ldr x16, [sp, #664] // 8-byte Folded Reload + ldr x18, [sp, #672] // 8-byte Folded Reload + ldp q4, q3, [x8] + add x9, x9, x15 + ldr x15, [sp, #656] // 8-byte Folded Reload + add x9, x7, x9, lsl #2 + mul x15, x25, x15 + ldp q1, q0, [x9] + madd x15, x18, x16, x15 + madd x15, x17, x22, x15 + lsl x15, x15, #2 + ldr q2, [x26, x15] + ldr x15, [sp, #456] // 8-byte Folded Reload + cmp xzr, x19 + b.ge .LBB0_54 + .p2align 2 +.LBB0_53: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x17, x10, #32 + fmla v1.4s, v4.4s, v2.s[0] + fmla v0.4s, v3.4s, v2.s[0] + add x16, x10, #96 + prfm pldl1keep, [x17] + ldp q5, q6, [x10, #-96] + add x14, x14, #4 + ldp q4, q3, [x10, #-64] + prfm pldl1keep, [x16] + fmla v0.4s, v6.4s, v2.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldp q5, q6, [x10, #-32] + prfm pldl1keep, [x15] + fmla v0.4s, v3.4s, v2.s[2] + fmla v1.4s, v4.4s, v2.s[2] + fmla v0.4s, v6.4s, v2.s[3] + fmla v1.4s, v5.4s, v2.s[3] + ldur q2, [x15, #-16] + ldp q4, q3, [x10], #128 + add x15, x15, #16 + cmp x14, x19 + b.lt .LBB0_53 +.LBB0_54: // in Loop: Header=BB0_7 Depth=2 + ldp q5, q6, [x13] + fmla v1.4s, v4.4s, v2.s[0] + fmla v0.4s, v3.4s, v2.s[0] + ldp q4, q3, [x12] + mov x10, xzr + mov x14, xzr + fmla v0.4s, v6.4s, v2.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldp q5, q6, [x11] + ldr x11, [sp, #232] // 8-byte Folded Reload + fmla v0.4s, v3.4s, v2.s[2] + fmla v1.4s, v4.4s, v2.s[2] + add x8, x8, x11 + mov x11, x29 + fmla v0.4s, v6.4s, v2.s[3] + fmla v1.4s, v5.4s, v2.s[3] + cmp x29, x20 + b.ge .LBB0_56 + .p2align 2 +.LBB0_55: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x16, [sp, #464] // 8-byte Folded Reload + add x13, x8, x14, lsl #3 + add x15, x8, x10 + add x11, x11, #1 + add x10, x10, #32 + add x13, x13, #32 + prfm pldl1keep, [x13] + add x12, x16, x14 + ldp q2, q3, [x15] + add x12, x12, #4 + prfm pldl1keep, [x12] + ldr s4, [x16, x14] + add x14, x14, #4 + fmla v0.4s, v3.4s, v4.s[0] + fmla v1.4s, v2.4s, v4.s[0] + cmp x11, x20 + b.lt .LBB0_55 +.LBB0_56: // in Loop: Header=BB0_7 Depth=2 + stp q1, q0, [x9] +.LBB0_57: // in Loop: Header=BB0_7 Depth=2 + bl free + ldp x8, x10, [sp, #440] // 16-byte Folded Reload + ldr x16, [sp, #560] // 8-byte Folded Reload + cmp x10, x8 + b.ge .LBB0_34 +.LBB0_58: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #248] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #520] // 8-byte Folded Reload + ldr x9, [sp, #512] // 8-byte Folded Reload + mov x10, xzr + mov x11, xzr + ldr x12, [sp, #672] // 8-byte Folded Reload + ldr x7, [sp, #584] // 8-byte Folded Reload + mul x8, x25, x8 + ldr x13, [sp, #576] // 8-byte Folded Reload + ldr x15, [sp, #560] // 8-byte Folded Reload + ldp x6, x5, [sp, #368] // 16-byte Folded Reload + madd x9, x12, x9, x8 + ldr x8, [sp, #448] // 8-byte Folded Reload + add x8, x9, x8 + lsl x12, x8, #2 + ldr q0, [x7, x12] + add x12, x8, x28 + lsl x12, x12, #2 + ldr q1, [x7, x12] + add x12, x8, x13 + lsl x12, x12, #2 + ldr q2, [x7, x12] + add x12, x13, x28 + add x8, x8, x12 + lsl x8, x8, #2 + ldr q3, [x7, x8] + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + cmp xzr, x20 + b.ge .LBB0_60 + .p2align 2 +.LBB0_59: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldr x12, [sp, #392] // 8-byte Folded Reload + add x13, x15, x10 + prfm pldl1keep, [x13] + ldur s4, [x13, #-4] + add x13, x13, x21 + prfm pldl1keep, [x13] + ldur s5, [x13, #-4] + add x13, x13, x21 + prfm pldl1keep, [x13] + ldur s6, [x13, #-4] + add x13, x13, x21 + add x12, x12, x10 + prfm pldl1keep, [x13] + ldur s7, [x13, #-4] + add x10, x10, #4 + prfm pldl1keep, [x12] + ldur s16, [x12, #-4] + add x12, x12, x27 + prfm pldl1keep, [x12] + sub x13, x12, #4 + add x12, x12, x27 + prfm pldl1keep, [x12] + sub x14, x12, #4 + add x12, x12, x27 + ld1 { v16.s }[1], [x13] + prfm pldl1keep, [x12] + sub x12, x12, #4 + ld1 { v16.s }[2], [x14] + ld1 { v16.s }[3], [x12] + str q16, [x8, x11, lsl #4] + add x11, x11, #1 + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v16.4s, v5.s[0] + fmla v2.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v7.s[0] + cmp x11, x20 + b.lt .LBB0_59 +.LBB0_60: // %.preheader38 + // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #208] // 8-byte Folded Reload + ldr x12, [sp, #552] // 8-byte Folded Reload + mov x1, xzr + add x10, x8, #48 + ldr x13, [sp, #544] // 8-byte Folded Reload + mov w15, #1 // =0x1 + mov w16, #2 // =0x2 + mov w17, #3 // =0x3 + mov w14, #4 // =0x4 + add x11, x8, x11 + b .LBB0_62 + .p2align 2 +.LBB0_61: // %.loopexit34 + // in Loop: Header=BB0_62 Depth=3 + add x13, x13, x23 + add x12, x12, x23 + mov x1, x14 + mov x14, x18 +.LBB0_62: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_64 Depth 4 + // Child Loop BB0_66 Depth 4 + madd x18, x1, x28, x9 + ldr x4, [sp, #448] // 8-byte Folded Reload + add x18, x18, x4 + madd x15, x15, x28, x9 + madd x16, x16, x28, x9 + madd x17, x17, x28, x9 + add x15, x15, x4 + add x16, x16, x4 + lsl x18, x18, #2 + lsl x15, x15, #2 + lsl x16, x16, #2 + str q0, [x7, x18] + str q1, [x7, x15] + add x15, x17, x4 + str q2, [x7, x16] + lsl x15, x15, #2 + str q3, [x7, x15] + ldr x15, [sp, #680] // 8-byte Folded Reload + cmp x14, x15 + b.ge .LBB0_67 +// %bb.63: // in Loop: Header=BB0_62 Depth=3 + madd x2, x14, x28, x9 + add x15, x14, #1 + add x17, x14, #3 + add x16, x14, #2 + madd x3, x16, x28, x9 + ldr q16, [x8] + mov x1, xzr + add x18, x14, #4 + add x2, x2, x4 + lsl x2, x2, #2 + add x3, x3, x4 + lsl x3, x3, #2 + ldr q0, [x7, x2] + madd x2, x15, x28, x9 + add x2, x2, x4 + ldr q2, [x7, x3] + ldr x3, [sp, #664] // 8-byte Folded Reload + lsl x2, x2, #2 + ldr q1, [x7, x2] + madd x2, x17, x28, x9 + add x2, x2, x4 + ldr x4, [sp, #672] // 8-byte Folded Reload + lsl x2, x2, #2 + ldr q3, [x7, x2] + ldr x2, [sp, #656] // 8-byte Folded Reload + mul x2, x25, x2 + madd x2, x4, x3, x2 + madd x3, x14, x22, x2 + lsl x3, x3, #2 + ldr q7, [x26, x3] + madd x3, x15, x22, x2 + lsl x3, x3, #2 + ldr q6, [x26, x3] + madd x3, x16, x22, x2 + madd x2, x17, x22, x2 + lsl x3, x3, #2 + lsl x2, x2, #2 + ldr q5, [x26, x3] + ldr q4, [x26, x2] + mov x2, x10 + mov x3, x13 + cmp xzr, x19 + b.ge .LBB0_65 + .p2align 2 +.LBB0_64: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_62 Depth=3 + // => This Inner Loop Header: Depth=4 + add x4, x2, #32 + fmla v0.4s, v16.4s, v7.s[0] + fmla v1.4s, v16.4s, v6.s[0] + add x1, x1, #4 + fmla v2.4s, v16.4s, v5.s[0] + fmla v3.4s, v16.4s, v4.s[0] + prfm pldl1keep, [x4] + add x4, x3, x21 + ldp q16, q17, [x2, #-32] + fmla v0.4s, v16.4s, v7.s[1] + fmla v1.4s, v16.4s, v6.s[1] + fmla v2.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + fmla v0.4s, v17.4s, v7.s[2] + fmla v1.4s, v17.4s, v6.s[2] + fmla v2.4s, v17.4s, v5.s[2] + fmla v3.4s, v17.4s, v4.s[2] + ldp q17, q16, [x2], #64 + prfm pldl1keep, [x3] + fmla v0.4s, v17.4s, v7.s[3] + ldur q7, [x3, #-16] + prfm pldl1keep, [x4] + fmla v1.4s, v17.4s, v6.s[3] + ldur q6, [x4, #-16] + add x4, x4, x21 + fmla v2.4s, v17.4s, v5.s[3] + fmla v3.4s, v17.4s, v4.s[3] + add x3, x3, #16 + prfm pldl1keep, [x4] + ldur q5, [x4, #-16] + add x4, x4, x21 + prfm pldl1keep, [x4] + ldur q4, [x4, #-16] + cmp x1, x19 + b.lt .LBB0_64 +.LBB0_65: // in Loop: Header=BB0_62 Depth=3 + ldr x1, [sp, #648] // 8-byte Folded Reload + fmla v0.4s, v16.4s, v7.s[0] + fmla v1.4s, v16.4s, v6.s[0] + mov x2, x12 + fmla v2.4s, v16.4s, v5.s[0] + fmla v3.4s, v16.4s, v4.s[0] + mov x3, x29 + ldr q17, [x8, x1, lsl #4] + ldr x1, [sp, #640] // 8-byte Folded Reload + fmla v0.4s, v17.4s, v7.s[1] + ldr q16, [x8, x1, lsl #4] + ldr x1, [sp, #632] // 8-byte Folded Reload + fmla v1.4s, v17.4s, v6.s[1] + fmla v2.4s, v17.4s, v5.s[1] + fmla v3.4s, v17.4s, v4.s[1] + ldr q18, [x8, x1, lsl #4] + mov x1, x11 + fmla v0.4s, v16.4s, v7.s[2] + fmla v1.4s, v16.4s, v6.s[2] + fmla v2.4s, v16.4s, v5.s[2] + fmla v3.4s, v16.4s, v4.s[2] + fmla v0.4s, v18.4s, v7.s[3] + fmla v1.4s, v18.4s, v6.s[3] + fmla v2.4s, v18.4s, v5.s[3] + fmla v3.4s, v18.4s, v4.s[3] + cmp x29, x20 + b.ge .LBB0_61 + .p2align 2 +.LBB0_66: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_62 Depth=3 + // => This Inner Loop Header: Depth=4 + add x4, x2, x21 + prfm pldl1keep, [x1] + ldur q4, [x1, #-16] + add x3, x3, #1 + prfm pldl1keep, [x2] + ldur s5, [x2, #-4] + add x2, x2, #4 + add x1, x1, #16 + prfm pldl1keep, [x4] + ldur s6, [x4, #-4] + add x4, x4, x21 + fmla v0.4s, v4.4s, v5.s[0] + prfm pldl1keep, [x4] + ldur s7, [x4, #-4] + add x4, x4, x21 + prfm pldl1keep, [x4] + ldur s16, [x4, #-4] + fmla v1.4s, v4.4s, v6.s[0] + fmla v2.4s, v4.4s, v7.s[0] + fmla v3.4s, v4.4s, v16.s[0] + cmp x3, x20 + b.lt .LBB0_66 + b .LBB0_61 + .p2align 2 +.LBB0_67: // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #680] // 8-byte Folded Reload + ldr x13, [sp, #592] // 8-byte Folded Reload + cmp x12, x13 + b.ge .LBB0_73 +// %bb.68: // in Loop: Header=BB0_7 Depth=2 + ldr x13, [sp, #656] // 8-byte Folded Reload + ldr x15, [sp, #664] // 8-byte Folded Reload + mov x14, xzr + mul x13, x25, x13 + ldr x18, [sp, #672] // 8-byte Folded Reload + ldr x16, [sp, #680] // 8-byte Folded Reload + madd x12, x16, x28, x9 + ldr x17, [sp, #448] // 8-byte Folded Reload + ldr q4, [x8] + madd x15, x18, x15, x13 + madd x13, x16, x22, x15 + add x16, x16, #1 + madd x15, x16, x22, x15 + add x12, x12, x17 + add x12, x7, x12, lsl #2 + lsl x13, x13, #2 + ldr q2, [x26, x13] + madd x13, x16, x28, x9 + lsl x15, x15, #2 + ldr q0, [x12] + ldr x16, [sp, #536] // 8-byte Folded Reload + ldr q3, [x26, x15] + mov x15, x10 + add x13, x13, x17 + add x13, x7, x13, lsl #2 + ldr q1, [x13] + cmp xzr, x19 + b.ge .LBB0_70 + .p2align 2 +.LBB0_69: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x3, x15, #32 + ldr x17, [sp, #608] // 8-byte Folded Reload + ldr x1, [sp, #600] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v2.s[0] + prfm pldl1keep, [x3] + fmla v1.4s, v4.4s, v3.s[0] + ldp q4, q5, [x15, #-32] + add x14, x14, #4 + add x17, x17, x16 + add x1, x1, x16 + add x16, x16, #16 + add x18, x17, #32 + add x2, x1, #32 + fmla v0.4s, v4.4s, v2.s[1] + fmla v1.4s, v4.4s, v3.s[1] + fmla v0.4s, v5.4s, v2.s[2] + fmla v1.4s, v5.4s, v3.s[2] + ldp q5, q4, [x15], #64 + prfm pldl1keep, [x2] + fmla v0.4s, v5.4s, v2.s[3] + ldr q2, [x1, #16] + prfm pldl1keep, [x18] + fmla v1.4s, v5.4s, v3.s[3] + ldr q3, [x17, #16] + cmp x14, x19 + b.lt .LBB0_69 +.LBB0_70: // in Loop: Header=BB0_7 Depth=2 + ldr x14, [sp, #648] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v2.s[0] + fmla v1.4s, v4.4s, v3.s[0] + mov x15, x29 + ldr q5, [x8, x14, lsl #4] + ldr x14, [sp, #640] // 8-byte Folded Reload + fmla v0.4s, v5.4s, v2.s[1] + ldr q4, [x8, x14, lsl #4] + ldr x14, [sp, #632] // 8-byte Folded Reload + fmla v1.4s, v5.4s, v3.s[1] + ldr q5, [x8, x14, lsl #4] + ldr x14, [sp, #536] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v2.s[2] + fmla v1.4s, v4.4s, v3.s[2] + fmla v0.4s, v5.4s, v2.s[3] + fmla v1.4s, v5.4s, v3.s[3] + cmp x29, x20 + b.ge .LBB0_72 + .p2align 2 +.LBB0_71: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x16, x5, x14 + add x17, x6, x14 + prfm pldl1keep, [x11] + ldur q2, [x11, #-16] + add x16, x16, #4 + add x17, x17, #4 + add x15, x15, #1 + add x11, x11, #16 + prfm pldl1keep, [x17] + ldr s3, [x6, x14] + prfm pldl1keep, [x16] + ldr s4, [x5, x14] + add x14, x14, #4 + fmla v0.4s, v2.4s, v3.s[0] + fmla v1.4s, v2.4s, v4.s[0] + cmp x15, x20 + b.lt .LBB0_71 +.LBB0_72: // in Loop: Header=BB0_7 Depth=2 + str q0, [x12] + str q1, [x13] +.LBB0_73: // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #528] // 8-byte Folded Reload + ldr x12, [sp, #592] // 8-byte Folded Reload + cmp x12, x11 + b.ge .LBB0_79 +// %bb.74: // in Loop: Header=BB0_7 Depth=2 + ldr x14, [sp, #592] // 8-byte Folded Reload + ldr x12, [sp, #448] // 8-byte Folded Reload + mov x11, xzr + madd x9, x14, x28, x9 + ldr x13, [sp, #664] // 8-byte Folded Reload + ldr x15, [sp, #672] // 8-byte Folded Reload + ldr q2, [x8] + add x9, x9, x12 + ldr x12, [sp, #656] // 8-byte Folded Reload + add x9, x7, x9, lsl #2 + ldr q0, [x9] + mul x12, x25, x12 + madd x12, x15, x13, x12 + madd x12, x14, x22, x12 + lsl x12, x12, #2 + ldr q1, [x26, x12] + ldp x12, x14, [sp, #456] // 16-byte Folded Reload + cmp xzr, x19 + b.ge .LBB0_76 + .p2align 2 +.LBB0_75: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x10, #32 + fmla v0.4s, v2.4s, v1.s[0] + add x11, x11, #4 + prfm pldl1keep, [x13] + ldp q2, q3, [x10, #-32] + fmla v0.4s, v2.4s, v1.s[1] + fmla v0.4s, v3.4s, v1.s[2] + ldp q3, q2, [x10], #64 + prfm pldl1keep, [x12] + fmla v0.4s, v3.4s, v1.s[3] + ldur q1, [x12, #-16] + add x12, x12, #16 + cmp x11, x19 + b.lt .LBB0_75 +.LBB0_76: // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #648] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[0] + ldr x12, [sp, #416] // 8-byte Folded Reload + mov x10, xzr + ldr q3, [x8, x11, lsl #4] + ldr x11, [sp, #640] // 8-byte Folded Reload + fmla v0.4s, v3.4s, v1.s[1] + ldr q2, [x8, x11, lsl #4] + ldr x11, [sp, #632] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[2] + ldr q3, [x8, x11, lsl #4] + ldr x11, [sp, #184] // 8-byte Folded Reload + add x8, x8, x11 + mov w11, #16 // =0x10 + fmla v0.4s, v3.4s, v1.s[3] + add x13, x29, xzr + cmp x13, x20 + b.ge .LBB0_78 + .p2align 2 +.LBB0_77: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x8, x11 + add x11, x11, #16 + prfm pldl1keep, [x13] + ldr q1, [x8, x10, lsl #4] + prfm pldl1keep, [x12] + add x12, x12, #4 + ldr s2, [x14, x10, lsl #2] + add x10, x10, #1 + fmla v0.4s, v1.4s, v2.s[0] + add x13, x29, x10 + cmp x13, x20 + b.lt .LBB0_77 +.LBB0_78: // in Loop: Header=BB0_7 Depth=2 + str q0, [x9] +.LBB0_79: // in Loop: Header=BB0_7 Depth=2 + bl free + ldp x9, x8, [sp, #432] // 16-byte Folded Reload + ldr x16, [sp, #560] // 8-byte Folded Reload + cmp x8, x9 + b.ge .LBB0_35 +.LBB0_80: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #240] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #520] // 8-byte Folded Reload + ldr x9, [sp, #512] // 8-byte Folded Reload + mov x10, xzr + mov x11, xzr + ldr x12, [sp, #672] // 8-byte Folded Reload + ldr x5, [sp, #584] // 8-byte Folded Reload + mul x8, x25, x8 + ldr x13, [sp, #576] // 8-byte Folded Reload + ldr x17, [sp, #560] // 8-byte Folded Reload + madd x9, x12, x9, x8 + ldr x8, [sp, #440] // 8-byte Folded Reload + add x8, x9, x8 + lsl x12, x8, #2 + ldr d0, [x5, x12] + add x12, x8, x28 + lsl x12, x12, #2 + ldr d1, [x5, x12] + add x12, x8, x13 + lsl x12, x12, #2 + ldr d2, [x5, x12] + add x12, x13, x28 + add x8, x8, x12 + lsl x8, x8, #2 + ldr d3, [x5, x8] + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + cmp xzr, x20 + b.ge .LBB0_82 + .p2align 2 +.LBB0_81: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ldp x12, x16, [sp, #400] // 16-byte Folded Reload + add x15, x17, x10 + prfm pldl1keep, [x15] + ldur s4, [x15, #-4] + add x15, x15, x21 + prfm pldl1keep, [x15] + ldur s5, [x15, #-4] + add x15, x15, x21 + prfm pldl1keep, [x15] + ldur s6, [x15, #-4] + add x15, x15, x21 + add x12, x12, x10 + add x14, x16, x10 + prfm pldl1keep, [x15] + ldur s7, [x15, #-4] + add x13, x12, #4 + add x14, x14, #4 + prfm pldl1keep, [x14] + prfm pldl1keep, [x13] + ldr s16, [x16, x10] + add x10, x10, #4 + ld1 { v16.s }[1], [x12] + str d16, [x8, x11, lsl #3] + add x11, x11, #1 + fmla v0.2s, v16.2s, v4.s[0] + fmla v1.2s, v16.2s, v5.s[0] + fmla v2.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v7.s[0] + cmp x11, x20 + b.lt .LBB0_81 +.LBB0_82: // %.preheader37 + // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #200] // 8-byte Folded Reload + ldr x12, [sp, #552] // 8-byte Folded Reload + mov x1, xzr + add x10, x8, #24 + ldr x13, [sp, #544] // 8-byte Folded Reload + mov w15, #1 // =0x1 + mov w16, #2 // =0x2 + mov w17, #3 // =0x3 + mov w14, #4 // =0x4 + add x11, x8, x11 + b .LBB0_84 + .p2align 2 +.LBB0_83: // %.loopexit33 + // in Loop: Header=BB0_84 Depth=3 + add x13, x13, x23 + add x12, x12, x23 + mov x1, x14 + mov x14, x18 +.LBB0_84: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_86 Depth 4 + // Child Loop BB0_88 Depth 4 + madd x18, x1, x28, x9 + ldr x4, [sp, #440] // 8-byte Folded Reload + add x18, x18, x4 + madd x15, x15, x28, x9 + madd x16, x16, x28, x9 + madd x17, x17, x28, x9 + add x15, x15, x4 + add x16, x16, x4 + lsl x18, x18, #2 + lsl x15, x15, #2 + lsl x16, x16, #2 + str d0, [x5, x18] + str d1, [x5, x15] + add x15, x17, x4 + str d2, [x5, x16] + lsl x15, x15, #2 + str d3, [x5, x15] + ldr x15, [sp, #680] // 8-byte Folded Reload + cmp x14, x15 + b.ge .LBB0_89 +// %bb.85: // in Loop: Header=BB0_84 Depth=3 + madd x2, x14, x28, x9 + add x15, x14, #1 + add x17, x14, #3 + add x16, x14, #2 + madd x3, x16, x28, x9 + ldr d16, [x8] + mov x1, xzr + add x18, x14, #4 + add x2, x2, x4 + lsl x2, x2, #2 + add x3, x3, x4 + lsl x3, x3, #2 + ldr d0, [x5, x2] + madd x2, x15, x28, x9 + add x2, x2, x4 + ldr d2, [x5, x3] + ldr x3, [sp, #664] // 8-byte Folded Reload + lsl x2, x2, #2 + ldr d1, [x5, x2] + madd x2, x17, x28, x9 + add x2, x2, x4 + ldr x4, [sp, #672] // 8-byte Folded Reload + lsl x2, x2, #2 + ldr d3, [x5, x2] + ldr x2, [sp, #656] // 8-byte Folded Reload + mul x2, x25, x2 + madd x2, x4, x3, x2 + madd x3, x14, x22, x2 + lsl x3, x3, #2 + ldr q7, [x26, x3] + madd x3, x15, x22, x2 + lsl x3, x3, #2 + ldr q6, [x26, x3] + madd x3, x16, x22, x2 + madd x2, x17, x22, x2 + lsl x3, x3, #2 + lsl x2, x2, #2 + ldr q5, [x26, x3] + ldr q4, [x26, x2] + mov x2, x10 + mov x3, x13 + cmp xzr, x19 + b.ge .LBB0_87 + .p2align 2 +.LBB0_86: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_84 Depth=3 + // => This Inner Loop Header: Depth=4 + add x4, x2, #16 + fmla v0.2s, v16.2s, v7.s[0] + fmla v1.2s, v16.2s, v6.s[0] + add x1, x1, #4 + fmla v2.2s, v16.2s, v5.s[0] + fmla v3.2s, v16.2s, v4.s[0] + prfm pldl1keep, [x4] + add x4, x3, x21 + ldp d16, d17, [x2, #-16] + fmla v0.2s, v16.2s, v7.s[1] + fmla v1.2s, v16.2s, v6.s[1] + fmla v2.2s, v16.2s, v5.s[1] + fmla v3.2s, v16.2s, v4.s[1] + fmla v0.2s, v17.2s, v7.s[2] + fmla v1.2s, v17.2s, v6.s[2] + fmla v2.2s, v17.2s, v5.s[2] + fmla v3.2s, v17.2s, v4.s[2] + ldp d17, d16, [x2], #32 + prfm pldl1keep, [x3] + fmla v0.2s, v17.2s, v7.s[3] + ldur q7, [x3, #-16] + prfm pldl1keep, [x4] + fmla v1.2s, v17.2s, v6.s[3] + ldur q6, [x4, #-16] + add x4, x4, x21 + fmla v2.2s, v17.2s, v5.s[3] + fmla v3.2s, v17.2s, v4.s[3] + add x3, x3, #16 + prfm pldl1keep, [x4] + ldur q5, [x4, #-16] + add x4, x4, x21 + prfm pldl1keep, [x4] + ldur q4, [x4, #-16] + cmp x1, x19 + b.lt .LBB0_86 +.LBB0_87: // in Loop: Header=BB0_84 Depth=3 + ldr x1, [sp, #648] // 8-byte Folded Reload + fmla v0.2s, v16.2s, v7.s[0] + fmla v1.2s, v16.2s, v6.s[0] + mov x2, x12 + fmla v2.2s, v16.2s, v5.s[0] + fmla v3.2s, v16.2s, v4.s[0] + mov x3, x29 + ldr d17, [x8, x1, lsl #3] + ldr x1, [sp, #640] // 8-byte Folded Reload + fmla v0.2s, v17.2s, v7.s[1] + ldr d16, [x8, x1, lsl #3] + ldr x1, [sp, #632] // 8-byte Folded Reload + fmla v1.2s, v17.2s, v6.s[1] + fmla v2.2s, v17.2s, v5.s[1] + fmla v3.2s, v17.2s, v4.s[1] + ldr d18, [x8, x1, lsl #3] + mov x1, x11 + fmla v0.2s, v16.2s, v7.s[2] + fmla v1.2s, v16.2s, v6.s[2] + fmla v2.2s, v16.2s, v5.s[2] + fmla v3.2s, v16.2s, v4.s[2] + fmla v0.2s, v18.2s, v7.s[3] + fmla v1.2s, v18.2s, v6.s[3] + fmla v2.2s, v18.2s, v5.s[3] + fmla v3.2s, v18.2s, v4.s[3] + cmp x29, x20 + b.ge .LBB0_83 + .p2align 2 +.LBB0_88: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_84 Depth=3 + // => This Inner Loop Header: Depth=4 + add x4, x2, x21 + prfm pldl1keep, [x1] + ldur d4, [x1, #-8] + add x3, x3, #1 + prfm pldl1keep, [x2] + ldur s5, [x2, #-4] + add x2, x2, #4 + add x1, x1, #8 + prfm pldl1keep, [x4] + ldur s6, [x4, #-4] + add x4, x4, x21 + fmla v0.2s, v4.2s, v5.s[0] + prfm pldl1keep, [x4] + ldur s7, [x4, #-4] + add x4, x4, x21 + prfm pldl1keep, [x4] + ldur s16, [x4, #-4] + fmla v1.2s, v4.2s, v6.s[0] + fmla v2.2s, v4.2s, v7.s[0] + fmla v3.2s, v4.2s, v16.s[0] + cmp x3, x20 + b.lt .LBB0_88 + b .LBB0_83 + .p2align 2 +.LBB0_89: // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #680] // 8-byte Folded Reload + ldr x12, [sp, #592] // 8-byte Folded Reload + cmp x11, x12 + b.ge .LBB0_95 +// %bb.90: // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #656] // 8-byte Folded Reload + ldr x14, [sp, #664] // 8-byte Folded Reload + mov x13, xzr + mul x12, x25, x12 + ldr x17, [sp, #672] // 8-byte Folded Reload + ldr x15, [sp, #680] // 8-byte Folded Reload + madd x11, x15, x28, x9 + ldr x16, [sp, #440] // 8-byte Folded Reload + ldr d4, [x8] + madd x14, x17, x14, x12 + madd x12, x15, x22, x14 + add x15, x15, #1 + madd x14, x15, x22, x14 + add x11, x11, x16 + add x11, x5, x11, lsl #2 + lsl x12, x12, #2 + ldr q2, [x26, x12] + madd x12, x15, x28, x9 + lsl x14, x14, #2 + ldr d0, [x11] + ldr x15, [sp, #536] // 8-byte Folded Reload + ldr q3, [x26, x14] + mov x14, x10 + add x12, x12, x16 + add x12, x5, x12, lsl #2 + ldr d1, [x12] + cmp xzr, x19 + b.ge .LBB0_92 + .p2align 2 +.LBB0_91: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x2, x14, #16 + ldr x16, [sp, #608] // 8-byte Folded Reload + ldr x18, [sp, #600] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v2.s[0] + prfm pldl1keep, [x2] + fmla v1.2s, v4.2s, v3.s[0] + ldp d4, d5, [x14, #-16] + add x13, x13, #4 + add x16, x16, x15 + add x18, x18, x15 + add x15, x15, #16 + add x17, x16, #32 + add x1, x18, #32 + fmla v0.2s, v4.2s, v2.s[1] + fmla v1.2s, v4.2s, v3.s[1] + fmla v0.2s, v5.2s, v2.s[2] + fmla v1.2s, v5.2s, v3.s[2] + ldp d5, d4, [x14], #32 + prfm pldl1keep, [x1] + fmla v0.2s, v5.2s, v2.s[3] + ldr q2, [x18, #16] + prfm pldl1keep, [x17] + fmla v1.2s, v5.2s, v3.s[3] + ldr q3, [x16, #16] + cmp x13, x19 + b.lt .LBB0_91 +.LBB0_92: // in Loop: Header=BB0_7 Depth=2 + ldr x15, [sp, #648] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v2.s[0] + fmla v1.2s, v4.2s, v3.s[0] + ldr x1, [sp, #352] // 8-byte Folded Reload + ldr x2, [sp, #424] // 8-byte Folded Reload + mov x13, xzr + mov x14, xzr + ldr d5, [x8, x15, lsl #3] + ldr x15, [sp, #640] // 8-byte Folded Reload + fmla v0.2s, v5.2s, v2.s[1] + ldr d4, [x8, x15, lsl #3] + ldr x15, [sp, #632] // 8-byte Folded Reload + fmla v1.2s, v5.2s, v3.s[1] + ldr d5, [x8, x15, lsl #3] + ldr x15, [sp, #224] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v2.s[2] + fmla v1.2s, v4.2s, v3.s[2] + add x15, x8, x15 + fmla v0.2s, v5.2s, v2.s[3] + fmla v1.2s, v5.2s, v3.s[3] + add x16, x29, xzr + cmp x16, x20 + b.ge .LBB0_94 + .p2align 2 +.LBB0_93: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x18, x15, x14, lsl #3 + add x16, x2, x13 + add x17, x1, x13 + add x13, x13, #4 + add x17, x17, #4 + add x16, x16, #4 + add x18, x18, #8 + prfm pldl1keep, [x18] + ldr d2, [x15, x14, lsl #3] + prfm pldl1keep, [x17] + ldr s3, [x1, x14, lsl #2] + prfm pldl1keep, [x16] + fmla v0.2s, v2.2s, v3.s[0] + ldr s4, [x2, x14, lsl #2] + fmla v1.2s, v2.2s, v4.s[0] + add x14, x14, #1 + add x16, x29, x14 + cmp x16, x20 + b.lt .LBB0_93 +.LBB0_94: // in Loop: Header=BB0_7 Depth=2 + str d0, [x11] + str d1, [x12] +.LBB0_95: // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #528] // 8-byte Folded Reload + ldr x12, [sp, #592] // 8-byte Folded Reload + cmp x12, x11 + b.ge .LBB0_101 +// %bb.96: // in Loop: Header=BB0_7 Depth=2 + ldr x14, [sp, #592] // 8-byte Folded Reload + ldr x12, [sp, #440] // 8-byte Folded Reload + mov x11, xzr + madd x9, x14, x28, x9 + ldr x13, [sp, #664] // 8-byte Folded Reload + ldr x15, [sp, #672] // 8-byte Folded Reload + ldr d1, [x8] + add x9, x9, x12 + ldr x12, [sp, #656] // 8-byte Folded Reload + add x9, x5, x9, lsl #2 + ldr d0, [x9] + mul x12, x25, x12 + madd x12, x15, x13, x12 + madd x12, x14, x22, x12 + lsl x12, x12, #2 + ldr q2, [x26, x12] + ldp x12, x14, [sp, #456] // 16-byte Folded Reload + cmp xzr, x19 + b.ge .LBB0_98 + .p2align 2 +.LBB0_97: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x10, #16 + fmla v0.2s, v1.2s, v2.s[0] + add x11, x11, #4 + prfm pldl1keep, [x13] + ldp d1, d3, [x10, #-16] + fmla v0.2s, v1.2s, v2.s[1] + fmla v0.2s, v3.2s, v2.s[2] + ldp d3, d1, [x10], #32 + prfm pldl1keep, [x12] + fmla v0.2s, v3.2s, v2.s[3] + ldur q2, [x12, #-16] + add x12, x12, #16 + cmp x11, x19 + b.lt .LBB0_97 +.LBB0_98: // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #648] // 8-byte Folded Reload + fmla v0.2s, v1.2s, v2.s[0] + mov x10, xzr + ldr d3, [x8, x11, lsl #3] + ldr x11, [sp, #640] // 8-byte Folded Reload + fmla v0.2s, v3.2s, v2.s[1] + ldr d4, [x8, x11, lsl #3] + ldr x11, [sp, #632] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v2.s[2] + ldr d1, [x8, x11, lsl #3] + ldr x11, [sp, #224] // 8-byte Folded Reload + add x8, x8, x11 + ldr x11, [sp, #416] // 8-byte Folded Reload + fmla v0.2s, v1.2s, v2.s[3] + add x12, x29, xzr + cmp x12, x20 + b.ge .LBB0_100 + .p2align 2 +.LBB0_99: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x12, x8, x10, lsl #3 + add x12, x12, #8 + prfm pldl1keep, [x12] + ldr d1, [x8, x10, lsl #3] + prfm pldl1keep, [x11] + add x11, x11, #4 + ldr s2, [x14, x10, lsl #2] + add x10, x10, #1 + fmla v0.2s, v1.2s, v2.s[0] + add x12, x29, x10 + cmp x12, x20 + b.lt .LBB0_99 +.LBB0_100: // in Loop: Header=BB0_7 Depth=2 + str d0, [x9] +.LBB0_101: // in Loop: Header=BB0_7 Depth=2 + bl free + ldr x9, [sp, #432] // 8-byte Folded Reload + ldr x16, [sp, #560] // 8-byte Folded Reload + ldr x8, [sp, #296] // 8-byte Folded Reload + cmp x9, x8 + b.ge .LBB0_6 +.LBB0_102: // in Loop: Header=BB0_7 Depth=2 + ldr x8, [sp, #272] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + ldr x8, [sp, #520] // 8-byte Folded Reload + ldr x9, [sp, #512] // 8-byte Folded Reload + mov x10, xzr + mov x11, xzr + ldr x12, [sp, #672] // 8-byte Folded Reload + ldr x13, [sp, #576] // 8-byte Folded Reload + mul x8, x25, x8 + ldr x6, [sp, #584] // 8-byte Folded Reload + ldr x14, [sp, #560] // 8-byte Folded Reload + madd x9, x12, x9, x8 + ldr x8, [sp, #432] // 8-byte Folded Reload + add x12, x9, x8 + add x8, x13, x28 + add x8, x12, x8 + add x13, x12, x13 + ldr s1, [x6, x12, lsl #2] + add x12, x12, x28 + ldr s0, [x6, x8, lsl #2] + ldr s2, [x6, x13, lsl #2] + ldr s3, [x6, x12, lsl #2] + ldr x12, [sp, #328] // 8-byte Folded Reload + add x8, x0, #63 + and x8, x8, #0xffffffffffffffc0 + cmp xzr, x20 + b.ge .LBB0_104 + .p2align 2 +.LBB0_103: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x14, x10 + add x11, x11, #1 + prfm pldl1keep, [x13] + ldur s4, [x13, #-4] + add x13, x13, x21 + prfm pldl1keep, [x13] + ldur s5, [x13, #-4] + add x13, x13, x21 + prfm pldl1keep, [x13] + ldur s6, [x13, #-4] + add x13, x13, x21 + prfm pldl1keep, [x13] + ldur s7, [x13, #-4] + prfm pldl1keep, [x12] + ldur s16, [x12, #-4] + add x12, x12, #4 + fmla v1.2s, v16.2s, v4.2s + fmla v3.2s, v16.2s, v5.2s + fmla v2.2s, v16.2s, v6.2s + fmla v0.2s, v16.2s, v7.2s + str s16, [x8, x10] + add x10, x10, #4 + cmp x11, x20 + b.lt .LBB0_103 +.LBB0_104: // %.preheader36 + // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #192] // 8-byte Folded Reload + ldr x12, [sp, #552] // 8-byte Folded Reload + mov x1, xzr + add x10, x8, #12 + ldr x13, [sp, #544] // 8-byte Folded Reload + mov w16, #1 // =0x1 + mov w17, #2 // =0x2 + mov w15, #3 // =0x3 + mov w14, #4 // =0x4 + add x11, x8, x11 + b .LBB0_106 + .p2align 2 +.LBB0_105: // %.loopexit32 + // in Loop: Header=BB0_106 Depth=3 + add x13, x13, x23 + add x12, x12, x23 + mov x1, x14 + mov x14, x18 +.LBB0_106: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB0_108 Depth 4 + // Child Loop BB0_110 Depth 4 + madd x18, x1, x28, x9 + ldr x5, [sp, #432] // 8-byte Folded Reload + add x18, x18, x5 + madd x16, x16, x28, x9 + madd x17, x17, x28, x9 + madd x15, x15, x28, x9 + add x16, x16, x5 + add x15, x15, x5 + str s1, [x6, x18, lsl #2] + str s3, [x6, x16, lsl #2] + add x16, x17, x5 + str s2, [x6, x16, lsl #2] + str s0, [x6, x15, lsl #2] + ldr x15, [sp, #680] // 8-byte Folded Reload + cmp x14, x15 + b.ge .LBB0_111 +// %bb.107: // in Loop: Header=BB0_106 Depth=3 + madd x2, x14, x28, x9 + add x15, x14, #3 + add x16, x14, #1 + add x17, x14, #2 + madd x3, x16, x28, x9 + ldr s16, [x8] + mov x1, xzr + add x18, x14, #4 + madd x4, x17, x28, x9 + add x2, x2, x5 + ldr s1, [x6, x2, lsl #2] + madd x2, x15, x28, x9 + add x4, x4, x5 + ldr s2, [x6, x4, lsl #2] + ldr x4, [sp, #672] // 8-byte Folded Reload + add x2, x2, x5 + ldr s0, [x6, x2, lsl #2] + add x2, x3, x5 + ldr x3, [sp, #664] // 8-byte Folded Reload + ldr s3, [x6, x2, lsl #2] + ldr x2, [sp, #656] // 8-byte Folded Reload + mul x2, x25, x2 + madd x2, x4, x3, x2 + madd x3, x14, x22, x2 + lsl x3, x3, #2 + ldr q7, [x26, x3] + madd x3, x16, x22, x2 + lsl x3, x3, #2 + ldr q6, [x26, x3] + madd x3, x17, x22, x2 + madd x2, x15, x22, x2 + lsl x3, x3, #2 + lsl x2, x2, #2 + ldr q5, [x26, x3] + ldr q4, [x26, x2] + mov x2, x10 + mov x3, x13 + ext v20.16b, v7.16b, v7.16b, #8 + cmp xzr, x19 + ext v19.16b, v6.16b, v6.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.ge .LBB0_109 + .p2align 2 +.LBB0_108: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_106 Depth=3 + // => This Inner Loop Header: Depth=4 + add x4, x2, #8 + fmla v1.2s, v16.2s, v7.2s + fmla v3.2s, v16.2s, v6.2s + add x1, x1, #4 + fmla v2.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + prfm pldl1keep, [x4] + add x4, x3, x21 + ldp s16, s21, [x2, #-8] + fmla v0.2s, v16.2s, v4.s[1] + fmla v1.2s, v16.2s, v7.s[1] + fmla v3.2s, v16.2s, v6.s[1] + fmla v2.2s, v16.2s, v5.s[1] + fmla v0.2s, v21.2s, v17.2s + fmla v1.2s, v21.2s, v20.2s + ldp s17, s16, [x2], #16 + fmla v3.2s, v21.2s, v19.2s + fmla v2.2s, v21.2s, v18.2s + prfm pldl1keep, [x3] + fmla v1.2s, v17.2s, v7.s[3] + ldur q7, [x3, #-16] + prfm pldl1keep, [x4] + fmla v3.2s, v17.2s, v6.s[3] + ldur q6, [x4, #-16] + add x4, x4, x21 + fmla v2.2s, v17.2s, v5.s[3] + fmla v0.2s, v17.2s, v4.s[3] + add x3, x3, #16 + prfm pldl1keep, [x4] + ldur q5, [x4, #-16] + add x4, x4, x21 + prfm pldl1keep, [x4] + ldur q4, [x4, #-16] + ext v20.16b, v7.16b, v7.16b, #8 + cmp x1, x19 + ext v19.16b, v6.16b, v6.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.lt .LBB0_108 +.LBB0_109: // in Loop: Header=BB0_106 Depth=3 + ldr x1, [sp, #648] // 8-byte Folded Reload + fmla v1.2s, v16.2s, v7.2s + fmla v3.2s, v16.2s, v6.2s + mov x2, x12 + fmla v2.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + mov x3, x29 + ldr s21, [x8, x1, lsl #2] + ldr x1, [sp, #640] // 8-byte Folded Reload + fmla v1.2s, v21.2s, v7.s[1] + ldr s16, [x8, x1, lsl #2] + ldr x1, [sp, #632] // 8-byte Folded Reload + fmla v3.2s, v21.2s, v6.s[1] + fmla v2.2s, v21.2s, v5.s[1] + fmla v0.2s, v21.2s, v4.s[1] + ldr s22, [x8, x1, lsl #2] + mov x1, x11 + fmla v1.2s, v16.2s, v20.2s + fmla v3.2s, v16.2s, v19.2s + fmla v2.2s, v16.2s, v18.2s + fmla v0.2s, v16.2s, v17.2s + fmla v1.2s, v22.2s, v7.s[3] + fmla v3.2s, v22.2s, v6.s[3] + fmla v2.2s, v22.2s, v5.s[3] + fmla v0.2s, v22.2s, v4.s[3] + cmp x29, x20 + b.ge .LBB0_105 + .p2align 2 +.LBB0_110: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // Parent Loop BB0_106 Depth=3 + // => This Inner Loop Header: Depth=4 + add x4, x2, x21 + prfm pldl1keep, [x1] + ldur s4, [x1, #-4] + add x3, x3, #1 + prfm pldl1keep, [x2] + ldur s5, [x2, #-4] + add x2, x2, #4 + add x1, x1, #4 + prfm pldl1keep, [x4] + ldur s6, [x4, #-4] + add x4, x4, x21 + fmla v1.2s, v4.2s, v5.2s + prfm pldl1keep, [x4] + ldur s7, [x4, #-4] + add x4, x4, x21 + prfm pldl1keep, [x4] + ldur s16, [x4, #-4] + fmla v3.2s, v4.2s, v6.2s + fmla v2.2s, v4.2s, v7.2s + fmla v0.2s, v4.2s, v16.2s + cmp x3, x20 + b.lt .LBB0_110 + b .LBB0_105 + .p2align 2 +.LBB0_111: // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #680] // 8-byte Folded Reload + ldr x12, [sp, #592] // 8-byte Folded Reload + cmp x11, x12 + b.ge .LBB0_117 +// %bb.112: // in Loop: Header=BB0_7 Depth=2 + ldr x12, [sp, #656] // 8-byte Folded Reload + ldr x15, [sp, #664] // 8-byte Folded Reload + mov x13, xzr + mov x14, xzr + ldr x18, [sp, #672] // 8-byte Folded Reload + ldr x16, [sp, #680] // 8-byte Folded Reload + mul x12, x25, x12 + madd x11, x16, x28, x9 + ldr x17, [sp, #432] // 8-byte Folded Reload + ldr s4, [x8] + madd x12, x18, x15, x12 + madd x15, x16, x22, x12 + add x11, x11, x17 + ldr s1, [x6, x11, lsl #2] + lsl x15, x15, #2 + ldr q2, [x26, x15] + add x15, x16, #1 + madd x16, x15, x22, x12 + madd x12, x15, x28, x9 + add x12, x12, x17 + lsl x15, x16, #2 + ldr s0, [x6, x12, lsl #2] + ldr q3, [x26, x15] + ext v6.16b, v2.16b, v2.16b, #8 + cmp xzr, x19 + ext v5.16b, v3.16b, v3.16b, #8 + b.ge .LBB0_114 + .p2align 2 +.LBB0_113: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x1, x8, x13 + ldp x15, x17, [sp, #336] // 16-byte Folded Reload + fmla v1.2s, v4.2s, v2.2s + add x2, x1, #20 + fmla v0.2s, v4.2s, v3.2s + add x14, x14, #4 + prfm pldl1keep, [x2] + ldp s4, s7, [x1, #4] + add x15, x15, x13 + add x17, x17, x13 + add x13, x13, #16 + add x16, x15, #32 + add x18, x17, #32 + fmla v0.2s, v4.2s, v3.s[1] + fmla v1.2s, v4.2s, v2.s[1] + fmla v0.2s, v7.2s, v5.2s + ldp s5, s4, [x1, #12] + fmla v1.2s, v7.2s, v6.2s + prfm pldl1keep, [x18] + fmla v1.2s, v5.2s, v2.s[3] + ldr q2, [x17, #16] + prfm pldl1keep, [x16] + fmla v0.2s, v5.2s, v3.s[3] + ldr q3, [x15, #16] + ext v6.16b, v2.16b, v2.16b, #8 + cmp x14, x19 + ext v5.16b, v3.16b, v3.16b, #8 + b.lt .LBB0_113 +.LBB0_114: // in Loop: Header=BB0_7 Depth=2 + ldr x14, [sp, #648] // 8-byte Folded Reload + fmla v1.2s, v4.2s, v2.2s + fmla v0.2s, v4.2s, v3.2s + ldr x1, [sp, #352] // 8-byte Folded Reload + ldr x2, [sp, #424] // 8-byte Folded Reload + mov x13, xzr + mov x15, x29 + ldr s7, [x8, x14, lsl #2] + ldr x14, [sp, #640] // 8-byte Folded Reload + fmla v1.2s, v7.2s, v2.s[1] + ldr s4, [x8, x14, lsl #2] + ldr x14, [sp, #632] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v3.s[1] + ldr s7, [x8, x14, lsl #2] + ldr x14, [sp, #264] // 8-byte Folded Reload + fmla v1.2s, v4.2s, v6.2s + fmla v0.2s, v4.2s, v5.2s + add x14, x8, x14 + fmla v1.2s, v7.2s, v2.s[3] + fmla v0.2s, v7.2s, v3.s[3] + cmp x29, x20 + b.ge .LBB0_116 + .p2align 2 +.LBB0_115: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x16, x2, x13 + add x17, x1, x13 + add x18, x14, x13 + add x15, x15, #1 + add x16, x16, #4 + add x17, x17, #4 + add x18, x18, #4 + prfm pldl1keep, [x18] + ldr s2, [x14, x13] + prfm pldl1keep, [x17] + prfm pldl1keep, [x16] + ldr s3, [x1, x13] + fmla v1.2s, v2.2s, v3.2s + ldr s3, [x2, x13] + add x13, x13, #4 + fmla v0.2s, v2.2s, v3.2s + cmp x15, x20 + b.lt .LBB0_115 +.LBB0_116: // in Loop: Header=BB0_7 Depth=2 + str s1, [x6, x11, lsl #2] + str s0, [x6, x12, lsl #2] +.LBB0_117: // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #528] // 8-byte Folded Reload + ldr x12, [sp, #592] // 8-byte Folded Reload + cmp x12, x11 + b.ge .LBB0_5 +// %bb.118: // in Loop: Header=BB0_7 Depth=2 + ldr x14, [sp, #592] // 8-byte Folded Reload + ldr x12, [sp, #432] // 8-byte Folded Reload + mov x11, xzr + madd x9, x14, x28, x9 + ldr x13, [sp, #664] // 8-byte Folded Reload + ldr x15, [sp, #672] // 8-byte Folded Reload + ldr s2, [x8] + add x9, x9, x12 + ldr x12, [sp, #656] // 8-byte Folded Reload + ldr s0, [x6, x9, lsl #2] + mul x12, x25, x12 + madd x12, x15, x13, x12 + madd x12, x14, x22, x12 + lsl x12, x12, #2 + ldr q1, [x26, x12] + ldp x12, x14, [sp, #456] // 16-byte Folded Reload + ext v3.16b, v1.16b, v1.16b, #8 + cmp xzr, x19 + b.ge .LBB0_120 + .p2align 2 +.LBB0_119: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x13, x10, #8 + fmla v0.2s, v2.2s, v1.2s + add x11, x11, #4 + prfm pldl1keep, [x13] + ldp s2, s4, [x10, #-8] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v4.2s, v3.2s + ldp s3, s2, [x10], #16 + prfm pldl1keep, [x12] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x12, #-16] + add x12, x12, #16 + ext v3.16b, v1.16b, v1.16b, #8 + cmp x11, x19 + b.lt .LBB0_119 +.LBB0_120: // in Loop: Header=BB0_7 Depth=2 + ldr x11, [sp, #648] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.2s + mov x10, xzr + ldr s4, [x8, x11, lsl #2] + ldr x11, [sp, #640] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v1.s[1] + ldr s5, [x8, x11, lsl #2] + ldr x11, [sp, #632] // 8-byte Folded Reload + fmla v0.2s, v5.2s, v3.2s + ldr s2, [x8, x11, lsl #2] + ldr x11, [sp, #264] // 8-byte Folded Reload + add x8, x8, x11 + mov x11, x29 + fmla v0.2s, v2.2s, v1.s[3] + cmp x29, x20 + b.ge .LBB0_4 + .p2align 2 +.LBB0_121: // Parent Loop BB0_2 Depth=1 + // Parent Loop BB0_7 Depth=2 + // => This Inner Loop Header: Depth=3 + add x12, x14, x10 + add x13, x8, x10 + add x11, x11, #1 + add x12, x12, #4 + add x13, x13, #4 + prfm pldl1keep, [x13] + ldr s1, [x8, x10] + prfm pldl1keep, [x12] + ldr s2, [x14, x10] + add x10, x10, #4 + fmla v0.2s, v1.2s, v2.2s + cmp x11, x20 + b.lt .LBB0_121 + b .LBB0_4 +.LBB0_122: + ldr x0, [sp, #8] // 8-byte Folded Reload + bl free + add sp, sp, #688 + ldp d9, d8, [sp, #48] // 16-byte Folded Reload + ldp d11, d10, [sp, #32] // 16-byte Folded Reload + ldp d13, d12, [sp, #16] // 16-byte Folded Reload + ldp x20, x19, [sp, #144] // 16-byte Folded Reload + ldp x22, x21, [sp, #128] // 16-byte Folded Reload + ldp x24, x23, [sp, #112] // 16-byte Folded Reload + ldp x26, x25, [sp, #96] // 16-byte Folded Reload + ldp x28, x27, [sp, #80] // 16-byte Folded Reload + ldp x29, x30, [sp, #64] // 16-byte Folded Reload + ldp d15, d14, [sp], #160 // 16-byte Folded Reload + ret +.Lfunc_end0: + .size sbatch_matmul_4d_nt_mlir, .Lfunc_end0-sbatch_matmul_4d_nt_mlir + .cfi_endproc + // -- End function + .section ".note.GNU-stack","",@progbits diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemm_nn_alpha1_beta1_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemm_nn_alpha1_beta1_mlir.s new file mode 100644 index 00000000000000..efa5087d8c2dfe --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemm_nn_alpha1_beta1_mlir.s @@ -0,0 +1,4104 @@ + .text + .file "LLVMDialectModule" + .globl sgemm_nn_alpha1_beta1_mlir // -- Begin function sgemm_nn_alpha1_beta1_mlir + .p2align 4 + .type sgemm_nn_alpha1_beta1_mlir,@function +sgemm_nn_alpha1_beta1_mlir: // @sgemm_nn_alpha1_beta1_mlir + .cfi_startproc +// %bb.0: + str d12, [sp, #-144]! // 8-byte Folded Spill + stp d11, d10, [sp, #16] // 16-byte Folded Spill + stp x29, x30, [sp, #48] // 16-byte Folded Spill + stp x28, x27, [sp, #64] // 16-byte Folded Spill + stp x26, x25, [sp, #80] // 16-byte Folded Spill + stp x24, x23, [sp, #96] // 16-byte Folded Spill + stp x22, x21, [sp, #112] // 16-byte Folded Spill + stp x20, x19, [sp, #128] // 16-byte Folded Spill + stp d9, d8, [sp, #32] // 16-byte Folded Spill + sub sp, sp, #512 + .cfi_def_cfa_offset 656 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + .cfi_offset b10, -120 + .cfi_offset b11, -128 + .cfi_offset b12, -144 + cmp x3, #0 + ldr x29, [sp, #688] + ldr x20, [sp, #656] + mov x22, x5 + cinv x8, x3, lt + ldr x26, [sp, #664] + ldr x27, [sp, #744] + mov x19, x4 + add x10, x8, x8, lsr #63 + add x9, x8, #7 + str x2, [sp, #320] // 8-byte Folded Spill + mov x25, x1 + str x3, [sp, #288] // 8-byte Folded Spill + asr x10, x10, #1 + cinv x23, x10, lt + cmp x8, #0 + add x10, x8, #3 + csel x9, x9, x8, lt + csel x8, x10, x8, lt + cmp x3, #0 + asr x9, x9, #3 + asr x8, x8, #2 + cinv x24, x9, lt + ldr x9, [sp, #680] + cinv x21, x8, lt + cmp x9, #0 + str x9, [sp, #128] // 8-byte Folded Spill + cinv x10, x9, lt + add x8, x10, #7 + cmp x10, #0 + str x10, [sp, #112] // 8-byte Folded Spill + csel x8, x8, x10, lt + cmp x9, #0 + ldr x10, [sp, #712] + ldr x9, [sp, #720] + asr x8, x8, #3 + cinv x8, x8, lt + str x8, [sp, #16] // 8-byte Folded Spill + lsl x8, x8, #3 + str x8, [sp, #384] // 8-byte Folded Spill + lsl x8, x4, #5 + stp x9, x10, [sp, #256] // 16-byte Folded Spill + add x0, x8, #64 + str x8, [sp, #504] // 8-byte Folded Spill + bl malloc + lsl x8, x24, #3 + mul x3, x24, x22 + mov w9, #1 // =0x1 + add x12, x0, #63 + str x8, [sp, #520] // 8-byte Folded Spill + lsl x8, x21, #2 + bfi x9, x21, #2, #62 + and x11, x19, #0x3 + str x8, [sp, #336] // 8-byte Folded Spill + lsl x8, x23, #1 + mul x9, x22, x9 + str x0, [sp, #56] // 8-byte Folded Spill + str x8, [sp, #312] // 8-byte Folded Spill + negs x8, x19 + mul x0, x23, x22 + mul x2, x21, x22 + and x13, x8, #0x3 + and x8, x12, #0xffffffffffffffc0 + ldr x12, [sp, #320] // 8-byte Folded Reload + lsl x5, x19, #2 + csneg x18, x11, x13, mi + add x11, x19, x3, lsl #3 + add x16, x19, x0, lsl #1 + lsl x28, x22, #2 + lsl x6, x18, #2 + add x10, x19, x22, lsl #3 + lsl x4, x22, #5 + mov w15, #28 // =0x1c + sub x23, x11, x18 + add x11, x19, x9 + add x9, x25, x9, lsl #2 + sub x13, x4, x28 + lsl x24, x12, #2 + sub x11, x11, x18 + add x14, x19, x2, lsl #2 + sub x12, x5, x6 + str x9, [sp, #168] // 8-byte Folded Spill + add x9, x24, x11, lsl #2 + lsl x21, x26, #2 + add x17, x12, #4 + madd x1, x29, x15, x20 + sub x15, x16, x18 + add x16, x24, x25 + stp x13, x12, [sp, #64] // 16-byte Folded Spill + sub x12, x10, x18 + sub x10, x14, x18 + madd x14, x29, x17, x21 + add x16, x13, x16 + add x9, x25, x9 + add x16, x16, #16 + str x9, [sp, #480] // 8-byte Folded Spill + add x9, x24, x10, lsl #2 + str x16, [sp, #240] // 8-byte Folded Spill + mov w16, #16 // =0x10 + sub x30, x19, x18 + sub x13, x16, x13 + add x9, x25, x9 + mul x17, x29, x30 + str x13, [sp, #232] // 8-byte Folded Spill + add x13, x20, x14 + stp x13, x9, [sp, #464] // 16-byte Folded Spill + add x9, x24, x15, lsl #2 + ldr x13, [sp, #504] // 8-byte Folded Reload + add x14, x5, x24 + sub x14, x14, x6 + add x16, x21, x17, lsl #2 + add x14, x14, x25 + add x9, x9, x25 + sub x10, x30, #3 + stp x17, x18, [sp, #96] // 16-byte Folded Spill + add x11, x20, x29, lsl #5 + add x9, x9, #4 + str x26, [sp, #280] // 8-byte Folded Spill + lsl x26, x29, #4 + stp x20, x29, [sp, #296] // 16-byte Folded Spill + str x9, [sp, #152] // 8-byte Folded Spill + sub x9, x30, #2 + sub x17, x13, x18, lsl #5 + add x13, x14, #4 + add x14, x24, x12, lsl #2 + add x12, x4, x24 + stp x10, x9, [sp, #400] // 16-byte Folded Spill + sub x9, x30, #1 + str x13, [sp, #224] // 8-byte Folded Spill + add x13, x20, x16 + add x12, x12, x25 + str x9, [sp, #416] // 8-byte Folded Spill + mov w9, #20 // =0x14 + str x13, [sp, #456] // 8-byte Folded Spill + add x13, x12, #32 + add x12, x14, x25 + madd x9, x29, x9, x20 + add x12, x12, #4 + stp x12, x13, [sp, #208] // 16-byte Folded Spill + add x12, x24, x3, lsl #5 + add x13, x24, x23, lsl #2 + stp x6, x14, [sp, #80] // 16-byte Folded Spill + mov w10, #24 // =0x18 + lsl x18, x29, #2 + add x12, x12, x25 + madd x6, x29, x10, x20 + add x10, x20, x18 + str x25, [sp, #328] // 8-byte Folded Spill + stp x9, x11, [sp, #440] // 16-byte Folded Spill + add x11, x20, x29, lsl #3 + add x14, x12, #32 + add x12, x13, x25 + add x12, x12, #4 + add x9, x20, x26 + mov x7, xzr + str x5, [sp, #120] // 8-byte Folded Spill + str x11, [sp, #432] // 8-byte Folded Spill + add x11, x8, #64 + stp x12, x14, [sp, #176] // 16-byte Folded Spill + add x12, x17, #32 + str x11, [sp, #192] // 8-byte Folded Spill + mov w11, #12 // =0xc + add x13, x8, x17 + sub x23, x30, #4 + madd x11, x29, x11, x20 + mov x20, x1 + mov x1, x9 + add x9, x8, x12 + stp x9, x4, [sp, #488] // 16-byte Folded Spill + add x9, x24, x0, lsl #3 + add x5, x8, #128 + stp x2, x0, [sp, #40] // 16-byte Folded Spill + mov x0, x10 + stp x9, x3, [sp, #24] // 16-byte Folded Spill + str x12, [sp, #160] // 8-byte Folded Spill + add x9, x25, x9 + str x30, [sp, #504] // 8-byte Folded Spill + str x24, [sp, #344] // 8-byte Folded Spill + str x9, [sp, #144] // 8-byte Folded Spill + add x9, x9, #32 + str x11, [sp, #424] // 8-byte Folded Spill + str x18, [sp, #272] // 8-byte Folded Spill + str x9, [sp, #136] // 8-byte Folded Spill + add x9, x25, x2, lsl #4 + ldr x25, [sp, #384] // 8-byte Folded Reload + str x13, [sp, #200] // 8-byte Folded Spill + str x9, [sp, #248] // 8-byte Folded Spill + b .LBB0_3 + .p2align 2 +.LBB0_1: // in Loop: Header=BB0_3 Depth=1 + stp q1, q0, [x10] +.LBB0_2: // %.backedge + // in Loop: Header=BB0_3 Depth=1 + ldp x9, x11, [sp, #440] // 16-byte Folded Reload + ldr x20, [sp, #376] // 8-byte Folded Reload + add x6, x6, #32 + add x1, x1, #32 + ldp x7, x0, [sp, #352] // 16-byte Folded Reload + ldr x30, [sp, #504] // 8-byte Folded Reload + add x10, x11, #32 + add x20, x20, #32 + add x0, x0, #32 + add x9, x9, #32 + stp x9, x10, [sp, #440] // 16-byte Folded Spill + ldp x9, x11, [sp, #424] // 16-byte Folded Reload + add x10, x11, #32 + add x9, x9, #32 + stp x9, x10, [sp, #424] // 16-byte Folded Spill + ldp x9, x11, [sp, #456] // 16-byte Folded Reload + add x10, x11, #32 + add x9, x9, #32 + stp x9, x10, [sp, #456] // 16-byte Folded Spill +.LBB0_3: // =>This Loop Header: Depth=1 + // Child Loop BB0_5 Depth 2 + // Child Loop BB0_7 Depth 2 + // Child Loop BB0_10 Depth 2 + // Child Loop BB0_12 Depth 3 + // Child Loop BB0_14 Depth 3 + // Child Loop BB0_19 Depth 2 + // Child Loop BB0_21 Depth 2 + // Child Loop BB0_24 Depth 2 + // Child Loop BB0_26 Depth 2 + // Child Loop BB0_29 Depth 2 + // Child Loop BB0_31 Depth 2 + cmp x7, x25 + b.ge .LBB0_32 +// %bb.4: // in Loop: Header=BB0_3 Depth=1 + add x10, x7, #8 + add x12, x7, x27, lsl #1 + ldr x17, [sp, #192] // 8-byte Folded Reload + mov x9, xzr + str x10, [sp, #352] // 8-byte Folded Spill + ldp x11, x10, [sp, #256] // 16-byte Folded Reload + stp x0, x1, [sp, #360] // 16-byte Folded Spill + ldp x14, x3, [sp, #232] // 16-byte Folded Reload + ldp x4, x24, [sp, #440] // 16-byte Folded Reload + str x6, [sp, #392] // 8-byte Folded Spill + str x20, [sp, #376] // 8-byte Folded Spill + add x2, x10, x11, lsl #2 + add x11, x27, x7 + lsl x10, x7, #2 + add x11, x2, x11, lsl #2 + add x15, x2, x12, lsl #2 + add x12, x12, x27 + add x13, x2, x10 + add x12, x2, x12, lsl #2 + ldp q1, q0, [x13] + ldp q3, q2, [x11] + add x11, x7, x27, lsl #2 + ldp q6, q5, [x15] + ldp x15, x18, [sp, #424] // 16-byte Folded Reload + ldp q7, q4, [x12] + add x12, x2, x11, lsl #2 + add x11, x11, x27 + add x11, x2, x11, lsl #2 + ldp q17, q16, [x12] + ldp x13, x12, [sp, #320] // 16-byte Folded Reload + ldp q19, q18, [x11] + mov w11, #6 // =0x6 + madd x11, x27, x11, x7 + add x16, x12, x13, lsl #2 + lsl x12, x22, #3 + add x11, x2, x11, lsl #2 + ldr q25, [x16, x12] + ldr x12, [sp, #280] // 8-byte Folded Reload + ldr q26, [x16, x28] + ldr q28, [x16, x22, lsl #4] + ldr q30, [x16] + ldp q21, q20, [x11] + mov w11, #12 // =0xc + mul x11, x22, x11 + ldr q27, [x16, x11] + sub x11, x7, x27 + add x11, x11, x27, lsl #3 + add x11, x2, x11, lsl #2 + ldp q23, q22, [x11] + ldr x11, [sp, #296] // 8-byte Folded Reload + add x11, x11, x12, lsl #2 + mov w12, #20 // =0x14 + mul x12, x22, x12 + add x10, x11, x10 + ldp q8, q9, [x10] + ldr q24, [x16, x12] + mov w12, #24 // =0x18 + mul x12, x22, x12 + ldr q29, [x16, x12] + prfm pldl1keep, [x3] + ldur q31, [x3, #-16] + cmp xzr, x23 + b.ge .LBB0_6 + .p2align 2 +.LBB0_5: // Parent Loop BB0_3 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x4, x21 + add x29, x0, x21 + fmla v1.4s, v8.4s, v30.s[0] + fmla v0.4s, v9.4s, v30.s[0] + fmla v3.4s, v8.4s, v26.s[0] + fmla v2.4s, v9.4s, v26.s[0] + stp q8, q9, [x17, #-64] + fmla v6.4s, v8.4s, v25.s[0] + fmla v5.4s, v9.4s, v25.s[0] + prfm pldl1keep, [x12] + add x25, x6, x21 + fmla v7.4s, v8.4s, v27.s[0] + fmla v4.4s, v9.4s, v27.s[0] + add x12, x18, x21 + add x13, x20, x21 + fmla v16.4s, v9.4s, v28.s[0] + fmla v17.4s, v8.4s, v28.s[0] + add x10, x24, x21 + add x9, x9, #4 + fmla v18.4s, v9.4s, v24.s[0] + fmla v19.4s, v8.4s, v24.s[0] + add x24, x24, x26 + add x20, x20, x26 + fmla v20.4s, v9.4s, v29.s[0] + fmla v21.4s, v8.4s, v29.s[0] + add x6, x6, x26 + add x4, x4, x26 + fmla v22.4s, v9.4s, v31.s[0] + fmla v23.4s, v8.4s, v31.s[0] + ldp q8, q9, [x29] + fmla v0.4s, v9.4s, v30.s[1] + fmla v1.4s, v8.4s, v30.s[1] + stp q8, q9, [x17, #-32] + prfm pldl1keep, [x25] + fmla v2.4s, v9.4s, v26.s[1] + fmla v3.4s, v8.4s, v26.s[1] + add x0, x0, x26 + fmla v5.4s, v9.4s, v25.s[1] + fmla v6.4s, v8.4s, v25.s[1] + add x18, x18, x26 + fmla v4.4s, v9.4s, v27.s[1] + fmla v7.4s, v8.4s, v27.s[1] + fmla v17.4s, v8.4s, v28.s[1] + fmla v16.4s, v9.4s, v28.s[1] + fmla v19.4s, v8.4s, v24.s[1] + fmla v18.4s, v9.4s, v24.s[1] + fmla v21.4s, v8.4s, v29.s[1] + fmla v20.4s, v9.4s, v29.s[1] + fmla v23.4s, v8.4s, v31.s[1] + fmla v22.4s, v9.4s, v31.s[1] + ldp q9, q8, [x12] + add x12, x15, x21 + stp q9, q8, [x17] + prfm pldl1keep, [x13] + add x15, x15, x26 + ldp q11, q10, [x12] + add x12, x1, x21 + add x1, x1, x26 + fmla v1.4s, v9.4s, v30.s[2] + fmla v0.4s, v8.4s, v30.s[2] + stp q11, q10, [x17, #32] + fmla v3.4s, v9.4s, v26.s[2] + fmla v2.4s, v8.4s, v26.s[2] + prfm pldl1keep, [x10] + add x10, x3, x14 + fmla v6.4s, v9.4s, v25.s[2] + fmla v5.4s, v8.4s, v25.s[2] + add x3, x3, #16 + add x17, x17, #128 + fmla v7.4s, v9.4s, v27.s[2] + fmla v4.4s, v8.4s, v27.s[2] + fmla v16.4s, v8.4s, v28.s[2] + fmla v17.4s, v9.4s, v28.s[2] + fmla v18.4s, v8.4s, v24.s[2] + fmla v19.4s, v9.4s, v24.s[2] + fmla v20.4s, v8.4s, v29.s[2] + fmla v21.4s, v9.4s, v29.s[2] + fmla v22.4s, v8.4s, v31.s[2] + fmla v23.4s, v9.4s, v31.s[2] + ldp q8, q9, [x12] + prfm pldl1keep, [x10] + fmla v0.4s, v10.4s, v30.s[3] + fmla v1.4s, v11.4s, v30.s[3] + ldur q30, [x10, #-16] + add x10, x10, x28 + prfm pldl1keep, [x10] + fmla v2.4s, v10.4s, v26.s[3] + fmla v3.4s, v11.4s, v26.s[3] + ldur q26, [x10, #-16] + add x10, x10, x28 + prfm pldl1keep, [x10] + fmla v5.4s, v10.4s, v25.s[3] + fmla v6.4s, v11.4s, v25.s[3] + ldur q25, [x10, #-16] + add x10, x10, x28 + prfm pldl1keep, [x10] + fmla v4.4s, v10.4s, v27.s[3] + fmla v7.4s, v11.4s, v27.s[3] + ldur q27, [x10, #-16] + add x10, x10, x28 + prfm pldl1keep, [x10] + fmla v17.4s, v11.4s, v28.s[3] + fmla v16.4s, v10.4s, v28.s[3] + ldur q28, [x10, #-16] + add x10, x10, x28 + prfm pldl1keep, [x10] + fmla v19.4s, v11.4s, v24.s[3] + fmla v18.4s, v10.4s, v24.s[3] + ldur q24, [x10, #-16] + add x10, x10, x28 + prfm pldl1keep, [x10] + fmla v21.4s, v11.4s, v29.s[3] + fmla v20.4s, v10.4s, v29.s[3] + ldur q29, [x10, #-16] + fmla v23.4s, v11.4s, v31.s[3] + fmla v22.4s, v10.4s, v31.s[3] + prfm pldl1keep, [x3] + ldur q31, [x3, #-16] + cmp x9, x23 + b.lt .LBB0_5 +.LBB0_6: // in Loop: Header=BB0_3 Depth=1 + ldp x13, x12, [sp, #400] // 16-byte Folded Reload + ldr x14, [sp, #304] // 8-byte Folded Reload + add x10, x8, x23, lsl #5 + fmla v1.4s, v8.4s, v30.s[0] + fmla v0.4s, v9.4s, v30.s[0] + fmla v3.4s, v8.4s, v26.s[0] + fmla v2.4s, v9.4s, v26.s[0] + stp q8, q9, [x10] + fmla v6.4s, v8.4s, v25.s[0] + fmla v5.4s, v9.4s, v25.s[0] + fmla v4.4s, v9.4s, v27.s[0] + fmla v7.4s, v8.4s, v27.s[0] + ldp x1, x18, [sp, #456] // 16-byte Folded Reload + madd x9, x13, x14, x7 + fmla v16.4s, v9.4s, v28.s[0] + fmla v17.4s, v8.4s, v28.s[0] + madd x10, x12, x14, x7 + fmla v18.4s, v9.4s, v24.s[0] + fmla v19.4s, v8.4s, v24.s[0] + add x0, x8, x12, lsl #5 + ldr x12, [sp, #416] // 8-byte Folded Reload + fmla v20.4s, v9.4s, v29.s[0] + fmla v21.4s, v8.4s, v29.s[0] + mov x15, xzr + add x9, x11, x9, lsl #2 + fmla v22.4s, v9.4s, v31.s[0] + fmla v23.4s, v8.4s, v31.s[0] + add x10, x11, x10, lsl #2 + ldp q8, q9, [x9] + add x9, x8, x13, lsl #5 + fmla v0.4s, v9.4s, v30.s[1] + fmla v2.4s, v9.4s, v26.s[1] + fmla v5.4s, v9.4s, v25.s[1] + fmla v4.4s, v9.4s, v27.s[1] + fmla v16.4s, v9.4s, v28.s[1] + fmla v18.4s, v9.4s, v24.s[1] + fmla v20.4s, v9.4s, v29.s[1] + fmla v22.4s, v9.4s, v31.s[1] + fmla v1.4s, v8.4s, v30.s[1] + stp q8, q9, [x9] + fmla v3.4s, v8.4s, v26.s[1] + fmla v6.4s, v8.4s, v25.s[1] + fmla v7.4s, v8.4s, v27.s[1] + fmla v17.4s, v8.4s, v28.s[1] + fmla v19.4s, v8.4s, v24.s[1] + fmla v21.4s, v8.4s, v29.s[1] + fmla v23.4s, v8.4s, v31.s[1] + ldp q9, q8, [x10] + madd x10, x12, x14, x7 + ldr x14, [sp, #272] // 8-byte Folded Reload + add x10, x11, x10, lsl #2 + fmla v0.4s, v8.4s, v30.s[2] + fmla v2.4s, v8.4s, v26.s[2] + fmla v5.4s, v8.4s, v25.s[2] + fmla v4.4s, v8.4s, v27.s[2] + fmla v16.4s, v8.4s, v28.s[2] + fmla v18.4s, v8.4s, v24.s[2] + fmla v20.4s, v8.4s, v29.s[2] + fmla v22.4s, v8.4s, v31.s[2] + mov x11, x30 + add x30, x8, x12, lsl #5 + stp q9, q8, [x0] + fmla v1.4s, v9.4s, v30.s[2] + fmla v3.4s, v9.4s, v26.s[2] + fmla v6.4s, v9.4s, v25.s[2] + fmla v7.4s, v9.4s, v27.s[2] + fmla v17.4s, v9.4s, v28.s[2] + fmla v19.4s, v9.4s, v24.s[2] + fmla v21.4s, v9.4s, v29.s[2] + fmla v23.4s, v9.4s, v31.s[2] + ldp q8, q9, [x10] + ldr x10, [sp, #224] // 8-byte Folded Reload + stp q8, q9, [x30] + fmla v0.4s, v9.4s, v30.s[3] + fmla v1.4s, v8.4s, v30.s[3] + fmla v2.4s, v9.4s, v26.s[3] + fmla v3.4s, v8.4s, v26.s[3] + fmla v5.4s, v9.4s, v25.s[3] + fmla v6.4s, v8.4s, v25.s[3] + fmla v7.4s, v8.4s, v27.s[3] + fmla v4.4s, v9.4s, v27.s[3] + fmla v17.4s, v8.4s, v28.s[3] + fmla v16.4s, v9.4s, v28.s[3] + fmla v19.4s, v8.4s, v24.s[3] + fmla v18.4s, v9.4s, v24.s[3] + fmla v21.4s, v8.4s, v29.s[3] + fmla v20.4s, v9.4s, v29.s[3] + fmla v23.4s, v8.4s, v31.s[3] + fmla v22.4s, v9.4s, v31.s[3] + cmp x11, x19 + b.ge .LBB0_8 + .p2align 2 +.LBB0_7: // Parent Loop BB0_3 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x10, x28 + prfm pldl1keep, [x10] + ldur s24, [x10, #-4] + add x10, x10, #4 + add x13, x12, x28 + prfm pldl1keep, [x12] + ldur s25, [x12, #-4] + add x17, x13, x28 + prfm pldl1keep, [x13] + ldur s26, [x13, #-4] + add x12, x17, x28 + prfm pldl1keep, [x17] + ldur s27, [x17, #-4] + add x13, x12, x28 + prfm pldl1keep, [x12] + ldur s28, [x12, #-4] + add x12, x18, x15 + add x17, x13, x28 + prfm pldl1keep, [x13] + ldur s29, [x13, #-4] + add x13, x1, x15 + add x15, x15, x14 + prfm pldl1keep, [x17] + ldur s30, [x17, #-4] + add x17, x17, x28 + prfm pldl1keep, [x17] + ldur s31, [x17, #-4] + prfm pldl1keep, [x12] + add x12, x8, x11, lsl #5 + add x11, x11, #1 + ldp q8, q9, [x13] + fmla v0.4s, v9.4s, v24.s[0] + fmla v2.4s, v9.4s, v25.s[0] + fmla v5.4s, v9.4s, v26.s[0] + fmla v4.4s, v9.4s, v27.s[0] + fmla v16.4s, v9.4s, v28.s[0] + fmla v18.4s, v9.4s, v29.s[0] + fmla v20.4s, v9.4s, v30.s[0] + fmla v1.4s, v8.4s, v24.s[0] + fmla v3.4s, v8.4s, v25.s[0] + fmla v6.4s, v8.4s, v26.s[0] + fmla v7.4s, v8.4s, v27.s[0] + fmla v17.4s, v8.4s, v28.s[0] + fmla v19.4s, v8.4s, v29.s[0] + fmla v21.4s, v8.4s, v30.s[0] + fmla v23.4s, v8.4s, v31.s[0] + fmla v22.4s, v9.4s, v31.s[0] + stp q8, q9, [x12] + cmp x11, x19 + b.lt .LBB0_7 +.LBB0_8: // %.preheader29 + // in Loop: Header=BB0_3 Depth=1 + ldp x18, x13, [sp, #208] // 16-byte Folded Reload + mov x10, xzr + mov w6, #1 // =0x1 + mov w24, #2 // =0x2 + mov w20, #3 // =0x3 + mov w29, #4 // =0x4 + mov w15, #5 // =0x5 + mov w11, #6 // =0x6 + mov w25, #7 // =0x7 + mov w1, #8 // =0x8 + b .LBB0_10 + .p2align 2 +.LBB0_9: // %.loopexit28 + // in Loop: Header=BB0_10 Depth=2 + ldr x10, [sp, #496] // 8-byte Folded Reload + add x13, x13, x10 + add x18, x18, x10 + mov x10, x1 + mov x1, x3 +.LBB0_10: // Parent Loop BB0_3 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB0_12 Depth 3 + // Child Loop BB0_14 Depth 3 + madd x10, x10, x27, x7 + add x10, x2, x10, lsl #2 + stp q1, q0, [x10] + madd x10, x6, x27, x7 + add x10, x2, x10, lsl #2 + stp q3, q2, [x10] + madd x10, x24, x27, x7 + add x10, x2, x10, lsl #2 + stp q6, q5, [x10] + madd x10, x20, x27, x7 + add x10, x2, x10, lsl #2 + stp q7, q4, [x10] + madd x10, x29, x27, x7 + add x10, x2, x10, lsl #2 + stp q17, q16, [x10] + madd x10, x15, x27, x7 + add x10, x2, x10, lsl #2 + stp q19, q18, [x10] + madd x10, x11, x27, x7 + ldr x11, [sp, #520] // 8-byte Folded Reload + add x10, x2, x10, lsl #2 + cmp x1, x11 + stp q21, q20, [x10] + madd x10, x25, x27, x7 + add x10, x2, x10, lsl #2 + stp q23, q22, [x10] + b.ge .LBB0_15 +// %bb.11: // in Loop: Header=BB0_10 Depth=2 + madd x10, x1, x27, x7 + add x20, x1, #3 + add x29, x1, #4 + add x6, x1, #1 + madd x15, x20, x27, x7 + add x25, x1, #7 + add x24, x1, #2 + mov x4, xzr + madd x11, x6, x27, x7 + ldp q8, q9, [x8] + add x3, x1, #8 + madd x12, x24, x27, x7 + mov x17, x13 + add x10, x2, x10, lsl #2 + add x15, x2, x15, lsl #2 + ldp q1, q0, [x10] + madd x10, x29, x27, x7 + add x11, x2, x11, lsl #2 + ldp q7, q4, [x15] + add x15, x1, #5 + add x12, x2, x12, lsl #2 + ldp q3, q2, [x11] + add x11, x1, #6 + add x10, x2, x10, lsl #2 + ldp q6, q5, [x12] + ldp q17, q16, [x10] + madd x10, x15, x27, x7 + add x10, x2, x10, lsl #2 + ldp q19, q18, [x10] + madd x10, x11, x27, x7 + add x10, x2, x10, lsl #2 + ldp q21, q20, [x10] + madd x10, x25, x27, x7 + add x10, x2, x10, lsl #2 + ldp q23, q22, [x10] + mul x10, x1, x22 + lsl x10, x10, #2 + ldr q31, [x16, x10] + mul x10, x6, x22 + lsl x10, x10, #2 + ldr q30, [x16, x10] + mul x10, x24, x22 + lsl x10, x10, #2 + ldr q29, [x16, x10] + mul x10, x20, x22 + lsl x10, x10, #2 + ldr q28, [x16, x10] + mul x10, x29, x22 + lsl x10, x10, #2 + ldr q27, [x16, x10] + mul x10, x15, x22 + lsl x10, x10, #2 + ldr q26, [x16, x10] + mul x10, x11, x22 + lsl x10, x10, #2 + ldr q25, [x16, x10] + mul x10, x25, x22 + lsl x10, x10, #2 + ldr q24, [x16, x10] + mov x10, x5 + cmp xzr, x23 + b.ge .LBB0_13 + .p2align 2 +.LBB0_12: // Parent Loop BB0_3 Depth=1 + // Parent Loop BB0_10 Depth=2 + // => This Inner Loop Header: Depth=3 + add x14, x10, #32 + fmla v1.4s, v8.4s, v31.s[0] + fmla v0.4s, v9.4s, v31.s[0] + add x12, x10, #96 + fmla v2.4s, v9.4s, v30.s[0] + fmla v3.4s, v8.4s, v30.s[0] + prfm pldl1keep, [x14] + add x4, x4, #4 + fmla v5.4s, v9.4s, v29.s[0] + fmla v6.4s, v8.4s, v29.s[0] + fmla v4.4s, v9.4s, v28.s[0] + fmla v7.4s, v8.4s, v28.s[0] + fmla v16.4s, v9.4s, v27.s[0] + fmla v17.4s, v8.4s, v27.s[0] + fmla v18.4s, v9.4s, v26.s[0] + fmla v19.4s, v8.4s, v26.s[0] + fmla v20.4s, v9.4s, v25.s[0] + fmla v21.4s, v8.4s, v25.s[0] + fmla v22.4s, v9.4s, v24.s[0] + fmla v23.4s, v8.4s, v24.s[0] + ldp q8, q9, [x10, #-96] + fmla v0.4s, v9.4s, v31.s[1] + fmla v2.4s, v9.4s, v30.s[1] + fmla v1.4s, v8.4s, v31.s[1] + fmla v3.4s, v8.4s, v30.s[1] + fmla v6.4s, v8.4s, v29.s[1] + fmla v5.4s, v9.4s, v29.s[1] + fmla v7.4s, v8.4s, v28.s[1] + fmla v4.4s, v9.4s, v28.s[1] + fmla v17.4s, v8.4s, v27.s[1] + fmla v16.4s, v9.4s, v27.s[1] + fmla v19.4s, v8.4s, v26.s[1] + fmla v18.4s, v9.4s, v26.s[1] + fmla v21.4s, v8.4s, v25.s[1] + fmla v20.4s, v9.4s, v25.s[1] + fmla v23.4s, v8.4s, v24.s[1] + fmla v22.4s, v9.4s, v24.s[1] + ldp q9, q8, [x10, #-64] + prfm pldl1keep, [x12] + ldp q11, q10, [x10, #-32] + add x12, x17, x28 + fmla v1.4s, v9.4s, v31.s[2] + fmla v0.4s, v8.4s, v31.s[2] + fmla v2.4s, v8.4s, v30.s[2] + fmla v3.4s, v9.4s, v30.s[2] + fmla v5.4s, v8.4s, v29.s[2] + fmla v6.4s, v9.4s, v29.s[2] + fmla v4.4s, v8.4s, v28.s[2] + fmla v7.4s, v9.4s, v28.s[2] + fmla v16.4s, v8.4s, v27.s[2] + fmla v17.4s, v9.4s, v27.s[2] + fmla v18.4s, v8.4s, v26.s[2] + fmla v19.4s, v9.4s, v26.s[2] + fmla v20.4s, v8.4s, v25.s[2] + fmla v21.4s, v9.4s, v25.s[2] + fmla v22.4s, v8.4s, v24.s[2] + fmla v23.4s, v9.4s, v24.s[2] + ldp q8, q9, [x10], #128 + prfm pldl1keep, [x17] + fmla v0.4s, v10.4s, v31.s[3] + fmla v1.4s, v11.4s, v31.s[3] + ldur q31, [x17, #-16] + prfm pldl1keep, [x12] + add x17, x17, #16 + fmla v3.4s, v11.4s, v30.s[3] + fmla v2.4s, v10.4s, v30.s[3] + ldur q30, [x12, #-16] + add x12, x12, x28 + prfm pldl1keep, [x12] + fmla v6.4s, v11.4s, v29.s[3] + fmla v5.4s, v10.4s, v29.s[3] + ldur q29, [x12, #-16] + add x12, x12, x28 + prfm pldl1keep, [x12] + fmla v7.4s, v11.4s, v28.s[3] + fmla v4.4s, v10.4s, v28.s[3] + ldur q28, [x12, #-16] + add x12, x12, x28 + prfm pldl1keep, [x12] + fmla v17.4s, v11.4s, v27.s[3] + fmla v16.4s, v10.4s, v27.s[3] + ldur q27, [x12, #-16] + add x12, x12, x28 + prfm pldl1keep, [x12] + fmla v19.4s, v11.4s, v26.s[3] + fmla v18.4s, v10.4s, v26.s[3] + ldur q26, [x12, #-16] + add x12, x12, x28 + prfm pldl1keep, [x12] + fmla v21.4s, v11.4s, v25.s[3] + fmla v20.4s, v10.4s, v25.s[3] + ldur q25, [x12, #-16] + add x12, x12, x28 + prfm pldl1keep, [x12] + fmla v23.4s, v11.4s, v24.s[3] + fmla v22.4s, v10.4s, v24.s[3] + ldur q24, [x12, #-16] + cmp x4, x23 + b.lt .LBB0_12 +.LBB0_13: // in Loop: Header=BB0_10 Depth=2 + ldp q11, q10, [x9] + fmla v0.4s, v9.4s, v31.s[0] + fmla v1.4s, v8.4s, v31.s[0] + fmla v2.4s, v9.4s, v30.s[0] + fmla v3.4s, v8.4s, v30.s[0] + ldr x17, [sp, #488] // 8-byte Folded Reload + ldr x4, [sp, #504] // 8-byte Folded Reload + fmla v5.4s, v9.4s, v29.s[0] + fmla v6.4s, v8.4s, v29.s[0] + mov x10, x18 + fmla v4.4s, v9.4s, v28.s[0] + fmla v7.4s, v8.4s, v28.s[0] + fmla v16.4s, v9.4s, v27.s[0] + fmla v17.4s, v8.4s, v27.s[0] + fmla v18.4s, v9.4s, v26.s[0] + fmla v19.4s, v8.4s, v26.s[0] + fmla v20.4s, v9.4s, v25.s[0] + fmla v21.4s, v8.4s, v25.s[0] + fmla v22.4s, v9.4s, v24.s[0] + ldp q9, q12, [x0] + fmla v23.4s, v8.4s, v24.s[0] + fmla v1.4s, v11.4s, v31.s[1] + fmla v0.4s, v10.4s, v31.s[1] + fmla v3.4s, v11.4s, v30.s[1] + fmla v2.4s, v10.4s, v30.s[1] + fmla v6.4s, v11.4s, v29.s[1] + fmla v5.4s, v10.4s, v29.s[1] + fmla v7.4s, v11.4s, v28.s[1] + fmla v4.4s, v10.4s, v28.s[1] + fmla v17.4s, v11.4s, v27.s[1] + fmla v16.4s, v10.4s, v27.s[1] + fmla v19.4s, v11.4s, v26.s[1] + fmla v18.4s, v10.4s, v26.s[1] + fmla v21.4s, v11.4s, v25.s[1] + fmla v20.4s, v10.4s, v25.s[1] + fmla v23.4s, v11.4s, v24.s[1] + fmla v22.4s, v10.4s, v24.s[1] + fmla v0.4s, v12.4s, v31.s[2] + ldp q10, q8, [x30] + fmla v1.4s, v9.4s, v31.s[2] + fmla v2.4s, v12.4s, v30.s[2] + fmla v3.4s, v9.4s, v30.s[2] + fmla v5.4s, v12.4s, v29.s[2] + fmla v6.4s, v9.4s, v29.s[2] + fmla v4.4s, v12.4s, v28.s[2] + fmla v7.4s, v9.4s, v28.s[2] + fmla v16.4s, v12.4s, v27.s[2] + fmla v17.4s, v9.4s, v27.s[2] + fmla v18.4s, v12.4s, v26.s[2] + fmla v19.4s, v9.4s, v26.s[2] + fmla v20.4s, v12.4s, v25.s[2] + fmla v21.4s, v9.4s, v25.s[2] + fmla v22.4s, v12.4s, v24.s[2] + fmla v23.4s, v9.4s, v24.s[2] + fmla v1.4s, v10.4s, v31.s[3] + fmla v0.4s, v8.4s, v31.s[3] + fmla v3.4s, v10.4s, v30.s[3] + fmla v2.4s, v8.4s, v30.s[3] + fmla v6.4s, v10.4s, v29.s[3] + fmla v5.4s, v8.4s, v29.s[3] + fmla v7.4s, v10.4s, v28.s[3] + fmla v4.4s, v8.4s, v28.s[3] + fmla v17.4s, v10.4s, v27.s[3] + fmla v16.4s, v8.4s, v27.s[3] + fmla v19.4s, v10.4s, v26.s[3] + fmla v18.4s, v8.4s, v26.s[3] + fmla v21.4s, v10.4s, v25.s[3] + fmla v20.4s, v8.4s, v25.s[3] + fmla v23.4s, v10.4s, v24.s[3] + fmla v22.4s, v8.4s, v24.s[3] + cmp x4, x19 + b.ge .LBB0_9 + .p2align 2 +.LBB0_14: // Parent Loop BB0_3 Depth=1 + // Parent Loop BB0_10 Depth=2 + // => This Inner Loop Header: Depth=3 + add x12, x10, x28 + prfm pldl1keep, [x10] + ldur s24, [x10, #-4] + add x4, x4, #1 + prfm pldl1keep, [x12] + ldur s25, [x12, #-4] + add x12, x12, x28 + add x10, x10, #4 + prfm pldl1keep, [x12] + ldur s26, [x12, #-4] + add x12, x12, x28 + prfm pldl1keep, [x12] + ldur s27, [x12, #-4] + add x12, x12, x28 + prfm pldl1keep, [x12] + ldur s28, [x12, #-4] + add x12, x12, x28 + prfm pldl1keep, [x12] + ldur s29, [x12, #-4] + add x12, x12, x28 + prfm pldl1keep, [x12] + ldur s30, [x12, #-4] + add x12, x12, x28 + prfm pldl1keep, [x12] + ldur s31, [x12, #-4] + prfm pldl1keep, [x17] + ldp q8, q9, [x17, #-32] + add x17, x17, #32 + fmla v0.4s, v9.4s, v24.s[0] + fmla v2.4s, v9.4s, v25.s[0] + fmla v5.4s, v9.4s, v26.s[0] + fmla v4.4s, v9.4s, v27.s[0] + fmla v16.4s, v9.4s, v28.s[0] + fmla v18.4s, v9.4s, v29.s[0] + fmla v20.4s, v9.4s, v30.s[0] + fmla v1.4s, v8.4s, v24.s[0] + fmla v3.4s, v8.4s, v25.s[0] + fmla v6.4s, v8.4s, v26.s[0] + fmla v7.4s, v8.4s, v27.s[0] + fmla v17.4s, v8.4s, v28.s[0] + fmla v19.4s, v8.4s, v29.s[0] + fmla v21.4s, v8.4s, v30.s[0] + fmla v23.4s, v8.4s, v31.s[0] + fmla v22.4s, v9.4s, v31.s[0] + cmp x4, x19 + b.lt .LBB0_14 + b .LBB0_9 + .p2align 2 +.LBB0_15: // in Loop: Header=BB0_3 Depth=1 + ldp x17, x24, [sp, #336] // 16-byte Folded Reload + ldr x20, [sp, #312] // 8-byte Folded Reload + cmp x11, x17 + ldp x25, x6, [sp, #384] // 16-byte Folded Reload + ldr x29, [sp, #200] // 8-byte Folded Reload + b.lt .LBB0_18 +// %bb.16: // in Loop: Header=BB0_3 Depth=1 + cmp x17, x20 + b.lt .LBB0_23 +.LBB0_17: // in Loop: Header=BB0_3 Depth=1 + ldr x10, [sp, #288] // 8-byte Folded Reload + ldr x1, [sp, #368] // 8-byte Folded Reload + cmp x20, x10 + b.ge .LBB0_2 + b .LBB0_28 + .p2align 2 +.LBB0_18: // in Loop: Header=BB0_3 Depth=1 + ldr x15, [sp, #520] // 8-byte Folded Reload + ldp q20, q21, [x8] + mov x10, xzr + add x12, x15, #1 + add x13, x15, #2 + mul x11, x15, x27 + add x14, x15, #3 + mul x15, x15, x22 + madd x18, x12, x27, x7 + mul x12, x12, x22 + madd x1, x13, x27, x7 + lsl x3, x15, #2 + add x11, x11, x7 + lsl x12, x12, #2 + add x15, x2, x18, lsl #2 + madd x18, x14, x27, x7 + add x17, x2, x11, lsl #2 + add x11, x2, x1, lsl #2 + ldr x1, [sp, #184] // 8-byte Folded Reload + ldr q18, [x16, x3] + ldr q19, [x16, x12] + mul x12, x13, x22 + mov x13, x5 + ldp q3, q0, [x17] + ldp q4, q1, [x15] + ldp q5, q2, [x11] + add x18, x2, x18, lsl #2 + lsl x12, x12, #2 + ldr q17, [x16, x12] + mul x12, x14, x22 + ldp q7, q6, [x18] + lsl x12, x12, #2 + ldr q16, [x16, x12] + cmp xzr, x23 + b.ge .LBB0_20 + .p2align 2 +.LBB0_19: // Parent Loop BB0_3 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x13, #32 + fmla v3.4s, v20.4s, v18.s[0] + fmla v0.4s, v21.4s, v18.s[0] + add x10, x10, #4 + prfm pldl1keep, [x12] + ldp q22, q23, [x13, #-96] + fmla v1.4s, v21.4s, v19.s[0] + fmla v4.4s, v20.4s, v19.s[0] + fmla v2.4s, v21.4s, v17.s[0] + fmla v5.4s, v20.4s, v17.s[0] + add x12, x13, #96 + fmla v6.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q21, q20, [x13, #-64] + prfm pldl1keep, [x12] + add x12, x1, x28 + add x14, x12, x28 + fmla v0.4s, v23.4s, v18.s[1] + fmla v1.4s, v23.4s, v19.s[1] + fmla v2.4s, v23.4s, v17.s[1] + fmla v6.4s, v23.4s, v16.s[1] + fmla v3.4s, v22.4s, v18.s[1] + fmla v4.4s, v22.4s, v19.s[1] + fmla v5.4s, v22.4s, v17.s[1] + fmla v7.4s, v22.4s, v16.s[1] + fmla v0.4s, v20.4s, v18.s[2] + ldp q22, q23, [x13, #-32] + fmla v1.4s, v20.4s, v19.s[2] + fmla v2.4s, v20.4s, v17.s[2] + fmla v6.4s, v20.4s, v16.s[2] + fmla v3.4s, v21.4s, v18.s[2] + fmla v4.4s, v21.4s, v19.s[2] + fmla v5.4s, v21.4s, v17.s[2] + fmla v7.4s, v21.4s, v16.s[2] + ldp q20, q21, [x13], #128 + prfm pldl1keep, [x1] + fmla v0.4s, v23.4s, v18.s[3] + fmla v1.4s, v23.4s, v19.s[3] + fmla v2.4s, v23.4s, v17.s[3] + fmla v6.4s, v23.4s, v16.s[3] + fmla v3.4s, v22.4s, v18.s[3] + ldur q18, [x1, #-16] + prfm pldl1keep, [x12] + fmla v4.4s, v22.4s, v19.s[3] + ldur q19, [x12, #-16] + add x12, x14, x28 + prfm pldl1keep, [x14] + add x1, x1, #16 + fmla v5.4s, v22.4s, v17.s[3] + ldur q17, [x14, #-16] + prfm pldl1keep, [x12] + fmla v7.4s, v22.4s, v16.s[3] + ldur q16, [x12, #-16] + cmp x10, x23 + b.lt .LBB0_19 +.LBB0_20: // in Loop: Header=BB0_3 Depth=1 + ldp q23, q22, [x9] + fmla v0.4s, v21.4s, v18.s[0] + fmla v3.4s, v20.4s, v18.s[0] + fmla v1.4s, v21.4s, v19.s[0] + fmla v4.4s, v20.4s, v19.s[0] + ldr x10, [sp, #176] // 8-byte Folded Reload + ldr x13, [sp, #488] // 8-byte Folded Reload + fmla v2.4s, v21.4s, v17.s[0] + fmla v5.4s, v20.4s, v17.s[0] + ldr x1, [sp, #504] // 8-byte Folded Reload + fmla v6.4s, v21.4s, v16.s[0] + fmla v7.4s, v20.4s, v16.s[0] + ldp q20, q21, [x0] + fmla v0.4s, v22.4s, v18.s[1] + fmla v1.4s, v22.4s, v19.s[1] + fmla v2.4s, v22.4s, v17.s[1] + fmla v6.4s, v22.4s, v16.s[1] + fmla v3.4s, v23.4s, v18.s[1] + fmla v4.4s, v23.4s, v19.s[1] + fmla v5.4s, v23.4s, v17.s[1] + fmla v7.4s, v23.4s, v16.s[1] + fmla v0.4s, v21.4s, v18.s[2] + ldp q23, q22, [x30] + fmla v1.4s, v21.4s, v19.s[2] + fmla v2.4s, v21.4s, v17.s[2] + fmla v6.4s, v21.4s, v16.s[2] + fmla v3.4s, v20.4s, v18.s[2] + fmla v4.4s, v20.4s, v19.s[2] + fmla v5.4s, v20.4s, v17.s[2] + fmla v7.4s, v20.4s, v16.s[2] + fmla v0.4s, v22.4s, v18.s[3] + fmla v1.4s, v22.4s, v19.s[3] + fmla v2.4s, v22.4s, v17.s[3] + fmla v6.4s, v22.4s, v16.s[3] + fmla v3.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v19.s[3] + fmla v5.4s, v23.4s, v17.s[3] + fmla v7.4s, v23.4s, v16.s[3] + cmp x1, x19 + b.ge .LBB0_22 + .p2align 2 +.LBB0_21: // Parent Loop BB0_3 Depth=1 + // => This Inner Loop Header: Depth=2 + add x12, x10, x28 + prfm pldl1keep, [x10] + ldur s16, [x10, #-4] + add x1, x1, #1 + prfm pldl1keep, [x12] + ldur s17, [x12, #-4] + add x12, x12, x28 + add x10, x10, #4 + prfm pldl1keep, [x12] + ldur s18, [x12, #-4] + add x12, x12, x28 + prfm pldl1keep, [x12] + ldur s19, [x12, #-4] + prfm pldl1keep, [x13] + ldp q20, q21, [x13, #-32] + add x13, x13, #32 + fmla v0.4s, v21.4s, v16.s[0] + fmla v1.4s, v21.4s, v17.s[0] + fmla v2.4s, v21.4s, v18.s[0] + fmla v3.4s, v20.4s, v16.s[0] + fmla v4.4s, v20.4s, v17.s[0] + fmla v5.4s, v20.4s, v18.s[0] + fmla v7.4s, v20.4s, v19.s[0] + fmla v6.4s, v21.4s, v19.s[0] + cmp x1, x19 + b.lt .LBB0_21 +.LBB0_22: // in Loop: Header=BB0_3 Depth=1 + stp q3, q0, [x17] + ldr x17, [sp, #336] // 8-byte Folded Reload + stp q4, q1, [x15] + stp q5, q2, [x11] + stp q7, q6, [x18] + cmp x17, x20 + b.ge .LBB0_17 +.LBB0_23: // in Loop: Header=BB0_3 Depth=1 + mul x10, x17, x27 + add x12, x17, #1 + ldp q6, q7, [x8] + madd x11, x12, x27, x7 + ldr x18, [sp, #168] // 8-byte Folded Reload + mov x13, xzr + mov x15, x5 + mul x14, x17, x22 + ldr x17, [sp, #248] // 8-byte Folded Reload + add x10, x10, x7 + mul x12, x12, x22 + lsl x14, x14, #2 + add x10, x2, x10, lsl #2 + add x11, x2, x11, lsl #2 + lsl x12, x12, #2 + ldr q5, [x16, x14] + ldr q4, [x16, x12] + ldp q1, q0, [x10] + ldp q3, q2, [x11] + cmp xzr, x23 + b.ge .LBB0_25 + .p2align 2 +.LBB0_24: // Parent Loop BB0_3 Depth=1 + // => This Inner Loop Header: Depth=2 + add x6, x15, #32 + fmla v1.4s, v6.4s, v5.s[0] + fmla v0.4s, v7.4s, v5.s[0] + add x4, x15, #96 + prfm pldl1keep, [x6] + ldp q16, q17, [x15, #-96] + fmla v2.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q7, q6, [x15, #-64] + prfm pldl1keep, [x4] + add x12, x18, x24 + add x1, x17, x24 + add x14, x12, #32 + add x3, x1, #32 + add x13, x13, #4 + add x18, x18, #16 + add x17, x17, #16 + fmla v0.4s, v17.4s, v5.s[1] + fmla v2.4s, v17.4s, v4.s[1] + fmla v1.4s, v16.4s, v5.s[1] + fmla v3.4s, v16.4s, v4.s[1] + fmla v0.4s, v6.4s, v5.s[2] + ldp q16, q17, [x15, #-32] + fmla v2.4s, v6.4s, v4.s[2] + fmla v1.4s, v7.4s, v5.s[2] + fmla v3.4s, v7.4s, v4.s[2] + ldp q6, q7, [x15], #128 + prfm pldl1keep, [x3] + fmla v0.4s, v17.4s, v5.s[3] + fmla v2.4s, v17.4s, v4.s[3] + fmla v1.4s, v16.4s, v5.s[3] + ldr q5, [x1, #16] + prfm pldl1keep, [x14] + fmla v3.4s, v16.4s, v4.s[3] + ldr q4, [x12, #16] + cmp x13, x23 + b.lt .LBB0_24 +.LBB0_25: // in Loop: Header=BB0_3 Depth=1 + ldp q17, q16, [x9] + fmla v0.4s, v7.4s, v5.s[0] + fmla v1.4s, v6.4s, v5.s[0] + fmla v2.4s, v7.4s, v4.s[0] + fmla v3.4s, v6.4s, v4.s[0] + ldp q6, q7, [x0] + ldr x12, [sp, #504] // 8-byte Folded Reload + ldr x6, [sp, #392] // 8-byte Folded Reload + mov x13, xzr + mov x15, xzr + fmla v0.4s, v16.4s, v5.s[1] + fmla v2.4s, v16.4s, v4.s[1] + fmla v1.4s, v17.4s, v5.s[1] + fmla v3.4s, v17.4s, v4.s[1] + ldp q17, q16, [x30] + fmla v0.4s, v7.4s, v5.s[2] + fmla v2.4s, v7.4s, v4.s[2] + fmla v1.4s, v6.4s, v5.s[2] + fmla v3.4s, v6.4s, v4.s[2] + fmla v0.4s, v16.4s, v5.s[3] + fmla v2.4s, v16.4s, v4.s[3] + fmla v1.4s, v17.4s, v5.s[3] + fmla v3.4s, v17.4s, v4.s[3] + cmp x12, x19 + b.ge .LBB0_27 + .p2align 2 +.LBB0_26: // Parent Loop BB0_3 Depth=1 + // => This Inner Loop Header: Depth=2 + ldp x3, x1, [sp, #472] // 16-byte Folded Reload + add x14, x29, x15, lsl #3 + add x12, x12, #1 + add x14, x14, #32 + add x17, x1, x15 + add x18, x3, x15 + add x17, x17, #4 + add x18, x18, #4 + prfm pldl1keep, [x18] + ldr s4, [x3, x15] + prfm pldl1keep, [x17] + ldr s5, [x1, x15] + add x17, x29, x13 + prfm pldl1keep, [x14] + add x15, x15, #4 + add x13, x13, #32 + ldp q6, q7, [x17] + fmla v0.4s, v7.4s, v4.s[0] + fmla v1.4s, v6.4s, v4.s[0] + fmla v2.4s, v7.4s, v5.s[0] + fmla v3.4s, v6.4s, v5.s[0] + cmp x12, x19 + b.lt .LBB0_26 +.LBB0_27: // in Loop: Header=BB0_3 Depth=1 + stp q1, q0, [x10] + stp q3, q2, [x11] + ldr x10, [sp, #288] // 8-byte Folded Reload + ldr x1, [sp, #368] // 8-byte Folded Reload + cmp x20, x10 + b.ge .LBB0_2 +.LBB0_28: // in Loop: Header=BB0_3 Depth=1 + mul x10, x20, x27 + ldp q4, q3, [x8] + ldr x13, [sp, #136] // 8-byte Folded Reload + mul x12, x20, x22 + mov x11, xzr + add x10, x10, x7 + lsl x12, x12, #2 + add x10, x2, x10, lsl #2 + ldr q2, [x16, x12] + mov x12, x5 + ldp q1, q0, [x10] + cmp xzr, x23 + b.ge .LBB0_30 + .p2align 2 +.LBB0_29: // Parent Loop BB0_3 Depth=1 + // => This Inner Loop Header: Depth=2 + add x15, x12, #32 + fmla v1.4s, v4.4s, v2.s[0] + fmla v0.4s, v3.4s, v2.s[0] + add x14, x12, #96 + prfm pldl1keep, [x15] + ldp q5, q6, [x12, #-96] + add x11, x11, #4 + ldp q4, q3, [x12, #-64] + prfm pldl1keep, [x14] + fmla v0.4s, v6.4s, v2.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldp q5, q6, [x12, #-32] + prfm pldl1keep, [x13] + fmla v0.4s, v3.4s, v2.s[2] + fmla v1.4s, v4.4s, v2.s[2] + fmla v0.4s, v6.4s, v2.s[3] + fmla v1.4s, v5.4s, v2.s[3] + ldur q2, [x13, #-16] + ldp q4, q3, [x12], #128 + add x13, x13, #16 + cmp x11, x23 + b.lt .LBB0_29 +.LBB0_30: // in Loop: Header=BB0_3 Depth=1 + ldp q6, q5, [x9] + fmla v0.4s, v3.4s, v2.s[0] + fmla v1.4s, v4.4s, v2.s[0] + ldp q3, q4, [x0] + ldp x9, x11, [sp, #152] // 16-byte Folded Reload + ldr x12, [sp, #504] // 8-byte Folded Reload + ldr x15, [sp, #144] // 8-byte Folded Reload + fmla v0.4s, v5.4s, v2.s[1] + fmla v1.4s, v6.4s, v2.s[1] + ldp q6, q5, [x30] + fmla v0.4s, v4.4s, v2.s[2] + fmla v1.4s, v3.4s, v2.s[2] + fmla v0.4s, v5.4s, v2.s[3] + fmla v1.4s, v6.4s, v2.s[3] + cmp x12, x19 + b.ge .LBB0_1 + .p2align 2 +.LBB0_31: // Parent Loop BB0_3 Depth=1 + // => This Inner Loop Header: Depth=2 + add x14, x8, x12, lsl #5 + add x13, x8, x11 + prfm pldl1keep, [x9] + add x11, x11, #32 + ldr s2, [x15, x12, lsl #2] + prfm pldl1keep, [x13] + add x12, x12, #1 + ldp q3, q4, [x14] + add x9, x9, #4 + fmla v0.4s, v4.4s, v2.s[0] + fmla v1.4s, v3.4s, v2.s[0] + cmp x12, x19 + b.lt .LBB0_31 + b .LBB0_1 +.LBB0_32: + ldr x0, [sp, #56] // 8-byte Folded Reload + bl free + ldr x9, [sp, #112] // 8-byte Folded Reload + lsl x20, x22, #3 + str x20, [sp, #472] // 8-byte Folded Spill + add x8, x9, #3 + cmp x9, #0 + csel x8, x8, x9, lt + ldr x9, [sp, #128] // 8-byte Folded Reload + asr x8, x8, #2 + cmp x9, #0 + cinv x29, x8, lt + lsl x8, x29, #2 + str x8, [sp, #488] // 8-byte Folded Spill + cmp x25, x8 + ldp x9, x8, [sp, #256] // 16-byte Folded Reload + add x24, x8, x9, lsl #2 + b.ge .LBB0_63 +// %bb.33: + lsl x8, x19, #4 + str x29, [sp, #480] // 8-byte Folded Spill + add x0, x8, #64 + str x8, [sp, #464] // 8-byte Folded Spill + bl malloc + add x8, x25, x27, lsl #1 + add x10, x27, x25 + ldp x6, x5, [sp, #296] // 16-byte Folded Reload + lsl x10, x10, #2 + add x13, x25, x27, lsl #2 + add x11, x0, #63 + ldr x18, [sp, #328] // 8-byte Folded Reload + ldr q2, [x24, x10] + lsl x10, x8, #2 + add x8, x8, x27 + ldr q1, [x24, x10] + lsl x10, x13, #2 + lsl x8, x8, #2 + ldr q3, [x24, x10] + mov w10, #6 // =0x6 + ldr x1, [sp, #344] // 8-byte Folded Reload + ldr q4, [x24, x8] + mul x8, x27, x10 + mov w4, #12 // =0xc + add x13, x13, x27 + lsl x10, x13, #2 + lsl x9, x25, #2 + mov w16, #20 // =0x14 + ldr q5, [x24, x10] + add x10, x6, x21 + mov w15, #24 // =0x18 + ldr q0, [x24, x9] + ldr q23, [x10, x9] + add x8, x8, x25 + add x9, x18, x1 + mul x13, x22, x15 + ldr x7, [sp, #64] // 8-byte Folded Reload + lsl x8, x8, #2 + ldr q22, [x9, x13] + mov w13, #28 // =0x1c + ldr q6, [x24, x8] + sub x8, x25, x27 + ldr q16, [x9] + add x8, x8, x27, lsl #3 + ldr q17, [x9, x28] + ldr x29, [sp, #504] // 8-byte Folded Reload + ldr q18, [x9, x20] + ldr q20, [x9, x22, lsl #4] + lsl x8, x8, #2 + ldr x30, [sp, #104] // 8-byte Folded Reload + mov x12, xzr + ldr q7, [x24, x8] + and x8, x11, #0xffffffffffffffc0 + mul x11, x22, x4 + orr x3, x8, #0x20 + ldr q19, [x9, x11] + mul x11, x22, x16 + ldr q21, [x9, x11] + ldr x11, [sp, #16] // 8-byte Folded Reload + lsl x11, x11, #5 + madd x17, x5, x13, x11 + add x14, x11, x5, lsl #5 + madd x15, x5, x15, x11 + madd x16, x5, x16, x11 + madd x4, x5, x4, x11 + add x2, x11, x5, lsl #3 + add x13, x6, x14 + add x2, x6, x2 + add x14, x6, x17 + add x17, x1, x18 + add x1, x11, x5, lsl #2 + add x18, x11, x26 + mov w5, #16 // =0x10 + add x15, x6, x15 + add x16, x6, x16 + add x4, x6, x4 + add x17, x7, x17 + add x18, x6, x18 + sub x5, x5, x7 + add x17, x17, #16 + add x1, x6, x1 + prfm pldl1keep, [x17] + ldur q24, [x17, #-16] + fmla v0.4s, v23.4s, v16.s[0] + fmla v2.4s, v23.4s, v17.s[0] + cmp xzr, x23 + b.ge .LBB0_35 + .p2align 2 +.LBB0_34: // =>This Inner Loop Header: Depth=1 + add x6, x16, x21 + stur q23, [x3, #-32] + fmla v1.4s, v23.4s, v18.s[0] + fmla v4.4s, v23.4s, v19.s[0] + prfm pldl1keep, [x6] + ldr q25, [x1, x21] + fmla v3.4s, v23.4s, v20.s[0] + fmla v5.4s, v23.4s, v21.s[0] + fmla v6.4s, v23.4s, v22.s[0] + fmla v7.4s, v23.4s, v24.s[0] + add x6, x15, x21 + add x7, x17, x5 + add x20, x7, x28 + add x25, x20, x28 + add x12, x12, #4 + add x15, x15, x26 + add x16, x16, x26 + add x17, x17, #16 + add x1, x1, x26 + stur q25, [x3, #-16] + prfm pldl1keep, [x6] + ldr q23, [x2, x21] + fmla v0.4s, v25.4s, v16.s[1] + fmla v2.4s, v25.4s, v17.s[1] + fmla v1.4s, v25.4s, v18.s[1] + fmla v4.4s, v25.4s, v19.s[1] + fmla v3.4s, v25.4s, v20.s[1] + fmla v5.4s, v25.4s, v21.s[1] + fmla v6.4s, v25.4s, v22.s[1] + fmla v7.4s, v25.4s, v24.s[1] + add x6, x14, x21 + add x14, x14, x26 + add x2, x2, x26 + fmla v0.4s, v23.4s, v16.s[2] + fmla v2.4s, v23.4s, v17.s[2] + fmla v1.4s, v23.4s, v18.s[2] + fmla v4.4s, v23.4s, v19.s[2] + fmla v3.4s, v23.4s, v20.s[2] + fmla v5.4s, v23.4s, v21.s[2] + fmla v6.4s, v23.4s, v22.s[2] + fmla v7.4s, v23.4s, v24.s[2] + str q23, [x3] + prfm pldl1keep, [x6] + ldr q23, [x4, x21] + add x6, x13, x21 + add x13, x13, x26 + add x4, x4, x26 + str q23, [x3, #16] + prfm pldl1keep, [x6] + add x6, x25, x28 + fmla v0.4s, v23.4s, v16.s[3] + fmla v2.4s, v23.4s, v17.s[3] + fmla v1.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v19.s[3] + fmla v3.4s, v23.4s, v20.s[3] + fmla v5.4s, v23.4s, v21.s[3] + fmla v6.4s, v23.4s, v22.s[3] + fmla v7.4s, v23.4s, v24.s[3] + ldr q23, [x18, x21] + prfm pldl1keep, [x7] + ldur q16, [x7, #-16] + prfm pldl1keep, [x20] + ldur q17, [x20, #-16] + prfm pldl1keep, [x25] + ldur q18, [x25, #-16] + ldr x25, [sp, #384] // 8-byte Folded Reload + add x18, x18, x26 + add x3, x3, #64 + prfm pldl1keep, [x6] + ldur q19, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q20, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q21, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q22, [x6, #-16] + prfm pldl1keep, [x17] + ldur q24, [x17, #-16] + fmla v0.4s, v23.4s, v16.s[0] + fmla v2.4s, v23.4s, v17.s[0] + cmp x12, x23 + b.lt .LBB0_34 +.LBB0_35: + ldp x13, x14, [sp, #400] // 16-byte Folded Reload + ldr x15, [sp, #304] // 8-byte Folded Reload + fmla v1.4s, v23.4s, v18.s[0] + str q23, [x8, x23, lsl #4] + fmla v4.4s, v23.4s, v19.s[0] + fmla v3.4s, v23.4s, v20.s[0] + fmla v5.4s, v23.4s, v21.s[0] + fmla v6.4s, v23.4s, v22.s[0] + fmla v7.4s, v23.4s, v24.s[0] + ldr x7, [sp, #520] // 8-byte Folded Reload + mul x12, x13, x15 + add x12, x12, x25 + lsl x12, x12, #2 + ldr q23, [x10, x12] + mul x12, x14, x15 + add x12, x12, x25 + lsl x12, x12, #2 + str q23, [x8, x13, lsl #4] + ldr x13, [sp, #416] // 8-byte Folded Reload + fmla v0.4s, v23.4s, v16.s[1] + fmla v2.4s, v23.4s, v17.s[1] + fmla v1.4s, v23.4s, v18.s[1] + fmla v4.4s, v23.4s, v19.s[1] + fmla v3.4s, v23.4s, v20.s[1] + fmla v5.4s, v23.4s, v21.s[1] + fmla v6.4s, v23.4s, v22.s[1] + fmla v7.4s, v23.4s, v24.s[1] + ldr q23, [x10, x12] + madd x12, x13, x15, x25 + fmla v0.4s, v23.4s, v16.s[2] + str q23, [x8, x14, lsl #4] + fmla v2.4s, v23.4s, v17.s[2] + fmla v1.4s, v23.4s, v18.s[2] + fmla v4.4s, v23.4s, v19.s[2] + fmla v3.4s, v23.4s, v20.s[2] + fmla v5.4s, v23.4s, v21.s[2] + fmla v6.4s, v23.4s, v22.s[2] + fmla v7.4s, v23.4s, v24.s[2] + mov x14, x29 + lsl x12, x12, #2 + ldr q23, [x10, x12] + ldr x10, [sp, #72] // 8-byte Folded Reload + add x12, x10, #4 + ldp x17, x10, [sp, #272] // 16-byte Folded Reload + str q23, [x8, x13, lsl #4] + ldr x13, [sp, #120] // 8-byte Folded Reload + fmla v0.4s, v23.4s, v16.s[3] + fmla v2.4s, v23.4s, v17.s[3] + fmla v1.4s, v23.4s, v18.s[3] + fmla v4.4s, v23.4s, v19.s[3] + fmla v3.4s, v23.4s, v20.s[3] + fmla v5.4s, v23.4s, v21.s[3] + fmla v6.4s, v23.4s, v22.s[3] + fmla v7.4s, v23.4s, v24.s[3] + add x10, x11, x10, lsl #2 + ldr x11, [sp, #296] // 8-byte Folded Reload + add x10, x11, x10 + ldr x11, [sp, #320] // 8-byte Folded Reload + add x11, x13, x11, lsl #2 + ldr x13, [sp, #80] // 8-byte Folded Reload + sub x11, x11, x13 + ldr x13, [sp, #328] // 8-byte Folded Reload + add x13, x11, x13 + mul x11, x15, x12 + add x12, x13, #4 + ldr x13, [sp, #96] // 8-byte Folded Reload + lsl x13, x13, #2 + cmp x29, x19 + b.ge .LBB0_37 + .p2align 2 +.LBB0_36: // =>This Inner Loop Header: Depth=1 + add x16, x12, x28 + prfm pldl1keep, [x12] + ldur s16, [x12, #-4] + add x15, x10, x11 + prfm pldl1keep, [x16] + ldur s17, [x16, #-4] + add x16, x16, x28 + add x12, x12, #4 + prfm pldl1keep, [x16] + ldur s18, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s19, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s20, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s21, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s22, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s23, [x16, #-4] + prfm pldl1keep, [x15] + ldr q24, [x10, x13] + add x10, x10, x17 + fmla v0.4s, v24.4s, v16.s[0] + str q24, [x8, x14, lsl #4] + add x14, x14, #1 + fmla v2.4s, v24.4s, v17.s[0] + fmla v1.4s, v24.4s, v18.s[0] + fmla v4.4s, v24.4s, v19.s[0] + fmla v3.4s, v24.4s, v20.s[0] + fmla v5.4s, v24.4s, v21.s[0] + fmla v6.4s, v24.4s, v22.s[0] + fmla v7.4s, v24.4s, v23.s[0] + cmp x14, x19 + b.lt .LBB0_36 +.LBB0_37: // %.preheader27 + ldr x10, [sp, #344] // 8-byte Folded Reload + ldr x11, [sp, #496] // 8-byte Folded Reload + mov x6, xzr + mov w4, #7 // =0x7 + ldr x15, [sp, #328] // 8-byte Folded Reload + ldr x12, [sp, #88] // 8-byte Folded Reload + mov w3, #6 // =0x6 + mov w16, #5 // =0x5 + mov w1, #4 // =0x4 + mov w17, #3 // =0x3 + mov w18, #2 // =0x2 + mov w2, #1 // =0x1 + add x13, x11, x10 + sub x10, x8, x30, lsl #4 + add x14, x12, x15 + add x11, x8, #48 + mov w12, #8 // =0x8 + add x15, x13, x15 + add x13, x14, #4 + add x10, x10, x19, lsl #4 + add x14, x15, #32 + add x15, x10, #16 + b .LBB0_39 + .p2align 2 +.LBB0_38: // %.loopexit26 + // in Loop: Header=BB0_39 Depth=1 + ldr x6, [sp, #496] // 8-byte Folded Reload + ldr x7, [sp, #520] // 8-byte Folded Reload + add x14, x14, x6 + ldr x25, [sp, #384] // 8-byte Folded Reload + add x13, x13, x6 + mov x6, x12 + mov x12, x5 +.LBB0_39: // =>This Loop Header: Depth=1 + // Child Loop BB0_41 Depth 2 + // Child Loop BB0_43 Depth 2 + madd x5, x6, x27, x25 + cmp x12, x7 + lsl x5, x5, #2 + madd x2, x2, x27, x25 + madd x18, x18, x27, x25 + madd x17, x17, x27, x25 + madd x1, x1, x27, x25 + lsl x2, x2, #2 + lsl x18, x18, #2 + lsl x17, x17, #2 + lsl x1, x1, #2 + madd x16, x16, x27, x25 + lsl x16, x16, #2 + str q0, [x24, x5] + str q2, [x24, x2] + str q1, [x24, x18] + str q4, [x24, x17] + str q3, [x24, x1] + str q5, [x24, x16] + madd x16, x3, x27, x25 + lsl x16, x16, #2 + str q6, [x24, x16] + madd x16, x4, x27, x25 + lsl x16, x16, #2 + str q7, [x24, x16] + b.ge .LBB0_44 +// %bb.40: // in Loop: Header=BB0_39 Depth=1 + add x17, x12, #3 + add x2, x12, #1 + add x18, x12, #2 + mul x3, x12, x27 + mul x7, x17, x27 + add x1, x12, #4 + add x16, x12, #5 + ldr q24, [x8] + mul x4, x2, x27 + mov x6, xzr + add x3, x3, x25 + mul x5, x18, x27 + mul x20, x1, x27 + add x7, x7, x25 + lsl x3, x3, #2 + add x4, x4, x25 + add x5, x5, x25 + add x20, x20, x25 + lsl x7, x7, #2 + lsl x4, x4, #2 + ldr q0, [x24, x3] + mul x3, x16, x27 + lsl x5, x5, #2 + lsl x20, x20, #2 + ldr q4, [x24, x7] + mul x7, x12, x22 + ldr q2, [x24, x4] + ldr q1, [x24, x5] + ldr q3, [x24, x20] + mov x20, x14 + add x3, x3, x25 + lsl x7, x7, #2 + lsl x3, x3, #2 + ldr q23, [x9, x7] + mul x7, x2, x22 + ldr q5, [x24, x3] + add x3, x12, #6 + mul x4, x3, x27 + lsl x7, x7, #2 + ldr q22, [x9, x7] + mul x7, x18, x22 + add x4, x4, x25 + lsl x4, x4, #2 + lsl x7, x7, #2 + ldr q6, [x24, x4] + add x4, x12, #7 + mul x5, x4, x27 + ldr q21, [x9, x7] + mul x7, x17, x22 + add x5, x5, x25 + lsl x5, x5, #2 + lsl x7, x7, #2 + ldr q7, [x24, x5] + add x5, x12, #8 + ldr q20, [x9, x7] + mul x7, x1, x22 + lsl x7, x7, #2 + ldr q19, [x9, x7] + mul x7, x16, x22 + lsl x7, x7, #2 + ldr q18, [x9, x7] + mul x7, x3, x22 + lsl x7, x7, #2 + ldr q17, [x9, x7] + mul x7, x4, x22 + lsl x7, x7, #2 + ldr q16, [x9, x7] + mov x7, x11 + cmp xzr, x23 + b.ge .LBB0_42 + .p2align 2 +.LBB0_41: // Parent Loop BB0_39 Depth=1 + // => This Inner Loop Header: Depth=2 + add x25, x7, #32 + fmla v0.4s, v24.4s, v23.s[0] + fmla v2.4s, v24.4s, v22.s[0] + add x6, x6, #4 + fmla v1.4s, v24.4s, v21.s[0] + fmla v4.4s, v24.4s, v20.s[0] + prfm pldl1keep, [x25] + add x25, x20, x28 + fmla v3.4s, v24.4s, v19.s[0] + fmla v5.4s, v24.4s, v18.s[0] + fmla v6.4s, v24.4s, v17.s[0] + fmla v7.4s, v24.4s, v16.s[0] + ldp q24, q25, [x7, #-32] + fmla v0.4s, v24.4s, v23.s[1] + fmla v2.4s, v24.4s, v22.s[1] + fmla v1.4s, v24.4s, v21.s[1] + fmla v4.4s, v24.4s, v20.s[1] + fmla v3.4s, v24.4s, v19.s[1] + fmla v5.4s, v24.4s, v18.s[1] + fmla v6.4s, v24.4s, v17.s[1] + fmla v7.4s, v24.4s, v16.s[1] + fmla v0.4s, v25.4s, v23.s[2] + fmla v2.4s, v25.4s, v22.s[2] + ldp q26, q24, [x7], #64 + fmla v1.4s, v25.4s, v21.s[2] + fmla v4.4s, v25.4s, v20.s[2] + fmla v3.4s, v25.4s, v19.s[2] + prfm pldl1keep, [x20] + fmla v5.4s, v25.4s, v18.s[2] + fmla v6.4s, v25.4s, v17.s[2] + fmla v7.4s, v25.4s, v16.s[2] + fmla v0.4s, v26.4s, v23.s[3] + ldur q23, [x20, #-16] + prfm pldl1keep, [x25] + fmla v2.4s, v26.4s, v22.s[3] + ldur q22, [x25, #-16] + add x25, x25, x28 + fmla v1.4s, v26.4s, v21.s[3] + fmla v4.4s, v26.4s, v20.s[3] + fmla v3.4s, v26.4s, v19.s[3] + fmla v5.4s, v26.4s, v18.s[3] + add x20, x20, #16 + prfm pldl1keep, [x25] + ldur q21, [x25, #-16] + add x25, x25, x28 + fmla v6.4s, v26.4s, v17.s[3] + fmla v7.4s, v26.4s, v16.s[3] + prfm pldl1keep, [x25] + ldur q20, [x25, #-16] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur q19, [x25, #-16] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur q18, [x25, #-16] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur q17, [x25, #-16] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur q16, [x25, #-16] + cmp x6, x23 + b.lt .LBB0_41 +.LBB0_42: // in Loop: Header=BB0_39 Depth=1 + ldp x7, x6, [sp, #400] // 16-byte Folded Reload + fmla v0.4s, v24.4s, v23.s[0] + fmla v2.4s, v24.4s, v22.s[0] + fmla v1.4s, v24.4s, v21.s[0] + fmla v4.4s, v24.4s, v20.s[0] + mov x20, x29 + fmla v3.4s, v24.4s, v19.s[0] + fmla v5.4s, v24.4s, v18.s[0] + ldr q25, [x8, x7, lsl #4] + fmla v6.4s, v24.4s, v17.s[0] + fmla v7.4s, v24.4s, v16.s[0] + ldr q24, [x8, x6, lsl #4] + ldr x6, [sp, #416] // 8-byte Folded Reload + mov x7, x15 + ldr q26, [x8, x6, lsl #4] + mov x6, x13 + fmla v0.4s, v25.4s, v23.s[1] + fmla v2.4s, v25.4s, v22.s[1] + fmla v1.4s, v25.4s, v21.s[1] + fmla v4.4s, v25.4s, v20.s[1] + fmla v3.4s, v25.4s, v19.s[1] + fmla v5.4s, v25.4s, v18.s[1] + fmla v6.4s, v25.4s, v17.s[1] + fmla v7.4s, v25.4s, v16.s[1] + fmla v0.4s, v24.4s, v23.s[2] + fmla v2.4s, v24.4s, v22.s[2] + fmla v1.4s, v24.4s, v21.s[2] + fmla v4.4s, v24.4s, v20.s[2] + fmla v3.4s, v24.4s, v19.s[2] + fmla v5.4s, v24.4s, v18.s[2] + fmla v6.4s, v24.4s, v17.s[2] + fmla v7.4s, v24.4s, v16.s[2] + fmla v0.4s, v26.4s, v23.s[3] + fmla v2.4s, v26.4s, v22.s[3] + fmla v1.4s, v26.4s, v21.s[3] + fmla v4.4s, v26.4s, v20.s[3] + fmla v3.4s, v26.4s, v19.s[3] + fmla v5.4s, v26.4s, v18.s[3] + fmla v6.4s, v26.4s, v17.s[3] + fmla v7.4s, v26.4s, v16.s[3] + cmp x29, x19 + b.ge .LBB0_38 + .p2align 2 +.LBB0_43: // Parent Loop BB0_39 Depth=1 + // => This Inner Loop Header: Depth=2 + add x25, x6, x28 + prfm pldl1keep, [x6] + ldur s16, [x6, #-4] + add x20, x20, #1 + prfm pldl1keep, [x25] + ldur s17, [x25, #-4] + add x25, x25, x28 + add x6, x6, #4 + prfm pldl1keep, [x25] + ldur s18, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s19, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s20, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s21, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s22, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s23, [x25, #-4] + prfm pldl1keep, [x7] + ldur q24, [x7, #-16] + add x7, x7, #16 + fmla v0.4s, v24.4s, v16.s[0] + fmla v2.4s, v24.4s, v17.s[0] + fmla v1.4s, v24.4s, v18.s[0] + fmla v4.4s, v24.4s, v19.s[0] + fmla v3.4s, v24.4s, v20.s[0] + fmla v5.4s, v24.4s, v21.s[0] + fmla v6.4s, v24.4s, v22.s[0] + fmla v7.4s, v24.4s, v23.s[0] + cmp x20, x19 + b.lt .LBB0_43 + b .LBB0_38 +.LBB0_44: + ldr x13, [sp, #336] // 8-byte Folded Reload + cmp x7, x13 + b.lt .LBB0_47 +// %bb.45: + ldr x10, [sp, #312] // 8-byte Folded Reload + cmp x13, x10 + b.lt .LBB0_52 +.LBB0_46: + ldr x10, [sp, #288] // 8-byte Folded Reload + ldr x11, [sp, #312] // 8-byte Folded Reload + cmp x11, x10 + b.lt .LBB0_57 + b .LBB0_62 +.LBB0_47: + add x18, x7, #1 + add x1, x7, #2 + add x2, x7, #3 + mul x11, x7, x27 + mul x12, x18, x27 + mov x16, xzr + add x11, x11, x25 + mul x18, x18, x22 + mul x13, x1, x27 + mul x14, x2, x27 + lsl x18, x18, #2 + mul x15, x7, x22 + add x12, x12, x25 + add x13, x13, x25 + add x14, x14, x25 + lsl x17, x15, #2 + ldr q5, [x9, x17] + mov x17, x8 + add x11, x24, x11, lsl #2 + ldr q7, [x9, x18] + ldr q16, [x17], #48 + ldr q0, [x11] + mul x18, x1, x22 + lsl x18, x18, #2 + add x12, x24, x12, lsl #2 + add x13, x24, x13, lsl #2 + add x14, x24, x14, lsl #2 + ldr q1, [x12] + ldr q2, [x13] + ldr q3, [x14] + ldr q6, [x9, x18] + mul x18, x2, x22 + ldp x2, x1, [sp, #320] // 16-byte Folded Reload + lsl x18, x18, #2 + ldr q4, [x9, x18] + ldr x18, [sp, #32] // 8-byte Folded Reload + lsl x18, x18, #5 + add x18, x18, x2, lsl #2 + add x18, x18, x1 + add x18, x18, #32 + cmp xzr, x23 + b.ge .LBB0_49 + .p2align 2 +.LBB0_48: // =>This Inner Loop Header: Depth=1 + add x1, x17, #32 + fmla v0.4s, v16.4s, v5.s[0] + fmla v1.4s, v16.4s, v7.s[0] + add x16, x16, #4 + fmla v2.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v4.s[0] + prfm pldl1keep, [x1] + add x1, x18, x28 + ldp q16, q17, [x17, #-32] + fmla v0.4s, v16.4s, v5.s[1] + fmla v1.4s, v16.4s, v7.s[1] + fmla v2.4s, v16.4s, v6.s[1] + fmla v3.4s, v16.4s, v4.s[1] + fmla v0.4s, v17.4s, v5.s[2] + fmla v1.4s, v17.4s, v7.s[2] + fmla v2.4s, v17.4s, v6.s[2] + fmla v3.4s, v17.4s, v4.s[2] + ldp q17, q16, [x17], #64 + prfm pldl1keep, [x18] + fmla v0.4s, v17.4s, v5.s[3] + ldur q5, [x18, #-16] + prfm pldl1keep, [x1] + fmla v1.4s, v17.4s, v7.s[3] + ldur q7, [x1, #-16] + add x1, x1, x28 + fmla v2.4s, v17.4s, v6.s[3] + fmla v3.4s, v17.4s, v4.s[3] + add x18, x18, #16 + prfm pldl1keep, [x1] + ldur q6, [x1, #-16] + add x1, x1, x28 + prfm pldl1keep, [x1] + ldur q4, [x1, #-16] + cmp x16, x23 + b.lt .LBB0_48 +.LBB0_49: + ldp x17, x16, [sp, #400] // 16-byte Folded Reload + fmla v0.4s, v16.4s, v5.s[0] + fmla v1.4s, v16.4s, v7.s[0] + fmla v2.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v4.s[0] + add x15, x19, x15 + sub x15, x15, x30 + add x10, x10, #16 + ldr q17, [x8, x17, lsl #4] + fmla v0.4s, v17.4s, v5.s[1] + ldr q16, [x8, x16, lsl #4] + ldr x16, [sp, #416] // 8-byte Folded Reload + fmla v1.4s, v17.4s, v7.s[1] + fmla v2.4s, v17.4s, v6.s[1] + fmla v3.4s, v17.4s, v4.s[1] + ldr q18, [x8, x16, lsl #4] + ldr x16, [sp, #344] // 8-byte Folded Reload + add x15, x16, x15, lsl #2 + ldr x16, [sp, #328] // 8-byte Folded Reload + fmla v0.4s, v16.4s, v5.s[2] + fmla v1.4s, v16.4s, v7.s[2] + fmla v2.4s, v16.4s, v6.s[2] + fmla v3.4s, v16.4s, v4.s[2] + add x15, x15, x16 + mov x16, x29 + add x15, x15, #4 + fmla v0.4s, v18.4s, v5.s[3] + fmla v1.4s, v18.4s, v7.s[3] + fmla v2.4s, v18.4s, v6.s[3] + fmla v3.4s, v18.4s, v4.s[3] + cmp x29, x19 + b.ge .LBB0_51 + .p2align 2 +.LBB0_50: // =>This Inner Loop Header: Depth=1 + add x17, x15, x28 + prfm pldl1keep, [x15] + ldur s4, [x15, #-4] + add x16, x16, #1 + prfm pldl1keep, [x17] + ldur s5, [x17, #-4] + add x17, x17, x28 + add x15, x15, #4 + prfm pldl1keep, [x17] + ldur s6, [x17, #-4] + add x17, x17, x28 + prfm pldl1keep, [x17] + ldur s7, [x17, #-4] + prfm pldl1keep, [x10] + ldur q16, [x10, #-16] + add x10, x10, #16 + fmla v0.4s, v16.4s, v4.s[0] + fmla v1.4s, v16.4s, v5.s[0] + fmla v2.4s, v16.4s, v6.s[0] + fmla v3.4s, v16.4s, v7.s[0] + cmp x16, x19 + b.lt .LBB0_50 +.LBB0_51: + str q0, [x11] + str q1, [x12] + str q2, [x13] + ldr x13, [sp, #336] // 8-byte Folded Reload + str q3, [x14] + ldr x10, [sp, #312] // 8-byte Folded Reload + cmp x13, x10 + b.ge .LBB0_46 +.LBB0_52: + mul x14, x13, x22 + add x12, x13, #1 + ldr x18, [sp, #328] // 8-byte Folded Reload + mov x16, x8 + mul x10, x13, x27 + ldr q4, [x16], #48 + mov x15, xzr + mul x11, x12, x27 + lsl x13, x14, #2 + add x10, x10, x25 + add x11, x11, x25 + ldr q3, [x9, x13] + mul x13, x12, x22 + add x10, x24, x10, lsl #2 + add x11, x24, x11, lsl #2 + ldr q0, [x10] + ldr q1, [x11] + lsl x17, x13, #2 + ldr q2, [x9, x17] + add x17, x18, x17 + ldr x18, [sp, #248] // 8-byte Folded Reload + cmp xzr, x23 + b.ge .LBB0_54 + .p2align 2 +.LBB0_53: // =>This Inner Loop Header: Depth=1 + add x5, x16, #32 + ldr x3, [sp, #344] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v3.s[0] + fmla v1.4s, v4.4s, v2.s[0] + prfm pldl1keep, [x5] + ldp q4, q5, [x16, #-32] + add x15, x15, #4 + add x1, x17, x3 + add x3, x18, x3 + add x17, x17, #16 + add x18, x18, #16 + add x2, x1, #32 + add x4, x3, #32 + fmla v0.4s, v4.4s, v3.s[1] + fmla v1.4s, v4.4s, v2.s[1] + fmla v0.4s, v5.4s, v3.s[2] + fmla v1.4s, v5.4s, v2.s[2] + ldp q5, q4, [x16], #64 + prfm pldl1keep, [x4] + fmla v0.4s, v5.4s, v3.s[3] + ldr q3, [x3, #16] + prfm pldl1keep, [x2] + fmla v1.4s, v5.4s, v2.s[3] + ldr q2, [x1, #16] + cmp x15, x23 + b.lt .LBB0_53 +.LBB0_54: + ldr x15, [sp, #400] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v3.s[0] + fmla v1.4s, v4.4s, v2.s[0] + ldr x16, [sp, #344] // 8-byte Folded Reload + add x13, x19, x13 + ldr x17, [sp, #328] // 8-byte Folded Reload + sub x13, x13, x30 + add x14, x19, x14 + mul x12, x22, x12 + ldr x18, [sp, #40] // 8-byte Folded Reload + ldr q5, [x8, x15, lsl #4] + ldr x15, [sp, #408] // 8-byte Folded Reload + add x13, x16, x13, lsl #2 + add x12, x16, x12, lsl #2 + add x12, x17, x12 + ldr q4, [x8, x15, lsl #4] + ldr x15, [sp, #416] // 8-byte Folded Reload + fmla v0.4s, v5.4s, v3.s[1] + fmla v1.4s, v5.4s, v2.s[1] + ldr q5, [x8, x15, lsl #4] + sub x15, x14, x30 + add x14, x13, x17 + ldr x13, [sp, #464] // 8-byte Folded Reload + add x15, x16, x15, lsl #2 + add x16, x16, x18, lsl #4 + add x14, x14, #4 + fmla v0.4s, v4.4s, v3.s[2] + fmla v1.4s, v4.4s, v2.s[2] + add x15, x15, x17 + add x16, x17, x16 + mov x17, x29 + sub x13, x13, x30, lsl #4 + add x15, x15, #4 + fmla v0.4s, v5.4s, v3.s[3] + fmla v1.4s, v5.4s, v2.s[3] + add x13, x13, #16 + cmp x29, x19 + b.ge .LBB0_56 + .p2align 2 +.LBB0_55: // =>This Inner Loop Header: Depth=1 + add x18, x8, x13 + prfm pldl1keep, [x15] + ldr s2, [x16, x17, lsl #2] + prfm pldl1keep, [x14] + ldr s3, [x12, x17, lsl #2] + add x13, x13, #16 + prfm pldl1keep, [x18] + ldr q4, [x8, x17, lsl #4] + add x17, x17, #1 + add x14, x14, #4 + add x15, x15, #4 + fmla v0.4s, v4.4s, v2.s[0] + fmla v1.4s, v4.4s, v3.s[0] + cmp x17, x19 + b.lt .LBB0_55 +.LBB0_56: + str q0, [x10] + str q1, [x11] + ldr x10, [sp, #288] // 8-byte Folded Reload + ldr x11, [sp, #312] // 8-byte Folded Reload + cmp x11, x10 + b.ge .LBB0_62 +.LBB0_57: + ldr x11, [sp, #312] // 8-byte Folded Reload + mov x13, x8 + mov x12, xzr + mul x10, x11, x27 + mul x11, x11, x22 + ldr q2, [x13], #48 + lsl x14, x11, #2 + add x10, x10, x25 + ldr q1, [x9, x14] + ldr x9, [sp, #48] // 8-byte Folded Reload + ldp x15, x14, [sp, #320] // 16-byte Folded Reload + add x10, x24, x10, lsl #2 + ldr q0, [x10] + lsl x9, x9, #3 + add x9, x9, x15, lsl #2 + add x9, x9, x14 + add x9, x9, #32 + cmp xzr, x23 + b.ge .LBB0_59 + .p2align 2 +.LBB0_58: // =>This Inner Loop Header: Depth=1 + add x14, x13, #32 + fmla v0.4s, v2.4s, v1.s[0] + add x12, x12, #4 + prfm pldl1keep, [x14] + ldp q2, q3, [x13, #-32] + fmla v0.4s, v2.4s, v1.s[1] + fmla v0.4s, v3.4s, v1.s[2] + ldp q3, q2, [x13], #64 + prfm pldl1keep, [x9] + fmla v0.4s, v3.4s, v1.s[3] + ldur q1, [x9, #-16] + add x9, x9, #16 + cmp x12, x23 + b.lt .LBB0_58 +.LBB0_59: + ldr x9, [sp, #400] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[0] + ldr x12, [sp, #328] // 8-byte Folded Reload + ldr x13, [sp, #24] // 8-byte Folded Reload + ldr q3, [x8, x9, lsl #4] + ldr x9, [sp, #408] // 8-byte Folded Reload + fmla v0.4s, v3.4s, v1.s[1] + ldr q2, [x8, x9, lsl #4] + ldr x9, [sp, #416] // 8-byte Folded Reload + fmla v0.4s, v2.4s, v1.s[2] + ldr q4, [x8, x9, lsl #4] + add x9, x19, x11 + ldr x11, [sp, #344] // 8-byte Folded Reload + sub x9, x9, x30 + add x9, x11, x9, lsl #2 + add x11, x9, x12 + ldr x9, [sp, #464] // 8-byte Folded Reload + fmla v0.4s, v4.4s, v1.s[3] + add x12, x12, x13 + mov x13, x29 + add x11, x11, #4 + sub x9, x9, x30, lsl #4 + add x9, x9, #16 + cmp x29, x19 + b.ge .LBB0_61 + .p2align 2 +.LBB0_60: // =>This Inner Loop Header: Depth=1 + add x14, x8, x9 + prfm pldl1keep, [x11] + ldr s1, [x12, x13, lsl #2] + prfm pldl1keep, [x14] + ldr q2, [x8, x13, lsl #4] + add x13, x13, #1 + add x9, x9, #16 + add x11, x11, #4 + fmla v0.4s, v2.4s, v1.s[0] + cmp x13, x19 + b.lt .LBB0_60 +.LBB0_61: + str q0, [x10] +.LBB0_62: + bl free + ldp x20, x29, [sp, #472] // 16-byte Folded Reload +.LBB0_63: + ldr x8, [sp, #112] // 8-byte Folded Reload + ldr x9, [sp, #128] // 8-byte Folded Reload + add x8, x8, x8, lsr #63 + ldr x25, [sp, #488] // 8-byte Folded Reload + cmp x9, #0 + asr x8, x8, #1 + cinv x8, x8, lt + str x8, [sp, #464] // 8-byte Folded Spill + lsl x8, x8, #1 + cmp x25, x8 + str x8, [sp, #480] // 8-byte Folded Spill + b.ge .LBB0_94 +// %bb.64: + lsl x8, x19, #3 + add x0, x8, #64 + bl malloc + add x8, x27, x25 + add x10, x25, x27, lsl #1 + ldp x6, x5, [sp, #296] // 16-byte Folded Reload + add x11, x25, x27, lsl #2 + lsl x8, x8, #2 + ldr x18, [sp, #328] // 8-byte Folded Reload + ldr x1, [sp, #344] // 8-byte Folded Reload + ldr d5, [x24, x8] + lsl x8, x10, #2 + mov w4, #12 // =0xc + ldr d0, [x24, x8] + lsl x8, x11, #2 + add x10, x10, x27 + ldr d3, [x24, x8] + add x8, x11, x27 + mul x11, x22, x4 + lsl x8, x8, #2 + lsl x10, x10, #2 + lsl x9, x25, #2 + add x13, x0, #63 + ldr d4, [x24, x8] + mov w8, #6 // =0x6 + mov w16, #20 // =0x14 + madd x8, x27, x8, x25 + ldr d2, [x24, x10] + add x10, x6, x21 + ldr d1, [x24, x9] + ldr d23, [x10, x9] + add x9, x18, x1 + mov w15, #24 // =0x18 + ldr q19, [x9, x11] + mul x11, x22, x16 + ldr q17, [x9, x28] + ldr x7, [sp, #64] // 8-byte Folded Reload + lsl x8, x8, #2 + ldr q18, [x9, x20] + ldr q16, [x9] + ldr d6, [x24, x8] + sub x8, x25, x27 + ldr x30, [sp, #104] // 8-byte Folded Reload + add x8, x8, x27, lsl #3 + ldr q21, [x9, x11] + lsl x11, x29, #4 + ldr q20, [x9, x22, lsl #4] + add x14, x11, x5, lsl #5 + madd x16, x5, x16, x11 + lsl x8, x8, #2 + madd x4, x5, x4, x11 + ldr x29, [sp, #504] // 8-byte Folded Reload + add x2, x11, x5, lsl #3 + ldr d7, [x24, x8] + and x8, x13, #0xffffffffffffffc0 + mul x13, x22, x15 + madd x15, x5, x15, x11 + mov x12, xzr + add x16, x6, x16 + add x2, x6, x2 + orr x3, x8, #0x10 + add x4, x6, x4 + ldr q22, [x9, x13] + mov w13, #28 // =0x1c + add x15, x6, x15 + madd x17, x5, x13, x11 + add x13, x6, x14 + add x14, x6, x17 + add x17, x1, x18 + add x1, x11, x5, lsl #2 + add x18, x6, x26 + mov w5, #16 // =0x10 + add x17, x7, x17 + add x18, x18, x11 + sub x5, x5, x7 + add x17, x17, #16 + add x1, x6, x1 + prfm pldl1keep, [x17] + ldur q24, [x17, #-16] + fmla v1.2s, v23.2s, v16.s[0] + fmla v5.2s, v23.2s, v17.s[0] + cmp xzr, x23 + b.ge .LBB0_66 + .p2align 2 +.LBB0_65: // =>This Inner Loop Header: Depth=1 + add x6, x16, x21 + stur d23, [x3, #-16] + fmla v0.2s, v23.2s, v18.s[0] + fmla v2.2s, v23.2s, v19.s[0] + prfm pldl1keep, [x6] + ldr d25, [x1, x21] + fmla v3.2s, v23.2s, v20.s[0] + fmla v4.2s, v23.2s, v21.s[0] + fmla v6.2s, v23.2s, v22.s[0] + fmla v7.2s, v23.2s, v24.s[0] + add x6, x15, x21 + add x7, x17, x5 + add x20, x7, x28 + add x25, x20, x28 + add x12, x12, #4 + add x15, x15, x26 + add x16, x16, x26 + add x17, x17, #16 + add x1, x1, x26 + stur d25, [x3, #-8] + prfm pldl1keep, [x6] + ldr d23, [x2, x21] + fmla v1.2s, v25.2s, v16.s[1] + fmla v5.2s, v25.2s, v17.s[1] + fmla v0.2s, v25.2s, v18.s[1] + fmla v2.2s, v25.2s, v19.s[1] + fmla v3.2s, v25.2s, v20.s[1] + fmla v4.2s, v25.2s, v21.s[1] + fmla v6.2s, v25.2s, v22.s[1] + fmla v7.2s, v25.2s, v24.s[1] + add x6, x14, x21 + add x14, x14, x26 + add x2, x2, x26 + fmla v1.2s, v23.2s, v16.s[2] + fmla v5.2s, v23.2s, v17.s[2] + fmla v0.2s, v23.2s, v18.s[2] + fmla v2.2s, v23.2s, v19.s[2] + fmla v3.2s, v23.2s, v20.s[2] + fmla v4.2s, v23.2s, v21.s[2] + fmla v6.2s, v23.2s, v22.s[2] + fmla v7.2s, v23.2s, v24.s[2] + str d23, [x3] + prfm pldl1keep, [x6] + ldr d23, [x4, x21] + add x6, x13, x21 + add x13, x13, x26 + add x4, x4, x26 + str d23, [x3, #8] + prfm pldl1keep, [x6] + add x6, x25, x28 + fmla v1.2s, v23.2s, v16.s[3] + fmla v5.2s, v23.2s, v17.s[3] + fmla v0.2s, v23.2s, v18.s[3] + fmla v2.2s, v23.2s, v19.s[3] + fmla v3.2s, v23.2s, v20.s[3] + fmla v4.2s, v23.2s, v21.s[3] + fmla v6.2s, v23.2s, v22.s[3] + fmla v7.2s, v23.2s, v24.s[3] + ldr d23, [x18, x21] + prfm pldl1keep, [x7] + ldur q16, [x7, #-16] + prfm pldl1keep, [x20] + ldur q17, [x20, #-16] + prfm pldl1keep, [x25] + ldur q18, [x25, #-16] + add x18, x18, x26 + add x3, x3, #32 + prfm pldl1keep, [x6] + ldur q19, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q20, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q21, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q22, [x6, #-16] + prfm pldl1keep, [x17] + ldur q24, [x17, #-16] + fmla v1.2s, v23.2s, v16.s[0] + fmla v5.2s, v23.2s, v17.s[0] + cmp x12, x23 + b.lt .LBB0_65 +.LBB0_66: + ldp x13, x14, [sp, #400] // 16-byte Folded Reload + ldr x15, [sp, #304] // 8-byte Folded Reload + fmla v0.2s, v23.2s, v18.s[0] + ldr x20, [sp, #488] // 8-byte Folded Reload + str d23, [x8, x23, lsl #3] + fmla v2.2s, v23.2s, v19.s[0] + fmla v3.2s, v23.2s, v20.s[0] + fmla v4.2s, v23.2s, v21.s[0] + fmla v6.2s, v23.2s, v22.s[0] + fmla v7.2s, v23.2s, v24.s[0] + ldr x7, [sp, #520] // 8-byte Folded Reload + madd x12, x13, x15, x20 + lsl x12, x12, #2 + ldr d23, [x10, x12] + madd x12, x14, x15, x20 + lsl x12, x12, #2 + str d23, [x8, x13, lsl #3] + ldr x13, [sp, #416] // 8-byte Folded Reload + fmla v1.2s, v23.2s, v16.s[1] + fmla v5.2s, v23.2s, v17.s[1] + fmla v0.2s, v23.2s, v18.s[1] + fmla v2.2s, v23.2s, v19.s[1] + fmla v3.2s, v23.2s, v20.s[1] + fmla v4.2s, v23.2s, v21.s[1] + fmla v6.2s, v23.2s, v22.s[1] + fmla v7.2s, v23.2s, v24.s[1] + ldr d23, [x10, x12] + madd x12, x13, x15, x20 + fmla v1.2s, v23.2s, v16.s[2] + str d23, [x8, x14, lsl #3] + fmla v5.2s, v23.2s, v17.s[2] + fmla v0.2s, v23.2s, v18.s[2] + fmla v2.2s, v23.2s, v19.s[2] + fmla v3.2s, v23.2s, v20.s[2] + fmla v4.2s, v23.2s, v21.s[2] + fmla v6.2s, v23.2s, v22.s[2] + fmla v7.2s, v23.2s, v24.s[2] + mov x14, x29 + lsl x12, x12, #2 + ldr d23, [x10, x12] + ldr x10, [sp, #72] // 8-byte Folded Reload + add x12, x10, #4 + ldp x17, x10, [sp, #272] // 16-byte Folded Reload + str d23, [x8, x13, lsl #3] + ldr x13, [sp, #120] // 8-byte Folded Reload + fmla v1.2s, v23.2s, v16.s[3] + fmla v5.2s, v23.2s, v17.s[3] + fmla v0.2s, v23.2s, v18.s[3] + fmla v2.2s, v23.2s, v19.s[3] + fmla v3.2s, v23.2s, v20.s[3] + fmla v4.2s, v23.2s, v21.s[3] + fmla v6.2s, v23.2s, v22.s[3] + fmla v7.2s, v23.2s, v24.s[3] + add x10, x11, x10, lsl #2 + ldr x11, [sp, #296] // 8-byte Folded Reload + add x10, x11, x10 + ldr x11, [sp, #320] // 8-byte Folded Reload + add x11, x13, x11, lsl #2 + ldr x13, [sp, #80] // 8-byte Folded Reload + sub x11, x11, x13 + ldr x13, [sp, #328] // 8-byte Folded Reload + add x13, x11, x13 + mul x11, x15, x12 + add x12, x13, #4 + ldr x13, [sp, #96] // 8-byte Folded Reload + lsl x13, x13, #2 + cmp x29, x19 + b.ge .LBB0_68 + .p2align 2 +.LBB0_67: // =>This Inner Loop Header: Depth=1 + add x16, x12, x28 + prfm pldl1keep, [x12] + ldur s16, [x12, #-4] + add x15, x10, x11 + prfm pldl1keep, [x16] + ldur s17, [x16, #-4] + add x16, x16, x28 + add x12, x12, #4 + prfm pldl1keep, [x16] + ldur s18, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s19, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s20, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s21, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s22, [x16, #-4] + add x16, x16, x28 + prfm pldl1keep, [x16] + ldur s23, [x16, #-4] + prfm pldl1keep, [x15] + ldr d24, [x10, x13] + add x10, x10, x17 + fmla v1.2s, v24.2s, v16.s[0] + str d24, [x8, x14, lsl #3] + add x14, x14, #1 + fmla v5.2s, v24.2s, v17.s[0] + fmla v0.2s, v24.2s, v18.s[0] + fmla v2.2s, v24.2s, v19.s[0] + fmla v3.2s, v24.2s, v20.s[0] + fmla v4.2s, v24.2s, v21.s[0] + fmla v6.2s, v24.2s, v22.s[0] + fmla v7.2s, v24.2s, v23.s[0] + cmp x14, x19 + b.lt .LBB0_67 +.LBB0_68: // %.preheader25 + ldr x10, [sp, #344] // 8-byte Folded Reload + ldr x11, [sp, #496] // 8-byte Folded Reload + mov x6, xzr + mov w3, #7 // =0x7 + ldr x15, [sp, #328] // 8-byte Folded Reload + ldr x12, [sp, #88] // 8-byte Folded Reload + mov w2, #6 // =0x6 + mov w16, #5 // =0x5 + mov w18, #4 // =0x4 + mov w17, #3 // =0x3 + mov w1, #2 // =0x2 + mov w4, #1 // =0x1 + add x13, x11, x10 + sub x10, x8, x30, lsl #3 + add x14, x12, x15 + add x11, x8, #24 + mov w12, #8 // =0x8 + add x15, x13, x15 + add x13, x14, #4 + add x10, x10, x19, lsl #3 + add x14, x15, #32 + add x15, x10, #8 + b .LBB0_70 + .p2align 2 +.LBB0_69: // %.loopexit24 + // in Loop: Header=BB0_70 Depth=1 + ldp x20, x6, [sp, #488] // 16-byte Folded Reload + ldr x7, [sp, #520] // 8-byte Folded Reload + add x14, x14, x6 + add x13, x13, x6 + mov x6, x12 + mov x12, x5 +.LBB0_70: // =>This Loop Header: Depth=1 + // Child Loop BB0_72 Depth 2 + // Child Loop BB0_74 Depth 2 + madd x5, x6, x27, x20 + cmp x12, x7 + lsl x5, x5, #2 + madd x4, x4, x27, x20 + madd x1, x1, x27, x20 + madd x17, x17, x27, x20 + madd x18, x18, x27, x20 + lsl x4, x4, #2 + lsl x1, x1, #2 + lsl x17, x17, #2 + lsl x18, x18, #2 + madd x16, x16, x27, x20 + lsl x16, x16, #2 + str d1, [x24, x5] + str d5, [x24, x4] + str d0, [x24, x1] + str d2, [x24, x17] + str d3, [x24, x18] + str d4, [x24, x16] + madd x16, x2, x27, x20 + lsl x16, x16, #2 + str d6, [x24, x16] + madd x16, x3, x27, x20 + lsl x16, x16, #2 + str d7, [x24, x16] + b.ge .LBB0_75 +// %bb.71: // in Loop: Header=BB0_70 Depth=1 + add x17, x12, #3 + add x4, x12, #1 + add x1, x12, #2 + madd x2, x12, x27, x20 + madd x7, x17, x27, x20 + add x18, x12, #4 + add x16, x12, #5 + mov x25, x20 + madd x3, x4, x27, x20 + ldr d24, [x8] + mov x6, xzr + lsl x2, x2, #2 + madd x5, x1, x27, x20 + madd x20, x18, x27, x20 + lsl x7, x7, #2 + lsl x3, x3, #2 + ldr d1, [x24, x2] + madd x2, x16, x27, x25 + lsl x5, x5, #2 + lsl x20, x20, #2 + ldr d2, [x24, x7] + mul x7, x12, x22 + ldr d5, [x24, x3] + ldr d0, [x24, x5] + ldr d3, [x24, x20] + mov x20, x14 + lsl x2, x2, #2 + lsl x7, x7, #2 + ldr d4, [x24, x2] + add x2, x12, #6 + ldr q23, [x9, x7] + mul x7, x4, x22 + madd x3, x2, x27, x25 + lsl x7, x7, #2 + lsl x3, x3, #2 + ldr q22, [x9, x7] + mul x7, x1, x22 + ldr d6, [x24, x3] + add x3, x12, #7 + madd x5, x3, x27, x25 + lsl x7, x7, #2 + ldr q21, [x9, x7] + mul x7, x17, x22 + lsl x5, x5, #2 + ldr d7, [x24, x5] + add x5, x12, #8 + lsl x7, x7, #2 + ldr q20, [x9, x7] + mul x7, x18, x22 + lsl x7, x7, #2 + ldr q19, [x9, x7] + mul x7, x16, x22 + lsl x7, x7, #2 + ldr q18, [x9, x7] + mul x7, x2, x22 + lsl x7, x7, #2 + ldr q17, [x9, x7] + mul x7, x3, x22 + lsl x7, x7, #2 + ldr q16, [x9, x7] + mov x7, x11 + cmp xzr, x23 + b.ge .LBB0_73 + .p2align 2 +.LBB0_72: // Parent Loop BB0_70 Depth=1 + // => This Inner Loop Header: Depth=2 + add x25, x7, #16 + fmla v1.2s, v24.2s, v23.s[0] + fmla v5.2s, v24.2s, v22.s[0] + add x6, x6, #4 + fmla v0.2s, v24.2s, v21.s[0] + fmla v2.2s, v24.2s, v20.s[0] + prfm pldl1keep, [x25] + add x25, x20, x28 + fmla v3.2s, v24.2s, v19.s[0] + fmla v4.2s, v24.2s, v18.s[0] + fmla v6.2s, v24.2s, v17.s[0] + fmla v7.2s, v24.2s, v16.s[0] + ldp d24, d25, [x7, #-16] + fmla v1.2s, v24.2s, v23.s[1] + fmla v5.2s, v24.2s, v22.s[1] + fmla v0.2s, v24.2s, v21.s[1] + fmla v2.2s, v24.2s, v20.s[1] + fmla v3.2s, v24.2s, v19.s[1] + fmla v4.2s, v24.2s, v18.s[1] + fmla v6.2s, v24.2s, v17.s[1] + fmla v7.2s, v24.2s, v16.s[1] + fmla v1.2s, v25.2s, v23.s[2] + fmla v5.2s, v25.2s, v22.s[2] + ldp d26, d24, [x7], #32 + fmla v0.2s, v25.2s, v21.s[2] + fmla v2.2s, v25.2s, v20.s[2] + fmla v3.2s, v25.2s, v19.s[2] + prfm pldl1keep, [x20] + fmla v4.2s, v25.2s, v18.s[2] + fmla v6.2s, v25.2s, v17.s[2] + fmla v7.2s, v25.2s, v16.s[2] + fmla v1.2s, v26.2s, v23.s[3] + ldur q23, [x20, #-16] + prfm pldl1keep, [x25] + fmla v5.2s, v26.2s, v22.s[3] + ldur q22, [x25, #-16] + add x25, x25, x28 + fmla v0.2s, v26.2s, v21.s[3] + fmla v2.2s, v26.2s, v20.s[3] + fmla v3.2s, v26.2s, v19.s[3] + fmla v4.2s, v26.2s, v18.s[3] + add x20, x20, #16 + prfm pldl1keep, [x25] + ldur q21, [x25, #-16] + add x25, x25, x28 + fmla v6.2s, v26.2s, v17.s[3] + fmla v7.2s, v26.2s, v16.s[3] + prfm pldl1keep, [x25] + ldur q20, [x25, #-16] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur q19, [x25, #-16] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur q18, [x25, #-16] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur q17, [x25, #-16] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur q16, [x25, #-16] + cmp x6, x23 + b.lt .LBB0_72 +.LBB0_73: // in Loop: Header=BB0_70 Depth=1 + ldp x7, x6, [sp, #400] // 16-byte Folded Reload + fmla v1.2s, v24.2s, v23.s[0] + fmla v5.2s, v24.2s, v22.s[0] + fmla v0.2s, v24.2s, v21.s[0] + fmla v2.2s, v24.2s, v20.s[0] + mov x20, x29 + fmla v3.2s, v24.2s, v19.s[0] + fmla v4.2s, v24.2s, v18.s[0] + ldr d25, [x8, x7, lsl #3] + fmla v6.2s, v24.2s, v17.s[0] + fmla v7.2s, v24.2s, v16.s[0] + ldr d24, [x8, x6, lsl #3] + ldr x6, [sp, #416] // 8-byte Folded Reload + mov x7, x15 + ldr d26, [x8, x6, lsl #3] + mov x6, x13 + fmla v1.2s, v25.2s, v23.s[1] + fmla v5.2s, v25.2s, v22.s[1] + fmla v0.2s, v25.2s, v21.s[1] + fmla v2.2s, v25.2s, v20.s[1] + fmla v3.2s, v25.2s, v19.s[1] + fmla v4.2s, v25.2s, v18.s[1] + fmla v6.2s, v25.2s, v17.s[1] + fmla v7.2s, v25.2s, v16.s[1] + fmla v1.2s, v24.2s, v23.s[2] + fmla v5.2s, v24.2s, v22.s[2] + fmla v0.2s, v24.2s, v21.s[2] + fmla v2.2s, v24.2s, v20.s[2] + fmla v3.2s, v24.2s, v19.s[2] + fmla v4.2s, v24.2s, v18.s[2] + fmla v6.2s, v24.2s, v17.s[2] + fmla v7.2s, v24.2s, v16.s[2] + fmla v1.2s, v26.2s, v23.s[3] + fmla v5.2s, v26.2s, v22.s[3] + fmla v0.2s, v26.2s, v21.s[3] + fmla v2.2s, v26.2s, v20.s[3] + fmla v3.2s, v26.2s, v19.s[3] + fmla v4.2s, v26.2s, v18.s[3] + fmla v6.2s, v26.2s, v17.s[3] + fmla v7.2s, v26.2s, v16.s[3] + cmp x29, x19 + b.ge .LBB0_69 + .p2align 2 +.LBB0_74: // Parent Loop BB0_70 Depth=1 + // => This Inner Loop Header: Depth=2 + add x25, x6, x28 + prfm pldl1keep, [x6] + ldur s16, [x6, #-4] + add x20, x20, #1 + prfm pldl1keep, [x25] + ldur s17, [x25, #-4] + add x25, x25, x28 + add x6, x6, #4 + prfm pldl1keep, [x25] + ldur s18, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s19, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s20, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s21, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s22, [x25, #-4] + add x25, x25, x28 + prfm pldl1keep, [x25] + ldur s23, [x25, #-4] + prfm pldl1keep, [x7] + ldur d24, [x7, #-8] + add x7, x7, #8 + fmla v1.2s, v24.2s, v16.s[0] + fmla v5.2s, v24.2s, v17.s[0] + fmla v0.2s, v24.2s, v18.s[0] + fmla v2.2s, v24.2s, v19.s[0] + fmla v3.2s, v24.2s, v20.s[0] + fmla v4.2s, v24.2s, v21.s[0] + fmla v6.2s, v24.2s, v22.s[0] + fmla v7.2s, v24.2s, v23.s[0] + cmp x20, x19 + b.lt .LBB0_74 + b .LBB0_69 +.LBB0_75: + ldr x13, [sp, #336] // 8-byte Folded Reload + cmp x7, x13 + b.lt .LBB0_78 +// %bb.76: + ldr x10, [sp, #312] // 8-byte Folded Reload + cmp x13, x10 + b.lt .LBB0_83 +.LBB0_77: + ldr x10, [sp, #288] // 8-byte Folded Reload + ldr x11, [sp, #312] // 8-byte Folded Reload + cmp x11, x10 + b.lt .LBB0_88 + b .LBB0_93 +.LBB0_78: + add x18, x7, #1 + add x1, x7, #2 + mul x15, x7, x22 + add x2, x7, #3 + madd x12, x18, x27, x20 + mov x17, x8 + ldr d16, [x17], #24 + mul x18, x18, x22 + mov x16, xzr + lsl x14, x15, #2 + mul x11, x7, x27 + madd x13, x1, x27, x20 + add x11, x11, x20 + lsl x18, x18, #2 + add x11, x24, x11, lsl #2 + ldr q5, [x9, x14] + ldr q7, [x9, x18] + mul x18, x1, x22 + ldr d0, [x11] + madd x14, x2, x27, x20 + lsl x18, x18, #2 + add x12, x24, x12, lsl #2 + add x13, x24, x13, lsl #2 + add x14, x24, x14, lsl #2 + ldr d1, [x12] + ldr d2, [x13] + ldr q6, [x9, x18] + mul x18, x2, x22 + ldp x2, x1, [sp, #320] // 16-byte Folded Reload + ldr d3, [x14] + lsl x18, x18, #2 + ldr q4, [x9, x18] + ldr x18, [sp, #32] // 8-byte Folded Reload + lsl x18, x18, #5 + add x18, x18, x2, lsl #2 + add x18, x18, x1 + add x18, x18, #32 + cmp xzr, x23 + b.ge .LBB0_80 + .p2align 2 +.LBB0_79: // =>This Inner Loop Header: Depth=1 + add x1, x17, #16 + fmla v0.2s, v16.2s, v5.s[0] + fmla v1.2s, v16.2s, v7.s[0] + add x16, x16, #4 + fmla v2.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v4.s[0] + prfm pldl1keep, [x1] + add x1, x18, x28 + ldp d16, d17, [x17, #-16] + fmla v0.2s, v16.2s, v5.s[1] + fmla v1.2s, v16.2s, v7.s[1] + fmla v2.2s, v16.2s, v6.s[1] + fmla v3.2s, v16.2s, v4.s[1] + fmla v0.2s, v17.2s, v5.s[2] + fmla v1.2s, v17.2s, v7.s[2] + fmla v2.2s, v17.2s, v6.s[2] + fmla v3.2s, v17.2s, v4.s[2] + ldp d17, d16, [x17], #32 + prfm pldl1keep, [x18] + fmla v0.2s, v17.2s, v5.s[3] + ldur q5, [x18, #-16] + prfm pldl1keep, [x1] + fmla v1.2s, v17.2s, v7.s[3] + ldur q7, [x1, #-16] + add x1, x1, x28 + fmla v2.2s, v17.2s, v6.s[3] + fmla v3.2s, v17.2s, v4.s[3] + add x18, x18, #16 + prfm pldl1keep, [x1] + ldur q6, [x1, #-16] + add x1, x1, x28 + prfm pldl1keep, [x1] + ldur q4, [x1, #-16] + cmp x16, x23 + b.lt .LBB0_79 +.LBB0_80: + ldp x17, x16, [sp, #400] // 16-byte Folded Reload + fmla v0.2s, v16.2s, v5.s[0] + fmla v1.2s, v16.2s, v7.s[0] + fmla v2.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v4.s[0] + add x15, x19, x15 + sub x15, x15, x30 + add x10, x10, #8 + ldr d17, [x8, x17, lsl #3] + fmla v0.2s, v17.2s, v5.s[1] + ldr d16, [x8, x16, lsl #3] + ldr x16, [sp, #416] // 8-byte Folded Reload + fmla v1.2s, v17.2s, v7.s[1] + fmla v2.2s, v17.2s, v6.s[1] + fmla v3.2s, v17.2s, v4.s[1] + ldr d18, [x8, x16, lsl #3] + ldr x16, [sp, #344] // 8-byte Folded Reload + add x15, x16, x15, lsl #2 + ldr x16, [sp, #328] // 8-byte Folded Reload + fmla v0.2s, v16.2s, v5.s[2] + fmla v1.2s, v16.2s, v7.s[2] + fmla v2.2s, v16.2s, v6.s[2] + fmla v3.2s, v16.2s, v4.s[2] + add x15, x15, x16 + mov x16, x29 + add x15, x15, #4 + fmla v0.2s, v18.2s, v5.s[3] + fmla v1.2s, v18.2s, v7.s[3] + fmla v2.2s, v18.2s, v6.s[3] + fmla v3.2s, v18.2s, v4.s[3] + cmp x29, x19 + b.ge .LBB0_82 + .p2align 2 +.LBB0_81: // =>This Inner Loop Header: Depth=1 + add x17, x15, x28 + prfm pldl1keep, [x15] + ldur s4, [x15, #-4] + add x16, x16, #1 + prfm pldl1keep, [x17] + ldur s5, [x17, #-4] + add x17, x17, x28 + add x15, x15, #4 + prfm pldl1keep, [x17] + ldur s6, [x17, #-4] + add x17, x17, x28 + prfm pldl1keep, [x17] + ldur s7, [x17, #-4] + prfm pldl1keep, [x10] + ldur d16, [x10, #-8] + add x10, x10, #8 + fmla v0.2s, v16.2s, v4.s[0] + fmla v1.2s, v16.2s, v5.s[0] + fmla v2.2s, v16.2s, v6.s[0] + fmla v3.2s, v16.2s, v7.s[0] + cmp x16, x19 + b.lt .LBB0_81 +.LBB0_82: + str d0, [x11] + str d1, [x12] + str d2, [x13] + ldr x13, [sp, #336] // 8-byte Folded Reload + str d3, [x14] + ldr x10, [sp, #312] // 8-byte Folded Reload + cmp x13, x10 + b.ge .LBB0_77 +.LBB0_83: + mul x10, x13, x27 + add x12, x13, #1 + mov x16, x8 + ldr x18, [sp, #328] // 8-byte Folded Reload + mul x13, x13, x22 + ldr d4, [x16], #24 + mov x15, xzr + madd x11, x12, x27, x20 + lsl x14, x13, #2 + add x10, x10, x20 + add x10, x24, x10, lsl #2 + ldr q3, [x9, x14] + mul x14, x12, x22 + add x11, x24, x11, lsl #2 + ldr d0, [x10] + ldr d1, [x11] + lsl x17, x14, #2 + ldr q2, [x9, x17] + add x17, x18, x17 + cmp xzr, x23 + b.ge .LBB0_85 + .p2align 2 +.LBB0_84: // =>This Inner Loop Header: Depth=1 + add x4, x16, #16 + ldr x2, [sp, #344] // 8-byte Folded Reload + ldr x5, [sp, #248] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v3.s[0] + prfm pldl1keep, [x4] + fmla v1.2s, v4.2s, v2.s[0] + ldp d4, d5, [x16, #-16] + add x15, x15, #4 + add x18, x17, x2 + add x2, x5, x2 + add x5, x5, #16 + add x17, x17, #16 + add x1, x18, #32 + add x3, x2, #32 + fmla v0.2s, v4.2s, v3.s[1] + fmla v1.2s, v4.2s, v2.s[1] + fmla v0.2s, v5.2s, v3.s[2] + fmla v1.2s, v5.2s, v2.s[2] + ldp d5, d4, [x16], #32 + prfm pldl1keep, [x3] + fmla v0.2s, v5.2s, v3.s[3] + ldr q3, [x2, #16] + prfm pldl1keep, [x1] + fmla v1.2s, v5.2s, v2.s[3] + ldr q2, [x18, #16] + str x5, [sp, #248] // 8-byte Folded Spill + cmp x15, x23 + b.lt .LBB0_84 +.LBB0_85: + ldr x15, [sp, #400] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v3.s[0] + fmla v1.2s, v4.2s, v2.s[0] + ldr x17, [sp, #344] // 8-byte Folded Reload + add x13, x19, x13 + mul x12, x22, x12 + ldr x16, [sp, #328] // 8-byte Folded Reload + add x12, x17, x12, lsl #2 + ldr d5, [x8, x15, lsl #3] + ldr x15, [sp, #408] // 8-byte Folded Reload + add x12, x16, x12 + ldr d4, [x8, x15, lsl #3] + ldr x15, [sp, #416] // 8-byte Folded Reload + fmla v0.2s, v5.2s, v3.s[1] + fmla v1.2s, v5.2s, v2.s[1] + ldr d5, [x8, x15, lsl #3] + sub x15, x13, x30 + add x13, x19, x14 + add x14, x17, x15, lsl #2 + ldr x15, [sp, #40] // 8-byte Folded Reload + sub x13, x13, x30 + fmla v0.2s, v4.2s, v3.s[2] + fmla v1.2s, v4.2s, v2.s[2] + add x13, x17, x13, lsl #2 + add x14, x14, x16 + add x13, x13, x16 + add x14, x14, #4 + add x15, x17, x15, lsl #4 + add x13, x13, #4 + fmla v0.2s, v5.2s, v3.s[3] + fmla v1.2s, v5.2s, v2.s[3] + add x15, x16, x15 + mov x16, x29 + cmp x29, x19 + b.ge .LBB0_87 + .p2align 2 +.LBB0_86: // =>This Inner Loop Header: Depth=1 + add x17, x8, x16, lsl #3 + prfm pldl1keep, [x14] + ldr s2, [x15, x16, lsl #2] + prfm pldl1keep, [x13] + ldr s3, [x12, x16, lsl #2] + add x13, x13, #4 + add x17, x17, #8 + add x14, x14, #4 + prfm pldl1keep, [x17] + ldr d4, [x8, x16, lsl #3] + add x16, x16, #1 + fmla v0.2s, v4.2s, v2.s[0] + fmla v1.2s, v4.2s, v3.s[0] + cmp x16, x19 + b.lt .LBB0_86 +.LBB0_87: + str d0, [x10] + str d1, [x11] + ldr x10, [sp, #288] // 8-byte Folded Reload + ldr x11, [sp, #312] // 8-byte Folded Reload + cmp x11, x10 + b.ge .LBB0_93 +.LBB0_88: + ldr x11, [sp, #312] // 8-byte Folded Reload + mov x13, x8 + mov x12, xzr + mul x10, x11, x27 + mul x11, x11, x22 + ldr d2, [x13], #24 + lsl x14, x11, #2 + add x10, x10, x20 + ldr q1, [x9, x14] + ldr x9, [sp, #48] // 8-byte Folded Reload + ldp x15, x14, [sp, #320] // 16-byte Folded Reload + add x10, x24, x10, lsl #2 + ldr d0, [x10] + lsl x9, x9, #3 + add x9, x9, x15, lsl #2 + add x9, x9, x14 + add x9, x9, #32 + cmp xzr, x23 + b.ge .LBB0_90 + .p2align 2 +.LBB0_89: // =>This Inner Loop Header: Depth=1 + add x14, x13, #16 + fmla v0.2s, v2.2s, v1.s[0] + add x12, x12, #4 + prfm pldl1keep, [x14] + ldp d2, d3, [x13, #-16] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v3.2s, v1.s[2] + ldp d3, d2, [x13], #32 + prfm pldl1keep, [x9] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x9, #-16] + add x9, x9, #16 + cmp x12, x23 + b.lt .LBB0_89 +.LBB0_90: + ldr x9, [sp, #400] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.s[0] + ldr x12, [sp, #24] // 8-byte Folded Reload + ldr d3, [x8, x9, lsl #3] + ldr x9, [sp, #408] // 8-byte Folded Reload + fmla v0.2s, v3.2s, v1.s[1] + ldr d2, [x8, x9, lsl #3] + ldr x9, [sp, #416] // 8-byte Folded Reload + fmla v0.2s, v2.2s, v1.s[2] + ldr d3, [x8, x9, lsl #3] + add x9, x19, x11 + ldr x11, [sp, #344] // 8-byte Folded Reload + sub x9, x9, x30 + add x9, x11, x9, lsl #2 + ldr x11, [sp, #328] // 8-byte Folded Reload + fmla v0.2s, v3.2s, v1.s[3] + add x9, x9, x11 + add x11, x11, x12 + mov x12, x29 + add x9, x9, #4 + cmp x29, x19 + b.ge .LBB0_92 + .p2align 2 +.LBB0_91: // =>This Inner Loop Header: Depth=1 + add x13, x8, x12, lsl #3 + prfm pldl1keep, [x9] + ldr s1, [x11, x12, lsl #2] + add x9, x9, #4 + add x13, x13, #8 + prfm pldl1keep, [x13] + ldr d2, [x8, x12, lsl #3] + add x12, x12, #1 + fmla v0.2s, v2.2s, v1.s[0] + cmp x12, x19 + b.lt .LBB0_91 +.LBB0_92: + str d0, [x10] +.LBB0_93: + bl free + ldr x20, [sp, #472] // 8-byte Folded Reload +.LBB0_94: + ldr x8, [sp, #128] // 8-byte Folded Reload + ldr x25, [sp, #480] // 8-byte Folded Reload + cmp x25, x8 + b.ge .LBB0_126 +// %bb.95: + ldr x8, [sp, #120] // 8-byte Folded Reload + add x0, x8, #64 + bl malloc + add x10, x25, x27, lsl #2 + ldr x18, [sp, #328] // 8-byte Folded Reload + ldr x1, [sp, #344] // 8-byte Folded Reload + mov w5, #12 // =0xc + add x9, x25, x27, lsl #1 + sub x13, x25, x27 + mov w11, #6 // =0x6 + add x8, x27, x25 + add x16, x10, x27 + ldr s2, [x24, x10, lsl #2] + mul x10, x22, x5 + add x15, x9, x27 + ldr s5, [x24, x9, lsl #2] + add x13, x13, x27, lsl #3 + ldr s3, [x24, x16, lsl #2] + add x9, x18, x1 + mov w16, #20 // =0x14 + ldr s4, [x24, x15, lsl #2] + mov w15, #24 // =0x18 + madd x11, x27, x11, x25 + ldr q19, [x9, x10] + mul x10, x22, x16 + add x14, x0, #63 + ldr s0, [x24, x13, lsl #2] + mul x13, x22, x15 + ldr x4, [sp, #64] // 8-byte Folded Reload + ldp x7, x6, [sp, #296] // 16-byte Folded Reload + ldr q22, [x9, x13] + mov w13, #28 // =0x1c + ldr s1, [x24, x11, lsl #2] + add x11, x7, x21 + ldr q21, [x9, x10] + ldr x10, [sp, #464] // 8-byte Folded Reload + ldr q16, [x9] + ldr s7, [x24, x8, lsl #2] + ldr s6, [x24, x25, lsl #2] + ldr s23, [x11, x25, lsl #2] + ldr q17, [x9, x28] + ldr q18, [x9, x20] + ldr q20, [x9, x22, lsl #4] + lsl x10, x10, #3 + and x8, x14, #0xffffffffffffffc0 + ldr x29, [sp, #504] // 8-byte Folded Reload + ldr x30, [sp, #104] // 8-byte Folded Reload + madd x17, x6, x13, x10 + add x14, x10, x6, lsl #5 + madd x15, x6, x15, x10 + madd x16, x6, x16, x10 + madd x5, x6, x5, x10 + add x2, x10, x6, lsl #3 + mov w3, #16 // =0x10 + mov x12, xzr + add x13, x7, x14 + add x2, x7, x2 + sub x3, x3, x4 + add x14, x7, x17 + add x17, x1, x18 + add x1, x10, x6, lsl #2 + add x18, x26, x10 + add x15, x7, x15 + add x16, x7, x16 + add x5, x7, x5 + add x17, x4, x17 + add x18, x7, x18 + orr x4, x8, #0x8 + add x17, x17, #16 + add x1, x7, x1 + .p2align 2 +.LBB0_96: // =>This Inner Loop Header: Depth=1 + prfm pldl1keep, [x17] + ldur q24, [x17, #-16] + ext v31.16b, v16.16b, v16.16b, #8 + ext v8.16b, v17.16b, v17.16b, #8 + cmp x12, x23 + ext v30.16b, v18.16b, v18.16b, #8 + ext v29.16b, v19.16b, v19.16b, #8 + ext v28.16b, v20.16b, v20.16b, #8 + ext v27.16b, v21.16b, v21.16b, #8 + ext v26.16b, v22.16b, v22.16b, #8 + ext v25.16b, v24.16b, v24.16b, #8 + b.ge .LBB0_98 +// %bb.97: // in Loop: Header=BB0_96 Depth=1 + add x6, x16, x21 + stur s23, [x4, #-8] + fmla v4.2s, v23.2s, v19.2s + fmla v6.2s, v23.2s, v16.2s + prfm pldl1keep, [x6] + ldr s9, [x1, x21] + add x6, x15, x21 + fmla v7.2s, v23.2s, v17.2s + fmla v5.2s, v23.2s, v18.2s + fmla v2.2s, v23.2s, v20.2s + fmla v3.2s, v23.2s, v21.2s + fmla v1.2s, v23.2s, v22.2s + fmla v0.2s, v23.2s, v24.2s + add x7, x14, x21 + add x20, x17, x3 + add x25, x20, x28 + add x12, x12, #4 + add x14, x14, x26 + add x15, x15, x26 + add x16, x16, x26 + add x17, x17, #16 + add x1, x1, x26 + stur s9, [x4, #-4] + prfm pldl1keep, [x6] + ldr s23, [x2, x21] + fmla v4.2s, v9.2s, v19.s[1] + fmla v6.2s, v9.2s, v16.s[1] + fmla v7.2s, v9.2s, v17.s[1] + fmla v5.2s, v9.2s, v18.s[1] + fmla v2.2s, v9.2s, v20.s[1] + fmla v3.2s, v9.2s, v21.s[1] + fmla v1.2s, v9.2s, v22.s[1] + fmla v0.2s, v9.2s, v24.s[1] + add x6, x13, x21 + add x13, x13, x26 + add x2, x2, x26 + str s23, [x4] + prfm pldl1keep, [x7] + fmla v4.2s, v23.2s, v29.2s + ldr s29, [x5, x21] + fmla v6.2s, v23.2s, v31.2s + fmla v7.2s, v23.2s, v8.2s + fmla v5.2s, v23.2s, v30.2s + fmla v2.2s, v23.2s, v28.2s + add x7, x25, x28 + fmla v3.2s, v23.2s, v27.2s + fmla v1.2s, v23.2s, v26.2s + fmla v0.2s, v23.2s, v25.2s + add x5, x5, x26 + str s29, [x4, #4] + prfm pldl1keep, [x6] + add x6, x7, x28 + fmla v6.2s, v29.2s, v16.s[3] + fmla v7.2s, v29.2s, v17.s[3] + fmla v5.2s, v29.2s, v18.s[3] + fmla v4.2s, v29.2s, v19.s[3] + fmla v2.2s, v29.2s, v20.s[3] + ldr s23, [x18, x21] + prfm pldl1keep, [x20] + ldur q16, [x20, #-16] + prfm pldl1keep, [x25] + ldur q17, [x25, #-16] + prfm pldl1keep, [x7] + ldur q18, [x7, #-16] + fmla v3.2s, v29.2s, v21.s[3] + fmla v1.2s, v29.2s, v22.s[3] + fmla v0.2s, v29.2s, v24.s[3] + add x18, x18, x26 + add x4, x4, #16 + prfm pldl1keep, [x6] + ldur q19, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q20, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q21, [x6, #-16] + add x6, x6, x28 + prfm pldl1keep, [x6] + ldur q22, [x6, #-16] + b .LBB0_96 +.LBB0_98: + ldp x13, x14, [sp, #400] // 16-byte Folded Reload + ldr x15, [sp, #304] // 8-byte Folded Reload + fmla v6.2s, v23.2s, v16.2s + ldr x25, [sp, #480] // 8-byte Folded Reload + str s23, [x8, x23, lsl #2] + fmla v7.2s, v23.2s, v17.2s + fmla v5.2s, v23.2s, v18.2s + fmla v4.2s, v23.2s, v19.2s + fmla v2.2s, v23.2s, v20.2s + fmla v3.2s, v23.2s, v21.2s + fmla v1.2s, v23.2s, v22.2s + fmla v0.2s, v23.2s, v24.2s + ldr x7, [sp, #520] // 8-byte Folded Reload + ldr x16, [sp, #96] // 8-byte Folded Reload + madd x12, x13, x15, x25 + ldr s23, [x11, x12, lsl #2] + madd x12, x14, x15, x25 + str s23, [x8, x13, lsl #2] + ldr x13, [sp, #416] // 8-byte Folded Reload + fmla v6.2s, v23.2s, v16.s[1] + fmla v7.2s, v23.2s, v17.s[1] + fmla v5.2s, v23.2s, v18.s[1] + fmla v4.2s, v23.2s, v19.s[1] + fmla v2.2s, v23.2s, v20.s[1] + fmla v3.2s, v23.2s, v21.s[1] + fmla v1.2s, v23.2s, v22.s[1] + fmla v0.2s, v23.2s, v24.s[1] + ldr s23, [x11, x12, lsl #2] + madd x12, x13, x15, x25 + fmla v6.2s, v23.2s, v31.2s + str s23, [x8, x14, lsl #2] + fmla v7.2s, v23.2s, v8.2s + fmla v5.2s, v23.2s, v30.2s + fmla v4.2s, v23.2s, v29.2s + fmla v2.2s, v23.2s, v28.2s + fmla v3.2s, v23.2s, v27.2s + fmla v1.2s, v23.2s, v26.2s + fmla v0.2s, v23.2s, v25.2s + ldr s31, [x11, x12, lsl #2] + ldr x11, [sp, #72] // 8-byte Folded Reload + ldp x17, x12, [sp, #272] // 16-byte Folded Reload + add x10, x10, x12, lsl #2 + ldr x12, [sp, #296] // 8-byte Folded Reload + add x11, x11, #4 + mul x11, x15, x11 + str s31, [x8, x13, lsl #2] + ldr x13, [sp, #120] // 8-byte Folded Reload + fmla v6.2s, v31.2s, v16.s[3] + fmla v7.2s, v31.2s, v17.s[3] + fmla v5.2s, v31.2s, v18.s[3] + fmla v4.2s, v31.2s, v19.s[3] + fmla v2.2s, v31.2s, v20.s[3] + fmla v3.2s, v31.2s, v21.s[3] + fmla v1.2s, v31.2s, v22.s[3] + fmla v0.2s, v31.2s, v24.s[3] + add x10, x12, x10 + ldr x12, [sp, #320] // 8-byte Folded Reload + add x12, x13, x12, lsl #2 + ldr x13, [sp, #80] // 8-byte Folded Reload + sub x12, x12, x13 + ldr x13, [sp, #328] // 8-byte Folded Reload + add x12, x12, x13 + mov x13, x29 + add x12, x12, #4 + cmp x29, x19 + b.ge .LBB0_100 + .p2align 2 +.LBB0_99: // =>This Inner Loop Header: Depth=1 + add x15, x12, x28 + prfm pldl1keep, [x12] + ldur s16, [x12, #-4] + add x14, x10, x11 + prfm pldl1keep, [x15] + ldur s17, [x15, #-4] + add x15, x15, x28 + add x12, x12, #4 + prfm pldl1keep, [x15] + ldur s18, [x15, #-4] + add x15, x15, x28 + prfm pldl1keep, [x15] + ldur s19, [x15, #-4] + add x15, x15, x28 + prfm pldl1keep, [x15] + ldur s20, [x15, #-4] + add x15, x15, x28 + prfm pldl1keep, [x15] + ldur s21, [x15, #-4] + add x15, x15, x28 + prfm pldl1keep, [x15] + ldur s22, [x15, #-4] + add x15, x15, x28 + prfm pldl1keep, [x15] + ldur s23, [x15, #-4] + prfm pldl1keep, [x14] + ldr s24, [x10, x16, lsl #2] + add x10, x10, x17 + fmla v6.2s, v24.2s, v16.2s + str s24, [x8, x13, lsl #2] + add x13, x13, #1 + fmla v7.2s, v24.2s, v17.2s + fmla v5.2s, v24.2s, v18.2s + fmla v4.2s, v24.2s, v19.2s + fmla v2.2s, v24.2s, v20.2s + fmla v3.2s, v24.2s, v21.2s + fmla v1.2s, v24.2s, v22.2s + fmla v0.2s, v24.2s, v23.2s + cmp x13, x19 + b.lt .LBB0_99 +.LBB0_100: // %.preheader + ldr x10, [sp, #344] // 8-byte Folded Reload + ldr x11, [sp, #496] // 8-byte Folded Reload + mov x6, xzr + mov w16, #7 // =0x7 + ldr x15, [sp, #328] // 8-byte Folded Reload + ldr x12, [sp, #88] // 8-byte Folded Reload + mov w17, #6 // =0x6 + mov w18, #5 // =0x5 + mov w1, #4 // =0x4 + mov w2, #3 // =0x3 + mov w3, #2 // =0x2 + mov w4, #1 // =0x1 + add x13, x11, x10 + sub x10, x8, x30, lsl #2 + add x14, x12, x15 + add x11, x8, #12 + mov w12, #8 // =0x8 + add x15, x13, x15 + add x13, x14, #4 + add x10, x10, x19, lsl #2 + add x14, x15, #32 + add x15, x10, #4 + b .LBB0_102 + .p2align 2 +.LBB0_101: // %.loopexit + // in Loop: Header=BB0_102 Depth=1 + ldr x6, [sp, #496] // 8-byte Folded Reload + ldr x7, [sp, #520] // 8-byte Folded Reload + add x14, x14, x6 + add x13, x13, x6 + mov x6, x12 + mov x12, x5 +.LBB0_102: // =>This Loop Header: Depth=1 + // Child Loop BB0_104 Depth 2 + // Child Loop BB0_106 Depth 2 + madd x5, x6, x27, x25 + cmp x12, x7 + str s6, [x24, x5, lsl #2] + madd x4, x4, x27, x25 + madd x3, x3, x27, x25 + madd x2, x2, x27, x25 + madd x1, x1, x27, x25 + str s7, [x24, x4, lsl #2] + str s5, [x24, x3, lsl #2] + str s4, [x24, x2, lsl #2] + str s2, [x24, x1, lsl #2] + madd x18, x18, x27, x25 + str s3, [x24, x18, lsl #2] + madd x17, x17, x27, x25 + str s1, [x24, x17, lsl #2] + madd x16, x16, x27, x25 + str s0, [x24, x16, lsl #2] + b.ge .LBB0_107 +// %bb.103: // in Loop: Header=BB0_102 Depth=1 + madd x3, x12, x27, x25 + add x2, x12, #3 + add x18, x12, #5 + add x1, x12, #4 + madd x4, x2, x27, x25 + add x17, x12, #6 + add x16, x12, #7 + ldr s24, [x8] + madd x7, x18, x27, x25 + mov x6, xzr + ldr s6, [x24, x3, lsl #2] + add x3, x12, #2 + madd x5, x1, x27, x25 + madd x20, x17, x27, x25 + madd x21, x16, x27, x25 + ldr s3, [x24, x7, lsl #2] + ldr s4, [x24, x4, lsl #2] + ldr s2, [x24, x5, lsl #2] + ldr s0, [x24, x21, lsl #2] + madd x4, x3, x27, x25 + ldr s1, [x24, x20, lsl #2] + mov x20, x14 + mul x7, x12, x22 + ldr s5, [x24, x4, lsl #2] + add x4, x12, #1 + lsl x7, x7, #2 + ldr q23, [x9, x7] + mul x7, x4, x22 + madd x5, x4, x27, x25 + lsl x7, x7, #2 + ldr s7, [x24, x5, lsl #2] + add x5, x12, #8 + ldr q22, [x9, x7] + mul x7, x3, x22 + lsl x7, x7, #2 + ldr q21, [x9, x7] + mul x7, x2, x22 + lsl x7, x7, #2 + ldr q20, [x9, x7] + mul x7, x1, x22 + lsl x7, x7, #2 + ldr q19, [x9, x7] + mul x7, x18, x22 + lsl x7, x7, #2 + ldr q18, [x9, x7] + mul x7, x17, x22 + lsl x7, x7, #2 + ldr q17, [x9, x7] + mul x7, x16, x22 + lsl x7, x7, #2 + ldr q16, [x9, x7] + mov x7, x11 + fmla v6.2s, v24.2s, v23.2s + cmp xzr, x23 + b.ge .LBB0_105 + .p2align 2 +.LBB0_104: // Parent Loop BB0_102 Depth=1 + // => This Inner Loop Header: Depth=2 + add x21, x7, #8 + fmla v4.2s, v24.2s, v20.2s + fmla v7.2s, v24.2s, v22.2s + add x6, x6, #4 + prfm pldl1keep, [x21] + ldp s27, s25, [x7, #-8] + fmla v5.2s, v24.2s, v21.2s + fmla v2.2s, v24.2s, v19.2s + fmla v3.2s, v24.2s, v18.2s + fmla v1.2s, v24.2s, v17.2s + add x21, x20, x28 + ext v28.16b, v20.16b, v20.16b, #8 + fmla v0.2s, v24.2s, v16.2s + fmla v4.2s, v27.2s, v20.s[1] + fmla v6.2s, v27.2s, v23.s[1] + fmla v7.2s, v27.2s, v22.s[1] + fmla v5.2s, v27.2s, v21.s[1] + fmla v2.2s, v27.2s, v19.s[1] + ldp s26, s24, [x7], #16 + prfm pldl1keep, [x20] + fmla v3.2s, v27.2s, v18.s[1] + fmla v1.2s, v27.2s, v17.s[1] + fmla v0.2s, v27.2s, v16.s[1] + fmla v4.2s, v25.2s, v28.2s + ext v30.16b, v23.16b, v23.16b, #8 + ext v31.16b, v22.16b, v22.16b, #8 + fmla v6.2s, v25.2s, v30.2s + fmla v7.2s, v25.2s, v31.2s + fmla v6.2s, v26.2s, v23.s[3] + ldur q23, [x20, #-16] + ext v29.16b, v21.16b, v21.16b, #8 + ext v28.16b, v19.16b, v19.16b, #8 + fmla v5.2s, v25.2s, v29.2s + prfm pldl1keep, [x21] + add x20, x20, #16 + fmla v2.2s, v25.2s, v28.2s + fmla v7.2s, v26.2s, v22.s[3] + ldur q22, [x21, #-16] + add x21, x21, x28 + prfm pldl1keep, [x21] + fmla v5.2s, v26.2s, v21.s[3] + ldur q21, [x21, #-16] + add x21, x21, x28 + prfm pldl1keep, [x21] + ext v28.16b, v18.16b, v18.16b, #8 + fmla v3.2s, v25.2s, v28.2s + fmla v4.2s, v26.2s, v20.s[3] + ldur q20, [x21, #-16] + add x21, x21, x28 + prfm pldl1keep, [x21] + fmla v2.2s, v26.2s, v19.s[3] + ldur q19, [x21, #-16] + add x21, x21, x28 + prfm pldl1keep, [x21] + fmla v3.2s, v26.2s, v18.s[3] + ldur q18, [x21, #-16] + add x21, x21, x28 + prfm pldl1keep, [x21] + ext v28.16b, v17.16b, v17.16b, #8 + fmla v1.2s, v25.2s, v28.2s + fmla v1.2s, v26.2s, v17.s[3] + ldur q17, [x21, #-16] + add x21, x21, x28 + prfm pldl1keep, [x21] + ext v27.16b, v16.16b, v16.16b, #8 + fmla v0.2s, v25.2s, v27.2s + fmla v0.2s, v26.2s, v16.s[3] + ldur q16, [x21, #-16] + fmla v6.2s, v24.2s, v23.2s + cmp x6, x23 + b.lt .LBB0_104 +.LBB0_105: // in Loop: Header=BB0_102 Depth=1 + ldp x7, x6, [sp, #400] // 16-byte Folded Reload + fmla v7.2s, v24.2s, v22.2s + fmla v4.2s, v24.2s, v20.2s + fmla v5.2s, v24.2s, v21.2s + fmla v2.2s, v24.2s, v19.2s + mov x20, x29 + fmla v3.2s, v24.2s, v18.2s + fmla v1.2s, v24.2s, v17.2s + ldr s26, [x8, x7, lsl #2] + fmla v0.2s, v24.2s, v16.2s + ldr s27, [x8, x6, lsl #2] + ext v24.16b, v23.16b, v23.16b, #8 + ldr x6, [sp, #416] // 8-byte Folded Reload + mov x7, x15 + ldr s25, [x8, x6, lsl #2] + mov x6, x13 + fmla v6.2s, v26.2s, v23.s[1] + fmla v7.2s, v26.2s, v22.s[1] + fmla v4.2s, v26.2s, v20.s[1] + fmla v2.2s, v26.2s, v19.s[1] + fmla v5.2s, v26.2s, v21.s[1] + fmla v3.2s, v26.2s, v18.s[1] + fmla v1.2s, v26.2s, v17.s[1] + fmla v0.2s, v26.2s, v16.s[1] + ext v26.16b, v21.16b, v21.16b, #8 + fmla v6.2s, v27.2s, v24.2s + ext v24.16b, v22.16b, v22.16b, #8 + fmla v5.2s, v27.2s, v26.2s + fmla v7.2s, v27.2s, v24.2s + ext v24.16b, v20.16b, v20.16b, #8 + ext v26.16b, v17.16b, v17.16b, #8 + fmla v1.2s, v27.2s, v26.2s + fmla v4.2s, v27.2s, v24.2s + ext v24.16b, v19.16b, v19.16b, #8 + fmla v6.2s, v25.2s, v23.s[3] + fmla v5.2s, v25.2s, v21.s[3] + fmla v2.2s, v27.2s, v24.2s + fmla v7.2s, v25.2s, v22.s[3] + ext v24.16b, v18.16b, v18.16b, #8 + fmla v1.2s, v25.2s, v17.s[3] + fmla v3.2s, v27.2s, v24.2s + ext v24.16b, v16.16b, v16.16b, #8 + fmla v4.2s, v25.2s, v20.s[3] + fmla v0.2s, v27.2s, v24.2s + fmla v2.2s, v25.2s, v19.s[3] + fmla v3.2s, v25.2s, v18.s[3] + fmla v0.2s, v25.2s, v16.s[3] + cmp x29, x19 + b.ge .LBB0_101 + .p2align 2 +.LBB0_106: // Parent Loop BB0_102 Depth=1 + // => This Inner Loop Header: Depth=2 + add x21, x6, x28 + prfm pldl1keep, [x6] + ldur s16, [x6, #-4] + add x20, x20, #1 + prfm pldl1keep, [x21] + ldur s17, [x21, #-4] + add x21, x21, x28 + add x6, x6, #4 + prfm pldl1keep, [x21] + ldur s18, [x21, #-4] + add x21, x21, x28 + prfm pldl1keep, [x21] + ldur s19, [x21, #-4] + add x21, x21, x28 + prfm pldl1keep, [x21] + ldur s20, [x21, #-4] + add x21, x21, x28 + prfm pldl1keep, [x21] + ldur s21, [x21, #-4] + add x21, x21, x28 + prfm pldl1keep, [x21] + ldur s22, [x21, #-4] + add x21, x21, x28 + prfm pldl1keep, [x21] + ldur s23, [x21, #-4] + prfm pldl1keep, [x7] + ldur s24, [x7, #-4] + add x7, x7, #4 + fmla v6.2s, v24.2s, v16.2s + fmla v7.2s, v24.2s, v17.2s + fmla v5.2s, v24.2s, v18.2s + fmla v4.2s, v24.2s, v19.2s + fmla v2.2s, v24.2s, v20.2s + fmla v3.2s, v24.2s, v21.2s + fmla v1.2s, v24.2s, v22.2s + fmla v0.2s, v24.2s, v23.2s + cmp x20, x19 + b.lt .LBB0_106 + b .LBB0_101 +.LBB0_107: + ldr x13, [sp, #336] // 8-byte Folded Reload + cmp x7, x13 + b.lt .LBB0_110 +// %bb.108: + ldr x11, [sp, #312] // 8-byte Folded Reload + cmp x13, x11 + b.lt .LBB0_115 +.LBB0_109: + ldr x11, [sp, #288] // 8-byte Folded Reload + ldr x12, [sp, #312] // 8-byte Folded Reload + cmp x12, x11 + b.lt .LBB0_120 + b .LBB0_125 +.LBB0_110: + add x18, x7, #1 + add x1, x7, #2 + add x2, x7, #3 + mul x14, x7, x27 + madd x13, x18, x27, x25 + mov x16, xzr + add x14, x14, x25 + mul x18, x18, x22 + mul x15, x7, x22 + madd x12, x1, x27, x25 + lsl x17, x15, #2 + lsl x18, x18, #2 + madd x11, x2, x27, x25 + ldr s2, [x24, x14, lsl #2] + ldr s0, [x24, x11, lsl #2] + ldr s1, [x24, x12, lsl #2] + ldr s3, [x24, x13, lsl #2] + ldr q6, [x9, x17] + ldr q7, [x9, x18] + mul x18, x1, x22 + mov x17, x8 + ldr s16, [x17], #12 + lsl x18, x18, #2 + ldr q5, [x9, x18] + mul x18, x2, x22 + lsl x18, x18, #2 + ldp x2, x1, [sp, #320] // 16-byte Folded Reload + ldr q4, [x9, x18] + ldr x18, [sp, #32] // 8-byte Folded Reload + lsl x18, x18, #5 + add x18, x18, x2, lsl #2 + add x18, x18, x1 + add x18, x18, #32 + ext v20.16b, v6.16b, v6.16b, #8 + cmp xzr, x23 + ext v19.16b, v7.16b, v7.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.ge .LBB0_112 + .p2align 2 +.LBB0_111: // =>This Inner Loop Header: Depth=1 + add x1, x17, #8 + fmla v2.2s, v16.2s, v6.2s + fmla v3.2s, v16.2s, v7.2s + add x16, x16, #4 + fmla v1.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + prfm pldl1keep, [x1] + add x1, x18, x28 + ldp s16, s21, [x17, #-8] + fmla v0.2s, v16.2s, v4.s[1] + fmla v2.2s, v16.2s, v6.s[1] + fmla v3.2s, v16.2s, v7.s[1] + fmla v1.2s, v16.2s, v5.s[1] + fmla v0.2s, v21.2s, v17.2s + fmla v2.2s, v21.2s, v20.2s + ldp s17, s16, [x17], #16 + fmla v3.2s, v21.2s, v19.2s + fmla v1.2s, v21.2s, v18.2s + prfm pldl1keep, [x18] + fmla v2.2s, v17.2s, v6.s[3] + ldur q6, [x18, #-16] + prfm pldl1keep, [x1] + fmla v3.2s, v17.2s, v7.s[3] + ldur q7, [x1, #-16] + add x1, x1, x28 + fmla v1.2s, v17.2s, v5.s[3] + fmla v0.2s, v17.2s, v4.s[3] + add x18, x18, #16 + prfm pldl1keep, [x1] + ldur q5, [x1, #-16] + add x1, x1, x28 + prfm pldl1keep, [x1] + ldur q4, [x1, #-16] + ext v20.16b, v6.16b, v6.16b, #8 + cmp x16, x23 + ext v19.16b, v7.16b, v7.16b, #8 + ext v18.16b, v5.16b, v5.16b, #8 + ext v17.16b, v4.16b, v4.16b, #8 + b.lt .LBB0_111 +.LBB0_112: + ldp x17, x16, [sp, #400] // 16-byte Folded Reload + fmla v2.2s, v16.2s, v6.2s + fmla v3.2s, v16.2s, v7.2s + fmla v1.2s, v16.2s, v5.2s + fmla v0.2s, v16.2s, v4.2s + add x15, x19, x15 + ldr s21, [x8, x17, lsl #2] + ldr s16, [x8, x16, lsl #2] + ldr x16, [sp, #416] // 8-byte Folded Reload + ldr x17, [sp, #344] // 8-byte Folded Reload + ldr s22, [x8, x16, lsl #2] + sub x16, x15, x30 + add x15, x10, #4 + add x16, x17, x16, lsl #2 + ldr x17, [sp, #328] // 8-byte Folded Reload + fmla v2.2s, v21.2s, v6.s[1] + fmla v3.2s, v21.2s, v7.s[1] + fmla v1.2s, v21.2s, v5.s[1] + fmla v0.2s, v21.2s, v4.s[1] + add x16, x16, x17 + mov x17, x29 + fmla v2.2s, v16.2s, v20.2s + fmla v3.2s, v16.2s, v19.2s + fmla v1.2s, v16.2s, v18.2s + fmla v0.2s, v16.2s, v17.2s + add x16, x16, #4 + fmla v2.2s, v22.2s, v6.s[3] + fmla v3.2s, v22.2s, v7.s[3] + fmla v1.2s, v22.2s, v5.s[3] + fmla v0.2s, v22.2s, v4.s[3] + cmp x29, x19 + b.ge .LBB0_114 + .p2align 2 +.LBB0_113: // =>This Inner Loop Header: Depth=1 + add x18, x16, x28 + prfm pldl1keep, [x16] + ldur s4, [x16, #-4] + add x17, x17, #1 + prfm pldl1keep, [x18] + ldur s5, [x18, #-4] + add x18, x18, x28 + add x16, x16, #4 + prfm pldl1keep, [x18] + ldur s6, [x18, #-4] + add x18, x18, x28 + prfm pldl1keep, [x18] + ldur s7, [x18, #-4] + prfm pldl1keep, [x15] + ldur s16, [x15, #-4] + add x15, x15, #4 + fmla v2.2s, v16.2s, v4.2s + fmla v3.2s, v16.2s, v5.2s + fmla v1.2s, v16.2s, v6.2s + fmla v0.2s, v16.2s, v7.2s + cmp x17, x19 + b.lt .LBB0_113 +.LBB0_114: + str s2, [x24, x14, lsl #2] + str s3, [x24, x13, lsl #2] + ldr x13, [sp, #336] // 8-byte Folded Reload + str s1, [x24, x12, lsl #2] + str s0, [x24, x11, lsl #2] + ldr x11, [sp, #312] // 8-byte Folded Reload + cmp x13, x11 + b.ge .LBB0_109 +.LBB0_115: + mul x11, x13, x27 + add x14, x13, #1 + ldr x1, [sp, #328] // 8-byte Folded Reload + ldr x2, [sp, #344] // 8-byte Folded Reload + mul x13, x13, x22 + ldr s4, [x8] + mov x15, xzr + mov x16, xzr + madd x12, x14, x27, x25 + lsl x17, x13, #2 + mul x14, x14, x22 + add x18, x1, x2 + add x11, x11, x25 + ldr q3, [x9, x17] + lsl x17, x14, #2 + ldr s0, [x24, x11, lsl #2] + ldr s1, [x24, x12, lsl #2] + ldr q2, [x9, x17] + add x17, x18, x17 + ldr x18, [sp, #40] // 8-byte Folded Reload + add x18, x2, x18, lsl #4 + add x18, x1, x18 + ext v6.16b, v3.16b, v3.16b, #8 + cmp xzr, x23 + ext v5.16b, v2.16b, v2.16b, #8 + b.ge .LBB0_117 + .p2align 2 +.LBB0_116: // =>This Inner Loop Header: Depth=1 + add x5, x8, x15 + fmla v0.2s, v4.2s, v3.2s + fmla v1.2s, v4.2s, v2.2s + add x1, x17, x15 + add x6, x5, #20 + add x3, x18, x15 + add x2, x1, #32 + add x4, x3, #32 + prfm pldl1keep, [x6] + ldp s4, s7, [x5, #4] + add x16, x16, #4 + add x15, x15, #16 + fmla v1.2s, v4.2s, v2.s[1] + fmla v0.2s, v4.2s, v3.s[1] + fmla v1.2s, v7.2s, v5.2s + ldp s5, s4, [x5, #12] + fmla v0.2s, v7.2s, v6.2s + prfm pldl1keep, [x4] + fmla v0.2s, v5.2s, v3.s[3] + ldr q3, [x3, #16] + prfm pldl1keep, [x2] + fmla v1.2s, v5.2s, v2.s[3] + ldr q2, [x1, #16] + ext v6.16b, v3.16b, v3.16b, #8 + cmp x16, x23 + ext v5.16b, v2.16b, v2.16b, #8 + b.lt .LBB0_116 +.LBB0_117: + ldp x17, x16, [sp, #400] // 16-byte Folded Reload + fmla v0.2s, v4.2s, v3.2s + fmla v1.2s, v4.2s, v2.2s + add x13, x19, x13 + mov x15, xzr + ldr s7, [x8, x17, lsl #2] + ldr s4, [x8, x16, lsl #2] + ldr x16, [sp, #416] // 8-byte Folded Reload + ldr x17, [sp, #328] // 8-byte Folded Reload + fmla v0.2s, v7.2s, v3.s[1] + fmla v1.2s, v7.2s, v2.s[1] + ldr s7, [x8, x16, lsl #2] + sub x16, x13, x30 + add x13, x19, x14 + ldr x14, [sp, #344] // 8-byte Folded Reload + sub x13, x13, x30 + fmla v0.2s, v4.2s, v6.2s + fmla v1.2s, v4.2s, v5.2s + add x13, x14, x13, lsl #2 + add x14, x14, x16, lsl #2 + mov x16, x29 + add x13, x17, x13 + add x14, x17, x14 + fmla v0.2s, v7.2s, v3.s[3] + fmla v1.2s, v7.2s, v2.s[3] + cmp x29, x19 + b.ge .LBB0_119 + .p2align 2 +.LBB0_118: // =>This Inner Loop Header: Depth=1 + add x17, x10, x15 + add x18, x13, x15 + add x1, x14, x15 + add x16, x16, #1 + add x17, x17, #4 + add x18, x18, #4 + add x1, x1, #4 + prfm pldl1keep, [x1] + prfm pldl1keep, [x18] + ldr s2, [x14, x15] + prfm pldl1keep, [x17] + ldr s3, [x10, x15] + fmla v0.2s, v3.2s, v2.2s + ldr s2, [x13, x15] + add x15, x15, #4 + fmla v1.2s, v3.2s, v2.2s + cmp x16, x19 + b.lt .LBB0_118 +.LBB0_119: + str s0, [x24, x11, lsl #2] + str s1, [x24, x12, lsl #2] + ldr x11, [sp, #288] // 8-byte Folded Reload + ldr x12, [sp, #312] // 8-byte Folded Reload + cmp x12, x11 + b.ge .LBB0_125 +.LBB0_120: + mul x11, x12, x27 + mov x14, x8 + mov x13, xzr + add x11, x11, x25 + mul x12, x12, x22 + lsl x15, x12, #2 + ldr s2, [x14], #12 + ldr q1, [x9, x15] + ldr x9, [sp, #48] // 8-byte Folded Reload + ldr x15, [sp, #320] // 8-byte Folded Reload + ldr s0, [x24, x11, lsl #2] + lsl x9, x9, #3 + add x9, x9, x15, lsl #2 + ldr x15, [sp, #328] // 8-byte Folded Reload + add x9, x9, x15 + add x9, x9, #32 + ext v3.16b, v1.16b, v1.16b, #8 + cmp xzr, x23 + b.ge .LBB0_122 + .p2align 2 +.LBB0_121: // =>This Inner Loop Header: Depth=1 + add x15, x14, #8 + fmla v0.2s, v2.2s, v1.2s + add x13, x13, #4 + prfm pldl1keep, [x15] + ldp s2, s4, [x14, #-8] + fmla v0.2s, v2.2s, v1.s[1] + fmla v0.2s, v4.2s, v3.2s + ldp s3, s2, [x14], #16 + prfm pldl1keep, [x9] + fmla v0.2s, v3.2s, v1.s[3] + ldur q1, [x9, #-16] + add x9, x9, #16 + ext v3.16b, v1.16b, v1.16b, #8 + cmp x13, x23 + b.lt .LBB0_121 +.LBB0_122: + ldp x14, x13, [sp, #400] // 16-byte Folded Reload + fmla v0.2s, v2.2s, v1.2s + mov x9, xzr + ldr s4, [x8, x14, lsl #2] + ldr s2, [x8, x13, lsl #2] + ldr x13, [sp, #416] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v1.s[1] + ldr s4, [x8, x13, lsl #2] + add x8, x19, x12 + ldr x12, [sp, #344] // 8-byte Folded Reload + sub x8, x8, x30 + fmla v0.2s, v2.2s, v3.2s + add x8, x12, x8, lsl #2 + ldr x12, [sp, #328] // 8-byte Folded Reload + fmla v0.2s, v4.2s, v1.s[3] + add x8, x12, x8 + cmp x29, x19 + b.ge .LBB0_124 + .p2align 2 +.LBB0_123: // =>This Inner Loop Header: Depth=1 + add x12, x10, x9 + add x13, x8, x9 + add x29, x29, #1 + add x12, x12, #4 + add x13, x13, #4 + prfm pldl1keep, [x13] + prfm pldl1keep, [x12] + ldr s1, [x10, x9] + ldr s2, [x8, x9] + add x9, x9, #4 + fmla v0.2s, v1.2s, v2.2s + cmp x29, x19 + b.lt .LBB0_123 +.LBB0_124: + str s0, [x24, x11, lsl #2] +.LBB0_125: + bl free +.LBB0_126: + add sp, sp, #512 + ldp d9, d8, [sp, #32] // 16-byte Folded Reload + ldp d11, d10, [sp, #16] // 16-byte Folded Reload + ldp x20, x19, [sp, #128] // 16-byte Folded Reload + ldp x22, x21, [sp, #112] // 16-byte Folded Reload + ldp x24, x23, [sp, #96] // 16-byte Folded Reload + ldp x26, x25, [sp, #80] // 16-byte Folded Reload + ldp x28, x27, [sp, #64] // 16-byte Folded Reload + ldp x29, x30, [sp, #48] // 16-byte Folded Reload + ldr d12, [sp], #144 // 8-byte Folded Reload + ret +.Lfunc_end0: + .size sgemm_nn_alpha1_beta1_mlir, .Lfunc_end0-sgemm_nn_alpha1_beta1_mlir + .cfi_endproc + // -- End function + .section ".note.GNU-stack","",@progbits diff --git a/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemv_n_alpha1_beta1_mlir.s b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemv_n_alpha1_beta1_mlir.s new file mode 100644 index 00000000000000..ffd32ba76066c8 --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/kernels/sgemv_n_alpha1_beta1_mlir.s @@ -0,0 +1,709 @@ + .text + .file "LLVMDialectModule" + .globl sgemv_n_alpha1_beta1_mlir // -- Begin function sgemv_n_alpha1_beta1_mlir + .p2align 4 + .type sgemv_n_alpha1_beta1_mlir,@function +sgemv_n_alpha1_beta1_mlir: // @sgemv_n_alpha1_beta1_mlir + .cfi_startproc +// %bb.0: + sub sp, sp, #112 + stp x29, x30, [sp, #16] // 16-byte Folded Spill + stp x28, x27, [sp, #32] // 16-byte Folded Spill + stp x26, x25, [sp, #48] // 16-byte Folded Spill + stp x24, x23, [sp, #64] // 16-byte Folded Spill + stp x22, x21, [sp, #80] // 16-byte Folded Spill + stp x20, x19, [sp, #96] // 16-byte Folded Spill + .cfi_def_cfa_offset 112 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + cmp x4, #0 + ldr x9, [sp, #112] + lsl x7, x5, #2 + mov x27, xzr + cinv x8, x4, lt + add x0, x1, #448 + lsl x2, x5, #4 + add x10, x8, x8, lsr #63 + add x11, x8, #3 + add x12, x8, #7 + add x19, x7, #448 + asr x10, x10, #1 + mov x15, x9 + cinv x10, x10, lt + cmp x8, #0 + csel x11, x11, x8, lt + csel x8, x12, x8, lt + cmp x4, #0 + asr x11, x11, #2 + asr x8, x8, #3 + cinv x12, x11, lt + cinv x14, x8, lt + cmp x3, #0 + lsl x11, x10, #3 + cinv x6, x3, lt + lsl x13, x12, #4 + lsl x16, x14, #5 + add x21, x9, x11 + lsl x12, x12, #2 + lsl x14, x14, #3 + add x8, x6, #3 + cmp x6, #0 + csel x8, x8, x6, lt + cmp x3, #0 + asr x8, x8, #2 + cinv x8, x8, lt + stp x8, x9, [sp] // 16-byte Folded Spill + lsl x17, x8, #2 + add x8, x11, x1 + add x20, x8, #72 + lsl x8, x10, #1 + add x10, x13, #128 + add x22, x1, x10 + add x23, x9, x10 + add x10, x16, #256 + add x24, x1, x10 + add x25, x9, x10 + ldr x10, [sp, #152] + b .LBB0_2 + .p2align 2 +.LBB0_1: // in Loop: Header=BB0_2 Depth=1 + mov s5, v0.s[2] + fadd s2, s2, s0 + add x0, x0, x2 + add x20, x20, x2 + add x24, x24, x2 + add x22, x22, x2 + fadd s3, s3, s5 + mov s5, v0.s[1] + mov s0, v0.s[3] + fadd s4, s4, s5 + fadd s0, s1, s0 + mov v2.s[1], v4.s[0] + mov v2.s[2], v3.s[0] + mov v2.s[3], v0.s[0] + str q2, [x26] +.LBB0_2: // =>This Loop Header: Depth=1 + // Child Loop BB0_4 Depth 2 + // Child Loop BB0_6 Depth 2 + // Child Loop BB0_8 Depth 2 + // Child Loop BB0_10 Depth 2 + cmp x27, x17 + b.ge .LBB0_11 +// %bb.3: // in Loop: Header=BB0_2 Depth=1 + add x26, x10, x27, lsl #2 + movi v4.2d, #0000000000000000 + movi v3.2d, #0000000000000000 + movi v5.2d, #0000000000000000 + movi v0.2d, #0000000000000000 + mov x28, x0 + mov x29, xzr + mov x30, x15 + ldr q1, [x26] + movi v7.2d, #0000000000000000 + movi v16.2d, #0000000000000000 + add x27, x27, #4 + movi v2.2d, #0000000000000000 + movi v6.2d, #0000000000000000 + cmp xzr, x14 + b.ge .LBB0_5 + .p2align 2 +.LBB0_4: // Parent Loop BB0_2 Depth=1 + // => This Inner Loop Header: Depth=2 + sub x18, x28, #448 + prfm pldl1keep, [x28] + add x28, x28, #32 + add x29, x29, #8 + ldp q18, q17, [x18] + add x18, x18, x19 + prfm pldl1keep, [x18] + sub x18, x18, #448 + add x9, x18, x19 + ldp q20, q19, [x18] + prfm pldl1keep, [x9] + sub x9, x9, #448 + add x18, x9, x19 + ldp q22, q21, [x9] + add x9, x30, #448 + prfm pldl1keep, [x18] + ldp q23, q24, [x18, #-448]! + prfm pldl1keep, [x9] + ldp q26, q25, [x30], #32 + fmla v6.4s, v21.4s, v25.4s + fmla v3.4s, v19.4s, v25.4s + fmla v7.4s, v24.4s, v25.4s + fmla v0.4s, v17.4s, v25.4s + fmla v5.4s, v22.4s, v26.4s + fmla v4.4s, v20.4s, v26.4s + fmla v2.4s, v18.4s, v26.4s + fmla v16.4s, v23.4s, v26.4s + cmp x29, x14 + b.lt .LBB0_4 +.LBB0_5: // in Loop: Header=BB0_2 Depth=1 + mov s17, v1.s[3] + mov s18, v16.s[1] + mov x28, x25 + mov x29, x14 + mov x30, x24 + fadd s17, s17, s16 + fadd s17, s17, s18 + mov s18, v16.s[2] + mov s16, v16.s[3] + fadd s17, s17, s18 + fadd s16, s17, s16 + mov s17, v7.s[1] + fadd s16, s16, s7 + fadd s16, s16, s17 + mov s17, v7.s[2] + mov s7, v7.s[3] + fadd s16, s16, s17 + mov s17, v5.s[1] + fadd s7, s16, s7 + mov s16, v1.s[2] + fadd s16, s16, s5 + fadd s16, s16, s17 + mov s17, v5.s[2] + mov s5, v5.s[3] + fadd s16, s16, s17 + fadd s5, s16, s5 + mov s16, v6.s[1] + fadd s5, s5, s6 + fadd s5, s5, s16 + mov s16, v6.s[2] + mov s6, v6.s[3] + fadd s5, s5, s16 + mov s16, v4.s[1] + fadd s5, s5, s6 + mov s6, v1.s[1] + fadd s1, s1, s2 + fadd s6, s6, s4 + fadd s6, s6, s16 + mov s16, v4.s[2] + mov s4, v4.s[3] + fadd s6, s6, s16 + fadd s4, s6, s4 + mov s6, v3.s[1] + fadd s4, s4, s3 + fadd s4, s4, s6 + mov s6, v3.s[2] + mov s3, v3.s[3] + fadd s4, s4, s6 + fadd s3, s4, s3 + mov s4, v2.s[1] + fadd s1, s1, s4 + mov s4, v2.s[2] + mov s2, v2.s[3] + fadd s1, s1, s4 + movi v4.2d, #0000000000000000 + fadd s1, s1, s2 + mov s2, v0.s[1] + fadd s1, s1, s0 + fadd s1, s1, s2 + mov s2, v0.s[2] + mov s0, v0.s[3] + fadd s1, s1, s2 + movi v2.2d, #0000000000000000 + fadd s0, s1, s0 + movi v1.2d, #0000000000000000 + mov v0.s[1], v3.s[0] + movi v3.2d, #0000000000000000 + mov v0.s[2], v5.s[0] + mov v0.s[3], v7.s[0] + str q0, [x26] + cmp x14, x12 + b.ge .LBB0_7 + .p2align 2 +.LBB0_6: // Parent Loop BB0_2 Depth=1 + // => This Inner Loop Header: Depth=2 + add x9, x30, x7 + prfm pldl1keep, [x30] + ldur q5, [x30, #-256] + add x30, x30, #16 + add x18, x9, x7 + prfm pldl1keep, [x9] + ldur q6, [x9, #-256] + add x29, x29, #4 + add x9, x18, x7 + prfm pldl1keep, [x18] + ldur q7, [x18, #-256] + prfm pldl1keep, [x9] + ldur q16, [x9, #-256] + prfm pldl1keep, [x28] + ldur q17, [x28, #-256] + add x28, x28, #16 + fmla v3.4s, v7.4s, v17.4s + fmla v1.4s, v16.4s, v17.4s + fmla v4.4s, v6.4s, v17.4s + fmla v2.4s, v5.4s, v17.4s + cmp x29, x12 + b.lt .LBB0_6 +.LBB0_7: // in Loop: Header=BB0_2 Depth=1 + mov s5, v0.s[1] + mov s6, v4.s[1] + mov x28, x23 + mov x29, x22 + mov s7, v1.s[1] + mov x30, x12 + fadd s5, s5, s4 + fadd s5, s5, s6 + mov s6, v4.s[2] + mov s4, v4.s[3] + fadd s5, s5, s6 + fadd s6, s0, s2 + fadd s4, s5, s4 + mov s5, v2.s[1] + fadd s5, s6, s5 + mov s6, v2.s[2] + mov s2, v2.s[3] + fadd s5, s5, s6 + movi d6, #0000000000000000 + fadd s2, s5, s2 + mov s5, v3.s[1] + mov v2.s[1], v4.s[0] + mov s4, v0.s[2] + mov s0, v0.s[3] + fadd s4, s4, s3 + fadd s0, s0, s1 + fadd s4, s4, s5 + mov s5, v3.s[2] + fadd s0, s0, s7 + mov s7, v1.s[2] + mov s3, v3.s[3] + mov s1, v1.s[3] + fadd s4, s4, s5 + fadd s0, s0, s7 + movi d5, #0000000000000000 + fadd s3, s4, s3 + fadd s0, s0, s1 + movi d4, #0000000000000000 + mov v2.s[2], v3.s[0] + movi d3, #0000000000000000 + mov v2.s[3], v0.s[0] + str q2, [x26] + cmp x12, x8 + b.ge .LBB0_9 + .p2align 2 +.LBB0_8: // Parent Loop BB0_2 Depth=1 + // => This Inner Loop Header: Depth=2 + add x9, x29, x7 + prfm pldl1keep, [x29] + ldur d0, [x29, #-128] + add x29, x29, #8 + add x18, x9, x7 + prfm pldl1keep, [x9] + ldur d1, [x9, #-128] + add x30, x30, #2 + add x9, x18, x7 + prfm pldl1keep, [x18] + ldur d7, [x18, #-128] + prfm pldl1keep, [x9] + ldur d16, [x9, #-128] + prfm pldl1keep, [x28] + ldur d17, [x28, #-128] + add x28, x28, #8 + fmla v5.2s, v7.2s, v17.2s + fmla v6.2s, v16.2s, v17.2s + fmla v4.2s, v1.2s, v17.2s + fmla v3.2s, v0.2s, v17.2s + cmp x30, x8 + b.lt .LBB0_8 +.LBB0_9: // in Loop: Header=BB0_2 Depth=1 + mov s0, v2.s[3] + mov s1, v6.s[1] + mov x28, x21 + mov x29, x20 + mov x30, x8 + fadd s0, s0, s6 + mov s6, v2.s[2] + fadd s6, s6, s5 + fadd s1, s0, s1 + mov s0, v2.s[1] + fadd s2, s2, s3 + fadd s0, s0, s4 + mov s4, v4.s[1] + fadd s4, s0, s4 + mov s0, v5.s[1] + fadd s5, s6, s0 + mov s0, v3.s[1] + movi d3, #0000000000000000 + fadd s0, s2, s0 + movi d2, #0000000000000000 + mov v0.s[1], v4.s[0] + movi d4, #0000000000000000 + mov v0.s[2], v5.s[0] + mov v0.s[3], v1.s[0] + movi d1, #0000000000000000 + str q0, [x26] + cmp x8, x4 + b.ge .LBB0_1 + .p2align 2 +.LBB0_10: // Parent Loop BB0_2 Depth=1 + // => This Inner Loop Header: Depth=2 + add x9, x29, x7 + prfm pldl1keep, [x29] + ldur s5, [x29, #-72] + add x29, x29, #4 + add x18, x9, x7 + prfm pldl1keep, [x9] + ldur s6, [x9, #-72] + add x30, x30, #1 + prfm pldl1keep, [x18] + add x9, x18, x7 + ldur s7, [x18, #-72] + add x18, x28, #72 + prfm pldl1keep, [x9] + ldur s16, [x9, #-72] + prfm pldl1keep, [x18] + ldr s17, [x28], #4 + fmul s7, s7, s17 + fmul s6, s6, s17 + fmul s5, s5, s17 + fadd v3.2s, v3.2s, v7.2s + fmul s7, s16, s17 + fadd v4.2s, v4.2s, v6.2s + fadd v2.2s, v2.2s, v5.2s + fadd v1.2s, v1.2s, v7.2s + cmp x30, x4 + b.lt .LBB0_10 + b .LBB0_1 +.LBB0_11: + add x9, x6, x6, lsr #63 + cmp x3, #0 + asr x9, x9, #1 + cinv x2, x9, lt + lsl x0, x2, #1 + cmp x17, x0 + b.ge .LBB0_21 +// %bb.12: + ldr x9, [sp] // 8-byte Folded Reload + movi v4.2d, #0000000000000000 + movi v2.2d, #0000000000000000 + mov x7, x15 + movi v3.2d, #0000000000000000 + movi v0.2d, #0000000000000000 + mov x19, xzr + mul x9, x9, x5 + add x18, x1, x9, lsl #4 + add x9, x17, #1 + add x17, x10, x17, lsl #2 + mul x9, x5, x9 + ldr d1, [x17] + mov x20, x18 + add x6, x1, x9, lsl #2 + mov x21, x6 + cmp xzr, x14 + b.ge .LBB0_14 + .p2align 2 +.LBB0_13: // =>This Inner Loop Header: Depth=1 + add x9, x20, #736 + add x19, x19, #8 + prfm pldl1keep, [x9] + add x9, x21, #736 + ldp q6, q5, [x20], #32 + prfm pldl1keep, [x9] + add x9, x7, #736 + ldp q16, q7, [x21], #32 + prfm pldl1keep, [x9] + ldr q17, [x7, #16] + fmla v0.4s, v5.4s, v17.4s + fmla v3.4s, v7.4s, v17.4s + ldr q5, [x7], #32 + fmla v4.4s, v16.4s, v5.4s + fmla v2.4s, v6.4s, v5.4s + cmp x19, x14 + b.lt .LBB0_13 +.LBB0_14: + mov s5, v1.s[1] + mov s6, v4.s[1] + mov x7, x18 + mov x19, x6 + fadd s1, s1, s2 + mov x20, x14 + mov x21, x15 + fadd s5, s5, s4 + fadd s5, s5, s6 + mov s6, v4.s[2] + mov s4, v4.s[3] + fadd s5, s5, s6 + fadd s4, s5, s4 + mov s5, v3.s[1] + fadd s4, s4, s3 + fadd s4, s4, s5 + mov s5, v3.s[2] + mov s3, v3.s[3] + fadd s4, s4, s5 + fadd s3, s4, s3 + mov s4, v2.s[1] + fadd s1, s1, s4 + mov s4, v2.s[2] + mov s2, v2.s[3] + fadd s1, s1, s4 + fadd s1, s1, s2 + mov s2, v0.s[1] + fadd s1, s1, s0 + fadd s1, s1, s2 + mov s2, v0.s[2] + mov s0, v0.s[3] + fadd s1, s1, s2 + movi v2.2d, #0000000000000000 + fadd s0, s1, s0 + movi v1.2d, #0000000000000000 + mov v0.s[1], v3.s[0] + str d0, [x17] + cmp x14, x12 + b.ge .LBB0_16 + .p2align 2 +.LBB0_15: // =>This Inner Loop Header: Depth=1 + add x9, x7, x16 + add x20, x20, #4 + add x9, x9, #432 + prfm pldl1keep, [x9] + add x9, x19, x16 + ldr q3, [x7, x16] + add x7, x7, #16 + add x9, x9, #432 + prfm pldl1keep, [x9] + add x9, x21, x16 + add x9, x9, #432 + ldr q4, [x19, x16] + add x19, x19, #16 + prfm pldl1keep, [x9] + ldr q5, [x21, x16] + add x21, x21, #16 + fmla v2.4s, v4.4s, v5.4s + fmla v1.4s, v3.4s, v5.4s + cmp x20, x12 + b.lt .LBB0_15 +.LBB0_16: + mov s3, v0.s[1] + mov s4, v2.s[1] + mov x7, x18 + mov x19, x6 + fadd s0, s0, s1 + mov x20, x15 + mov x21, x12 + fadd s3, s3, s2 + fadd s3, s3, s4 + mov s4, v2.s[2] + mov s2, v2.s[3] + fadd s3, s3, s4 + fadd s2, s3, s2 + mov s3, v1.s[1] + fadd s0, s0, s3 + mov s3, v1.s[2] + mov s1, v1.s[3] + fadd s0, s0, s3 + fadd s0, s0, s1 + movi d1, #0000000000000000 + mov v0.s[1], v2.s[0] + movi d2, #0000000000000000 + str d0, [x17] + cmp x12, x8 + b.ge .LBB0_18 + .p2align 2 +.LBB0_17: // =>This Inner Loop Header: Depth=1 + add x9, x7, x13 + add x21, x21, #2 + add x9, x9, #216 + prfm pldl1keep, [x9] + add x9, x19, x13 + ldr d3, [x7, x13] + add x7, x7, #8 + add x9, x9, #216 + prfm pldl1keep, [x9] + add x9, x20, x13 + add x9, x9, #216 + ldr d4, [x19, x13] + add x19, x19, #8 + prfm pldl1keep, [x9] + ldr d5, [x20, x13] + add x20, x20, #8 + fmla v2.2s, v4.2s, v5.2s + fmla v1.2s, v3.2s, v5.2s + cmp x21, x8 + b.lt .LBB0_17 +.LBB0_18: + mov s3, v0.s[1] + fadd s0, s0, s1 + mov x7, x15 + mov x19, x8 + fadd s3, s3, s2 + mov s2, v2.s[1] + fadd s2, s3, s2 + mov s3, v1.s[1] + movi d1, #0000000000000000 + fadd s0, s0, s3 + mov v0.s[1], v2.s[0] + movi d2, #0000000000000000 + str d0, [x17] + cmp x8, x4 + b.ge .LBB0_20 + .p2align 2 +.LBB0_19: // =>This Inner Loop Header: Depth=1 + add x9, x18, x11 + add x19, x19, #1 + add x9, x9, #128 + prfm pldl1keep, [x9] + add x9, x6, x11 + ldr s3, [x18, x11] + add x18, x18, #4 + add x9, x9, #128 + prfm pldl1keep, [x9] + add x9, x7, x11 + add x9, x9, #128 + ldr s4, [x6, x11] + add x6, x6, #4 + prfm pldl1keep, [x9] + ldr s5, [x7, x11] + add x7, x7, #4 + fmul s4, s4, s5 + fmul s3, s3, s5 + fadd v1.2s, v1.2s, v4.2s + fadd v2.2s, v2.2s, v3.2s + cmp x19, x4 + b.lt .LBB0_19 +.LBB0_20: + mov s3, v0.s[1] + fadd s0, s2, s0 + fadd s1, s1, s3 + mov v0.s[1], v1.s[0] + str d0, [x17] +.LBB0_21: + cmp x0, x3 + b.ge .LBB0_31 +// %bb.22: + mul x17, x2, x5 + ldr s2, [x10, x0, lsl #2] + movi v0.2d, #0000000000000000 + movi v1.2d, #0000000000000000 + mov x18, xzr + add x2, x1, x17, lsl #3 + cmp xzr, x14 + b.ge .LBB0_24 + .p2align 2 +.LBB0_23: // =>This Inner Loop Header: Depth=1 + add x9, x2, #1152 + add x18, x18, #8 + prfm pldl1keep, [x9] + add x9, x15, #1152 + ldp q3, q4, [x2], #32 + prfm pldl1keep, [x9] + ldr q5, [x15] + fmla v1.4s, v3.4s, v5.4s + ldr q3, [x15, #16] + add x15, x15, #32 + fmla v0.4s, v4.4s, v3.4s + cmp x18, x14 + b.lt .LBB0_23 +.LBB0_24: + fadd s2, s2, s1 + mov s3, v1.s[1] + ldr x18, [sp, #8] // 8-byte Folded Reload + add x9, x16, x17, lsl #3 + add x15, x1, x9 + add x16, x18, x16 + fadd s2, s2, s3 + mov s3, v1.s[2] + mov s1, v1.s[3] + fadd s2, s2, s3 + fadd s1, s2, s1 + mov s2, v0.s[1] + fadd s1, s1, s0 + fadd s1, s1, s2 + mov s2, v0.s[2] + mov s0, v0.s[3] + fadd s1, s1, s2 + fadd s0, s1, s0 + movi v1.2d, #0000000000000000 + str s0, [x10, x0, lsl #2] + cmp x14, x12 + b.ge .LBB0_26 + .p2align 2 +.LBB0_25: // =>This Inner Loop Header: Depth=1 + add x9, x15, #672 + add x14, x14, #4 + prfm pldl1keep, [x9] + add x9, x16, #672 + ldr q2, [x15], #16 + prfm pldl1keep, [x9] + ldr q3, [x16], #16 + fmla v1.4s, v2.4s, v3.4s + cmp x14, x12 + b.lt .LBB0_25 +.LBB0_26: + fadd s0, s0, s1 + mov s2, v1.s[1] + add x9, x13, x17, lsl #3 + add x13, x18, x13 + add x14, x1, x9 + fadd s0, s0, s2 + mov s2, v1.s[2] + mov s1, v1.s[3] + fadd s0, s0, s2 + fadd s0, s0, s1 + movi d1, #0000000000000000 + str s0, [x10, x0, lsl #2] + cmp x12, x8 + b.ge .LBB0_28 + .p2align 2 +.LBB0_27: // =>This Inner Loop Header: Depth=1 + add x9, x14, #336 + add x12, x12, #2 + prfm pldl1keep, [x9] + add x9, x13, #336 + ldr d2, [x14], #8 + prfm pldl1keep, [x9] + ldr d3, [x13], #8 + fmla v1.2s, v2.2s, v3.2s + cmp x12, x8 + b.lt .LBB0_27 +.LBB0_28: + fadd s0, s0, s1 + mov s2, v1.s[1] + add x9, x11, x17, lsl #3 + add x12, x1, x9 + movi d1, #0000000000000000 + add x9, x18, x11 + fadd s0, s0, s2 + str s0, [x10, x0, lsl #2] + cmp x8, x4 + b.ge .LBB0_30 + .p2align 2 +.LBB0_29: // =>This Inner Loop Header: Depth=1 + add x11, x12, #200 + add x8, x8, #1 + prfm pldl1keep, [x11] + add x11, x9, #200 + ldr s2, [x12], #4 + prfm pldl1keep, [x11] + ldr s3, [x9], #4 + fmul s2, s2, s3 + fadd v1.2s, v1.2s, v2.2s + cmp x8, x4 + b.lt .LBB0_29 +.LBB0_30: + fadd s0, s1, s0 + str s0, [x10, x0, lsl #2] +.LBB0_31: + ldp x20, x19, [sp, #96] // 16-byte Folded Reload + ldp x22, x21, [sp, #80] // 16-byte Folded Reload + ldp x24, x23, [sp, #64] // 16-byte Folded Reload + ldp x26, x25, [sp, #48] // 16-byte Folded Reload + ldp x28, x27, [sp, #32] // 16-byte Folded Reload + ldp x29, x30, [sp, #16] // 16-byte Folded Reload + add sp, sp, #112 + ret +.Lfunc_end0: + .size sgemv_n_alpha1_beta1_mlir, .Lfunc_end0-sgemv_n_alpha1_beta1_mlir + .cfi_endproc + // -- End function + .section ".note.GNU-stack","",@progbits diff --git a/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_3d.cpp b/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_3d.cpp new file mode 100644 index 00000000000000..c65157a12444ae --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_3d.cpp @@ -0,0 +1,46 @@ +#include +#include + +#include +#include +#include + +extern "C" void sbatch_matmul_3d_nn_mlir( + /* A */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, + /* B */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, + /* C */ float *, float *, int64_t, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t); + +extern "C" void sbatch_matmul_3d_nt_mlir( + /* A */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, + /* B */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, + /* C */ float *, float *, int64_t, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t); + +// C interface +extern "C" void cblas_sbatch_matmul_mlir( + const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const BLASINT BATCH, const BLASINT M, + const BLASINT N, const BLASINT K, const float *A, const BLASINT LDA, + const float *B, const BLASINT LDB, float *C, const BLASINT LDC) { + + // For the mini lib we only have nn,nt + assert(Order == CblasRowMajor); + assert(TransA == CblasNoTrans); + + memset(C, 0, BATCH * M * N * sizeof(float)); + + if (TransB == CblasTrans) { + sbatch_matmul_3d_nt_mlir(/* A */ Memref_3D_Args(A, BATCH, M, K, LDA), + /* B */ Memref_3D_Args(B, BATCH, N, K, LDB), + /* C */ Memref_3D_Args(C, BATCH, M, N, LDC)); + } else { + sbatch_matmul_3d_nn_mlir(/* A */ Memref_3D_Args(A, BATCH, M, K, LDA), + /* B */ Memref_3D_Args(B, BATCH, K, N, LDB), + /* C */ Memref_3D_Args(C, BATCH, M, N, LDC)); + } +} diff --git a/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_4d.cpp b/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_4d.cpp new file mode 100644 index 00000000000000..f92e217d3a1693 --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/src/sbatch_matmul_4d.cpp @@ -0,0 +1,49 @@ +#include +#include + +#include +#include +#include + +extern "C" void sbatch_matmul_4d_nn_mlir( + /* A */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, int64_t, int64_t, + /* B */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, int64_t, int64_t, + /* C */ float *, float *, int64_t, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, int64_t); + +extern "C" void sbatch_matmul_4d_nt_mlir( + /* A */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, int64_t, int64_t, + /* B */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, int64_t, int64_t, + /* C */ float *, float *, int64_t, int64_t, int64_t, int64_t, int64_t, + int64_t, int64_t, int64_t, int64_t); + +// C interface +extern "C" void cblas_sbatch_matmul_4d_mlir( + const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const BLASINT BATCH1, + const BLASINT BATCH2, const BLASINT M, const BLASINT N, const BLASINT K, + const float *A, const BLASINT LDA, const float *B, const BLASINT LDB, + float *C, const BLASINT LDC) { + + // For the mini lib we only have nn,nt + assert(Order == CblasRowMajor); + assert(TransA == CblasNoTrans); + + memset(C, 0, BATCH1 * BATCH2 * M * N * sizeof(float)); + + if (TransB == CblasTrans) { + sbatch_matmul_4d_nt_mlir( + /* A */ Memref_4D_Args(A, BATCH1, BATCH2, M, K, LDA), + /* B */ Memref_4D_Args(B, BATCH1, BATCH2, N, K, LDB), + /* C */ Memref_4D_Args(C, BATCH1, BATCH2, M, N, LDC)); + } else { + sbatch_matmul_4d_nn_mlir( + /* A */ Memref_4D_Args(A, BATCH1, BATCH2, M, K, LDA), + /* B */ Memref_4D_Args(B, BATCH1, BATCH2, K, N, LDB), + /* C */ Memref_4D_Args(C, BATCH1, BATCH2, M, N, LDC)); + } +} diff --git a/third_party/xla/xla/service/libs/libblas_mlir/src/sgemm.cpp b/third_party/xla/xla/service/libs/libblas_mlir/src/sgemm.cpp new file mode 100644 index 00000000000000..b51efca3f51b71 --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/src/sgemm.cpp @@ -0,0 +1,43 @@ +#include +#include + +#include +#include + +#include + +extern "C" void sgemm_nn_alpha1_beta1_mlir( + /* alpha */ float, + /* beta */ float, + /* A */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, + /* B */ const float *, const float *, int64_t, int64_t, int64_t, int64_t, + int64_t, + /* C */ float *, float *, int64_t, int64_t, int64_t, int64_t, int64_t); + +// C interface +extern "C" void cblas_sgemm_mlir( + const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_TRANSPOSE TransB, const BLASINT M, const BLASINT N, + const BLASINT K, const float alpha, const float *A, const BLASINT LDA, + const float *B, const BLASINT LDB, const float beta, float *C, + const BLASINT LDC) { + // For the mini lib we only have nn, alpha=1, beta=1 or beta=0. + assert(Order == CblasRowMajor); + assert(TransA == CblasNoTrans); + assert(TransB == CblasNoTrans); + assert(alpha == 1.0); + assert(beta == 1.0 || beta == 0.0); + + // This is faster + if (beta == 0.0) { + memset(C, 0, M * N * sizeof(float)); + } + + // Call MLIR kernel + sgemm_nn_alpha1_beta1_mlir(/* alpha */ 1.0, + /* beta */ 1.0, + /* A */ Memref_2D_Args(A, M, K, LDA), + /* B */ Memref_2D_Args(B, K, N, LDB), + /* C */ Memref_2D_Args(C, M, N, LDC)); +} diff --git a/third_party/xla/xla/service/libs/libblas_mlir/src/sgemv.cpp b/third_party/xla/xla/service/libs/libblas_mlir/src/sgemv.cpp new file mode 100644 index 00000000000000..4ee3441735218a --- /dev/null +++ b/third_party/xla/xla/service/libs/libblas_mlir/src/sgemv.cpp @@ -0,0 +1,43 @@ +#include +#include +#include + +#include +#include + +extern "C" void sgemv_n_alpha1_beta1_mlir(/* alpha */ float, + /* beta */ float, + /* A */ const float *, const float *, + int64_t, int64_t, int64_t, int64_t, + int64_t, + /* X */ const float *, const float *, + int64_t, int64_t, int64_t, + /* Y */ float *, float *, int64_t, + int64_t, int64_t); + +// C interface +extern "C" void cblas_sgemv_mlir(const enum CBLAS_ORDER Order, + const enum CBLAS_TRANSPOSE TransA, + const BLASINT M, const BLASINT N, + const float alpha, const float *A, + const BLASINT LDA, const float *X, + const BLASINT INCX, const float beta, float *Y, + const BLASINT INCY) { + // For the mini lib we only have nn, alpha=1, beta=0 or beta=1. + assert(TransA == CblasNoTrans); + assert(Order == CblasRowMajor); + assert(alpha == 1.0); + assert(beta == 1.0 || beta == 0.0); + + // This is faster + if (beta == 0.0) { + memset(Y, 0, M * sizeof(float)); + } + + // Call MLIR kernel + sgemv_n_alpha1_beta1_mlir(/* alpha */ 1.0, + /* beta */ 1.0, + /* A */ Memref_2D_Args(A, M, N, LDA), + /* X */ Memref_1D_Args(X, N, INCX), + /* Y */ Memref_1D_Args(Y, M, INCY)); +} diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto index ca8ba0553bd56a..7d2a49a59c2f75 100644 --- a/third_party/xla/xla/xla.proto +++ b/third_party/xla/xla/xla.proto @@ -222,6 +222,9 @@ message DebugOptions { // When true, XLA:CPU uses XNNPACK to execute supported operations. bool xla_cpu_use_xnnpack = 359; + bool xla_cpu_enable_xnnpack = 389; + bool xla_cpu_use_kernel_selector = 390; + // Enabling this will enable optimizations that ignore the possibility of NaN. bool xla_enable_fast_math = 335; @@ -1208,7 +1211,7 @@ message DebugOptions { // Note: when adding a new flag, please add it to one of the hardware-specific // or hardware-agnostic sections at the top of this proto message. - // Next id: 389 + // Next id: 391 // Extra options to pass to the compilation backend (e.g. LLVM); specific // interpretation of these values is left to the backend.